From 099f53cb50e45ef617a9f1d63ceec799e489418b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 8 Apr 2009 14:28:37 -0700
Subject: async_tx: rename zero_sum to val

'zero_sum' does not properly describe the operation of generating parity
and checking that it validates against an existing buffer.  Change the
name of the operation to 'val' (for 'validate').  This is in
anticipation of the p+q case where it is a requirement to identify the
target parity buffers separately from the source buffers, because the
target parity buffers will not have corresponding pq coefficients.

Reviewed-by: Andre Noll <maan@systemlinux.org>
Acked-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/async_tx.h  | 2 +-
 include/linux/dmaengine.h | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
index 5fc2ef8d97fa..513150d8c25b 100644
--- a/include/linux/async_tx.h
+++ b/include/linux/async_tx.h
@@ -117,7 +117,7 @@ async_xor(struct page *dest, struct page **src_list, unsigned int offset,
 	dma_async_tx_callback cb_fn, void *cb_fn_param);
 
 struct dma_async_tx_descriptor *
-async_xor_zero_sum(struct page *dest, struct page **src_list,
+async_xor_val(struct page *dest, struct page **src_list,
 	unsigned int offset, int src_cnt, size_t len,
 	u32 *result, enum async_tx_flags flags,
 	struct dma_async_tx_descriptor *depend_tx,
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 2e2aa3df170c..6768727d00d7 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -55,8 +55,8 @@ enum dma_transaction_type {
 	DMA_PQ_XOR,
 	DMA_DUAL_XOR,
 	DMA_PQ_UPDATE,
-	DMA_ZERO_SUM,
-	DMA_PQ_ZERO_SUM,
+	DMA_XOR_VAL,
+	DMA_PQ_VAL,
 	DMA_MEMSET,
 	DMA_MEMCPY_CRC32C,
 	DMA_INTERRUPT,
@@ -214,7 +214,7 @@ struct dma_async_tx_descriptor {
  * @device_free_chan_resources: release DMA channel's resources
  * @device_prep_dma_memcpy: prepares a memcpy operation
  * @device_prep_dma_xor: prepares a xor operation
- * @device_prep_dma_zero_sum: prepares a zero_sum operation
+ * @device_prep_dma_xor_val: prepares a xor validation operation
  * @device_prep_dma_memset: prepares a memset operation
  * @device_prep_dma_interrupt: prepares an end of chain interrupt operation
  * @device_prep_slave_sg: prepares a slave dma operation
@@ -243,7 +243,7 @@ struct dma_device {
 	struct dma_async_tx_descriptor *(*device_prep_dma_xor)(
 		struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src,
 		unsigned int src_cnt, size_t len, unsigned long flags);
-	struct dma_async_tx_descriptor *(*device_prep_dma_zero_sum)(
+	struct dma_async_tx_descriptor *(*device_prep_dma_xor_val)(
 		struct dma_chan *chan, dma_addr_t *src,	unsigned int src_cnt,
 		size_t len, u32 *result, unsigned long flags);
 	struct dma_async_tx_descriptor *(*device_prep_dma_memset)(
-- 
cgit v1.2.3-71-gd317


From 88ba2aa586c874681c072101287e15d40de7e6e2 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 9 Apr 2009 16:16:18 -0700
Subject: async_tx: kill ASYNC_TX_DEP_ACK flag

In support of inter-channel chaining async_tx utilizes an ack flag to
gate whether a dependent operation can be chained to another.  While the
flag is not set the chain can be considered open for appending.  Setting
the ack flag closes the chain and flags the descriptor for garbage
collection.  The ASYNC_TX_DEP_ACK flag essentially means "close the
chain after adding this dependency".  Since each operation can only have
one child the api now implicitly sets the ack flag at dependency
submission time.  This removes an unnecessary management burden from
clients of the api.

[ Impact: clean up and enforce one dependency per operation ]

Reviewed-by: Andre Noll <maan@systemlinux.org>
Acked-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 Documentation/crypto/async-tx-api.txt |  9 ++++-----
 crypto/async_tx/async_memcpy.c        |  2 +-
 crypto/async_tx/async_memset.c        |  2 +-
 crypto/async_tx/async_tx.c            |  4 ++--
 crypto/async_tx/async_xor.c           |  6 ++----
 drivers/md/raid5.c                    | 25 +++++++++++--------------
 include/linux/async_tx.h              |  4 +---
 7 files changed, 22 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/crypto/async-tx-api.txt b/Documentation/crypto/async-tx-api.txt
index 4af12180d191..76feda8541dc 100644
--- a/Documentation/crypto/async-tx-api.txt
+++ b/Documentation/crypto/async-tx-api.txt
@@ -80,8 +80,8 @@ acknowledged by the application before the offload engine driver is allowed to
 recycle (or free) the descriptor.  A descriptor can be acked by one of the
 following methods:
 1/ setting the ASYNC_TX_ACK flag if no child operations are to be submitted
-2/ setting the ASYNC_TX_DEP_ACK flag to acknowledge the parent
-   descriptor of a new operation.
+2/ submitting an unacknowledged descriptor as a dependency to another
+   async_tx call will implicitly set the acknowledged state.
 3/ calling async_tx_ack() on the descriptor.
 
 3.4 When does the operation execute?
@@ -136,10 +136,9 @@ int run_xor_copy_xor(struct page **xor_srcs,
 
 	tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len,
 		       ASYNC_TX_XOR_DROP_DST, NULL, NULL, NULL);
-	tx = async_memcpy(copy_dest, copy_src, 0, 0, copy_len,
-			  ASYNC_TX_DEP_ACK, tx, NULL, NULL);
+	tx = async_memcpy(copy_dest, copy_src, 0, 0, copy_len, tx, NULL, NULL);
 	tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len,
-		       ASYNC_TX_XOR_DROP_DST | ASYNC_TX_DEP_ACK | ASYNC_TX_ACK,
+		       ASYNC_TX_XOR_DROP_DST | ASYNC_TX_ACK,
 		       tx, complete_xor_copy_xor, NULL);
 
 	async_tx_issue_pending_all();
diff --git a/crypto/async_tx/async_memcpy.c b/crypto/async_tx/async_memcpy.c
index ddccfb01c416..7117ec6f1b74 100644
--- a/crypto/async_tx/async_memcpy.c
+++ b/crypto/async_tx/async_memcpy.c
@@ -35,7 +35,7 @@
  * @src: src page
  * @offset: offset in pages to start transaction
  * @len: length in bytes
- * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK,
+ * @flags: ASYNC_TX_ACK
  * @depend_tx: memcpy depends on the result of this transaction
  * @cb_fn: function to call when the memcpy completes
  * @cb_param: parameter to pass to the callback routine
diff --git a/crypto/async_tx/async_memset.c b/crypto/async_tx/async_memset.c
index 5b5eb99bb244..b2f133885b7f 100644
--- a/crypto/async_tx/async_memset.c
+++ b/crypto/async_tx/async_memset.c
@@ -35,7 +35,7 @@
  * @val: fill value
  * @offset: offset in pages to start transaction
  * @len: length in bytes
- * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
+ * @flags: ASYNC_TX_ACK
  * @depend_tx: memset depends on the result of this transaction
  * @cb_fn: function to call when the memcpy completes
  * @cb_param: parameter to pass to the callback routine
diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c
index 06eb6cc09fef..3766bc3d7d89 100644
--- a/crypto/async_tx/async_tx.c
+++ b/crypto/async_tx/async_tx.c
@@ -223,7 +223,7 @@ async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx,
 	if (flags & ASYNC_TX_ACK)
 		async_tx_ack(tx);
 
-	if (depend_tx && (flags & ASYNC_TX_DEP_ACK))
+	if (depend_tx)
 		async_tx_ack(depend_tx);
 }
 EXPORT_SYMBOL_GPL(async_tx_submit);
@@ -231,7 +231,7 @@ EXPORT_SYMBOL_GPL(async_tx_submit);
 /**
  * async_trigger_callback - schedules the callback function to be run after
  * any dependent operations have been completed.
- * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
+ * @flags: ASYNC_TX_ACK
  * @depend_tx: 'callback' requires the completion of this transaction
  * @cb_fn: function to call after depend_tx completes
  * @cb_param: parameter to pass to the callback routine
diff --git a/crypto/async_tx/async_xor.c b/crypto/async_tx/async_xor.c
index e0580b0ea533..3cc5dc763b54 100644
--- a/crypto/async_tx/async_xor.c
+++ b/crypto/async_tx/async_xor.c
@@ -105,7 +105,6 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
 				_cb_param);
 
 		depend_tx = tx;
-		flags |= ASYNC_TX_DEP_ACK;
 
 		if (src_cnt > xor_src_cnt) {
 			/* drop completed sources */
@@ -168,8 +167,7 @@ do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset,
  * @offset: offset in pages to start transaction
  * @src_cnt: number of source pages
  * @len: length in bytes
- * @flags: ASYNC_TX_XOR_ZERO_DST, ASYNC_TX_XOR_DROP_DEST,
- *	ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
+ * @flags: ASYNC_TX_XOR_ZERO_DST, ASYNC_TX_XOR_DROP_DEST, ASYNC_TX_ACK
  * @depend_tx: xor depends on the result of this transaction.
  * @cb_fn: function to call when the xor completes
  * @cb_param: parameter to pass to the callback routine
@@ -230,7 +228,7 @@ static int page_is_zero(struct page *p, unsigned int offset, size_t len)
  * @src_cnt: number of source pages
  * @len: length in bytes
  * @result: 0 if sum == 0 else non-zero
- * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
+ * @flags: ASYNC_TX_ACK
  * @depend_tx: xor depends on the result of this transaction.
  * @cb_fn: function to call when the xor completes
  * @cb_param: parameter to pass to the callback routine
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f8d2d35ed298..0ef5362c8d02 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -525,14 +525,12 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
 			bio_page = bio_iovec_idx(bio, i)->bv_page;
 			if (frombio)
 				tx = async_memcpy(page, bio_page, page_offset,
-					b_offset, clen,
-					ASYNC_TX_DEP_ACK,
-					tx, NULL, NULL);
+						  b_offset, clen, 0,
+						  tx, NULL, NULL);
 			else
 				tx = async_memcpy(bio_page, page, b_offset,
-					page_offset, clen,
-					ASYNC_TX_DEP_ACK,
-					tx, NULL, NULL);
+						  page_offset, clen, 0,
+						  tx, NULL, NULL);
 		}
 		if (clen < len) /* hit end of page */
 			break;
@@ -615,8 +613,7 @@ static void ops_run_biofill(struct stripe_head *sh)
 	}
 
 	atomic_inc(&sh->count);
-	async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
-		ops_complete_biofill, sh);
+	async_trigger_callback(ASYNC_TX_ACK, tx, ops_complete_biofill, sh);
 }
 
 static void ops_complete_compute5(void *stripe_head_ref)
@@ -701,8 +698,8 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 	}
 
 	tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
-		ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx,
-		ops_complete_prexor, sh);
+		       ASYNC_TX_XOR_DROP_DST, tx,
+		       ops_complete_prexor, sh);
 
 	return tx;
 }
@@ -809,7 +806,7 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 	 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
 	 * for the synchronous xor case
 	 */
-	flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK |
+	flags = ASYNC_TX_ACK |
 		(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
 
 	atomic_inc(&sh->count);
@@ -858,7 +855,7 @@ static void ops_run_check(struct stripe_head *sh)
 		&sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
 
 	atomic_inc(&sh->count);
-	tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
+	tx = async_trigger_callback(ASYNC_TX_ACK, tx,
 		ops_complete_check, sh);
 }
 
@@ -2687,8 +2684,8 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
 
 			/* place all the copies on one channel */
 			tx = async_memcpy(sh2->dev[dd_idx].page,
-				sh->dev[i].page, 0, 0, STRIPE_SIZE,
-				ASYNC_TX_DEP_ACK, tx, NULL, NULL);
+					  sh->dev[i].page, 0, 0, STRIPE_SIZE,
+					  0, tx, NULL, NULL);
 
 			set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
 			set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
index 513150d8c25b..9f14cd540cd2 100644
--- a/include/linux/async_tx.h
+++ b/include/linux/async_tx.h
@@ -58,13 +58,11 @@ struct dma_chan_ref {
  * array.
  * @ASYNC_TX_ACK: immediately ack the descriptor, precludes setting up a
  * dependency chain
- * @ASYNC_TX_DEP_ACK: ack the dependency descriptor.  Useful for chaining.
  */
 enum async_tx_flags {
 	ASYNC_TX_XOR_ZERO_DST	 = (1 << 0),
 	ASYNC_TX_XOR_DROP_DST	 = (1 << 1),
-	ASYNC_TX_ACK		 = (1 << 3),
-	ASYNC_TX_DEP_ACK	 = (1 << 4),
+	ASYNC_TX_ACK		 = (1 << 2),
 };
 
 #ifdef CONFIG_DMA_ENGINE
-- 
cgit v1.2.3-71-gd317


From a08abd8ca890a377521d65d493d174bebcaf694b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 3 Jun 2009 11:43:59 -0700
Subject: async_tx: structify submission arguments, add scribble

Prepare the api for the arrival of a new parameter, 'scribble'.  This
will allow callers to identify scratchpad memory for dma address or page
address conversions.  As this adds yet another parameter, take this
opportunity to convert the common submission parameters (flags,
dependency, callback, and callback argument) into an object that is
passed by reference.

Also, take this opportunity to fix up the kerneldoc and add notes about
the relevant ASYNC_TX_* flags for each routine.

[ Impact: moves api pass-by-value parameters to a pass-by-reference struct ]

Signed-off-by: Andre Noll <maan@systemlinux.org>
Acked-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 Documentation/crypto/async-tx-api.txt |   6 +-
 crypto/async_tx/async_memcpy.c        |  26 +++----
 crypto/async_tx/async_memset.c        |  25 +++----
 crypto/async_tx/async_tx.c            |  51 +++++++-------
 crypto/async_tx/async_xor.c           | 123 +++++++++++++++++-----------------
 drivers/md/raid5.c                    |  59 +++++++++-------
 include/linux/async_tx.h              |  84 ++++++++++++++---------
 7 files changed, 200 insertions(+), 174 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/crypto/async-tx-api.txt b/Documentation/crypto/async-tx-api.txt
index 76feda8541dc..dfe0475f7919 100644
--- a/Documentation/crypto/async-tx-api.txt
+++ b/Documentation/crypto/async-tx-api.txt
@@ -54,11 +54,7 @@ features surfaced as a result:
 
 3.1 General format of the API:
 struct dma_async_tx_descriptor *
-async_<operation>(<op specific parameters>,
-		  enum async_tx_flags flags,
-        	  struct dma_async_tx_descriptor *dependency,
-        	  dma_async_tx_callback callback_routine,
-		  void *callback_parameter);
+async_<operation>(<op specific parameters>, struct async_submit ctl *submit)
 
 3.2 Supported operations:
 memcpy  - memory copy between a source and a destination buffer
diff --git a/crypto/async_tx/async_memcpy.c b/crypto/async_tx/async_memcpy.c
index 7117ec6f1b74..89e05556f3df 100644
--- a/crypto/async_tx/async_memcpy.c
+++ b/crypto/async_tx/async_memcpy.c
@@ -33,28 +33,28 @@
  * async_memcpy - attempt to copy memory with a dma engine.
  * @dest: destination page
  * @src: src page
- * @offset: offset in pages to start transaction
+ * @dest_offset: offset into 'dest' to start transaction
+ * @src_offset: offset into 'src' to start transaction
  * @len: length in bytes
- * @flags: ASYNC_TX_ACK
- * @depend_tx: memcpy depends on the result of this transaction
- * @cb_fn: function to call when the memcpy completes
- * @cb_param: parameter to pass to the callback routine
+ * @submit: submission / completion modifiers
+ *
+ * honored flags: ASYNC_TX_ACK
  */
 struct dma_async_tx_descriptor *
 async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
-	unsigned int src_offset, size_t len, enum async_tx_flags flags,
-	struct dma_async_tx_descriptor *depend_tx,
-	dma_async_tx_callback cb_fn, void *cb_param)
+	     unsigned int src_offset, size_t len,
+	     struct async_submit_ctl *submit)
 {
-	struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_MEMCPY,
+	struct dma_chan *chan = async_tx_find_channel(submit, DMA_MEMCPY,
 						      &dest, 1, &src, 1, len);
 	struct dma_device *device = chan ? chan->device : NULL;
 	struct dma_async_tx_descriptor *tx = NULL;
 
 	if (device) {
 		dma_addr_t dma_dest, dma_src;
-		unsigned long dma_prep_flags = cb_fn ? DMA_PREP_INTERRUPT : 0;
+		unsigned long dma_prep_flags;
 
+		dma_prep_flags = submit->cb_fn ? DMA_PREP_INTERRUPT : 0;
 		dma_dest = dma_map_page(device->dev, dest, dest_offset, len,
 					DMA_FROM_DEVICE);
 
@@ -67,13 +67,13 @@ async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
 
 	if (tx) {
 		pr_debug("%s: (async) len: %zu\n", __func__, len);
-		async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
+		async_tx_submit(chan, tx, submit);
 	} else {
 		void *dest_buf, *src_buf;
 		pr_debug("%s: (sync) len: %zu\n", __func__, len);
 
 		/* wait for any prerequisite operations */
-		async_tx_quiesce(&depend_tx);
+		async_tx_quiesce(&submit->depend_tx);
 
 		dest_buf = kmap_atomic(dest, KM_USER0) + dest_offset;
 		src_buf = kmap_atomic(src, KM_USER1) + src_offset;
@@ -83,7 +83,7 @@ async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
 		kunmap_atomic(dest_buf, KM_USER0);
 		kunmap_atomic(src_buf, KM_USER1);
 
-		async_tx_sync_epilog(cb_fn, cb_param);
+		async_tx_sync_epilog(submit);
 	}
 
 	return tx;
diff --git a/crypto/async_tx/async_memset.c b/crypto/async_tx/async_memset.c
index b2f133885b7f..c14437238f4c 100644
--- a/crypto/async_tx/async_memset.c
+++ b/crypto/async_tx/async_memset.c
@@ -35,26 +35,23 @@
  * @val: fill value
  * @offset: offset in pages to start transaction
  * @len: length in bytes
- * @flags: ASYNC_TX_ACK
- * @depend_tx: memset depends on the result of this transaction
- * @cb_fn: function to call when the memcpy completes
- * @cb_param: parameter to pass to the callback routine
+ *
+ * honored flags: ASYNC_TX_ACK
  */
 struct dma_async_tx_descriptor *
-async_memset(struct page *dest, int val, unsigned int offset,
-	size_t len, enum async_tx_flags flags,
-	struct dma_async_tx_descriptor *depend_tx,
-	dma_async_tx_callback cb_fn, void *cb_param)
+async_memset(struct page *dest, int val, unsigned int offset, size_t len,
+	     struct async_submit_ctl *submit)
 {
-	struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_MEMSET,
+	struct dma_chan *chan = async_tx_find_channel(submit, DMA_MEMSET,
 						      &dest, 1, NULL, 0, len);
 	struct dma_device *device = chan ? chan->device : NULL;
 	struct dma_async_tx_descriptor *tx = NULL;
 
 	if (device) {
 		dma_addr_t dma_dest;
-		unsigned long dma_prep_flags = cb_fn ? DMA_PREP_INTERRUPT : 0;
+		unsigned long dma_prep_flags;
 
+		dma_prep_flags = submit->cb_fn ? DMA_PREP_INTERRUPT : 0;
 		dma_dest = dma_map_page(device->dev, dest, offset, len,
 					DMA_FROM_DEVICE);
 
@@ -64,19 +61,19 @@ async_memset(struct page *dest, int val, unsigned int offset,
 
 	if (tx) {
 		pr_debug("%s: (async) len: %zu\n", __func__, len);
-		async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
+		async_tx_submit(chan, tx, submit);
 	} else { /* run the memset synchronously */
 		void *dest_buf;
 		pr_debug("%s: (sync) len: %zu\n", __func__, len);
 
-		dest_buf = (void *) (((char *) page_address(dest)) + offset);
+		dest_buf = page_address(dest) + offset;
 
 		/* wait for any prerequisite operations */
-		async_tx_quiesce(&depend_tx);
+		async_tx_quiesce(&submit->depend_tx);
 
 		memset(dest_buf, val, len);
 
-		async_tx_sync_epilog(cb_fn, cb_param);
+		async_tx_sync_epilog(submit);
 	}
 
 	return tx;
diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c
index 3766bc3d7d89..802a5ce437d9 100644
--- a/crypto/async_tx/async_tx.c
+++ b/crypto/async_tx/async_tx.c
@@ -45,13 +45,15 @@ static void __exit async_tx_exit(void)
 /**
  * __async_tx_find_channel - find a channel to carry out the operation or let
  *	the transaction execute synchronously
- * @depend_tx: transaction dependency
+ * @submit: transaction dependency and submission modifiers
  * @tx_type: transaction type
  */
 struct dma_chan *
-__async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
-	enum dma_transaction_type tx_type)
+__async_tx_find_channel(struct async_submit_ctl *submit,
+			enum dma_transaction_type tx_type)
 {
+	struct dma_async_tx_descriptor *depend_tx = submit->depend_tx;
+
 	/* see if we can keep the chain on one channel */
 	if (depend_tx &&
 	    dma_has_cap(tx_type, depend_tx->chan->device->cap_mask))
@@ -144,13 +146,14 @@ async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx,
 
 
 /**
- * submit_disposition - while holding depend_tx->lock we must avoid submitting
- * 	new operations to prevent a circular locking dependency with
- * 	drivers that already hold a channel lock when calling
- * 	async_tx_run_dependencies.
+ * submit_disposition - flags for routing an incoming operation
  * @ASYNC_TX_SUBMITTED: we were able to append the new operation under the lock
  * @ASYNC_TX_CHANNEL_SWITCH: when the lock is dropped schedule a channel switch
  * @ASYNC_TX_DIRECT_SUBMIT: when the lock is dropped submit directly
+ *
+ * while holding depend_tx->lock we must avoid submitting new operations
+ * to prevent a circular locking dependency with drivers that already
+ * hold a channel lock when calling async_tx_run_dependencies.
  */
 enum submit_disposition {
 	ASYNC_TX_SUBMITTED,
@@ -160,11 +163,12 @@ enum submit_disposition {
 
 void
 async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx,
-	enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx,
-	dma_async_tx_callback cb_fn, void *cb_param)
+		struct async_submit_ctl *submit)
 {
-	tx->callback = cb_fn;
-	tx->callback_param = cb_param;
+	struct dma_async_tx_descriptor *depend_tx = submit->depend_tx;
+
+	tx->callback = submit->cb_fn;
+	tx->callback_param = submit->cb_param;
 
 	if (depend_tx) {
 		enum submit_disposition s;
@@ -220,7 +224,7 @@ async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx,
 		tx->tx_submit(tx);
 	}
 
-	if (flags & ASYNC_TX_ACK)
+	if (submit->flags & ASYNC_TX_ACK)
 		async_tx_ack(tx);
 
 	if (depend_tx)
@@ -229,21 +233,20 @@ async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx,
 EXPORT_SYMBOL_GPL(async_tx_submit);
 
 /**
- * async_trigger_callback - schedules the callback function to be run after
- * any dependent operations have been completed.
- * @flags: ASYNC_TX_ACK
- * @depend_tx: 'callback' requires the completion of this transaction
- * @cb_fn: function to call after depend_tx completes
- * @cb_param: parameter to pass to the callback routine
+ * async_trigger_callback - schedules the callback function to be run
+ * @submit: submission and completion parameters
+ *
+ * honored flags: ASYNC_TX_ACK
+ *
+ * The callback is run after any dependent operations have completed.
  */
 struct dma_async_tx_descriptor *
-async_trigger_callback(enum async_tx_flags flags,
-	struct dma_async_tx_descriptor *depend_tx,
-	dma_async_tx_callback cb_fn, void *cb_param)
+async_trigger_callback(struct async_submit_ctl *submit)
 {
 	struct dma_chan *chan;
 	struct dma_device *device;
 	struct dma_async_tx_descriptor *tx;
+	struct dma_async_tx_descriptor *depend_tx = submit->depend_tx;
 
 	if (depend_tx) {
 		chan = depend_tx->chan;
@@ -262,14 +265,14 @@ async_trigger_callback(enum async_tx_flags flags,
 	if (tx) {
 		pr_debug("%s: (async)\n", __func__);
 
-		async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
+		async_tx_submit(chan, tx, submit);
 	} else {
 		pr_debug("%s: (sync)\n", __func__);
 
 		/* wait for any prerequisite operations */
-		async_tx_quiesce(&depend_tx);
+		async_tx_quiesce(&submit->depend_tx);
 
-		async_tx_sync_epilog(cb_fn, cb_param);
+		async_tx_sync_epilog(submit);
 	}
 
 	return tx;
diff --git a/crypto/async_tx/async_xor.c b/crypto/async_tx/async_xor.c
index 3cc5dc763b54..691fa98a18c4 100644
--- a/crypto/async_tx/async_xor.c
+++ b/crypto/async_tx/async_xor.c
@@ -34,18 +34,16 @@
 static __async_inline struct dma_async_tx_descriptor *
 do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
 	     unsigned int offset, int src_cnt, size_t len,
-	     enum async_tx_flags flags,
-	     struct dma_async_tx_descriptor *depend_tx,
-	     dma_async_tx_callback cb_fn, void *cb_param)
+	     struct async_submit_ctl *submit)
 {
 	struct dma_device *dma = chan->device;
 	dma_addr_t *dma_src = (dma_addr_t *) src_list;
 	struct dma_async_tx_descriptor *tx = NULL;
 	int src_off = 0;
 	int i;
-	dma_async_tx_callback _cb_fn;
-	void *_cb_param;
-	enum async_tx_flags async_flags;
+	dma_async_tx_callback cb_fn_orig = submit->cb_fn;
+	void *cb_param_orig = submit->cb_param;
+	enum async_tx_flags flags_orig = submit->flags;
 	enum dma_ctrl_flags dma_flags;
 	int xor_src_cnt;
 	dma_addr_t dma_dest;
@@ -63,7 +61,7 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
 	}
 
 	while (src_cnt) {
-		async_flags = flags;
+		submit->flags = flags_orig;
 		dma_flags = 0;
 		xor_src_cnt = min(src_cnt, dma->max_xor);
 		/* if we are submitting additional xors, leave the chain open,
@@ -71,15 +69,15 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
 		 * buffer mapped
 		 */
 		if (src_cnt > xor_src_cnt) {
-			async_flags &= ~ASYNC_TX_ACK;
+			submit->flags &= ~ASYNC_TX_ACK;
 			dma_flags = DMA_COMPL_SKIP_DEST_UNMAP;
-			_cb_fn = NULL;
-			_cb_param = NULL;
+			submit->cb_fn = NULL;
+			submit->cb_param = NULL;
 		} else {
-			_cb_fn = cb_fn;
-			_cb_param = cb_param;
+			submit->cb_fn = cb_fn_orig;
+			submit->cb_param = cb_param_orig;
 		}
-		if (_cb_fn)
+		if (submit->cb_fn)
 			dma_flags |= DMA_PREP_INTERRUPT;
 
 		/* Since we have clobbered the src_list we are committed
@@ -90,7 +88,7 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
 					      xor_src_cnt, len, dma_flags);
 
 		if (unlikely(!tx))
-			async_tx_quiesce(&depend_tx);
+			async_tx_quiesce(&submit->depend_tx);
 
 		/* spin wait for the preceeding transactions to complete */
 		while (unlikely(!tx)) {
@@ -101,10 +99,8 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
 						      dma_flags);
 		}
 
-		async_tx_submit(chan, tx, async_flags, depend_tx, _cb_fn,
-				_cb_param);
-
-		depend_tx = tx;
+		async_tx_submit(chan, tx, submit);
+		submit->depend_tx = tx;
 
 		if (src_cnt > xor_src_cnt) {
 			/* drop completed sources */
@@ -123,8 +119,7 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
 
 static void
 do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset,
-	    int src_cnt, size_t len, enum async_tx_flags flags,
-	    dma_async_tx_callback cb_fn, void *cb_param)
+	    int src_cnt, size_t len, struct async_submit_ctl *submit)
 {
 	int i;
 	int xor_src_cnt;
@@ -139,7 +134,7 @@ do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset,
 	/* set destination address */
 	dest_buf = page_address(dest) + offset;
 
-	if (flags & ASYNC_TX_XOR_ZERO_DST)
+	if (submit->flags & ASYNC_TX_XOR_ZERO_DST)
 		memset(dest_buf, 0, len);
 
 	while (src_cnt > 0) {
@@ -152,33 +147,35 @@ do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset,
 		src_off += xor_src_cnt;
 	}
 
-	async_tx_sync_epilog(cb_fn, cb_param);
+	async_tx_sync_epilog(submit);
 }
 
 /**
  * async_xor - attempt to xor a set of blocks with a dma engine.
- *	xor_blocks always uses the dest as a source so the ASYNC_TX_XOR_ZERO_DST
- *	flag must be set to not include dest data in the calculation.  The
- *	assumption with dma eninges is that they only use the destination
- *	buffer as a source when it is explicity specified in the source list.
  * @dest: destination page
- * @src_list: array of source pages (if the dest is also a source it must be
- *	at index zero).  The contents of this array may be overwritten.
- * @offset: offset in pages to start transaction
+ * @src_list: array of source pages
+ * @offset: common src/dst offset to start transaction
  * @src_cnt: number of source pages
  * @len: length in bytes
- * @flags: ASYNC_TX_XOR_ZERO_DST, ASYNC_TX_XOR_DROP_DEST, ASYNC_TX_ACK
- * @depend_tx: xor depends on the result of this transaction.
- * @cb_fn: function to call when the xor completes
- * @cb_param: parameter to pass to the callback routine
+ * @submit: submission / completion modifiers
+ *
+ * honored flags: ASYNC_TX_ACK, ASYNC_TX_XOR_ZERO_DST, ASYNC_TX_XOR_DROP_DST
+ *
+ * xor_blocks always uses the dest as a source so the
+ * ASYNC_TX_XOR_ZERO_DST flag must be set to not include dest data in
+ * the calculation.  The assumption with dma eninges is that they only
+ * use the destination buffer as a source when it is explicity specified
+ * in the source list.
+ *
+ * src_list note: if the dest is also a source it must be at index zero.
+ * The contents of this array will be overwritten if a scribble region
+ * is not specified.
  */
 struct dma_async_tx_descriptor *
 async_xor(struct page *dest, struct page **src_list, unsigned int offset,
-	int src_cnt, size_t len, enum async_tx_flags flags,
-	struct dma_async_tx_descriptor *depend_tx,
-	dma_async_tx_callback cb_fn, void *cb_param)
+	  int src_cnt, size_t len, struct async_submit_ctl *submit)
 {
-	struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_XOR,
+	struct dma_chan *chan = async_tx_find_channel(submit, DMA_XOR,
 						      &dest, 1, src_list,
 						      src_cnt, len);
 	BUG_ON(src_cnt <= 1);
@@ -188,7 +185,7 @@ async_xor(struct page *dest, struct page **src_list, unsigned int offset,
 		pr_debug("%s (async): len: %zu\n", __func__, len);
 
 		return do_async_xor(chan, dest, src_list, offset, src_cnt, len,
-				    flags, depend_tx, cb_fn, cb_param);
+				    submit);
 	} else {
 		/* run the xor synchronously */
 		pr_debug("%s (sync): len: %zu\n", __func__, len);
@@ -196,16 +193,15 @@ async_xor(struct page *dest, struct page **src_list, unsigned int offset,
 		/* in the sync case the dest is an implied source
 		 * (assumes the dest is the first source)
 		 */
-		if (flags & ASYNC_TX_XOR_DROP_DST) {
+		if (submit->flags & ASYNC_TX_XOR_DROP_DST) {
 			src_cnt--;
 			src_list++;
 		}
 
 		/* wait for any prerequisite operations */
-		async_tx_quiesce(&depend_tx);
+		async_tx_quiesce(&submit->depend_tx);
 
-		do_sync_xor(dest, src_list, offset, src_cnt, len,
-			    flags, cb_fn, cb_param);
+		do_sync_xor(dest, src_list, offset, src_cnt, len, submit);
 
 		return NULL;
 	}
@@ -222,25 +218,25 @@ static int page_is_zero(struct page *p, unsigned int offset, size_t len)
 /**
  * async_xor_val - attempt a xor parity check with a dma engine.
  * @dest: destination page used if the xor is performed synchronously
- * @src_list: array of source pages.  The dest page must be listed as a source
- * 	at index zero.  The contents of this array may be overwritten.
+ * @src_list: array of source pages
  * @offset: offset in pages to start transaction
  * @src_cnt: number of source pages
  * @len: length in bytes
  * @result: 0 if sum == 0 else non-zero
- * @flags: ASYNC_TX_ACK
- * @depend_tx: xor depends on the result of this transaction.
- * @cb_fn: function to call when the xor completes
- * @cb_param: parameter to pass to the callback routine
+ * @submit: submission / completion modifiers
+ *
+ * honored flags: ASYNC_TX_ACK
+ *
+ * src_list note: if the dest is also a source it must be at index zero.
+ * The contents of this array will be overwritten if a scribble region
+ * is not specified.
  */
 struct dma_async_tx_descriptor *
-async_xor_val(struct page *dest, struct page **src_list,
-	unsigned int offset, int src_cnt, size_t len,
-	u32 *result, enum async_tx_flags flags,
-	struct dma_async_tx_descriptor *depend_tx,
-	dma_async_tx_callback cb_fn, void *cb_param)
+async_xor_val(struct page *dest, struct page **src_list, unsigned int offset,
+	      int src_cnt, size_t len, u32 *result,
+	      struct async_submit_ctl *submit)
 {
-	struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_XOR_VAL,
+	struct dma_chan *chan = async_tx_find_channel(submit, DMA_XOR_VAL,
 						      &dest, 1, src_list,
 						      src_cnt, len);
 	struct dma_device *device = chan ? chan->device : NULL;
@@ -250,11 +246,12 @@ async_xor_val(struct page *dest, struct page **src_list,
 
 	if (device && src_cnt <= device->max_xor) {
 		dma_addr_t *dma_src = (dma_addr_t *) src_list;
-		unsigned long dma_prep_flags = cb_fn ? DMA_PREP_INTERRUPT : 0;
+		unsigned long dma_prep_flags;
 		int i;
 
 		pr_debug("%s: (async) len: %zu\n", __func__, len);
 
+		dma_prep_flags = submit->cb_fn ? DMA_PREP_INTERRUPT : 0;
 		for (i = 0; i < src_cnt; i++)
 			dma_src[i] = dma_map_page(device->dev, src_list[i],
 						  offset, len, DMA_TO_DEVICE);
@@ -263,7 +260,7 @@ async_xor_val(struct page *dest, struct page **src_list,
 						     len, result,
 						     dma_prep_flags);
 		if (unlikely(!tx)) {
-			async_tx_quiesce(&depend_tx);
+			async_tx_quiesce(&submit->depend_tx);
 
 			while (!tx) {
 				dma_async_issue_pending(chan);
@@ -273,23 +270,23 @@ async_xor_val(struct page *dest, struct page **src_list,
 			}
 		}
 
-		async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
+		async_tx_submit(chan, tx, submit);
 	} else {
-		unsigned long xor_flags = flags;
+		enum async_tx_flags flags_orig = submit->flags;
 
 		pr_debug("%s: (sync) len: %zu\n", __func__, len);
 
-		xor_flags |= ASYNC_TX_XOR_DROP_DST;
-		xor_flags &= ~ASYNC_TX_ACK;
+		submit->flags |= ASYNC_TX_XOR_DROP_DST;
+		submit->flags &= ~ASYNC_TX_ACK;
 
-		tx = async_xor(dest, src_list, offset, src_cnt, len, xor_flags,
-			depend_tx, NULL, NULL);
+		tx = async_xor(dest, src_list, offset, src_cnt, len, submit);
 
 		async_tx_quiesce(&tx);
 
 		*result = page_is_zero(dest, offset, len) ? 0 : 1;
 
-		async_tx_sync_epilog(cb_fn, cb_param);
+		async_tx_sync_epilog(submit);
+		submit->flags = flags_orig;
 	}
 
 	return tx;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0ef5362c8d02..e1920f23579f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -499,11 +499,14 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
 	struct page *bio_page;
 	int i;
 	int page_offset;
+	struct async_submit_ctl submit;
 
 	if (bio->bi_sector >= sector)
 		page_offset = (signed)(bio->bi_sector - sector) * 512;
 	else
 		page_offset = (signed)(sector - bio->bi_sector) * -512;
+
+	init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
 	bio_for_each_segment(bvl, bio, i) {
 		int len = bio_iovec_idx(bio, i)->bv_len;
 		int clen;
@@ -525,13 +528,14 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
 			bio_page = bio_iovec_idx(bio, i)->bv_page;
 			if (frombio)
 				tx = async_memcpy(page, bio_page, page_offset,
-						  b_offset, clen, 0,
-						  tx, NULL, NULL);
+						  b_offset, clen, &submit);
 			else
 				tx = async_memcpy(bio_page, page, b_offset,
-						  page_offset, clen, 0,
-						  tx, NULL, NULL);
+						  page_offset, clen, &submit);
 		}
+		/* chain the operations */
+		submit.depend_tx = tx;
+
 		if (clen < len) /* hit end of page */
 			break;
 		page_offset +=  len;
@@ -590,6 +594,7 @@ static void ops_run_biofill(struct stripe_head *sh)
 {
 	struct dma_async_tx_descriptor *tx = NULL;
 	raid5_conf_t *conf = sh->raid_conf;
+	struct async_submit_ctl submit;
 	int i;
 
 	pr_debug("%s: stripe %llu\n", __func__,
@@ -613,7 +618,8 @@ static void ops_run_biofill(struct stripe_head *sh)
 	}
 
 	atomic_inc(&sh->count);
-	async_trigger_callback(ASYNC_TX_ACK, tx, ops_complete_biofill, sh);
+	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
+	async_trigger_callback(&submit);
 }
 
 static void ops_complete_compute5(void *stripe_head_ref)
@@ -645,6 +651,7 @@ static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
 	struct page *xor_dest = tgt->page;
 	int count = 0;
 	struct dma_async_tx_descriptor *tx;
+	struct async_submit_ctl submit;
 	int i;
 
 	pr_debug("%s: stripe %llu block: %d\n",
@@ -657,13 +664,12 @@ static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
 
 	atomic_inc(&sh->count);
 
+	init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL,
+			  ops_complete_compute5, sh, NULL);
 	if (unlikely(count == 1))
-		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
-			0, NULL, ops_complete_compute5, sh);
+		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
 	else
-		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
-			ASYNC_TX_XOR_ZERO_DST, NULL,
-			ops_complete_compute5, sh);
+		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
 
 	return tx;
 }
@@ -683,6 +689,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 	int disks = sh->disks;
 	struct page *xor_srcs[disks];
 	int count = 0, pd_idx = sh->pd_idx, i;
+	struct async_submit_ctl submit;
 
 	/* existing parity data subtracted */
 	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
@@ -697,9 +704,9 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 			xor_srcs[count++] = dev->page;
 	}
 
-	tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
-		       ASYNC_TX_XOR_DROP_DST, tx,
-		       ops_complete_prexor, sh);
+	init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST, tx,
+			  ops_complete_prexor, sh, NULL);
+	tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
 
 	return tx;
 }
@@ -772,7 +779,7 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 	/* kernel stack size limits the total number of disks */
 	int disks = sh->disks;
 	struct page *xor_srcs[disks];
-
+	struct async_submit_ctl submit;
 	int count = 0, pd_idx = sh->pd_idx, i;
 	struct page *xor_dest;
 	int prexor = 0;
@@ -811,13 +818,11 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 
 	atomic_inc(&sh->count);
 
-	if (unlikely(count == 1)) {
-		flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST);
-		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
-			flags, tx, ops_complete_postxor, sh);
-	} else
-		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
-			flags, tx, ops_complete_postxor, sh);
+	init_async_submit(&submit, flags, tx, ops_complete_postxor, sh, NULL);
+	if (unlikely(count == 1))
+		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
+	else
+		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
 }
 
 static void ops_complete_check(void *stripe_head_ref)
@@ -838,6 +843,7 @@ static void ops_run_check(struct stripe_head *sh)
 	int disks = sh->disks;
 	struct page *xor_srcs[disks];
 	struct dma_async_tx_descriptor *tx;
+	struct async_submit_ctl submit;
 
 	int count = 0, pd_idx = sh->pd_idx, i;
 	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
@@ -851,12 +857,13 @@ static void ops_run_check(struct stripe_head *sh)
 			xor_srcs[count++] = dev->page;
 	}
 
+	init_async_submit(&submit, 0, NULL, NULL, NULL, NULL);
 	tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
-		&sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
+			   &sh->ops.zero_sum_result, &submit);
 
 	atomic_inc(&sh->count);
-	tx = async_trigger_callback(ASYNC_TX_ACK, tx,
-		ops_complete_check, sh);
+	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
+	tx = async_trigger_callback(&submit);
 }
 
 static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
@@ -2664,6 +2671,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
 		if (i != sh->pd_idx && i != sh->qd_idx) {
 			int dd_idx, j;
 			struct stripe_head *sh2;
+			struct async_submit_ctl submit;
 
 			sector_t bn = compute_blocknr(sh, i, 1);
 			sector_t s = raid5_compute_sector(conf, bn, 0,
@@ -2683,9 +2691,10 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
 			}
 
 			/* place all the copies on one channel */
+			init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
 			tx = async_memcpy(sh2->dev[dd_idx].page,
 					  sh->dev[i].page, 0, 0, STRIPE_SIZE,
-					  0, tx, NULL, NULL);
+					  &submit);
 
 			set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
 			set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
index 9f14cd540cd2..00cfb637ddf2 100644
--- a/include/linux/async_tx.h
+++ b/include/linux/async_tx.h
@@ -65,6 +65,22 @@ enum async_tx_flags {
 	ASYNC_TX_ACK		 = (1 << 2),
 };
 
+/**
+ * struct async_submit_ctl - async_tx submission/completion modifiers
+ * @flags: submission modifiers
+ * @depend_tx: parent dependency of the current operation being submitted
+ * @cb_fn: callback routine to run at operation completion
+ * @cb_param: parameter for the callback routine
+ * @scribble: caller provided space for dma/page address conversions
+ */
+struct async_submit_ctl {
+	enum async_tx_flags flags;
+	struct dma_async_tx_descriptor *depend_tx;
+	dma_async_tx_callback cb_fn;
+	void *cb_param;
+	void *scribble;
+};
+
 #ifdef CONFIG_DMA_ENGINE
 #define async_tx_issue_pending_all dma_issue_pending_all
 #ifdef CONFIG_ARCH_HAS_ASYNC_TX_FIND_CHANNEL
@@ -73,8 +89,8 @@ enum async_tx_flags {
 #define async_tx_find_channel(dep, type, dst, dst_count, src, src_count, len) \
 	 __async_tx_find_channel(dep, type)
 struct dma_chan *
-__async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
-	enum dma_transaction_type tx_type);
+__async_tx_find_channel(struct async_submit_ctl *submit,
+			enum dma_transaction_type tx_type);
 #endif /* CONFIG_ARCH_HAS_ASYNC_TX_FIND_CHANNEL */
 #else
 static inline void async_tx_issue_pending_all(void)
@@ -83,9 +99,10 @@ static inline void async_tx_issue_pending_all(void)
 }
 
 static inline struct dma_chan *
-async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
-	enum dma_transaction_type tx_type, struct page **dst, int dst_count,
-	struct page **src, int src_count, size_t len)
+async_tx_find_channel(struct async_submit_ctl *submit,
+		      enum dma_transaction_type tx_type, struct page **dst,
+		      int dst_count, struct page **src, int src_count,
+		      size_t len)
 {
 	return NULL;
 }
@@ -97,46 +114,53 @@ async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
  * @cb_fn_param: parameter to pass to the callback routine
  */
 static inline void
-async_tx_sync_epilog(dma_async_tx_callback cb_fn, void *cb_fn_param)
+async_tx_sync_epilog(struct async_submit_ctl *submit)
+{
+	if (submit->cb_fn)
+		submit->cb_fn(submit->cb_param);
+}
+
+typedef union {
+	unsigned long addr;
+	struct page *page;
+	dma_addr_t dma;
+} addr_conv_t;
+
+static inline void
+init_async_submit(struct async_submit_ctl *args, enum async_tx_flags flags,
+		  struct dma_async_tx_descriptor *tx,
+		  dma_async_tx_callback cb_fn, void *cb_param,
+		  addr_conv_t *scribble)
 {
-	if (cb_fn)
-		cb_fn(cb_fn_param);
+	args->flags = flags;
+	args->depend_tx = tx;
+	args->cb_fn = cb_fn;
+	args->cb_param = cb_param;
+	args->scribble = scribble;
 }
 
-void
-async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx,
-	enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx,
-	dma_async_tx_callback cb_fn, void *cb_fn_param);
+void async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx,
+		     struct async_submit_ctl *submit);
 
 struct dma_async_tx_descriptor *
 async_xor(struct page *dest, struct page **src_list, unsigned int offset,
-	int src_cnt, size_t len, enum async_tx_flags flags,
-	struct dma_async_tx_descriptor *depend_tx,
-	dma_async_tx_callback cb_fn, void *cb_fn_param);
+	  int src_cnt, size_t len, struct async_submit_ctl *submit);
 
 struct dma_async_tx_descriptor *
-async_xor_val(struct page *dest, struct page **src_list,
-	unsigned int offset, int src_cnt, size_t len,
-	u32 *result, enum async_tx_flags flags,
-	struct dma_async_tx_descriptor *depend_tx,
-	dma_async_tx_callback cb_fn, void *cb_fn_param);
+async_xor_val(struct page *dest, struct page **src_list, unsigned int offset,
+	      int src_cnt, size_t len, u32 *result,
+	      struct async_submit_ctl *submit);
 
 struct dma_async_tx_descriptor *
 async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
-	unsigned int src_offset, size_t len, enum async_tx_flags flags,
-	struct dma_async_tx_descriptor *depend_tx,
-	dma_async_tx_callback cb_fn, void *cb_fn_param);
+	     unsigned int src_offset, size_t len,
+	     struct async_submit_ctl *submit);
 
 struct dma_async_tx_descriptor *
 async_memset(struct page *dest, int val, unsigned int offset,
-	size_t len, enum async_tx_flags flags,
-	struct dma_async_tx_descriptor *depend_tx,
-	dma_async_tx_callback cb_fn, void *cb_fn_param);
+	     size_t len, struct async_submit_ctl *submit);
 
-struct dma_async_tx_descriptor *
-async_trigger_callback(enum async_tx_flags flags,
-	struct dma_async_tx_descriptor *depend_tx,
-	dma_async_tx_callback cb_fn, void *cb_fn_param);
+struct dma_async_tx_descriptor *async_trigger_callback(struct async_submit_ctl *submit);
 
 void async_tx_quiesce(struct dma_async_tx_descriptor **tx);
 #endif /* _ASYNC_TX_H_ */
-- 
cgit v1.2.3-71-gd317


From a635f9dd83f3382577f4544a96df12356e951a40 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Fri, 12 Jun 2009 15:20:55 +0200
Subject: HID: use debugfs for report dumping descriptor

It is a little bit inconvenient for people who have some non-standard
HID hardware (usually violating the HID specification) to have to
recompile kernel with CONFIG_HID_DEBUG to be able to see kernel's perspective
of the HID report descriptor and observe the parsed events. Plus the messages
are then mixed up inconveniently with the rest of the dmesg stuff.

This patch implements /sys/kernel/debug/hid/<device>/rdesc file, which
represents the kernel's view of report descriptor (both the raw report
descriptor data and parsed contents).

With all the device-specific debug data being available through debugfs, there
is no need for keeping CONFIG_HID_DEBUG, as the 'debug' parameter to the
hid module will now only output only driver-specific debugging options, which has
absolutely minimal memory footprint, just a few error messages and one global
flag (hid_debug).

We use the current set of output formatting functions. The ones that need to be
used both for one-shot rdesc seq_file and also for continuous flow of data
(individual reports, as being sent by the device) distinguish according to the
passed seq_file parameter, and if it is NULL, it still output to kernel ringbuffer,
otherwise the corresponding seq_file is used for output.

The format of the output is preserved.

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/Kconfig           |  15 ---
 drivers/hid/Makefile          |   5 +-
 drivers/hid/hid-core.c        |  13 ++-
 drivers/hid/hid-debug.c       | 227 +++++++++++++++++++++++++++++-------------
 drivers/hid/hid-input.c       |  13 +--
 drivers/hid/usbhid/hid-core.c |   8 +-
 include/linux/hid-debug.h     |  32 +++---
 include/linux/hid.h           |   4 +
 8 files changed, 195 insertions(+), 122 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig
index 7e67dcb3d4f6..05950a783560 100644
--- a/drivers/hid/Kconfig
+++ b/drivers/hid/Kconfig
@@ -31,21 +31,6 @@ config HID
 
 	  If unsure, say Y.
 
-config HID_DEBUG
-	bool "HID debugging support"
-	default y
-	depends on HID
-	---help---
-	This option lets the HID layer output diagnostics about its internal
-	state, resolve HID usages, dump HID fields, etc. Individual HID drivers
-	use this debugging facility to output information about individual HID
-	devices, etc.
-
-	This feature is useful for those who are either debugging the HID parser
-	or any HID hardware device.
-
-	If unsure, say Y.
-
 config HIDRAW
 	bool "/dev/hidraw raw HID device support"
 	depends on HID
diff --git a/drivers/hid/Makefile b/drivers/hid/Makefile
index 1f7cb0fd4505..cf3687d1ed63 100644
--- a/drivers/hid/Makefile
+++ b/drivers/hid/Makefile
@@ -3,9 +3,12 @@
 #
 hid-objs			:= hid-core.o hid-input.o
 
+ifdef CONFIG_DEBUG_FS
+	hid-objs		+= hid-debug.o
+endif
+
 obj-$(CONFIG_HID)		+= hid.o
 
-hid-$(CONFIG_HID_DEBUG)		+= hid-debug.o
 hid-$(CONFIG_HIDRAW)		+= hidraw.o
 
 hid-logitech-objs		:= hid-lg.o
diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index 8551693d645f..d4317db85b54 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -44,12 +44,10 @@
 #define DRIVER_DESC "HID core driver"
 #define DRIVER_LICENSE "GPL"
 
-#ifdef CONFIG_HID_DEBUG
 int hid_debug = 0;
 module_param_named(debug, hid_debug, int, 0600);
 MODULE_PARM_DESC(debug, "HID debugging (0=off, 1=probing info, 2=continuous data dumping)");
 EXPORT_SYMBOL_GPL(hid_debug);
-#endif
 
 /*
  * Register a new report for a device.
@@ -987,7 +985,6 @@ int hid_set_field(struct hid_field *field, unsigned offset, __s32 value)
 
 	if (offset >= field->report_count) {
 		dbg_hid("offset (%d) exceeds report_count (%d)\n", offset, field->report_count);
-		hid_dump_field(field, 8);
 		return -1;
 	}
 	if (field->logical_minimum < 0) {
@@ -1721,6 +1718,8 @@ int hid_add_device(struct hid_device *hdev)
 	if (!ret)
 		hdev->status |= HID_STAT_ADDED;
 
+	hid_debug_register(hdev, dev_name(&hdev->dev));
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(hid_add_device);
@@ -1768,6 +1767,7 @@ static void hid_remove_device(struct hid_device *hdev)
 {
 	if (hdev->status & HID_STAT_ADDED) {
 		device_del(&hdev->dev);
+		hid_debug_unregister(hdev);
 		hdev->status &= ~HID_STAT_ADDED;
 	}
 }
@@ -1843,6 +1843,10 @@ static int __init hid_init(void)
 {
 	int ret;
 
+	if (hid_debug)
+		printk(KERN_WARNING "HID: hid_debug parameter has been deprecated. "
+				"Debugging data are now provided via debugfs\n");
+
 	ret = bus_register(&hid_bus_type);
 	if (ret) {
 		printk(KERN_ERR "HID: can't register hid bus\n");
@@ -1853,6 +1857,8 @@ static int __init hid_init(void)
 	if (ret)
 		goto err_bus;
 
+	hid_debug_init();
+
 	return 0;
 err_bus:
 	bus_unregister(&hid_bus_type);
@@ -1862,6 +1868,7 @@ err:
 
 static void __exit hid_exit(void)
 {
+	hid_debug_exit();
 	hidraw_exit();
 	bus_unregister(&hid_bus_type);
 }
diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c
index 47ac1a7d66e1..067e173aa3e4 100644
--- a/drivers/hid/hid-debug.c
+++ b/drivers/hid/hid-debug.c
@@ -1,9 +1,9 @@
 /*
  *  (c) 1999 Andreas Gal		<gal@cs.uni-magdeburg.de>
  *  (c) 2000-2001 Vojtech Pavlik	<vojtech@ucw.cz>
- *  (c) 2007 Jiri Kosina
+ *  (c) 2007-2009 Jiri Kosina
  *
- *  Some debug stuff for the HID parser.
+ *  HID debugging support
  */
 
 /*
@@ -26,9 +26,13 @@
  * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
 #include <linux/hid.h>
 #include <linux/hid-debug.h>
 
+static struct dentry *hid_debug_root;
+
 struct hid_usage_entry {
 	unsigned  page;
 	unsigned  usage;
@@ -331,72 +335,83 @@ static const struct hid_usage_entry hid_usage_table[] = {
   { 0, 0, NULL }
 };
 
-static void resolv_usage_page(unsigned page) {
+static void resolv_usage_page(unsigned page, struct seq_file *f) {
 	const struct hid_usage_entry *p;
 
 	for (p = hid_usage_table; p->description; p++)
 		if (p->page == page) {
-			printk("%s", p->description);
+			if (!f)
+				printk("%s", p->description);
+			else
+				seq_printf(f, "%s", p->description);
 			return;
 		}
-	printk("%04x", page);
+	if (!f)
+		printk("%04x", page);
+	else
+		seq_printf(f, "%04x", page);
 }
 
-void hid_resolv_usage(unsigned usage) {
+void hid_resolv_usage(unsigned usage, struct seq_file *f) {
 	const struct hid_usage_entry *p;
 
-	if (!hid_debug)
-		return;
-
-	resolv_usage_page(usage >> 16);
-	printk(".");
+	resolv_usage_page(usage >> 16, f);
+	if (!f)
+		printk(".");
+	else
+		seq_printf(f, ".");
 	for (p = hid_usage_table; p->description; p++)
 		if (p->page == (usage >> 16)) {
 			for(++p; p->description && p->usage != 0; p++)
 				if (p->usage == (usage & 0xffff)) {
-					printk("%s", p->description);
+					if (!f)
+						printk("%s", p->description);
+					else
+						seq_printf(f,
+							"%s",
+							p->description);
 					return;
 				}
 			break;
 		}
-	printk("%04x", usage & 0xffff);
+	if (!f)
+		printk("%04x", usage & 0xffff);
+	else
+		seq_printf(f, "%04x", usage & 0xffff);
 }
 EXPORT_SYMBOL_GPL(hid_resolv_usage);
 
-static void tab(int n) {
-	printk(KERN_DEBUG "%*s", n, "");
+static void tab(int n, struct seq_file *f) {
+	seq_printf(f, "%*s", n, "");
 }
 
-void hid_dump_field(struct hid_field *field, int n) {
+void hid_dump_field(struct hid_field *field, int n, struct seq_file *f) {
 	int j;
 
-	if (!hid_debug)
-		return;
-
 	if (field->physical) {
-		tab(n);
-		printk("Physical(");
-		hid_resolv_usage(field->physical); printk(")\n");
+		tab(n, f);
+		seq_printf(f, "Physical(");
+		hid_resolv_usage(field->physical, f); seq_printf(f, ")\n");
 	}
 	if (field->logical) {
-		tab(n);
-		printk("Logical(");
-		hid_resolv_usage(field->logical); printk(")\n");
+		tab(n, f);
+		seq_printf(f, "Logical(");
+		hid_resolv_usage(field->logical, f); seq_printf(f, ")\n");
 	}
-	tab(n); printk("Usage(%d)\n", field->maxusage);
+	tab(n, f); seq_printf(f, "Usage(%d)\n", field->maxusage);
 	for (j = 0; j < field->maxusage; j++) {
-		tab(n+2); hid_resolv_usage(field->usage[j].hid); printk("\n");
+		tab(n+2, f); hid_resolv_usage(field->usage[j].hid, f); seq_printf(f, "\n");
 	}
 	if (field->logical_minimum != field->logical_maximum) {
-		tab(n); printk("Logical Minimum(%d)\n", field->logical_minimum);
-		tab(n); printk("Logical Maximum(%d)\n", field->logical_maximum);
+		tab(n, f); seq_printf(f, "Logical Minimum(%d)\n", field->logical_minimum);
+		tab(n, f); seq_printf(f, "Logical Maximum(%d)\n", field->logical_maximum);
 	}
 	if (field->physical_minimum != field->physical_maximum) {
-		tab(n); printk("Physical Minimum(%d)\n", field->physical_minimum);
-		tab(n); printk("Physical Maximum(%d)\n", field->physical_maximum);
+		tab(n, f); seq_printf(f, "Physical Minimum(%d)\n", field->physical_minimum);
+		tab(n, f); seq_printf(f, "Physical Maximum(%d)\n", field->physical_maximum);
 	}
 	if (field->unit_exponent) {
-		tab(n); printk("Unit Exponent(%d)\n", field->unit_exponent);
+		tab(n, f); seq_printf(f, "Unit Exponent(%d)\n", field->unit_exponent);
 	}
 	if (field->unit) {
 		static const char *systems[5] = { "None", "SI Linear", "SI Rotation", "English Linear", "English Rotation" };
@@ -417,77 +432,75 @@ void hid_dump_field(struct hid_field *field, int n) {
 		data >>= 4;
 
 		if(sys > 4) {
-			tab(n); printk("Unit(Invalid)\n");
+			tab(n, f); seq_printf(f, "Unit(Invalid)\n");
 		}
 		else {
 			int earlier_unit = 0;
 
-			tab(n); printk("Unit(%s : ", systems[sys]);
+			tab(n, f); seq_printf(f, "Unit(%s : ", systems[sys]);
 
 			for (i=1 ; i<sizeof(__u32)*2 ; i++) {
 				char nibble = data & 0xf;
 				data >>= 4;
 				if (nibble != 0) {
 					if(earlier_unit++ > 0)
-						printk("*");
-					printk("%s", units[sys][i]);
+						seq_printf(f, "*");
+					seq_printf(f, "%s", units[sys][i]);
 					if(nibble != 1) {
 						/* This is a _signed_ nibble(!) */
 
 						int val = nibble & 0x7;
 						if(nibble & 0x08)
 							val = -((0x7 & ~val) +1);
-						printk("^%d", val);
+						seq_printf(f, "^%d", val);
 					}
 				}
 			}
-			printk(")\n");
+			seq_printf(f, ")\n");
 		}
 	}
-	tab(n); printk("Report Size(%u)\n", field->report_size);
-	tab(n); printk("Report Count(%u)\n", field->report_count);
-	tab(n); printk("Report Offset(%u)\n", field->report_offset);
+	tab(n, f); seq_printf(f, "Report Size(%u)\n", field->report_size);
+	tab(n, f); seq_printf(f, "Report Count(%u)\n", field->report_count);
+	tab(n, f); seq_printf(f, "Report Offset(%u)\n", field->report_offset);
 
-	tab(n); printk("Flags( ");
+	tab(n, f); seq_printf(f, "Flags( ");
 	j = field->flags;
-	printk("%s", HID_MAIN_ITEM_CONSTANT & j ? "Constant " : "");
-	printk("%s", HID_MAIN_ITEM_VARIABLE & j ? "Variable " : "Array ");
-	printk("%s", HID_MAIN_ITEM_RELATIVE & j ? "Relative " : "Absolute ");
-	printk("%s", HID_MAIN_ITEM_WRAP & j ? "Wrap " : "");
-	printk("%s", HID_MAIN_ITEM_NONLINEAR & j ? "NonLinear " : "");
-	printk("%s", HID_MAIN_ITEM_NO_PREFERRED & j ? "NoPreferredState " : "");
-	printk("%s", HID_MAIN_ITEM_NULL_STATE & j ? "NullState " : "");
-	printk("%s", HID_MAIN_ITEM_VOLATILE & j ? "Volatile " : "");
-	printk("%s", HID_MAIN_ITEM_BUFFERED_BYTE & j ? "BufferedByte " : "");
-	printk(")\n");
+	seq_printf(f, "%s", HID_MAIN_ITEM_CONSTANT & j ? "Constant " : "");
+	seq_printf(f, "%s", HID_MAIN_ITEM_VARIABLE & j ? "Variable " : "Array ");
+	seq_printf(f, "%s", HID_MAIN_ITEM_RELATIVE & j ? "Relative " : "Absolute ");
+	seq_printf(f, "%s", HID_MAIN_ITEM_WRAP & j ? "Wrap " : "");
+	seq_printf(f, "%s", HID_MAIN_ITEM_NONLINEAR & j ? "NonLinear " : "");
+	seq_printf(f, "%s", HID_MAIN_ITEM_NO_PREFERRED & j ? "NoPreferredState " : "");
+	seq_printf(f, "%s", HID_MAIN_ITEM_NULL_STATE & j ? "NullState " : "");
+	seq_printf(f, "%s", HID_MAIN_ITEM_VOLATILE & j ? "Volatile " : "");
+	seq_printf(f, "%s", HID_MAIN_ITEM_BUFFERED_BYTE & j ? "BufferedByte " : "");
+	seq_printf(f, ")\n");
 }
 EXPORT_SYMBOL_GPL(hid_dump_field);
 
-void hid_dump_device(struct hid_device *device) {
+void hid_dump_device(struct hid_device *device, struct seq_file *f)
+{
 	struct hid_report_enum *report_enum;
 	struct hid_report *report;
 	struct list_head *list;
 	unsigned i,k;
 	static const char *table[] = {"INPUT", "OUTPUT", "FEATURE"};
 
-	if (!hid_debug)
-		return;
-
 	for (i = 0; i < HID_REPORT_TYPES; i++) {
 		report_enum = device->report_enum + i;
 		list = report_enum->report_list.next;
 		while (list != &report_enum->report_list) {
 			report = (struct hid_report *) list;
-			tab(2);
-			printk("%s", table[i]);
+			tab(2, f);
+			seq_printf(f, "%s", table[i]);
 			if (report->id)
-				printk("(%d)", report->id);
-			printk("[%s]", table[report->type]);
-			printk("\n");
+				seq_printf(f, "(%d)", report->id);
+			seq_printf(f, "[%s]", table[report->type]);
+			seq_printf(f, "\n");
 			for (k = 0; k < report->maxfield; k++) {
-				tab(4);
-				printk("Field(%d)\n", k);
-				hid_dump_field(report->field[k], 6);
+				tab(4, f);
+				seq_printf(f, "Field(%d)\n", k);
+				hid_dump_field(report->field[k], 6, f);
 			}
 			list = list->next;
 		}
@@ -500,7 +513,7 @@ void hid_dump_input(struct hid_usage *usage, __s32 value) {
 		return;
 
 	printk(KERN_DEBUG "hid-debug: input ");
-	hid_resolv_usage(usage->hid);
+	hid_resolv_usage(usage->hid, NULL);
 	printk(" = %d\n", value);
 }
 EXPORT_SYMBOL_GPL(hid_dump_input);
@@ -767,12 +780,84 @@ static const char **names[EV_MAX + 1] = {
 	[EV_SND] = sounds,			[EV_REP] = repeats,
 };
 
-void hid_resolv_event(__u8 type, __u16 code) {
-
-	if (!hid_debug)
-		return;
+void hid_resolv_event(__u8 type, __u16 code, struct seq_file *f) {
 
-	printk("%s.%s", events[type] ? events[type] : "?",
+	seq_printf(f, "%s.%s", events[type] ? events[type] : "?",
 		names[type] ? (names[type][code] ? names[type][code] : "?") : "?");
 }
-EXPORT_SYMBOL_GPL(hid_resolv_event);
+
+void hid_dump_input_mapping(struct hid_device *hid, struct seq_file *f)
+{
+	int i, j, k;
+	struct hid_report *report;
+	struct hid_usage *usage;
+
+	for (k = HID_INPUT_REPORT; k <= HID_OUTPUT_REPORT; k++) {
+		list_for_each_entry(report, &hid->report_enum[k].report_list, list) {
+			for (i = 0; i < report->maxfield; i++) {
+				for ( j = 0; j < report->field[i]->maxusage; j++) {
+					usage = report->field[i]->usage + j;
+					hid_resolv_usage(usage->hid, f);
+					seq_printf(f, " ---> ");
+					hid_resolv_event(usage->type, usage->code, f);
+					seq_printf(f, "\n");
+				}
+			}
+		}
+	}
+
+}
+
+static int hid_debug_rdesc_show(struct seq_file *f, void *p)
+{
+	struct hid_device *hdev = f->private;
+	int i;
+
+	/* dump HID report descriptor */
+	for (i = 0; i < hdev->rsize; i++)
+		seq_printf(f, "%02x ", hdev->rdesc[i]);
+	seq_printf(f, "\n\n");
+
+	/* dump parsed data and input mappings */
+	hid_dump_device(hdev, f);
+	seq_printf(f, "\n");
+	hid_dump_input_mapping(hdev, f);
+
+	return 0;
+}
+
+static int hid_debug_rdesc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, hid_debug_rdesc_show, inode->i_private);
+}
+
+static const struct file_operations hid_debug_rdesc_fops = {
+	.open           = hid_debug_rdesc_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = single_release,
+};
+
+void hid_debug_register(struct hid_device *hdev, const char *name)
+{
+	hdev->debug_dir = debugfs_create_dir(name, hid_debug_root);
+	hdev->debug_rdesc = debugfs_create_file("rdesc", 0400,
+			hdev->debug_dir, hdev, &hid_debug_rdesc_fops);
+}
+
+void hid_debug_unregister(struct hid_device *hdev)
+{
+	debugfs_remove(hdev->debug_rdesc);
+	debugfs_remove(hdev->debug_dir);
+}
+
+void hid_debug_init(void)
+{
+	hid_debug_root = debugfs_create_dir("hid", NULL);
+}
+
+void hid_debug_exit(void)
+{
+	debugfs_remove_recursive(hid_debug_root);
+}
+
diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c
index 7f183b7147e1..5862b0f3b55d 100644
--- a/drivers/hid/hid-input.c
+++ b/drivers/hid/hid-input.c
@@ -159,17 +159,12 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel
 
 	field->hidinput = hidinput;
 
-	dbg_hid("Mapping: ");
-	hid_resolv_usage(usage->hid);
-	dbg_hid_line(" ---> ");
-
 	if (field->flags & HID_MAIN_ITEM_CONSTANT)
 		goto ignore;
 
 	/* only LED usages are supported in output fields */
 	if (field->report_type == HID_OUTPUT_REPORT &&
 			(usage->hid & HID_USAGE_PAGE) != HID_UP_LED) {
-		dbg_hid_line(" [non-LED output field] ");
 		goto ignore;
 	}
 
@@ -561,15 +556,9 @@ mapped:
 		set_bit(MSC_SCAN, input->mscbit);
 	}
 
-	hid_resolv_event(usage->type, usage->code);
-
-	dbg_hid_line("\n");
-
-	return;
-
 ignore:
-	dbg_hid_line("IGNORED\n");
 	return;
+
 }
 
 void hidinput_hid_event(struct hid_device *hid, struct hid_field *field, struct hid_usage *usage, __s32 value)
diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c
index ac8049b5f1e9..708aa52d0753 100644
--- a/drivers/hid/usbhid/hid-core.c
+++ b/drivers/hid/usbhid/hid-core.c
@@ -4,8 +4,8 @@
  *  Copyright (c) 1999 Andreas Gal
  *  Copyright (c) 2000-2005 Vojtech Pavlik <vojtech@suse.cz>
  *  Copyright (c) 2005 Michael Haboustak <mike-@cinci.rr.com> for Concept2, Inc
- *  Copyright (c) 2006-2008 Jiri Kosina
  *  Copyright (c) 2007-2008 Oliver Neukum
+ *  Copyright (c) 2006-2009 Jiri Kosina
  */
 
 /*
@@ -886,11 +886,6 @@ static int usbhid_parse(struct hid_device *hid)
 		goto err;
 	}
 
-	dbg_hid("report descriptor (size %u, read %d) = ", rsize, n);
-	for (n = 0; n < rsize; n++)
-		dbg_hid_line(" %02x", (unsigned char) rdesc[n]);
-	dbg_hid_line("\n");
-
 	ret = hid_parse_report(hid, rdesc, rsize);
 	kfree(rdesc);
 	if (ret) {
@@ -1005,7 +1000,6 @@ static int usbhid_start(struct hid_device *hid)
 	usbhid->urbctrl->transfer_flags |= (URB_NO_TRANSFER_DMA_MAP | URB_NO_SETUP_DMA_MAP);
 
 	usbhid_init_reports(hid);
-	hid_dump_device(hid);
 
 	set_bit(HID_STARTED, &usbhid->iofl);
 
diff --git a/include/linux/hid-debug.h b/include/linux/hid-debug.h
index 50d568ec178a..516e12c33235 100644
--- a/include/linux/hid-debug.h
+++ b/include/linux/hid-debug.h
@@ -2,7 +2,7 @@
 #define __HID_DEBUG_H
 
 /*
- *  Copyright (c) 2007	Jiri Kosina
+ *  Copyright (c) 2007-2009	Jiri Kosina
  */
 
 /*
@@ -22,24 +22,30 @@
  *
  */
 
-#ifdef CONFIG_HID_DEBUG
+#ifdef CONFIG_DEBUG_FS
 
 void hid_dump_input(struct hid_usage *, __s32);
-void hid_dump_device(struct hid_device *);
-void hid_dump_field(struct hid_field *, int);
-void hid_resolv_usage(unsigned);
-void hid_resolv_event(__u8, __u16);
+void hid_dump_device(struct hid_device *, struct seq_file *);
+void hid_dump_field(struct hid_field *, int, struct seq_file *);
+void hid_resolv_usage(unsigned, struct seq_file *);
+void hid_debug_register(struct hid_device *, const char *);
+void hid_debug_unregister(struct hid_device *);
+void hid_debug_init(void);
+void hid_debug_exit(void);
 
 #else
 
-#define hid_dump_input(a,b)     do { } while (0)
-#define hid_dump_device(c)      do { } while (0)
-#define hid_dump_field(a,b)     do { } while (0)
-#define hid_resolv_usage(a)         do { } while (0)
-#define hid_resolv_event(a,b)       do { } while (0)
-
-#endif /* CONFIG_HID_DEBUG */
+#define hid_dump_input(a,b)		do { } while (0)
+#define hid_dump_device(c)		do { } while (0)
+#define hid_dump_field(a,b)		do { } while (0)
+#define hid_resolv_usage(a)		do { } while (0)
+#define hid_resolv_event(a,b)		do { } while (0)
+#define hid_debug_register(a, b)	do { } while (0)
+#define hid_debug_unregister(a)		do { } while (0)
+#define hid_debug_init()		do { } while (0)
+#define hid_debug_exit()		do { } while (0)
 
+#endif
 
 #endif
 
diff --git a/include/linux/hid.h b/include/linux/hid.h
index a72876e43589..da09ab140ef1 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -451,6 +451,10 @@ struct hid_device {							/* device report descriptor */
 	char phys[64];							/* Device physical location */
 	char uniq[64];							/* Device unique identifier (serial #) */
 
+	/* debugfs */
+	struct dentry *debug_dir;
+	struct dentry *debug_rdesc;
+
 	void *driver_data;
 
 	/* temporary hid_ff handling (until moved to the drivers) */
-- 
cgit v1.2.3-71-gd317


From cd667ce24796700e1a0e6e7528efc61c96ff832e Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Fri, 12 Jun 2009 15:20:57 +0200
Subject: HID: use debugfs for events/reports dumping

This is a followup patch to the one implemeting rdesc representation in debugfs
rather than being dependent on compile-time CONFIG_HID_DEBUG setting.

The API of the appropriate formatting functions is slightly modified -- if
they are passed seq_file pointer, the one-shot output for 'rdesc' file mode
is used, and therefore the message is formatted into the corresponding seq_file
immediately.

Otherwise the called function allocated a new buffer, formats the text into the
buffer and returns the pointer to it, so that it can be queued into the ring-buffer
of the processess blocked waiting on input on 'events' file in debugfs.

'debug' parameter to the 'hid' module is now used solely for the prupose of inetrnal
driver state debugging (parser, transport, etc).

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-core.c    |  42 ++++++--
 drivers/hid/hid-debug.c   | 239 ++++++++++++++++++++++++++++++++++++++++++----
 include/linux/hid-debug.h |  18 +++-
 include/linux/hid.h       |  26 ++---
 4 files changed, 276 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index d4317db85b54..449bd747d116 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -46,7 +46,7 @@
 
 int hid_debug = 0;
 module_param_named(debug, hid_debug, int, 0600);
-MODULE_PARM_DESC(debug, "HID debugging (0=off, 1=probing info, 2=continuous data dumping)");
+MODULE_PARM_DESC(debug, "toggle HID debugging messages");
 EXPORT_SYMBOL_GPL(hid_debug);
 
 /*
@@ -859,7 +859,7 @@ static void hid_process_event(struct hid_device *hid, struct hid_field *field,
 	struct hid_driver *hdrv = hid->driver;
 	int ret;
 
-	hid_dump_input(usage, value);
+	hid_dump_input(hid, usage, value);
 
 	if (hdrv && hdrv->event && hid_match_usage(hid, usage)) {
 		ret = hdrv->event(hid, field, usage, value);
@@ -981,7 +981,7 @@ int hid_set_field(struct hid_field *field, unsigned offset, __s32 value)
 {
 	unsigned size = field->report_size;
 
-	hid_dump_input(field->usage + offset, value);
+	hid_dump_input(field->report->device, field->usage + offset, value);
 
 	if (offset >= field->report_count) {
 		dbg_hid("offset (%d) exceeds report_count (%d)\n", offset, field->report_count);
@@ -1075,6 +1075,7 @@ int hid_input_report(struct hid_device *hid, int type, u8 *data, int size, int i
 	struct hid_report_enum *report_enum = hid->report_enum + type;
 	struct hid_driver *hdrv = hid->driver;
 	struct hid_report *report;
+	char *buf;
 	unsigned int i;
 	int ret;
 
@@ -1086,18 +1087,36 @@ int hid_input_report(struct hid_device *hid, int type, u8 *data, int size, int i
 		return -1;
 	}
 
-	dbg_hid("report (size %u) (%snumbered)\n", size, report_enum->numbered ? "" : "un");
+	buf = kmalloc(sizeof(char) * HID_DEBUG_BUFSIZE,
+			interrupt ? GFP_ATOMIC : GFP_KERNEL);
+
+	if (!buf) {
+		report = hid_get_report(report_enum, data);
+		goto nomem;
+	}
+
+	snprintf(buf, HID_DEBUG_BUFSIZE - 1,
+			"\nreport (size %u) (%snumbered)\n", size, report_enum->numbered ? "" : "un");
+	hid_debug_event(hid, buf);
 
 	report = hid_get_report(report_enum, data);
 	if (!report)
 		return -1;
 
 	/* dump the report */
-	dbg_hid("report %d (size %u) = ", report->id, size);
-	for (i = 0; i < size; i++)
-		dbg_hid_line(" %02x", data[i]);
-	dbg_hid_line("\n");
+	snprintf(buf, HID_DEBUG_BUFSIZE - 1,
+			"report %d (size %u) = ", report->id, size);
+	hid_debug_event(hid, buf);
+	for (i = 0; i < size; i++) {
+		snprintf(buf, HID_DEBUG_BUFSIZE - 1,
+				" %02x", data[i]);
+		hid_debug_event(hid, buf);
+	}
+	hid_debug_event(hid, "\n");
+
+	kfree(buf);
 
+nomem:
 	if (hdrv && hdrv->raw_event && hid_match_report(hid, report)) {
 		ret = hdrv->raw_event(hid, report, data, size);
 		if (ret != 0)
@@ -1756,6 +1775,9 @@ struct hid_device *hid_allocate_device(void)
 	for (i = 0; i < HID_REPORT_TYPES; i++)
 		INIT_LIST_HEAD(&hdev->report_enum[i].report_list);
 
+	init_waitqueue_head(&hdev->debug_wait);
+	INIT_LIST_HEAD(&hdev->debug_list);
+
 	return hdev;
 err:
 	put_device(&hdev->dev);
@@ -1844,8 +1866,8 @@ static int __init hid_init(void)
 	int ret;
 
 	if (hid_debug)
-		printk(KERN_WARNING "HID: hid_debug parameter has been deprecated. "
-				"Debugging data are now provided via debugfs\n");
+		printk(KERN_WARNING "HID: hid_debug is now used solely for parser and driver debugging.\n"
+				"HID: debugfs is now used for inspecting the device (report descriptor, reports)\n");
 
 	ret = bus_register(&hid_bus_type);
 	if (ret) {
diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c
index 067e173aa3e4..a331a1821e85 100644
--- a/drivers/hid/hid-debug.c
+++ b/drivers/hid/hid-debug.c
@@ -28,6 +28,10 @@
 
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/poll.h>
+
 #include <linux/hid.h>
 #include <linux/hid-debug.h>
 
@@ -335,49 +339,86 @@ static const struct hid_usage_entry hid_usage_table[] = {
   { 0, 0, NULL }
 };
 
-static void resolv_usage_page(unsigned page, struct seq_file *f) {
+/* Either output directly into simple seq_file, or (if f == NULL)
+ * allocate a separate buffer that will then be passed to the 'events'
+ * ringbuffer.
+ *
+ * This is because these functions can be called both for "one-shot"
+ * "rdesc" while resolving, or for blocking "events".
+ *
+ * This holds both for resolv_usage_page() and hid_resolv_usage().
+ */
+static char *resolv_usage_page(unsigned page, struct seq_file *f) {
 	const struct hid_usage_entry *p;
+	char *buf = NULL;
+
+	if (!f) {
+		buf = kzalloc(sizeof(char) * HID_DEBUG_BUFSIZE, GFP_ATOMIC);
+		if (!buf)
+			return ERR_PTR(-ENOMEM);
+	}
 
 	for (p = hid_usage_table; p->description; p++)
 		if (p->page == page) {
-			if (!f)
-				printk("%s", p->description);
-			else
+			if (!f) {
+				snprintf(buf, HID_DEBUG_BUFSIZE, "%s",
+						p->description);
+				return buf;
+			}
+			else {
 				seq_printf(f, "%s", p->description);
-			return;
+				return NULL;
+			}
 		}
 	if (!f)
-		printk("%04x", page);
+		snprintf(buf, HID_DEBUG_BUFSIZE, "%04x", page);
 	else
 		seq_printf(f, "%04x", page);
+	return buf;
 }
 
-void hid_resolv_usage(unsigned usage, struct seq_file *f) {
+char *hid_resolv_usage(unsigned usage, struct seq_file *f) {
 	const struct hid_usage_entry *p;
+	char *buf = NULL;
+	int len = 0;
 
-	resolv_usage_page(usage >> 16, f);
-	if (!f)
-		printk(".");
-	else
+	buf = resolv_usage_page(usage >> 16, f);
+	if (IS_ERR(buf)) {
+		printk(KERN_ERR "error allocating HID debug buffer\n");
+		return NULL;
+	}
+
+
+	if (!f) {
+		len = strlen(buf);
+		snprintf(buf+len, max(0, HID_DEBUG_BUFSIZE - len), ".");
+		len++;
+	}
+	else {
 		seq_printf(f, ".");
+	}
 	for (p = hid_usage_table; p->description; p++)
 		if (p->page == (usage >> 16)) {
 			for(++p; p->description && p->usage != 0; p++)
 				if (p->usage == (usage & 0xffff)) {
 					if (!f)
-						printk("%s", p->description);
+						snprintf(buf + len,
+							max(0,HID_DEBUG_BUFSIZE - len - 1),
+							"%s", p->description);
 					else
 						seq_printf(f,
 							"%s",
 							p->description);
-					return;
+					return buf;
 				}
 			break;
 		}
 	if (!f)
-		printk("%04x", usage & 0xffff);
+		snprintf(buf + len, max(0, HID_DEBUG_BUFSIZE - len - 1),
+				"%04x", usage & 0xffff);
 	else
 		seq_printf(f, "%04x", usage & 0xffff);
+	return buf;
 }
 EXPORT_SYMBOL_GPL(hid_resolv_usage);
 
@@ -508,13 +549,37 @@ void hid_dump_device(struct hid_device *device, struct seq_file *f)
 }
 EXPORT_SYMBOL_GPL(hid_dump_device);
 
-void hid_dump_input(struct hid_usage *usage, __s32 value) {
-	if (hid_debug < 2)
+/* enqueue string to 'events' ring buffer */
+void hid_debug_event(struct hid_device *hdev, char *buf)
+{
+	int i;
+	struct hid_debug_list *list;
+
+	list_for_each_entry(list, &hdev->debug_list, node) {
+		for (i = 0; i <= strlen(buf); i++)
+			list->hid_debug_buf[(list->tail + i) % (HID_DEBUG_BUFSIZE - 1)] =
+				buf[i];
+		list->tail = (list->tail + i) % (HID_DEBUG_BUFSIZE - 1);
+        }
+}
+EXPORT_SYMBOL_GPL(hid_debug_event);
+
+void hid_dump_input(struct hid_device *hdev, struct hid_usage *usage, __s32 value)
+{
+	char *buf;
+	int len;
+
+	buf = hid_resolv_usage(usage->hid, NULL);
+	if (!buf)
 		return;
+	len = strlen(buf);
+	snprintf(buf + len, HID_DEBUG_BUFSIZE - len - 1, " = %d\n", value);
+
+	hid_debug_event(hdev, buf);
+
+	kfree(buf);
+        wake_up_interruptible(&hdev->debug_wait);
 
-	printk(KERN_DEBUG "hid-debug: input ");
-	hid_resolv_usage(usage->hid, NULL);
-	printk(" = %d\n", value);
 }
 EXPORT_SYMBOL_GPL(hid_dump_input);
 
@@ -808,6 +873,7 @@ void hid_dump_input_mapping(struct hid_device *hid, struct seq_file *f)
 
 }
 
+
 static int hid_debug_rdesc_show(struct seq_file *f, void *p)
 {
 	struct hid_device *hdev = f->private;
@@ -831,6 +897,126 @@ static int hid_debug_rdesc_open(struct inode *inode, struct file *file)
 	return single_open(file, hid_debug_rdesc_show, inode->i_private);
 }
 
+static int hid_debug_events_open(struct inode *inode, struct file *file)
+{
+	int err = 0;
+	struct hid_debug_list *list;
+
+	if (!(list = kzalloc(sizeof(struct hid_debug_list), GFP_KERNEL))) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	if (!(list->hid_debug_buf = kzalloc(sizeof(char) * HID_DEBUG_BUFSIZE, GFP_KERNEL))) {
+		err = -ENOMEM;
+		goto out;
+	}
+	list->hdev = (struct hid_device *) inode->i_private;
+	file->private_data = list;
+	mutex_init(&list->read_mutex);
+
+	list_add_tail(&list->node, &list->hdev->debug_list);
+
+out:
+	return err;
+}
+
+static ssize_t hid_debug_events_read(struct file *file, char __user *buffer,
+		size_t count, loff_t *ppos)
+{
+	struct hid_debug_list *list = file->private_data;
+	int ret = 0, len;
+	DECLARE_WAITQUEUE(wait, current);
+
+	while (ret == 0) {
+		mutex_lock(&list->read_mutex);
+		if (list->head == list->tail) {
+			add_wait_queue(&list->hdev->debug_wait, &wait);
+			set_current_state(TASK_INTERRUPTIBLE);
+
+			while (list->head == list->tail) {
+				if (file->f_flags & O_NONBLOCK) {
+					ret = -EAGAIN;
+					break;
+				}
+				if (signal_pending(current)) {
+					ret = -ERESTARTSYS;
+					break;
+				}
+
+				if (!list->hdev || !list->hdev->debug) {
+					ret = -EIO;
+					break;
+				}
+
+				/* allow O_NONBLOCK from other threads */
+				mutex_unlock(&list->read_mutex);
+				schedule();
+				mutex_lock(&list->read_mutex);
+				set_current_state(TASK_INTERRUPTIBLE);
+			}
+
+			set_current_state(TASK_RUNNING);
+			remove_wait_queue(&list->hdev->debug_wait, &wait);
+		}
+
+		if (ret)
+			goto out;
+
+		/* pass the ringbuffer contents to userspace */
+copy_rest:
+		if (list->tail == list->head)
+			goto out;
+		if (list->tail > list->head) {
+			len = list->tail - list->head;
+
+			if (copy_to_user(buffer + ret, &list->hid_debug_buf[list->head], len)) {
+				ret = -EFAULT;
+				goto out;
+			}
+			ret += len;
+			list->head += len;
+		} else {
+			len = HID_DEBUG_BUFSIZE - list->head;
+
+			if (copy_to_user(buffer, &list->hid_debug_buf[list->head], len)) {
+				ret = -EFAULT;
+				goto out;
+			}
+			list->head = 0;
+			ret += len;
+			goto copy_rest;
+		}
+
+	}
+out:
+	mutex_unlock(&list->read_mutex);
+	return ret;
+}
+
+static unsigned int hid_debug_events_poll(struct file *file, poll_table *wait)
+{
+	struct hid_debug_list *list = file->private_data;
+
+	poll_wait(file, &list->hdev->debug_wait, wait);
+	if (list->head != list->tail)
+		return POLLIN | POLLRDNORM;
+	if (!list->hdev->debug)
+		return POLLERR | POLLHUP;
+	return 0;
+}
+
+static int hid_debug_events_release(struct inode *inode, struct file *file)
+{
+	struct hid_debug_list *list = file->private_data;
+
+	list_del(&list->node);
+	kfree(list->hid_debug_buf);
+	kfree(list);
+
+	return 0;
+}
+
 static const struct file_operations hid_debug_rdesc_fops = {
 	.open           = hid_debug_rdesc_open,
 	.read           = seq_read,
@@ -838,16 +1024,31 @@ static const struct file_operations hid_debug_rdesc_fops = {
 	.release        = single_release,
 };
 
+static const struct file_operations hid_debug_events_fops = {
+	.owner =        THIS_MODULE,
+	.open           = hid_debug_events_open,
+	.read           = hid_debug_events_read,
+	.poll		= hid_debug_events_poll,
+	.release        = hid_debug_events_release,
+};
+
+
 void hid_debug_register(struct hid_device *hdev, const char *name)
 {
 	hdev->debug_dir = debugfs_create_dir(name, hid_debug_root);
 	hdev->debug_rdesc = debugfs_create_file("rdesc", 0400,
 			hdev->debug_dir, hdev, &hid_debug_rdesc_fops);
+	hdev->debug_events = debugfs_create_file("events", 0400,
+			hdev->debug_dir, hdev, &hid_debug_events_fops);
+	hdev->debug = 1;
 }
 
 void hid_debug_unregister(struct hid_device *hdev)
 {
+	hdev->debug = 0;
+	wake_up_interruptible(&hdev->debug_wait);
 	debugfs_remove(hdev->debug_rdesc);
+	debugfs_remove(hdev->debug_events);
 	debugfs_remove(hdev->debug_dir);
 }
 
diff --git a/include/linux/hid-debug.h b/include/linux/hid-debug.h
index 516e12c33235..ec08ac1ad687 100644
--- a/include/linux/hid-debug.h
+++ b/include/linux/hid-debug.h
@@ -24,14 +24,27 @@
 
 #ifdef CONFIG_DEBUG_FS
 
-void hid_dump_input(struct hid_usage *, __s32);
+void hid_dump_input(struct hid_device *, struct hid_usage *, __s32);
 void hid_dump_device(struct hid_device *, struct seq_file *);
 void hid_dump_field(struct hid_field *, int, struct seq_file *);
-void hid_resolv_usage(unsigned, struct seq_file *);
+char *hid_resolv_usage(unsigned, struct seq_file *);
 void hid_debug_register(struct hid_device *, const char *);
 void hid_debug_unregister(struct hid_device *);
 void hid_debug_init(void);
 void hid_debug_exit(void);
+void hid_debug_event(struct hid_device *, char *);
+
+#define HID_DEBUG_BUFSIZE 512
+
+struct hid_debug_list {
+	char *hid_debug_buf;
+	int head;
+	int tail;
+	struct fasync_struct *fasync;
+	struct hid_device *hdev;
+	struct list_head node;
+	struct mutex read_mutex;
+};
 
 #else
 
@@ -44,6 +57,7 @@ void hid_debug_exit(void);
 #define hid_debug_unregister(a)		do { } while (0)
 #define hid_debug_init()		do { } while (0)
 #define hid_debug_exit()		do { } while (0)
+#define hid_debug_event(a,b)		do { } while (0)
 
 #endif
 
diff --git a/include/linux/hid.h b/include/linux/hid.h
index da09ab140ef1..60fa52913f89 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -451,10 +451,6 @@ struct hid_device {							/* device report descriptor */
 	char phys[64];							/* Device physical location */
 	char uniq[64];							/* Device unique identifier (serial #) */
 
-	/* debugfs */
-	struct dentry *debug_dir;
-	struct dentry *debug_rdesc;
-
 	void *driver_data;
 
 	/* temporary hid_ff handling (until moved to the drivers) */
@@ -468,6 +464,14 @@ struct hid_device {							/* device report descriptor */
 
 	/* handler for raw output data, used by hidraw */
 	int (*hid_output_raw_report) (struct hid_device *, __u8 *, size_t);
+
+	/* debugging support via debugfs */
+	unsigned short debug;
+	struct dentry *debug_dir;
+	struct dentry *debug_rdesc;
+	struct dentry *debug_events;
+	struct list_head debug_list;
+	wait_queue_head_t debug_wait;
 };
 
 static inline void *hid_get_drvdata(struct hid_device *hdev)
@@ -625,9 +629,7 @@ struct hid_ll_driver {
 
 /* HID core API */
 
-#ifdef CONFIG_HID_DEBUG
 extern int hid_debug;
-#endif
 
 extern int hid_add_device(struct hid_device *);
 extern void hid_destroy_device(struct hid_device *);
@@ -783,21 +785,9 @@ int hid_pidff_init(struct hid_device *hid);
 #define hid_pidff_init NULL
 #endif
 
-#ifdef CONFIG_HID_DEBUG
 #define dbg_hid(format, arg...) if (hid_debug) \
 				printk(KERN_DEBUG "%s: " format ,\
 				__FILE__ , ## arg)
-#define dbg_hid_line(format, arg...) if (hid_debug) \
-				printk(format, ## arg)
-#else
-static inline int __attribute__((format(printf, 1, 2)))
-dbg_hid(const char *fmt, ...)
-{
-	return 0;
-}
-#define dbg_hid_line dbg_hid
-#endif /* HID_DEBUG */
-
 #define err_hid(format, arg...) printk(KERN_ERR "%s: " format "\n" , \
 		__FILE__ , ## arg)
 #endif /* HID_FF */
-- 
cgit v1.2.3-71-gd317


From e74e396204bfcb67570ba4517b08f5918e69afea Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 30 Mar 2009 19:07:44 +0900
Subject: percpu: use dynamic percpu allocator as the default percpu allocator

This patch makes most !CONFIG_HAVE_SETUP_PER_CPU_AREA archs use
dynamic percpu allocator.  The first chunk is allocated using
embedding helper and 8k is reserved for modules.  This ensures that
the new allocator behaves almost identically to the original allocator
as long as static percpu variables are concerned, so it shouldn't
introduce much breakage.

s390 and alpha use custom SHIFT_PERCPU_PTR() to work around addressing
range limit the addressing model imposes.  Unfortunately, this breaks
if the address is specified using a variable, so for now, the two
archs aren't converted.

The following architectures are affected by this change.

* sh
* arm
* cris
* mips
* sparc(32)
* blackfin
* avr32
* parisc (broken, under investigation)
* m32r
* powerpc(32)

As this change makes the dynamic allocator the default one,
CONFIG_HAVE_DYNAMIC_PER_CPU_AREA is replaced with its invert -
CONFIG_HAVE_LEGACY_PER_CPU_AREA, which is added to yet-to-be converted
archs.  These archs implement their own setup_per_cpu_areas() and the
conversion is not trivial.

* powerpc(64)
* sparc(64)
* ia64
* alpha
* s390

Boot and batch alloc/free tests on x86_32 with debug code (x86_32
doesn't use default first chunk initialization).  Compile tested on
sparc(32), powerpc(32), arm and alpha.

Kyle McMartin reported that this change breaks parisc.  The problem is
still under investigation and he is okay with pushing this patch
forward and fixing parisc later.

[ Impact: use dynamic allocator for most archs w/o custom percpu setup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Bryan Wu <cooloney@kernel.org>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Grant Grundler <grundler@parisc-linux.org>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/alpha/Kconfig     |  3 +++
 arch/ia64/Kconfig      |  3 +++
 arch/powerpc/Kconfig   |  3 +++
 arch/s390/Kconfig      |  3 +++
 arch/sparc/Kconfig     |  3 +++
 arch/x86/Kconfig       |  3 ---
 include/linux/percpu.h | 12 +++++++++---
 init/main.c            | 24 ------------------------
 kernel/module.c        |  6 +++---
 mm/Makefile            |  2 +-
 mm/allocpercpu.c       | 28 ++++++++++++++++++++++++++++
 mm/percpu.c            | 40 +++++++++++++++++++++++++++++++++++++++-
 12 files changed, 95 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 9fb8aae5c391..05d86407188c 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -70,6 +70,9 @@ config AUTO_IRQ_AFFINITY
 	depends on SMP
 	default y
 
+config HAVE_LEGACY_PER_CPU_AREA
+	def_bool y
+
 source "init/Kconfig"
 source "kernel/Kconfig.freezer"
 
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 170042b420d4..328d2f8b8c3f 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -89,6 +89,9 @@ config GENERIC_TIME_VSYSCALL
 	bool
 	default y
 
+config HAVE_LEGACY_PER_CPU_AREA
+	def_bool y
+
 config HAVE_SETUP_PER_CPU_AREA
 	def_bool y
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index bf6cedfa05db..a774c2acbe69 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -46,6 +46,9 @@ config GENERIC_HARDIRQS_NO__DO_IRQ
 	bool
 	default y
 
+config HAVE_LEGACY_PER_CPU_AREA
+	def_bool PPC64
+
 config HAVE_SETUP_PER_CPU_AREA
 	def_bool PPC64
 
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index a14dba0e4d67..f4a3cc62d28f 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -75,6 +75,9 @@ config VIRT_CPU_ACCOUNTING
 config ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	def_bool y
 
+config HAVE_LEGACY_PER_CPU_AREA
+	def_bool y
+
 mainmenu "Linux Kernel Configuration"
 
 config S390
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 3f8b6a92eabd..7a8698b913fe 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -92,6 +92,9 @@ config AUDIT_ARCH
 	bool
 	default y
 
+config HAVE_LEGACY_PER_CPU_AREA
+	def_bool y if SPARC64
+
 config HAVE_SETUP_PER_CPU_AREA
 	def_bool y if SPARC64
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d1430ef6b4f9..a48a90076d83 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -149,9 +149,6 @@ config ARCH_HAS_CACHE_LINE_SIZE
 config HAVE_SETUP_PER_CPU_AREA
 	def_bool y
 
-config HAVE_DYNAMIC_PER_CPU_AREA
-	def_bool y
-
 config HAVE_CPUMASK_OF_CPU_MAP
 	def_bool X86_64_SMP
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 26fd9d12f050..e5000343dd61 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -34,7 +34,7 @@
 
 #ifdef CONFIG_SMP
 
-#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
 
 /* minimum unit size, also is the maximum supported allocation size */
 #define PCPU_MIN_UNIT_SIZE		PFN_ALIGN(64 << 10)
@@ -80,7 +80,7 @@ extern ssize_t __init pcpu_embed_first_chunk(
 
 extern void *__alloc_reserved_percpu(size_t size, size_t align);
 
-#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+#else /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
 
 struct percpu_data {
 	void *ptrs[1];
@@ -99,11 +99,15 @@ struct percpu_data {
         (__typeof__(ptr))__p->ptrs[(cpu)];				\
 })
 
-#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
 
 extern void *__alloc_percpu(size_t size, size_t align);
 extern void free_percpu(void *__pdata);
 
+#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
+extern void __init setup_per_cpu_areas(void);
+#endif
+
 #else /* CONFIG_SMP */
 
 #define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
@@ -124,6 +128,8 @@ static inline void free_percpu(void *p)
 	kfree(p);
 }
 
+static inline void __init setup_per_cpu_areas(void) { }
+
 #endif /* CONFIG_SMP */
 
 #define alloc_percpu(type)	(type *)__alloc_percpu(sizeof(type), \
diff --git a/init/main.c b/init/main.c
index 09131ec090c1..602d724afa5c 100644
--- a/init/main.c
+++ b/init/main.c
@@ -357,7 +357,6 @@ static void __init smp_init(void)
 #define smp_init()	do { } while (0)
 #endif
 
-static inline void setup_per_cpu_areas(void) { }
 static inline void setup_nr_cpu_ids(void) { }
 static inline void smp_prepare_cpus(unsigned int maxcpus) { }
 
@@ -378,29 +377,6 @@ static void __init setup_nr_cpu_ids(void)
 	nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
 }
 
-#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
-
-EXPORT_SYMBOL(__per_cpu_offset);
-
-static void __init setup_per_cpu_areas(void)
-{
-	unsigned long size, i;
-	char *ptr;
-	unsigned long nr_possible_cpus = num_possible_cpus();
-
-	/* Copy section for each CPU (we discard the original) */
-	size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
-	ptr = alloc_bootmem_pages(size * nr_possible_cpus);
-
-	for_each_possible_cpu(i) {
-		__per_cpu_offset[i] = ptr - __per_cpu_start;
-		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
-		ptr += size;
-	}
-}
-#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
-
 /* Called by boot processor to activate the rest. */
 static void __init smp_init(void)
 {
diff --git a/kernel/module.c b/kernel/module.c
index 38928fcaff2b..f5934954fa99 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -364,7 +364,7 @@ EXPORT_SYMBOL_GPL(find_module);
 
 #ifdef CONFIG_SMP
 
-#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
 
 static void *percpu_modalloc(unsigned long size, unsigned long align,
 			     const char *name)
@@ -389,7 +389,7 @@ static void percpu_modfree(void *freeme)
 	free_percpu(freeme);
 }
 
-#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
 
 /* Number of blocks used and allocated. */
 static unsigned int pcpu_num_used, pcpu_num_allocated;
@@ -535,7 +535,7 @@ static int percpu_modinit(void)
 }
 __initcall(percpu_modinit);
 
-#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
 
 static unsigned int find_pcpusec(Elf_Ehdr *hdr,
 				 Elf_Shdr *sechdrs,
diff --git a/mm/Makefile b/mm/Makefile
index 5e0bd6426693..c77c6487552f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,7 +33,7 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
-ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
 obj-$(CONFIG_SMP) += percpu.o
 else
 obj-$(CONFIG_SMP) += allocpercpu.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index dfdee6a47359..df34ceae0c67 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -5,6 +5,8 @@
  */
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/bootmem.h>
+#include <asm/sections.h>
 
 #ifndef cache_line_size
 #define cache_line_size()	L1_CACHE_BYTES
@@ -147,3 +149,29 @@ void free_percpu(void *__pdata)
 	kfree(__percpu_disguise(__pdata));
 }
 EXPORT_SYMBOL_GPL(free_percpu);
+
+/*
+ * Generic percpu area setup.
+ */
+#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+
+EXPORT_SYMBOL(__per_cpu_offset);
+
+void __init setup_per_cpu_areas(void)
+{
+	unsigned long size, i;
+	char *ptr;
+	unsigned long nr_possible_cpus = num_possible_cpus();
+
+	/* Copy section for each CPU (we discard the original) */
+	size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
+	ptr = alloc_bootmem_pages(size * nr_possible_cpus);
+
+	for_each_possible_cpu(i) {
+		__per_cpu_offset[i] = ptr - __per_cpu_start;
+		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+		ptr += size;
+	}
+}
+#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --git a/mm/percpu.c b/mm/percpu.c
index b70f2acd8853..b14984566f5a 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -43,7 +43,7 @@
  *
  * To use this allocator, arch code should do the followings.
  *
- * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+ * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA
  *
  * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
  *   regular address to percpu pointer and back if they need to be
@@ -1275,3 +1275,41 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
 				      reserved_size, dyn_size,
 				      pcpue_unit_size, pcpue_ptr, NULL);
 }
+
+/*
+ * Generic percpu area setup.
+ *
+ * The embedding helper is used because its behavior closely resembles
+ * the original non-dynamic generic percpu area setup.  This is
+ * important because many archs have addressing restrictions and might
+ * fail if the percpu area is located far away from the previous
+ * location.  As an added bonus, in non-NUMA cases, embedding is
+ * generally a good idea TLB-wise because percpu area can piggy back
+ * on the physical linear memory mapping which uses large page
+ * mappings on applicable archs.
+ */
+#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(__per_cpu_offset);
+
+void __init setup_per_cpu_areas(void)
+{
+	size_t static_size = __per_cpu_end - __per_cpu_start;
+	ssize_t unit_size;
+	unsigned long delta;
+	unsigned int cpu;
+
+	/*
+	 * Always reserve area for module percpu variables.  That's
+	 * what the legacy allocator did.
+	 */
+	unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE,
+					   PERCPU_DYNAMIC_RESERVE, -1);
+	if (unit_size < 0)
+		panic("Failed to initialized percpu areas.");
+
+	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+	for_each_possible_cpu(cpu)
+		__per_cpu_offset[cpu] = delta + cpu * unit_size;
+}
+#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
-- 
cgit v1.2.3-71-gd317


From 7c756e6e19e71f0327760d8955f7077118ebb2b1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 24 Jun 2009 15:13:50 +0900
Subject: percpu: implement optional weak percpu definitions

Some archs (alpha and s390) need to use weak definitions for percpu
variables in modules so that the compiler generates external
references for them.

This patch implements weak percpu definitions which arch can enable by
defining ARCH_NEEDS_WEAK_PER_CPU in arch percpu header file.  This
weak definition adds the following two restrictions on percpu variable
definitions.

  1. percpu symbols must be unique whether static or not
  2. percpu variables can't be defined inside a function

To ensure that these restrictions are observed in generic code, config
option DEBUG_FORCE_WEAK_PER_CPU enables weak percpu definitions for
all cases.

This patch is inspired by Ivan Kokshaysky's alpha percpu patch.

[ Impact: stricter rules for percpu variables, one more debug config option ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Howells <dhowells@redhat.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
---
 include/linux/percpu-defs.h | 65 ++++++++++++++++++++++++++++++++++++++-------
 lib/Kconfig.debug           | 15 +++++++++++
 2 files changed, 71 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 8f921d74f49f..cf32838ad0fa 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -10,21 +10,68 @@
 /*
  * Base implementations of per-CPU variable declarations and definitions, where
  * the section in which the variable is to be placed is provided by the
- * 'section' argument.  This may be used to affect the parameters governing the
+ * 'sec' argument.  This may be used to affect the parameters governing the
  * variable's storage.
  *
  * NOTE!  The sections for the DECLARE and for the DEFINE must match, lest
  * linkage errors occur due the compiler generating the wrong code to access
  * that section.
  */
-#define DECLARE_PER_CPU_SECTION(type, name, section)			\
-	extern								\
-	__attribute__((__section__(PER_CPU_BASE_SECTION section)))	\
-	PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
-
-#define DEFINE_PER_CPU_SECTION(type, name, section)			\
-	__attribute__((__section__(PER_CPU_BASE_SECTION section)))	\
-	PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
+#define __PCPU_ATTRS(sec)						\
+	__attribute__((section(PER_CPU_BASE_SECTION sec)))		\
+	PER_CPU_ATTRIBUTES
+
+#define __PCPU_DUMMY_ATTRS						\
+	__attribute__((section(".discard"), unused))
+
+/*
+ * s390 and alpha modules require percpu variables to be defined as
+ * weak to force the compiler to generate GOT based external
+ * references for them.  This is necessary because percpu sections
+ * will be located outside of the usually addressable area.
+ *
+ * This definition puts the following two extra restrictions when
+ * defining percpu variables.
+ *
+ * 1. The symbol must be globally unique, even the static ones.
+ * 2. Static percpu variables cannot be defined inside a function.
+ *
+ * Archs which need weak percpu definitions should define
+ * ARCH_NEEDS_WEAK_PER_CPU in asm/percpu.h when necessary.
+ *
+ * To ensure that the generic code observes the above two
+ * restrictions, if CONFIG_DEBUG_FORCE_WEAK_PER_CPU is set weak
+ * definition is used for all cases.
+ */
+#if defined(ARCH_NEEDS_WEAK_PER_CPU) || defined(CONFIG_DEBUG_FORCE_WEAK_PER_CPU)
+/*
+ * __pcpu_scope_* dummy variable is used to enforce scope.  It
+ * receives the static modifier when it's used in front of
+ * DEFINE_PER_CPU() and will trigger build failure if
+ * DECLARE_PER_CPU() is used for the same variable.
+ *
+ * __pcpu_unique_* dummy variable is used to enforce symbol uniqueness
+ * such that hidden weak symbol collision, which will cause unrelated
+ * variables to share the same address, can be detected during build.
+ */
+#define DECLARE_PER_CPU_SECTION(type, name, sec)			\
+	extern __PCPU_DUMMY_ATTRS char __pcpu_scope_##name;		\
+	extern __PCPU_ATTRS(sec) __weak __typeof__(type) per_cpu__##name
+
+#define DEFINE_PER_CPU_SECTION(type, name, sec)				\
+	__PCPU_DUMMY_ATTRS char __pcpu_scope_##name;			\
+	__PCPU_DUMMY_ATTRS char __pcpu_unique_##name;			\
+	__PCPU_ATTRS(sec) __weak __typeof__(type) per_cpu__##name
+#else
+/*
+ * Normal declaration and definition macros.
+ */
+#define DECLARE_PER_CPU_SECTION(type, name, sec)			\
+	extern __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name
+
+#define DEFINE_PER_CPU_SECTION(type, name, sec)				\
+	__PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name
+#endif
 
 /*
  * Variant on the per-CPU variable declaration/definition theme used for
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 23067ab1a73c..77e0d8b1b7c5 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -777,6 +777,21 @@ config DEBUG_BLOCK_EXT_DEVT
 
 	  Say N if you are unsure.
 
+config DEBUG_FORCE_WEAK_PER_CPU
+	bool "Force weak per-cpu definitions"
+	depends on DEBUG_KERNEL
+	help
+	  s390 and alpha require percpu variables in modules to be
+	  defined weak to work around addressing range issue which
+	  puts the following two restrictions on percpu variable
+	  definitions.
+
+	  1. percpu symbols must be unique whether static or not
+	  2. percpu variables can't be defined inside a function
+
+	  To ensure that generic code follows the above rules, this
+	  option forces all percpu variables to be defined as weak.
+
 config LKDTM
 	tristate "Linux Kernel Dump Test Tool Module"
 	depends on DEBUG_KERNEL
-- 
cgit v1.2.3-71-gd317


From 38b7f49a0654cb52cac61c6455807248eee3059d Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Fri, 26 Jun 2009 10:48:34 +0200
Subject: HID: fix debugfs build with !CONFIG_DEBUG_FS

Fix the debug function prototypes to be correct even in the
!CONFIG_DEBUG_FS case.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/hid-debug.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hid-debug.h b/include/linux/hid-debug.h
index ec08ac1ad687..53744fa1c8b7 100644
--- a/include/linux/hid-debug.h
+++ b/include/linux/hid-debug.h
@@ -22,6 +22,8 @@
  *
  */
 
+#define HID_DEBUG_BUFSIZE 512
+
 #ifdef CONFIG_DEBUG_FS
 
 void hid_dump_input(struct hid_device *, struct hid_usage *, __s32);
@@ -34,7 +36,6 @@ void hid_debug_init(void);
 void hid_debug_exit(void);
 void hid_debug_event(struct hid_device *, char *);
 
-#define HID_DEBUG_BUFSIZE 512
 
 struct hid_debug_list {
 	char *hid_debug_buf;
@@ -48,11 +49,10 @@ struct hid_debug_list {
 
 #else
 
-#define hid_dump_input(a,b)		do { } while (0)
-#define hid_dump_device(c)		do { } while (0)
-#define hid_dump_field(a,b)		do { } while (0)
-#define hid_resolv_usage(a)		do { } while (0)
-#define hid_resolv_event(a,b)		do { } while (0)
+#define hid_dump_input(a,b,c)		do { } while (0)
+#define hid_dump_device(a,b)		do { } while (0)
+#define hid_dump_field(a,b,c)		do { } while (0)
+#define hid_resolv_usage(a,b)		do { } while (0)
 #define hid_debug_register(a, b)	do { } while (0)
 #define hid_debug_unregister(a)		do { } while (0)
 #define hid_debug_init()		do { } while (0)
-- 
cgit v1.2.3-71-gd317


From 1a8dd307cc0a2119be4e578c517795464e6dabba Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 29 Jun 2009 17:45:39 +0900
Subject: percpu: use __weak only in the definition of weak percpu variables

__weak is necessary only for definition and might even not work in
declaration.  Drop it from declaration.

This change was suggested by Ivan Kokshaysky.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
---
 include/linux/percpu-defs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index cf32838ad0fa..9b7a53cc16eb 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -56,7 +56,7 @@
  */
 #define DECLARE_PER_CPU_SECTION(type, name, sec)			\
 	extern __PCPU_DUMMY_ATTRS char __pcpu_scope_##name;		\
-	extern __PCPU_ATTRS(sec) __weak __typeof__(type) per_cpu__##name
+	extern __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name
 
 #define DEFINE_PER_CPU_SECTION(type, name, sec)				\
 	__PCPU_DUMMY_ATTRS char __pcpu_scope_##name;			\
-- 
cgit v1.2.3-71-gd317


From b294a290d24d1196d68399cc3a9b8c50bfb55abd Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@collabora.co.uk>
Date: Tue, 30 Jun 2009 02:13:01 -0400
Subject: Revert "power: remove POWER_SUPPLY_PROP_CAPACITY_LEVEL"

This reverts commit 8efe444038a205e79b38b7ad03878824901849a8 and
4cbc76eadf56399cd11fb736b33c53aec9caab8c.

Richard@laptop.org was apparently using CAPACITY_LEVEL for debugging
battery/EC problems, and was upset that it was removed.  This readds it.

Conflicts:

	Documentation/power_supply_class.txt

Signed-off-by: Andres Salomon <dilinger@collabora.co.uk>
Signed-off-by: Anton Vorontsov <cbouatmailru@gmail.com>
---
 Documentation/power/power_supply_class.txt |  2 ++
 drivers/power/olpc_battery.c               |  9 +++++++++
 drivers/power/power_supply_sysfs.c         |  6 ++++++
 include/linux/power_supply.h               | 10 ++++++++++
 4 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/power/power_supply_class.txt b/Documentation/power/power_supply_class.txt
index c6cd4956047c..709d95571d7b 100644
--- a/Documentation/power/power_supply_class.txt
+++ b/Documentation/power/power_supply_class.txt
@@ -108,6 +108,8 @@ relative, time-based measurements.
 ENERGY_FULL, ENERGY_EMPTY - same as above but for energy.
 
 CAPACITY - capacity in percents.
+CAPACITY_LEVEL - capacity level. This corresponds to
+POWER_SUPPLY_CAPACITY_LEVEL_*.
 
 TEMP - temperature of the power supply.
 TEMP_AMBIENT - ambient temperature.
diff --git a/drivers/power/olpc_battery.c b/drivers/power/olpc_battery.c
index 58e419299cd6..3a589df09376 100644
--- a/drivers/power/olpc_battery.c
+++ b/drivers/power/olpc_battery.c
@@ -276,6 +276,14 @@ static int olpc_bat_get_property(struct power_supply *psy,
 			return ret;
 		val->intval = ec_byte;
 		break;
+	case POWER_SUPPLY_PROP_CAPACITY_LEVEL:
+		if (ec_byte & BAT_STAT_FULL)
+			val->intval = POWER_SUPPLY_CAPACITY_LEVEL_FULL;
+		else if (ec_byte & BAT_STAT_LOW)
+			val->intval = POWER_SUPPLY_CAPACITY_LEVEL_LOW;
+		else
+			val->intval = POWER_SUPPLY_CAPACITY_LEVEL_NORMAL;
+		break;
 	case POWER_SUPPLY_PROP_TEMP:
 		ret = olpc_ec_cmd(EC_BAT_TEMP, NULL, 0, (void *)&ec_word, 2);
 		if (ret)
@@ -321,6 +329,7 @@ static enum power_supply_property olpc_bat_props[] = {
 	POWER_SUPPLY_PROP_VOLTAGE_AVG,
 	POWER_SUPPLY_PROP_CURRENT_AVG,
 	POWER_SUPPLY_PROP_CAPACITY,
+	POWER_SUPPLY_PROP_CAPACITY_LEVEL,
 	POWER_SUPPLY_PROP_TEMP,
 	POWER_SUPPLY_PROP_TEMP_AMBIENT,
 	POWER_SUPPLY_PROP_MANUFACTURER,
diff --git a/drivers/power/power_supply_sysfs.c b/drivers/power/power_supply_sysfs.c
index da73591017f9..9deabbde6fd6 100644
--- a/drivers/power/power_supply_sysfs.c
+++ b/drivers/power/power_supply_sysfs.c
@@ -51,6 +51,9 @@ static ssize_t power_supply_show_property(struct device *dev,
 		"Unknown", "NiMH", "Li-ion", "Li-poly", "LiFe", "NiCd",
 		"LiMn"
 	};
+	static char *capacity_level_text[] = {
+		"Unknown", "Critical", "Low", "Normal", "High", "Full"
+	};
 	ssize_t ret;
 	struct power_supply *psy = dev_get_drvdata(dev);
 	const ptrdiff_t off = attr - power_supply_attrs;
@@ -71,6 +74,8 @@ static ssize_t power_supply_show_property(struct device *dev,
 		return sprintf(buf, "%s\n", health_text[value.intval]);
 	else if (off == POWER_SUPPLY_PROP_TECHNOLOGY)
 		return sprintf(buf, "%s\n", technology_text[value.intval]);
+	else if (off == POWER_SUPPLY_PROP_CAPACITY_LEVEL)
+		return sprintf(buf, "%s\n", capacity_level_text[value.intval]);
 	else if (off >= POWER_SUPPLY_PROP_MODEL_NAME)
 		return sprintf(buf, "%s\n", value.strval);
 
@@ -109,6 +114,7 @@ static struct device_attribute power_supply_attrs[] = {
 	POWER_SUPPLY_ATTR(energy_now),
 	POWER_SUPPLY_ATTR(energy_avg),
 	POWER_SUPPLY_ATTR(capacity),
+	POWER_SUPPLY_ATTR(capacity_level),
 	POWER_SUPPLY_ATTR(temp),
 	POWER_SUPPLY_ATTR(temp_ambient),
 	POWER_SUPPLY_ATTR(time_to_empty_now),
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 594c494ac3f0..0ab6aa171241 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -58,6 +58,15 @@ enum {
 	POWER_SUPPLY_TECHNOLOGY_LiMn,
 };
 
+enum {
+	POWER_SUPPLY_CAPACITY_LEVEL_UNKNOWN = 0,
+	POWER_SUPPLY_CAPACITY_LEVEL_CRITICAL,
+	POWER_SUPPLY_CAPACITY_LEVEL_LOW,
+	POWER_SUPPLY_CAPACITY_LEVEL_NORMAL,
+	POWER_SUPPLY_CAPACITY_LEVEL_HIGH,
+	POWER_SUPPLY_CAPACITY_LEVEL_FULL,
+};
+
 enum power_supply_property {
 	/* Properties of type `int' */
 	POWER_SUPPLY_PROP_STATUS = 0,
@@ -89,6 +98,7 @@ enum power_supply_property {
 	POWER_SUPPLY_PROP_ENERGY_NOW,
 	POWER_SUPPLY_PROP_ENERGY_AVG,
 	POWER_SUPPLY_PROP_CAPACITY, /* in percents! */
+	POWER_SUPPLY_PROP_CAPACITY_LEVEL,
 	POWER_SUPPLY_PROP_TEMP,
 	POWER_SUPPLY_PROP_TEMP_AMBIENT,
 	POWER_SUPPLY_PROP_TIME_TO_EMPTY_NOW,
-- 
cgit v1.2.3-71-gd317


From ee8076ed3e1cdd0cd1e61318386932669c90b92f Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@collabora.co.uk>
Date: Thu, 2 Jul 2009 09:45:18 -0400
Subject: power_supply: Add a charge_type property, and use it for olpc driver

This adds a new sysfs file called 'charge_type' which displays the
type of charging (unknown, n/a, trickle charge, or fast charging).

This allows things like battery diagnostics to determine what the
battery/EC is doing without resorting to changing the 'status' sysfs
output.

Signed-off-by: Andres Salomon <dilinger@collabora.co.uk>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Anton Vorontsov <cbouatmailru@gmail.com>
---
 Documentation/power/power_supply_class.txt | 5 +++++
 drivers/power/olpc_battery.c               | 9 +++++++++
 drivers/power/power_supply_sysfs.c         | 6 ++++++
 include/linux/power_supply.h               | 8 ++++++++
 4 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/power/power_supply_class.txt b/Documentation/power/power_supply_class.txt
index 709d95571d7b..9f16c5178b66 100644
--- a/Documentation/power/power_supply_class.txt
+++ b/Documentation/power/power_supply_class.txt
@@ -76,6 +76,11 @@ STATUS - this attribute represents operating status (charging, full,
 discharging (i.e. powering a load), etc.). This corresponds to
 BATTERY_STATUS_* values, as defined in battery.h.
 
+CHARGE_TYPE - batteries can typically charge at different rates.
+This defines trickle and fast charges.  For batteries that
+are already charged or discharging, 'n/a' can be displayed (or
+'unknown', if the status is not known).
+
 HEALTH - represents health of the battery, values corresponds to
 POWER_SUPPLY_HEALTH_*, defined in battery.h.
 
diff --git a/drivers/power/olpc_battery.c b/drivers/power/olpc_battery.c
index 602bbd008f78..8fefe5a73558 100644
--- a/drivers/power/olpc_battery.c
+++ b/drivers/power/olpc_battery.c
@@ -233,6 +233,14 @@ static int olpc_bat_get_property(struct power_supply *psy,
 		if (ret)
 			return ret;
 		break;
+	case POWER_SUPPLY_PROP_CHARGE_TYPE:
+		if (ec_byte & BAT_STAT_TRICKLE)
+			val->intval = POWER_SUPPLY_CHARGE_TYPE_TRICKLE;
+		else if (ec_byte & BAT_STAT_CHARGING)
+			val->intval = POWER_SUPPLY_CHARGE_TYPE_FAST;
+		else
+			val->intval = POWER_SUPPLY_CHARGE_TYPE_NONE;
+		break;
 	case POWER_SUPPLY_PROP_PRESENT:
 		val->intval = !!(ec_byte & (BAT_STAT_PRESENT |
 					    BAT_STAT_TRICKLE));
@@ -325,6 +333,7 @@ static int olpc_bat_get_property(struct power_supply *psy,
 
 static enum power_supply_property olpc_bat_props[] = {
 	POWER_SUPPLY_PROP_STATUS,
+	POWER_SUPPLY_PROP_CHARGE_TYPE,
 	POWER_SUPPLY_PROP_PRESENT,
 	POWER_SUPPLY_PROP_HEALTH,
 	POWER_SUPPLY_PROP_TECHNOLOGY,
diff --git a/drivers/power/power_supply_sysfs.c b/drivers/power/power_supply_sysfs.c
index 9deabbde6fd6..08144393d64b 100644
--- a/drivers/power/power_supply_sysfs.c
+++ b/drivers/power/power_supply_sysfs.c
@@ -43,6 +43,9 @@ static ssize_t power_supply_show_property(struct device *dev,
 	static char *status_text[] = {
 		"Unknown", "Charging", "Discharging", "Not charging", "Full"
 	};
+	static char *charge_type[] = {
+		"Unknown", "N/A", "Trickle", "Fast"
+	};
 	static char *health_text[] = {
 		"Unknown", "Good", "Overheat", "Dead", "Over voltage",
 		"Unspecified failure", "Cold",
@@ -70,6 +73,8 @@ static ssize_t power_supply_show_property(struct device *dev,
 
 	if (off == POWER_SUPPLY_PROP_STATUS)
 		return sprintf(buf, "%s\n", status_text[value.intval]);
+	else if (off == POWER_SUPPLY_PROP_CHARGE_TYPE)
+		return sprintf(buf, "%s\n", charge_type[value.intval]);
 	else if (off == POWER_SUPPLY_PROP_HEALTH)
 		return sprintf(buf, "%s\n", health_text[value.intval]);
 	else if (off == POWER_SUPPLY_PROP_TECHNOLOGY)
@@ -86,6 +91,7 @@ static ssize_t power_supply_show_property(struct device *dev,
 static struct device_attribute power_supply_attrs[] = {
 	/* Properties of type `int' */
 	POWER_SUPPLY_ATTR(status),
+	POWER_SUPPLY_ATTR(charge_type),
 	POWER_SUPPLY_ATTR(health),
 	POWER_SUPPLY_ATTR(present),
 	POWER_SUPPLY_ATTR(online),
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 0ab6aa171241..4c7c6fc35487 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -38,6 +38,13 @@ enum {
 	POWER_SUPPLY_STATUS_FULL,
 };
 
+enum {
+	POWER_SUPPLY_CHARGE_TYPE_UNKNOWN = 0,
+	POWER_SUPPLY_CHARGE_TYPE_NONE,
+	POWER_SUPPLY_CHARGE_TYPE_TRICKLE,
+	POWER_SUPPLY_CHARGE_TYPE_FAST,
+};
+
 enum {
 	POWER_SUPPLY_HEALTH_UNKNOWN = 0,
 	POWER_SUPPLY_HEALTH_GOOD,
@@ -70,6 +77,7 @@ enum {
 enum power_supply_property {
 	/* Properties of type `int' */
 	POWER_SUPPLY_PROP_STATUS = 0,
+	POWER_SUPPLY_PROP_CHARGE_TYPE,
 	POWER_SUPPLY_PROP_HEALTH,
 	POWER_SUPPLY_PROP_PRESENT,
 	POWER_SUPPLY_PROP_ONLINE,
-- 
cgit v1.2.3-71-gd317


From 788e5abc5441e9046dd91c995c6f1f75bbd144bf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 4 Jul 2009 08:10:58 +0900
Subject: percpu: drop @unit_size from embed first chunk allocator

The only extra feature @unit_size provides is making dead space at the
end of the first chunk which doesn't have any valid usecase.  Drop the
parameter.  This will increase consistency with generalized 4k
allocator.

James Bottomley spotted missing conversion for the default
setup_per_cpu_areas() which caused build breakage on all arcsh which
use it.

[ Impact: drop unused code path ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_percpu.c |  2 +-
 include/linux/percpu.h         |  2 +-
 mm/percpu.c                    | 18 ++++++------------
 3 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 29a3eef7cf4a..14728206fb52 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -342,7 +342,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
 		return -EINVAL;
 
 	return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
-				      reserve - PERCPU_FIRST_CHUNK_RESERVE, -1);
+				      reserve - PERCPU_FIRST_CHUNK_RESERVE);
 }
 
 /*
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index e5000343dd61..83bff053bd1c 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -69,7 +69,7 @@ extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 
 extern ssize_t __init pcpu_embed_first_chunk(
 				size_t static_size, size_t reserved_size,
-				ssize_t dyn_size, ssize_t unit_size);
+				ssize_t dyn_size);
 
 /*
  * Use this to get to a cpu's version of the per-cpu object
diff --git a/mm/percpu.c b/mm/percpu.c
index 19dd83b5cbdc..fc6babe6e554 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1207,7 +1207,6 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
  * @static_size: the size of static percpu area in bytes
  * @reserved_size: the size of reserved percpu area in bytes
  * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
- * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
  *
  * This is a helper to ease setting up embedded first percpu chunk and
  * can be called where pcpu_setup_first_chunk() is expected.
@@ -1219,9 +1218,9 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
  * page size.
  *
  * When @dyn_size is positive, dynamic area might be larger than
- * specified to fill page alignment.  Also, when @dyn_size is auto,
- * @dyn_size does not fill the whole first chunk but only what's
- * necessary for page alignment after static and reserved areas.
+ * specified to fill page alignment.  When @dyn_size is auto,
+ * @dyn_size is just big enough to fill page alignment after static
+ * and reserved areas.
  *
  * If the needed size is smaller than the minimum or specified unit
  * size, the leftover is returned to the bootmem allocator.
@@ -1231,7 +1230,7 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
  * percpu access on success, -errno on failure.
  */
 ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
-				      ssize_t dyn_size, ssize_t unit_size)
+				      ssize_t dyn_size)
 {
 	size_t chunk_size;
 	unsigned int cpu;
@@ -1242,12 +1241,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
 	if (dyn_size != 0)
 		dyn_size = pcpue_size - static_size - reserved_size;
 
-	if (unit_size >= 0) {
-		BUG_ON(unit_size < pcpue_size);
-		pcpue_unit_size = unit_size;
-	} else
-		pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
-
+	pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
 	chunk_size = pcpue_unit_size * num_possible_cpus();
 
 	pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
@@ -1304,7 +1298,7 @@ void __init setup_per_cpu_areas(void)
 	 * what the legacy allocator did.
 	 */
 	unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE,
-					   PERCPU_DYNAMIC_RESERVE, -1);
+					   PERCPU_DYNAMIC_RESERVE);
 	if (unit_size < 0)
 		panic("Failed to initialized percpu areas.");
 
-- 
cgit v1.2.3-71-gd317


From d4b95f80399471e4bce5e992700ff7f06ef91f6a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 4 Jul 2009 08:10:59 +0900
Subject: x86,percpu: generalize 4k first chunk allocator

Generalize and move x86 setup_pcpu_4k() into pcpu_4k_first_chunk().
setup_pcpu_4k() now is a simple wrapper around the generalized
version.  Other than taking size parameters and using arch supplied
callbacks to allocate/free memory, pcpu_4k_first_chunk() is identical
to the original implementation.

This simplifies arch code and will help converting more archs to
dynamic percpu allocator.

While at it, s/pcpu_populate_pte_fn_t/pcpu_fc_populate_pte_fn_t/ for
consistency.

[ Impact: code reorganization and generalization ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_percpu.c | 78 ++++++++++----------------------------
 include/linux/percpu.h         | 12 +++++-
 mm/percpu.c                    | 85 +++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 113 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 14728206fb52..ab896b31e80b 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -123,6 +123,19 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
 #endif
 }
 
+/*
+ * Helpers for first chunk memory allocation
+ */
+static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size)
+{
+	return pcpu_alloc_bootmem(cpu, size, size);
+}
+
+static void __init pcpu_fc_free(void *ptr, size_t size)
+{
+	free_bootmem(__pa(ptr), size);
+}
+
 /*
  * Large page remap allocator
  *
@@ -346,22 +359,11 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
 }
 
 /*
- * 4k page allocator
+ * 4k allocator
  *
- * This is the basic allocator.  Static percpu area is allocated
- * page-by-page and most of initialization is done by the generic
- * setup function.
+ * Boring fallback 4k allocator.  This allocator puts more pressure on
+ * PTE TLBs but other than that behaves nicely on both UMA and NUMA.
  */
-static struct page **pcpu4k_pages __initdata;
-static int pcpu4k_nr_static_pages __initdata;
-
-static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
-{
-	if (pageno < pcpu4k_nr_static_pages)
-		return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
-	return NULL;
-}
-
 static void __init pcpu4k_populate_pte(unsigned long addr)
 {
 	populate_extra_pte(addr);
@@ -369,51 +371,9 @@ static void __init pcpu4k_populate_pte(unsigned long addr)
 
 static ssize_t __init setup_pcpu_4k(size_t static_size)
 {
-	size_t pages_size;
-	unsigned int cpu;
-	int i, j;
-	ssize_t ret;
-
-	pcpu4k_nr_static_pages = PFN_UP(static_size);
-
-	/* unaligned allocations can't be freed, round up to page size */
-	pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
-			       * sizeof(pcpu4k_pages[0]));
-	pcpu4k_pages = alloc_bootmem(pages_size);
-
-	/* allocate and copy */
-	j = 0;
-	for_each_possible_cpu(cpu)
-		for (i = 0; i < pcpu4k_nr_static_pages; i++) {
-			void *ptr;
-
-			ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
-			if (!ptr) {
-				pr_warning("PERCPU: failed to allocate "
-					   "4k page for cpu%u\n", cpu);
-				goto enomem;
-			}
-
-			memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
-			pcpu4k_pages[j++] = virt_to_page(ptr);
-		}
-
-	/* we're ready, commit */
-	pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
-		pcpu4k_nr_static_pages, static_size);
-
-	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
-				     PERCPU_FIRST_CHUNK_RESERVE, -1,
-				     -1, NULL, pcpu4k_populate_pte);
-	goto out_free_ar;
-
-enomem:
-	while (--j >= 0)
-		free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
-	ret = -ENOMEM;
-out_free_ar:
-	free_bootmem(__pa(pcpu4k_pages), pages_size);
-	return ret;
+	return pcpu_4k_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
+				   pcpu_fc_alloc, pcpu_fc_free,
+				   pcpu4k_populate_pte);
 }
 
 /* for explicit first chunk allocator selection */
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 83bff053bd1c..41b5bfab4195 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -59,18 +59,26 @@
 extern void *pcpu_base_addr;
 
 typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
-typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
+typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size);
+typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
+typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
 
 extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 				size_t static_size, size_t reserved_size,
 				ssize_t dyn_size, ssize_t unit_size,
 				void *base_addr,
-				pcpu_populate_pte_fn_t populate_pte_fn);
+				pcpu_fc_populate_pte_fn_t populate_pte_fn);
 
 extern ssize_t __init pcpu_embed_first_chunk(
 				size_t static_size, size_t reserved_size,
 				ssize_t dyn_size);
 
+extern ssize_t __init pcpu_4k_first_chunk(
+				size_t static_size, size_t reserved_size,
+				pcpu_fc_alloc_fn_t alloc_fn,
+				pcpu_fc_free_fn_t free_fn,
+				pcpu_fc_populate_pte_fn_t populate_pte_fn);
+
 /*
  * Use this to get to a cpu's version of the per-cpu object
  * dynamically allocated. Non-atomic access to the current CPU's
diff --git a/mm/percpu.c b/mm/percpu.c
index fc6babe6e554..27b0f40a3ea8 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1037,7 +1037,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 				     size_t static_size, size_t reserved_size,
 				     ssize_t dyn_size, ssize_t unit_size,
 				     void *base_addr,
-				     pcpu_populate_pte_fn_t populate_pte_fn)
+				     pcpu_fc_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct first_vm;
 	static int smap[2], dmap[2];
@@ -1270,6 +1270,89 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
 				      pcpue_unit_size, pcpue_ptr, NULL);
 }
 
+/*
+ * 4k page first chunk setup helper.
+ */
+static struct page **pcpu4k_pages __initdata;
+static int pcpu4k_nr_static_pages __initdata;
+
+static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
+{
+	if (pageno < pcpu4k_nr_static_pages)
+		return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
+	return NULL;
+}
+
+/**
+ * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages
+ * @static_size: the size of static percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
+ * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
+ * @populate_pte_fn: function to populate pte
+ *
+ * This is a helper to ease setting up embedded first percpu chunk and
+ * can be called where pcpu_setup_first_chunk() is expected.
+ *
+ * This is the basic allocator.  Static percpu area is allocated
+ * page-by-page into vmalloc area.
+ *
+ * RETURNS:
+ * The determined pcpu_unit_size which can be used to initialize
+ * percpu access on success, -errno on failure.
+ */
+ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
+				   pcpu_fc_alloc_fn_t alloc_fn,
+				   pcpu_fc_free_fn_t free_fn,
+				   pcpu_fc_populate_pte_fn_t populate_pte_fn)
+{
+	size_t pages_size;
+	unsigned int cpu;
+	int i, j;
+	ssize_t ret;
+
+	pcpu4k_nr_static_pages = PFN_UP(static_size);
+
+	/* unaligned allocations can't be freed, round up to page size */
+	pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() *
+			       sizeof(pcpu4k_pages[0]));
+	pcpu4k_pages = alloc_bootmem(pages_size);
+
+	/* allocate and copy */
+	j = 0;
+	for_each_possible_cpu(cpu)
+		for (i = 0; i < pcpu4k_nr_static_pages; i++) {
+			void *ptr;
+
+			ptr = alloc_fn(cpu, PAGE_SIZE);
+			if (!ptr) {
+				pr_warning("PERCPU: failed to allocate "
+					   "4k page for cpu%u\n", cpu);
+				goto enomem;
+			}
+
+			memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
+			pcpu4k_pages[j++] = virt_to_page(ptr);
+		}
+
+	/* we're ready, commit */
+	pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
+		pcpu4k_nr_static_pages, static_size);
+
+	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
+				     reserved_size, -1,
+				     -1, NULL, populate_pte_fn);
+	goto out_free_ar;
+
+enomem:
+	while (--j >= 0)
+		free_fn(page_address(pcpu4k_pages[j]), PAGE_SIZE);
+	ret = -ENOMEM;
+out_free_ar:
+	free_bootmem(__pa(pcpu4k_pages), pages_size);
+	return ret;
+}
+
 /*
  * Generic percpu area setup.
  *
-- 
cgit v1.2.3-71-gd317


From 8c4bfc6e8801616ab2e01c38140b2159b388d2ff Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 4 Jul 2009 08:10:59 +0900
Subject: x86,percpu: generalize lpage first chunk allocator

Generalize and move x86 setup_pcpu_lpage() into
pcpu_lpage_first_chunk().  setup_pcpu_lpage() now is a simple wrapper
around the generalized version.  Other than taking size parameters and
using arch supplied callbacks to allocate/free/map memory,
pcpu_lpage_first_chunk() is identical to the original implementation.

This simplifies arch code and will help converting more archs to
dynamic percpu allocator.

While at it, factor out pcpu_calc_fc_sizes() which is common to
pcpu_embed_first_chunk() and pcpu_lpage_first_chunk().

[ Impact: code reorganization and generalization ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/percpu.h  |   9 --
 arch/x86/kernel/setup_percpu.c | 169 +++------------------------------
 arch/x86/mm/pageattr.c         |   1 +
 include/linux/percpu.h         |  27 ++++++
 mm/percpu.c                    | 209 ++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 244 insertions(+), 171 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 103f1ddb0d85..a18c038a3079 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -156,15 +156,6 @@ do {							\
 /* We can use this directly for local CPU (faster). */
 DECLARE_PER_CPU(unsigned long, this_cpu_off);
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-void *pcpu_lpage_remapped(void *kaddr);
-#else
-static inline void *pcpu_lpage_remapped(void *kaddr)
-{
-	return NULL;
-}
-#endif
-
 #endif /* !__ASSEMBLY__ */
 
 #ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ab896b31e80b..4f2e0ac9130b 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -137,44 +137,21 @@ static void __init pcpu_fc_free(void *ptr, size_t size)
 }
 
 /*
- * Large page remap allocator
- *
- * This allocator uses PMD page as unit.  A PMD page is allocated for
- * each cpu and each is remapped into vmalloc area using PMD mapping.
- * As PMD page is quite large, only part of it is used for the first
- * chunk.  Unused part is returned to the bootmem allocator.
- *
- * So, the PMD pages are mapped twice - once to the physical mapping
- * and to the vmalloc area for the first percpu chunk.  The double
- * mapping does add one more PMD TLB entry pressure but still is much
- * better than only using 4k mappings while still being NUMA friendly.
+ * Large page remapping allocator
  */
 #ifdef CONFIG_NEED_MULTIPLE_NODES
-struct pcpul_ent {
-	unsigned int	cpu;
-	void		*ptr;
-};
-
-static size_t pcpul_size;
-static struct pcpul_ent *pcpul_map;
-static struct vm_struct pcpul_vm;
-
-static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
+static void __init pcpul_map(void *ptr, size_t size, void *addr)
 {
-	size_t off = (size_t)pageno << PAGE_SHIFT;
+	pmd_t *pmd, pmd_v;
 
-	if (off >= pcpul_size)
-		return NULL;
-
-	return virt_to_page(pcpul_map[cpu].ptr + off);
+	pmd = populate_extra_pmd((unsigned long)addr);
+	pmd_v = pfn_pmd(page_to_pfn(virt_to_page(ptr)), PAGE_KERNEL_LARGE);
+	set_pmd(pmd, pmd_v);
 }
 
 static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
 {
-	size_t map_size, dyn_size;
-	unsigned int cpu;
-	int i, j;
-	ssize_t ret;
+	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
 
 	if (!chosen) {
 		size_t vm_size = VMALLOC_END - VMALLOC_START;
@@ -198,134 +175,10 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
 		return -EINVAL;
 	}
 
-	/*
-	 * Currently supports only single page.  Supporting multiple
-	 * pages won't be too difficult if it ever becomes necessary.
-	 */
-	pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
-			       PERCPU_DYNAMIC_RESERVE);
-	if (pcpul_size > PMD_SIZE) {
-		pr_warning("PERCPU: static data is larger than large page, "
-			   "can't use large page\n");
-		return -EINVAL;
-	}
-	dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
-
-	/* allocate pointer array and alloc large pages */
-	map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0]));
-	pcpul_map = alloc_bootmem(map_size);
-
-	for_each_possible_cpu(cpu) {
-		pcpul_map[cpu].cpu = cpu;
-		pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
-							PMD_SIZE);
-		if (!pcpul_map[cpu].ptr) {
-			pr_warning("PERCPU: failed to allocate large page "
-				   "for cpu%u\n", cpu);
-			goto enomem;
-		}
-
-		/*
-		 * Only use pcpul_size bytes and give back the rest.
-		 *
-		 * Ingo: The 2MB up-rounding bootmem is needed to make
-		 * sure the partial 2MB page is still fully RAM - it's
-		 * not well-specified to have a PAT-incompatible area
-		 * (unmapped RAM, device memory, etc.) in that hole.
-		 */
-		free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
-			     PMD_SIZE - pcpul_size);
-
-		memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
-	}
-
-	/* allocate address and map */
-	pcpul_vm.flags = VM_ALLOC;
-	pcpul_vm.size = num_possible_cpus() * PMD_SIZE;
-	vm_area_register_early(&pcpul_vm, PMD_SIZE);
-
-	for_each_possible_cpu(cpu) {
-		pmd_t *pmd, pmd_v;
-
-		pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
-					 cpu * PMD_SIZE);
-		pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
-				PAGE_KERNEL_LARGE);
-		set_pmd(pmd, pmd_v);
-	}
-
-	/* we're ready, commit */
-	pr_info("PERCPU: Remapped at %p with large pages, static data "
-		"%zu bytes\n", pcpul_vm.addr, static_size);
-
-	ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
-				     PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
-				     PMD_SIZE, pcpul_vm.addr, NULL);
-
-	/* sort pcpul_map array for pcpu_lpage_remapped() */
-	for (i = 0; i < num_possible_cpus() - 1; i++)
-		for (j = i + 1; j < num_possible_cpus(); j++)
-			if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
-				struct pcpul_ent tmp = pcpul_map[i];
-				pcpul_map[i] = pcpul_map[j];
-				pcpul_map[j] = tmp;
-			}
-
-	return ret;
-
-enomem:
-	for_each_possible_cpu(cpu)
-		if (pcpul_map[cpu].ptr)
-			free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
-	free_bootmem(__pa(pcpul_map), map_size);
-	return -ENOMEM;
-}
-
-/**
- * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
- * @kaddr: the kernel address in question
- *
- * Determine whether @kaddr falls in the pcpul recycled area.  This is
- * used by pageattr to detect VM aliases and break up the pcpu PMD
- * mapping such that the same physical page is not mapped under
- * different attributes.
- *
- * The recycled area is always at the tail of a partially used PMD
- * page.
- *
- * RETURNS:
- * Address of corresponding remapped pcpu address if match is found;
- * otherwise, NULL.
- */
-void *pcpu_lpage_remapped(void *kaddr)
-{
-	void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK);
-	unsigned long offset = (unsigned long)kaddr & ~PMD_MASK;
-	int left = 0, right = num_possible_cpus() - 1;
-	int pos;
-
-	/* pcpul in use at all? */
-	if (!pcpul_map)
-		return NULL;
-
-	/* okay, perform binary search */
-	while (left <= right) {
-		pos = (left + right) / 2;
-
-		if (pcpul_map[pos].ptr < pmd_addr)
-			left = pos + 1;
-		else if (pcpul_map[pos].ptr > pmd_addr)
-			right = pos - 1;
-		else {
-			/* it shouldn't be in the area for the first chunk */
-			WARN_ON(offset < pcpul_size);
-
-			return pcpul_vm.addr +
-				pcpul_map[pos].cpu * PMD_SIZE + offset;
-		}
-	}
-
-	return NULL;
+	return pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
+				      reserve - PERCPU_FIRST_CHUNK_RESERVE,
+				      PMD_SIZE,
+				      pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
 }
 #else
 static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 1b734d7a8966..c106f7852424 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -12,6 +12,7 @@
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
 #include <linux/pfn.h>
+#include <linux/percpu.h>
 
 #include <asm/e820.h>
 #include <asm/processor.h>
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 41b5bfab4195..9f6bfd7d4b92 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -62,6 +62,7 @@ typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
 typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size);
 typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
 typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
+typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr);
 
 extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 				size_t static_size, size_t reserved_size,
@@ -79,6 +80,32 @@ extern ssize_t __init pcpu_4k_first_chunk(
 				pcpu_fc_free_fn_t free_fn,
 				pcpu_fc_populate_pte_fn_t populate_pte_fn);
 
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+extern ssize_t __init pcpu_lpage_first_chunk(
+				size_t static_size, size_t reserved_size,
+				ssize_t dyn_size, size_t lpage_size,
+				pcpu_fc_alloc_fn_t alloc_fn,
+				pcpu_fc_free_fn_t free_fn,
+				pcpu_fc_map_fn_t map_fn);
+
+extern void *pcpu_lpage_remapped(void *kaddr);
+#else
+static inline ssize_t __init pcpu_lpage_first_chunk(
+				size_t static_size, size_t reserved_size,
+				ssize_t dyn_size, size_t lpage_size,
+				pcpu_fc_alloc_fn_t alloc_fn,
+				pcpu_fc_free_fn_t free_fn,
+				pcpu_fc_map_fn_t map_fn)
+{
+	return -EINVAL;
+}
+
+static inline void *pcpu_lpage_remapped(void *kaddr)
+{
+	return NULL;
+}
+#endif
+
 /*
  * Use this to get to a cpu's version of the per-cpu object
  * dynamically allocated. Non-atomic access to the current CPU's
diff --git a/mm/percpu.c b/mm/percpu.c
index f3fe7bc7378f..17db527ee2e2 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1190,6 +1190,19 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	return pcpu_unit_size;
 }
 
+static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size,
+				 ssize_t *dyn_sizep)
+{
+	size_t size_sum;
+
+	size_sum = PFN_ALIGN(static_size + reserved_size +
+			     (*dyn_sizep >= 0 ? *dyn_sizep : 0));
+	if (*dyn_sizep != 0)
+		*dyn_sizep = size_sum - static_size - reserved_size;
+
+	return size_sum;
+}
+
 /*
  * Embedding first chunk setup helper.
  */
@@ -1241,10 +1254,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
 	unsigned int cpu;
 
 	/* determine parameters and allocate */
-	pcpue_size = PFN_ALIGN(static_size + reserved_size +
-			       (dyn_size >= 0 ? dyn_size : 0));
-	if (dyn_size != 0)
-		dyn_size = pcpue_size - static_size - reserved_size;
+	pcpue_size = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
 
 	pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
 	chunk_size = pcpue_unit_size * num_possible_cpus();
@@ -1390,6 +1400,197 @@ out_free_ar:
 	return ret;
 }
 
+/*
+ * Large page remapping first chunk setup helper
+ */
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+struct pcpul_ent {
+	unsigned int	cpu;
+	void		*ptr;
+};
+
+static size_t pcpul_size;
+static size_t pcpul_unit_size;
+static struct pcpul_ent *pcpul_map;
+static struct vm_struct pcpul_vm;
+
+static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
+{
+	size_t off = (size_t)pageno << PAGE_SHIFT;
+
+	if (off >= pcpul_size)
+		return NULL;
+
+	return virt_to_page(pcpul_map[cpu].ptr + off);
+}
+
+/**
+ * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
+ * @static_size: the size of static percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
+ * @lpage_size: the size of a large page
+ * @alloc_fn: function to allocate percpu lpage, always called with lpage_size
+ * @free_fn: function to free percpu memory, @size <= lpage_size
+ * @map_fn: function to map percpu lpage, always called with lpage_size
+ *
+ * This allocator uses large page as unit.  A large page is allocated
+ * for each cpu and each is remapped into vmalloc area using large
+ * page mapping.  As large page can be quite large, only part of it is
+ * used for the first chunk.  Unused part is returned to the bootmem
+ * allocator.
+ *
+ * So, the large pages are mapped twice - once to the physical mapping
+ * and to the vmalloc area for the first percpu chunk.  The double
+ * mapping does add one more large TLB entry pressure but still is
+ * much better than only using 4k mappings while still being NUMA
+ * friendly.
+ *
+ * RETURNS:
+ * The determined pcpu_unit_size which can be used to initialize
+ * percpu access on success, -errno on failure.
+ */
+ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
+				      ssize_t dyn_size, size_t lpage_size,
+				      pcpu_fc_alloc_fn_t alloc_fn,
+				      pcpu_fc_free_fn_t free_fn,
+				      pcpu_fc_map_fn_t map_fn)
+{
+	size_t size_sum;
+	size_t map_size;
+	unsigned int cpu;
+	int i, j;
+	ssize_t ret;
+
+	/*
+	 * Currently supports only single page.  Supporting multiple
+	 * pages won't be too difficult if it ever becomes necessary.
+	 */
+	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
+
+	pcpul_unit_size = lpage_size;
+	pcpul_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+	if (pcpul_size > pcpul_unit_size) {
+		pr_warning("PERCPU: static data is larger than large page, "
+			   "can't use large page\n");
+		return -EINVAL;
+	}
+
+	/* allocate pointer array and alloc large pages */
+	map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0]));
+	pcpul_map = alloc_bootmem(map_size);
+
+	for_each_possible_cpu(cpu) {
+		void *ptr;
+
+		ptr = alloc_fn(cpu, lpage_size);
+		if (!ptr) {
+			pr_warning("PERCPU: failed to allocate large page "
+				   "for cpu%u\n", cpu);
+			goto enomem;
+		}
+
+		/*
+		 * Only use pcpul_size bytes and give back the rest.
+		 *
+		 * Ingo: The lpage_size up-rounding bootmem is needed
+		 * to make sure the partial lpage is still fully RAM -
+		 * it's not well-specified to have a incompatible area
+		 * (unmapped RAM, device memory, etc.) in that hole.
+		 */
+		free_fn(ptr + pcpul_size, lpage_size - pcpul_size);
+
+		pcpul_map[cpu].cpu = cpu;
+		pcpul_map[cpu].ptr = ptr;
+
+		memcpy(ptr, __per_cpu_load, static_size);
+	}
+
+	/* allocate address and map */
+	pcpul_vm.flags = VM_ALLOC;
+	pcpul_vm.size = num_possible_cpus() * pcpul_unit_size;
+	vm_area_register_early(&pcpul_vm, pcpul_unit_size);
+
+	for_each_possible_cpu(cpu)
+		map_fn(pcpul_map[cpu].ptr, pcpul_unit_size,
+		       pcpul_vm.addr + cpu * pcpul_unit_size);
+
+	/* we're ready, commit */
+	pr_info("PERCPU: Remapped at %p with large pages, static data "
+		"%zu bytes\n", pcpul_vm.addr, static_size);
+
+	ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
+				     reserved_size, dyn_size, pcpul_unit_size,
+				     pcpul_vm.addr, NULL);
+
+	/* sort pcpul_map array for pcpu_lpage_remapped() */
+	for (i = 0; i < num_possible_cpus() - 1; i++)
+		for (j = i + 1; j < num_possible_cpus(); j++)
+			if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
+				struct pcpul_ent tmp = pcpul_map[i];
+				pcpul_map[i] = pcpul_map[j];
+				pcpul_map[j] = tmp;
+			}
+
+	return ret;
+
+enomem:
+	for_each_possible_cpu(cpu)
+		if (pcpul_map[cpu].ptr)
+			free_fn(pcpul_map[cpu].ptr, pcpul_size);
+	free_bootmem(__pa(pcpul_map), map_size);
+	return -ENOMEM;
+}
+
+/**
+ * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
+ * @kaddr: the kernel address in question
+ *
+ * Determine whether @kaddr falls in the pcpul recycled area.  This is
+ * used by pageattr to detect VM aliases and break up the pcpu large
+ * page mapping such that the same physical page is not mapped under
+ * different attributes.
+ *
+ * The recycled area is always at the tail of a partially used large
+ * page.
+ *
+ * RETURNS:
+ * Address of corresponding remapped pcpu address if match is found;
+ * otherwise, NULL.
+ */
+void *pcpu_lpage_remapped(void *kaddr)
+{
+	unsigned long unit_mask = pcpul_unit_size - 1;
+	void *lpage_addr = (void *)((unsigned long)kaddr & ~unit_mask);
+	unsigned long offset = (unsigned long)kaddr & unit_mask;
+	int left = 0, right = num_possible_cpus() - 1;
+	int pos;
+
+	/* pcpul in use at all? */
+	if (!pcpul_map)
+		return NULL;
+
+	/* okay, perform binary search */
+	while (left <= right) {
+		pos = (left + right) / 2;
+
+		if (pcpul_map[pos].ptr < lpage_addr)
+			left = pos + 1;
+		else if (pcpul_map[pos].ptr > lpage_addr)
+			right = pos - 1;
+		else {
+			/* it shouldn't be in the area for the first chunk */
+			WARN_ON(offset < pcpul_size);
+
+			return pcpul_vm.addr +
+				pcpul_map[pos].cpu * pcpul_unit_size + offset;
+		}
+	}
+
+	return NULL;
+}
+#endif
+
 /*
  * Generic percpu area setup.
  *
-- 
cgit v1.2.3-71-gd317


From 38a6be525460f52ac6f2de1c3f73c5615a8853cd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 4 Jul 2009 08:10:59 +0900
Subject: percpu: simplify pcpu_setup_first_chunk()

Now that all first chunk allocator helpers allocate and map the first
chunk themselves, there's no need to have optional default alloc/map
in pcpu_setup_first_chunk().  Drop @populate_pte_fn and only leave
@dyn_size optional and make all other params mandatory.

This makes it much easier to follow what pcpu_setup_first_chunk() is
doing and what actual differences tweaking each parameter results in.

[ Impact: drop unused code path ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/sparc/kernel/smp_64.c |   2 +-
 include/linux/percpu.h     |   5 +--
 mm/percpu.c                | 104 +++++++++++++--------------------------------
 3 files changed, 33 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index fa44eaf8d897..ccad7b20ae75 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1528,7 +1528,7 @@ void __init setup_per_cpu_areas(void)
 
 	pcpu_unit_size = pcpu_setup_first_chunk(pcpur_get_page, static_size,
 						PERCPU_MODULE_RESERVE, dyn_size,
-						PCPU_CHUNK_SIZE, vm.addr, NULL);
+						PCPU_CHUNK_SIZE, vm.addr);
 
 	free_bootmem(__pa(pcpur_ptrs), ptrs_size);
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 9f6bfd7d4b92..ec64357e1762 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -66,9 +66,8 @@ typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr);
 
 extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 				size_t static_size, size_t reserved_size,
-				ssize_t dyn_size, ssize_t unit_size,
-				void *base_addr,
-				pcpu_fc_populate_pte_fn_t populate_pte_fn);
+				ssize_t dyn_size, size_t unit_size,
+				void *base_addr);
 
 extern ssize_t __init pcpu_embed_first_chunk(
 				size_t static_size, size_t reserved_size,
diff --git a/mm/percpu.c b/mm/percpu.c
index 17db527ee2e2..21d938a10662 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -983,24 +983,22 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * pcpu_setup_first_chunk - initialize the first percpu chunk
  * @get_page_fn: callback to fetch page pointer
  * @static_size: the size of static percpu area in bytes
- * @reserved_size: the size of reserved percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes, 0 for none
  * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
- * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
- * @base_addr: mapped address, NULL for auto
- * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
+ * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE
+ * @base_addr: mapped address
  *
  * Initialize the first percpu chunk which contains the kernel static
  * perpcu area.  This function is to be called from arch percpu area
- * setup path.  The first two parameters are mandatory.  The rest are
- * optional.
+ * setup path.
  *
  * @get_page_fn() should return pointer to percpu page given cpu
  * number and page number.  It should at least return enough pages to
  * cover the static area.  The returned pages for static area should
- * have been initialized with valid data.  If @unit_size is specified,
- * it can also return pages after the static area.  NULL return
- * indicates end of pages for the cpu.  Note that @get_page_fn() must
- * return the same number of pages for all cpus.
+ * have been initialized with valid data.  It can also return pages
+ * after the static area.  NULL return indicates end of pages for the
+ * cpu.  Note that @get_page_fn() must return the same number of pages
+ * for all cpus.
  *
  * @reserved_size, if non-zero, specifies the amount of bytes to
  * reserve after the static area in the first chunk.  This reserves
@@ -1015,17 +1013,12 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * non-negative value makes percpu leave alone the area beyond
  * @static_size + @reserved_size + @dyn_size.
  *
- * @unit_size, if non-negative, specifies unit size and must be
- * aligned to PAGE_SIZE and equal to or larger than @static_size +
- * @reserved_size + if non-negative, @dyn_size.
- *
- * Non-null @base_addr means that the caller already allocated virtual
- * region for the first chunk and mapped it.  percpu must not mess
- * with the chunk.  Note that @base_addr with 0 @unit_size or non-NULL
- * @populate_pte_fn doesn't make any sense.
+ * @unit_size specifies unit size and must be aligned to PAGE_SIZE and
+ * equal to or larger than @static_size + @reserved_size + if
+ * non-negative, @dyn_size.
  *
- * @populate_pte_fn is used to populate the pagetable.  NULL means the
- * caller already populated the pagetable.
+ * The caller should have mapped the first chunk at @base_addr and
+ * copied static data to each unit.
  *
  * If the first chunk ends up with both reserved and dynamic areas, it
  * is served by two chunks - one to serve the core static and reserved
@@ -1040,9 +1033,8 @@ EXPORT_SYMBOL_GPL(free_percpu);
  */
 size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 				     size_t static_size, size_t reserved_size,
-				     ssize_t dyn_size, ssize_t unit_size,
-				     void *base_addr,
-				     pcpu_fc_populate_pte_fn_t populate_pte_fn)
+				     ssize_t dyn_size, size_t unit_size,
+				     void *base_addr)
 {
 	static struct vm_struct first_vm;
 	static int smap[2], dmap[2];
@@ -1050,27 +1042,18 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 			  (dyn_size >= 0 ? dyn_size : 0);
 	struct pcpu_chunk *schunk, *dchunk = NULL;
 	unsigned int cpu;
-	int nr_pages;
-	int err, i;
+	int i, nr_pages;
 
 	/* santiy checks */
 	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
 		     ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
 	BUG_ON(!static_size);
-	if (unit_size >= 0) {
-		BUG_ON(unit_size < size_sum);
-		BUG_ON(unit_size & ~PAGE_MASK);
-		BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
-	} else
-		BUG_ON(base_addr);
-	BUG_ON(base_addr && populate_pte_fn);
-
-	if (unit_size >= 0)
-		pcpu_unit_pages = unit_size >> PAGE_SHIFT;
-	else
-		pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
-					PFN_UP(size_sum));
+	BUG_ON(!base_addr);
+	BUG_ON(unit_size < size_sum);
+	BUG_ON(unit_size & ~PAGE_MASK);
+	BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
 
+	pcpu_unit_pages = unit_size >> PAGE_SHIFT;
 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
@@ -1079,6 +1062,10 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	if (dyn_size < 0)
 		dyn_size = pcpu_unit_size - static_size - reserved_size;
 
+	first_vm.flags = VM_ALLOC;
+	first_vm.size = pcpu_chunk_size;
+	first_vm.addr = base_addr;
+
 	/*
 	 * Allocate chunk slots.  The additional last slot is for
 	 * empty chunks.
@@ -1101,6 +1088,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	schunk->map = smap;
 	schunk->map_alloc = ARRAY_SIZE(smap);
 	schunk->page = schunk->page_ar;
+	schunk->immutable = true;
 
 	if (reserved_size) {
 		schunk->free_size = reserved_size;
@@ -1124,31 +1112,13 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 		dchunk->map = dmap;
 		dchunk->map_alloc = ARRAY_SIZE(dmap);
 		dchunk->page = schunk->page_ar;	/* share page map with schunk */
+		dchunk->immutable = true;
 
 		dchunk->contig_hint = dchunk->free_size = dyn_size;
 		dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
 		dchunk->map[dchunk->map_used++] = dchunk->free_size;
 	}
 
-	/* allocate vm address */
-	first_vm.flags = VM_ALLOC;
-	first_vm.size = pcpu_chunk_size;
-
-	if (!base_addr)
-		vm_area_register_early(&first_vm, PAGE_SIZE);
-	else {
-		/*
-		 * Pages already mapped.  No need to remap into
-		 * vmalloc area.  In this case the first chunks can't
-		 * be mapped or unmapped by percpu and are marked
-		 * immutable.
-		 */
-		first_vm.addr = base_addr;
-		schunk->immutable = true;
-		if (dchunk)
-			dchunk->immutable = true;
-	}
-
 	/* assign pages */
 	nr_pages = -1;
 	for_each_possible_cpu(cpu) {
@@ -1168,19 +1138,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 			BUG_ON(nr_pages != i);
 	}
 
-	/* map them */
-	if (populate_pte_fn) {
-		for_each_possible_cpu(cpu)
-			for (i = 0; i < nr_pages; i++)
-				populate_pte_fn(pcpu_chunk_addr(schunk,
-								cpu, i));
-
-		err = pcpu_map(schunk, 0, nr_pages);
-		if (err)
-			panic("failed to setup static percpu area, err=%d\n",
-			      err);
-	}
-
 	/* link the first chunk in */
 	pcpu_first_chunk = dchunk ?: schunk;
 	pcpu_chunk_relocate(pcpu_first_chunk, -1);
@@ -1282,7 +1239,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
 
 	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
 				      reserved_size, dyn_size,
-				      pcpue_unit_size, pcpue_ptr, NULL);
+				      pcpue_unit_size, pcpue_ptr);
 }
 
 /*
@@ -1387,8 +1344,7 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
 
 	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
 				     reserved_size, -1,
-				     pcpu4k_unit_pages << PAGE_SHIFT, vm.addr,
-				     NULL);
+				     pcpu4k_unit_pages << PAGE_SHIFT, vm.addr);
 	goto out_free_ar;
 
 enomem:
@@ -1521,7 +1477,7 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
 
 	ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
 				     reserved_size, dyn_size, pcpul_unit_size,
-				     pcpul_vm.addr, NULL);
+				     pcpul_vm.addr);
 
 	/* sort pcpul_map array for pcpu_lpage_remapped() */
 	for (i = 0; i < num_possible_cpus() - 1; i++)
-- 
cgit v1.2.3-71-gd317


From ce3141a277ff6cc37e51008b8888dc2cb7456ef1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 4 Jul 2009 08:11:00 +0900
Subject: percpu: drop pcpu_chunk->page[]

percpu core doesn't need to tack all the allocated pages.  It needs to
know whether certain pages are populated and a way to reverse map
address to page when freeing.  This patch drops pcpu_chunk->page[] and
use populated bitmap and vmalloc_to_page() lookup instead.  Using
vmalloc_to_page() exclusively is also possible but complicates first
chunk handling, inflates cache footprint and prevents non-standard
memory allocation for percpu memory.

pcpu_chunk->page[] was used to track each page's allocation and
allowed asymmetric population which happens during failure path;
however, with single bitmap for all units, this is no longer possible.
Bite the bullet and rewrite (de)populate functions so that things are
done in clearly separated steps such that asymmetric population
doesn't happen.  This makes the (de)population process much more
modular and will also ease implementing non-standard memory usage in
the future (e.g. large pages).

This makes @get_page_fn parameter to pcpu_setup_first_chunk()
unnecessary.  The parameter is dropped and all first chunk helpers are
updated accordingly.  Please note that despite the volume most changes
to first chunk helpers are symbol renames for variables which don't
need to be referenced outside of the helper anymore.

This change reduces memory usage and cache footprint of pcpu_chunk.
Now only #unit_pages bits are necessary per chunk.

[ Impact: reduced memory usage and cache footprint for bookkeeping ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
---
 arch/sparc/kernel/smp_64.c |  42 ++--
 include/linux/percpu.h     |   3 +-
 mm/percpu.c                | 604 ++++++++++++++++++++++++++++-----------------
 3 files changed, 400 insertions(+), 249 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index ccad7b20ae75..f2f22ee97a7a 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1415,19 +1415,6 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
 #endif
 }
 
-static size_t pcpur_size __initdata;
-static void **pcpur_ptrs __initdata;
-
-static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
-{
-	size_t off = (size_t)pageno << PAGE_SHIFT;
-
-	if (off >= pcpur_size)
-		return NULL;
-
-	return virt_to_page(pcpur_ptrs[cpu] + off);
-}
-
 #define PCPU_CHUNK_SIZE (4UL * 1024UL * 1024UL)
 
 static void __init pcpu_map_range(unsigned long start, unsigned long end,
@@ -1491,25 +1478,26 @@ void __init setup_per_cpu_areas(void)
 	size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start;
 	static struct vm_struct vm;
 	unsigned long delta, cpu;
-	size_t pcpu_unit_size;
+	size_t size_sum, pcpu_unit_size;
 	size_t ptrs_size;
+	void **ptrs;
 
-	pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
-			       PERCPU_DYNAMIC_RESERVE);
-	dyn_size = pcpur_size - static_size - PERCPU_MODULE_RESERVE;
+	size_sum = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
+			     PERCPU_DYNAMIC_RESERVE);
+	dyn_size = size_sum - static_size - PERCPU_MODULE_RESERVE;
 
 
-	ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
-	pcpur_ptrs = alloc_bootmem(ptrs_size);
+	ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(ptrs[0]));
+	ptrs = alloc_bootmem(ptrs_size);
 
 	for_each_possible_cpu(cpu) {
-		pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE,
-						     PCPU_CHUNK_SIZE);
+		ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE,
+					       PCPU_CHUNK_SIZE);
 
-		free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
-			     PCPU_CHUNK_SIZE - pcpur_size);
+		free_bootmem(__pa(ptrs[cpu] + size_sum),
+			     PCPU_CHUNK_SIZE - size_sum);
 
-		memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
+		memcpy(ptrs[cpu], __per_cpu_load, static_size);
 	}
 
 	/* allocate address and map */
@@ -1523,14 +1511,14 @@ void __init setup_per_cpu_areas(void)
 
 		start += cpu * PCPU_CHUNK_SIZE;
 		end = start + PCPU_CHUNK_SIZE;
-		pcpu_map_range(start, end, virt_to_page(pcpur_ptrs[cpu]));
+		pcpu_map_range(start, end, virt_to_page(ptrs[cpu]));
 	}
 
-	pcpu_unit_size = pcpu_setup_first_chunk(pcpur_get_page, static_size,
+	pcpu_unit_size = pcpu_setup_first_chunk(static_size,
 						PERCPU_MODULE_RESERVE, dyn_size,
 						PCPU_CHUNK_SIZE, vm.addr);
 
-	free_bootmem(__pa(pcpur_ptrs), ptrs_size);
+	free_bootmem(__pa(ptrs), ptrs_size);
 
 	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
 	for_each_possible_cpu(cpu) {
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index ec64357e1762..63c8b7a23e66 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -58,13 +58,12 @@
 
 extern void *pcpu_base_addr;
 
-typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
 typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size);
 typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
 typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
 typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr);
 
-extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
+extern size_t __init pcpu_setup_first_chunk(
 				size_t static_size, size_t reserved_size,
 				ssize_t dyn_size, size_t unit_size,
 				void *base_addr);
diff --git a/mm/percpu.c b/mm/percpu.c
index 639fce4d2caf..21756814d99f 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -94,8 +94,7 @@ struct pcpu_chunk {
 	int			map_alloc;	/* # of map entries allocated */
 	int			*map;		/* allocation map */
 	bool			immutable;	/* no [de]population allowed */
-	struct page		**page;		/* points to page array */
-	struct page		*page_ar[];	/* #cpus * UNIT_PAGES */
+	unsigned long		populated[];	/* populated bitmap */
 };
 
 static int pcpu_unit_pages __read_mostly;
@@ -129,9 +128,9 @@ static int pcpu_reserved_chunk_limit;
  * Synchronization rules.
  *
  * There are two locks - pcpu_alloc_mutex and pcpu_lock.  The former
- * protects allocation/reclaim paths, chunks and chunk->page arrays.
- * The latter is a spinlock and protects the index data structures -
- * chunk slots, chunks and area maps in chunks.
+ * protects allocation/reclaim paths, chunks, populated bitmap and
+ * vmalloc mapping.  The latter is a spinlock and protects the index
+ * data structures - chunk slots, chunks and area maps in chunks.
  *
  * During allocation, pcpu_alloc_mutex is kept locked all the time and
  * pcpu_lock is grabbed and released as necessary.  All actual memory
@@ -188,16 +187,13 @@ static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
 		(pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
 }
 
-static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
-				      unsigned int cpu, int page_idx)
+static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
+				    unsigned int cpu, int page_idx)
 {
-	return &chunk->page[pcpu_page_idx(cpu, page_idx)];
-}
+	/* must not be used on pre-mapped chunk */
+	WARN_ON(chunk->immutable);
 
-static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
-				     int page_idx)
-{
-	return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
+	return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
 }
 
 /* set the pointer to a chunk in a page struct */
@@ -212,6 +208,34 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
 	return (struct pcpu_chunk *)page->index;
 }
 
+static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
+{
+	*rs = find_next_zero_bit(chunk->populated, end, *rs);
+	*re = find_next_bit(chunk->populated, end, *rs + 1);
+}
+
+static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
+{
+	*rs = find_next_bit(chunk->populated, end, *rs);
+	*re = find_next_zero_bit(chunk->populated, end, *rs + 1);
+}
+
+/*
+ * (Un)populated page region iterators.  Iterate over (un)populated
+ * page regions betwen @start and @end in @chunk.  @rs and @re should
+ * be integer variables and will be set to start and end page index of
+ * the current region.
+ */
+#define pcpu_for_each_unpop_region(chunk, rs, re, start, end)		    \
+	for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
+	     (rs) < (re);						    \
+	     (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
+
+#define pcpu_for_each_pop_region(chunk, rs, re, start, end)		    \
+	for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end));   \
+	     (rs) < (re);						    \
+	     (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
+
 /**
  * pcpu_mem_alloc - allocate memory
  * @size: bytes to allocate
@@ -545,42 +569,197 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
 }
 
 /**
- * pcpu_unmap - unmap pages out of a pcpu_chunk
+ * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
+ * @chunk: chunk of interest
+ * @bitmapp: output parameter for bitmap
+ * @may_alloc: may allocate the array
+ *
+ * Returns pointer to array of pointers to struct page and bitmap,
+ * both of which can be indexed with pcpu_page_idx().  The returned
+ * array is cleared to zero and *@bitmapp is copied from
+ * @chunk->populated.  Note that there is only one array and bitmap
+ * and access exclusion is the caller's responsibility.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
+ * Otherwise, don't care.
+ *
+ * RETURNS:
+ * Pointer to temp pages array on success, NULL on failure.
+ */
+static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
+					       unsigned long **bitmapp,
+					       bool may_alloc)
+{
+	static struct page **pages;
+	static unsigned long *bitmap;
+	size_t pages_size = num_possible_cpus() * pcpu_unit_pages *
+			    sizeof(pages[0]);
+	size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
+			     sizeof(unsigned long);
+
+	if (!pages || !bitmap) {
+		if (may_alloc && !pages)
+			pages = pcpu_mem_alloc(pages_size);
+		if (may_alloc && !bitmap)
+			bitmap = pcpu_mem_alloc(bitmap_size);
+		if (!pages || !bitmap)
+			return NULL;
+	}
+
+	memset(pages, 0, pages_size);
+	bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
+
+	*bitmapp = bitmap;
+	return pages;
+}
+
+/**
+ * pcpu_free_pages - free pages which were allocated for @chunk
+ * @chunk: chunk pages were allocated for
+ * @pages: array of pages to be freed, indexed by pcpu_page_idx()
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to be freed
+ * @page_end: page index of the last page to be freed + 1
+ *
+ * Free pages [@page_start and @page_end) in @pages for all units.
+ * The pages were allocated for @chunk.
+ */
+static void pcpu_free_pages(struct pcpu_chunk *chunk,
+			    struct page **pages, unsigned long *populated,
+			    int page_start, int page_end)
+{
+	unsigned int cpu;
+	int i;
+
+	for_each_possible_cpu(cpu) {
+		for (i = page_start; i < page_end; i++) {
+			struct page *page = pages[pcpu_page_idx(cpu, i)];
+
+			if (page)
+				__free_page(page);
+		}
+	}
+}
+
+/**
+ * pcpu_alloc_pages - allocates pages for @chunk
+ * @chunk: target chunk
+ * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to be allocated
+ * @page_end: page index of the last page to be allocated + 1
+ *
+ * Allocate pages [@page_start,@page_end) into @pages for all units.
+ * The allocation is for @chunk.  Percpu core doesn't care about the
+ * content of @pages and will pass it verbatim to pcpu_map_pages().
+ */
+static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
+			    struct page **pages, unsigned long *populated,
+			    int page_start, int page_end)
+{
+	const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
+	unsigned int cpu;
+	int i;
+
+	for_each_possible_cpu(cpu) {
+		for (i = page_start; i < page_end; i++) {
+			struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
+
+			*pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
+			if (!*pagep) {
+				pcpu_free_pages(chunk, pages, populated,
+						page_start, page_end);
+				return -ENOMEM;
+			}
+		}
+	}
+	return 0;
+}
+
+/**
+ * pcpu_pre_unmap_flush - flush cache prior to unmapping
+ * @chunk: chunk the regions to be flushed belongs to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages in [@page_start,@page_end) of @chunk are about to be
+ * unmapped.  Flush cache.  As each flushing trial can be very
+ * expensive, issue flush on the whole region at once rather than
+ * doing it for each cpu.  This could be an overkill but is more
+ * scalable.
+ */
+static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
+				 int page_start, int page_end)
+{
+	unsigned int last = num_possible_cpus() - 1;
+
+	flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
+			   pcpu_chunk_addr(chunk, last, page_end));
+}
+
+static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
+{
+	unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
+}
+
+/**
+ * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
  * @chunk: chunk of interest
+ * @pages: pages array which can be used to pass information to free
+ * @populated: populated bitmap
  * @page_start: page index of the first page to unmap
  * @page_end: page index of the last page to unmap + 1
- * @flush_tlb: whether to flush tlb or not
  *
  * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
- * If @flush is true, vcache is flushed before unmapping and tlb
- * after.
+ * Corresponding elements in @pages were cleared by the caller and can
+ * be used to carry information to pcpu_free_pages() which will be
+ * called after all unmaps are finished.  The caller should call
+ * proper pre/post flush functions.
  */
-static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
-		       bool flush_tlb)
+static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
+			     struct page **pages, unsigned long *populated,
+			     int page_start, int page_end)
 {
-	unsigned int last = num_possible_cpus() - 1;
 	unsigned int cpu;
+	int i;
 
-	/* unmap must not be done on immutable chunk */
-	WARN_ON(chunk->immutable);
+	for_each_possible_cpu(cpu) {
+		for (i = page_start; i < page_end; i++) {
+			struct page *page;
 
-	/*
-	 * Each flushing trial can be very expensive, issue flush on
-	 * the whole region at once rather than doing it for each cpu.
-	 * This could be an overkill but is more scalable.
-	 */
-	flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
-			   pcpu_chunk_addr(chunk, last, page_end));
+			page = pcpu_chunk_page(chunk, cpu, i);
+			WARN_ON(!page);
+			pages[pcpu_page_idx(cpu, i)] = page;
+		}
+		__pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
+				   page_end - page_start);
+	}
 
-	for_each_possible_cpu(cpu)
-		unmap_kernel_range_noflush(
-				pcpu_chunk_addr(chunk, cpu, page_start),
-				(page_end - page_start) << PAGE_SHIFT);
-
-	/* ditto as flush_cache_vunmap() */
-	if (flush_tlb)
-		flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
-				       pcpu_chunk_addr(chunk, last, page_end));
+	for (i = page_start; i < page_end; i++)
+		__clear_bit(i, populated);
+}
+
+/**
+ * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
+ * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages [@page_start,@page_end) of @chunk have been unmapped.  Flush
+ * TLB for the regions.  This can be skipped if the area is to be
+ * returned to vmalloc as vmalloc will handle TLB flushing lazily.
+ *
+ * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * for the whole region.
+ */
+static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
+				      int page_start, int page_end)
+{
+	unsigned int last = num_possible_cpus() - 1;
+
+	flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
+			       pcpu_chunk_addr(chunk, last, page_end));
 }
 
 static int __pcpu_map_pages(unsigned long addr, struct page **pages,
@@ -591,35 +770,76 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages,
 }
 
 /**
- * pcpu_map - map pages into a pcpu_chunk
+ * pcpu_map_pages - map pages into a pcpu_chunk
  * @chunk: chunk of interest
+ * @pages: pages array containing pages to be mapped
+ * @populated: populated bitmap
  * @page_start: page index of the first page to map
  * @page_end: page index of the last page to map + 1
  *
- * For each cpu, map pages [@page_start,@page_end) into @chunk.
- * vcache is flushed afterwards.
+ * For each cpu, map pages [@page_start,@page_end) into @chunk.  The
+ * caller is responsible for calling pcpu_post_map_flush() after all
+ * mappings are complete.
+ *
+ * This function is responsible for setting corresponding bits in
+ * @chunk->populated bitmap and whatever is necessary for reverse
+ * lookup (addr -> chunk).
  */
-static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
+static int pcpu_map_pages(struct pcpu_chunk *chunk,
+			  struct page **pages, unsigned long *populated,
+			  int page_start, int page_end)
 {
-	unsigned int last = num_possible_cpus() - 1;
-	unsigned int cpu;
-	int err;
-
-	/* map must not be done on immutable chunk */
-	WARN_ON(chunk->immutable);
+	unsigned int cpu, tcpu;
+	int i, err;
 
 	for_each_possible_cpu(cpu) {
 		err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
-				       pcpu_chunk_pagep(chunk, cpu, page_start),
+				       &pages[pcpu_page_idx(cpu, page_start)],
 				       page_end - page_start);
 		if (err < 0)
-			return err;
+			goto err;
 	}
 
+	/* mapping successful, link chunk and mark populated */
+	for (i = page_start; i < page_end; i++) {
+		for_each_possible_cpu(cpu)
+			pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
+					    chunk);
+		__set_bit(i, populated);
+	}
+
+	return 0;
+
+err:
+	for_each_possible_cpu(tcpu) {
+		if (tcpu == cpu)
+			break;
+		__pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
+				   page_end - page_start);
+	}
+	return err;
+}
+
+/**
+ * pcpu_post_map_flush - flush cache after mapping
+ * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages [@page_start,@page_end) of @chunk have been mapped.  Flush
+ * cache.
+ *
+ * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * for the whole region.
+ */
+static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
+				int page_start, int page_end)
+{
+	unsigned int last = num_possible_cpus() - 1;
+
 	/* flush at once, please read comments in pcpu_unmap() */
 	flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
 			 pcpu_chunk_addr(chunk, last, page_end));
-	return 0;
 }
 
 /**
@@ -636,39 +856,45 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
  * CONTEXT:
  * pcpu_alloc_mutex.
  */
-static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
-				  bool flush)
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
 {
 	int page_start = PFN_DOWN(off);
 	int page_end = PFN_UP(off + size);
-	int unmap_start = -1;
-	int uninitialized_var(unmap_end);
-	unsigned int cpu;
-	int i;
+	struct page **pages;
+	unsigned long *populated;
+	int rs, re;
+
+	/* quick path, check whether it's empty already */
+	pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+		if (rs == page_start && re == page_end)
+			return;
+		break;
+	}
 
-	for (i = page_start; i < page_end; i++) {
-		for_each_possible_cpu(cpu) {
-			struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+	/* immutable chunks can't be depopulated */
+	WARN_ON(chunk->immutable);
 
-			if (!*pagep)
-				continue;
+	/*
+	 * If control reaches here, there must have been at least one
+	 * successful population attempt so the temp pages array must
+	 * be available now.
+	 */
+	pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
+	BUG_ON(!pages);
 
-			__free_page(*pagep);
+	/* unmap and free */
+	pcpu_pre_unmap_flush(chunk, page_start, page_end);
 
-			/*
-			 * If it's partial depopulation, it might get
-			 * populated or depopulated again.  Mark the
-			 * page gone.
-			 */
-			*pagep = NULL;
+	pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+		pcpu_unmap_pages(chunk, pages, populated, rs, re);
 
-			unmap_start = unmap_start < 0 ? i : unmap_start;
-			unmap_end = i + 1;
-		}
-	}
+	/* no need to flush tlb, vmalloc will handle it lazily */
+
+	pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+		pcpu_free_pages(chunk, pages, populated, rs, re);
 
-	if (unmap_start >= 0)
-		pcpu_unmap(chunk, unmap_start, unmap_end, flush);
+	/* commit new bitmap */
+	bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
 }
 
 /**
@@ -685,50 +911,61 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
  */
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
 {
-	const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
 	int page_start = PFN_DOWN(off);
 	int page_end = PFN_UP(off + size);
-	int map_start = -1;
-	int uninitialized_var(map_end);
+	int free_end = page_start, unmap_end = page_start;
+	struct page **pages;
+	unsigned long *populated;
 	unsigned int cpu;
-	int i;
+	int rs, re, rc;
 
-	for (i = page_start; i < page_end; i++) {
-		if (pcpu_chunk_page_occupied(chunk, i)) {
-			if (map_start >= 0) {
-				if (pcpu_map(chunk, map_start, map_end))
-					goto err;
-				map_start = -1;
-			}
-			continue;
-		}
+	/* quick path, check whether all pages are already there */
+	pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) {
+		if (rs == page_start && re == page_end)
+			goto clear;
+		break;
+	}
 
-		map_start = map_start < 0 ? i : map_start;
-		map_end = i + 1;
+	/* need to allocate and map pages, this chunk can't be immutable */
+	WARN_ON(chunk->immutable);
 
-		for_each_possible_cpu(cpu) {
-			struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+	pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
+	if (!pages)
+		return -ENOMEM;
 
-			*pagep = alloc_pages_node(cpu_to_node(cpu),
-						  alloc_mask, 0);
-			if (!*pagep)
-				goto err;
-			pcpu_set_page_chunk(*pagep, chunk);
-		}
+	/* alloc and map */
+	pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+		rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
+		if (rc)
+			goto err_free;
+		free_end = re;
 	}
 
-	if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
-		goto err;
+	pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+		rc = pcpu_map_pages(chunk, pages, populated, rs, re);
+		if (rc)
+			goto err_unmap;
+		unmap_end = re;
+	}
+	pcpu_post_map_flush(chunk, page_start, page_end);
 
+	/* commit new bitmap */
+	bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
+clear:
 	for_each_possible_cpu(cpu)
 		memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
 		       size);
-
 	return 0;
-err:
-	/* likely under heavy memory pressure, give memory back */
-	pcpu_depopulate_chunk(chunk, off, size, true);
-	return -ENOMEM;
+
+err_unmap:
+	pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
+	pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
+		pcpu_unmap_pages(chunk, pages, populated, rs, re);
+	pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
+err_free:
+	pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
+		pcpu_free_pages(chunk, pages, populated, rs, re);
+	return rc;
 }
 
 static void free_pcpu_chunk(struct pcpu_chunk *chunk)
@@ -752,7 +989,6 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
 	chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
 	chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
 	chunk->map[chunk->map_used++] = pcpu_unit_size;
-	chunk->page = chunk->page_ar;
 
 	chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
 	if (!chunk->vm) {
@@ -933,7 +1169,7 @@ static void pcpu_reclaim(struct work_struct *work)
 	mutex_unlock(&pcpu_alloc_mutex);
 
 	list_for_each_entry_safe(chunk, next, &todo, list) {
-		pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
+		pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
 		free_pcpu_chunk(chunk);
 	}
 }
@@ -981,7 +1217,6 @@ EXPORT_SYMBOL_GPL(free_percpu);
 
 /**
  * pcpu_setup_first_chunk - initialize the first percpu chunk
- * @get_page_fn: callback to fetch page pointer
  * @static_size: the size of static percpu area in bytes
  * @reserved_size: the size of reserved percpu area in bytes, 0 for none
  * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
@@ -992,14 +1227,6 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * perpcu area.  This function is to be called from arch percpu area
  * setup path.
  *
- * @get_page_fn() should return pointer to percpu page given cpu
- * number and page number.  It should at least return enough pages to
- * cover the static area.  The returned pages for static area should
- * have been initialized with valid data.  It can also return pages
- * after the static area.  NULL return indicates end of pages for the
- * cpu.  Note that @get_page_fn() must return the same number of pages
- * for all cpus.
- *
  * @reserved_size, if non-zero, specifies the amount of bytes to
  * reserve after the static area in the first chunk.  This reserves
  * the first chunk such that it's available only through reserved
@@ -1031,8 +1258,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access.
  */
-size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
-				     size_t static_size, size_t reserved_size,
+size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
 				     ssize_t dyn_size, size_t unit_size,
 				     void *base_addr)
 {
@@ -1041,8 +1267,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	size_t size_sum = static_size + reserved_size +
 			  (dyn_size >= 0 ? dyn_size : 0);
 	struct pcpu_chunk *schunk, *dchunk = NULL;
-	unsigned int cpu;
-	int i, nr_pages;
+	int i;
 
 	/* santiy checks */
 	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
@@ -1056,8 +1281,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	pcpu_unit_pages = unit_size >> PAGE_SHIFT;
 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
-	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
-		+ num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
+	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
+		BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
 
 	if (dyn_size < 0)
 		dyn_size = pcpu_unit_size - static_size - reserved_size;
@@ -1087,8 +1312,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	schunk->vm = &first_vm;
 	schunk->map = smap;
 	schunk->map_alloc = ARRAY_SIZE(smap);
-	schunk->page = schunk->page_ar;
 	schunk->immutable = true;
+	bitmap_fill(schunk->populated, pcpu_unit_pages);
 
 	if (reserved_size) {
 		schunk->free_size = reserved_size;
@@ -1106,38 +1331,19 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 
 	/* init dynamic chunk if necessary */
 	if (dyn_size) {
-		dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
+		dchunk = alloc_bootmem(pcpu_chunk_struct_size);
 		INIT_LIST_HEAD(&dchunk->list);
 		dchunk->vm = &first_vm;
 		dchunk->map = dmap;
 		dchunk->map_alloc = ARRAY_SIZE(dmap);
-		dchunk->page = schunk->page_ar;	/* share page map with schunk */
 		dchunk->immutable = true;
+		bitmap_fill(dchunk->populated, pcpu_unit_pages);
 
 		dchunk->contig_hint = dchunk->free_size = dyn_size;
 		dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
 		dchunk->map[dchunk->map_used++] = dchunk->free_size;
 	}
 
-	/* assign pages */
-	nr_pages = -1;
-	for_each_possible_cpu(cpu) {
-		for (i = 0; i < pcpu_unit_pages; i++) {
-			struct page *page = get_page_fn(cpu, i);
-
-			if (!page)
-				break;
-			*pcpu_chunk_pagep(schunk, cpu, i) = page;
-		}
-
-		BUG_ON(i < PFN_UP(static_size));
-
-		if (nr_pages < 0)
-			nr_pages = i;
-		else
-			BUG_ON(nr_pages != i);
-	}
-
 	/* link the first chunk in */
 	pcpu_first_chunk = dchunk ?: schunk;
 	pcpu_chunk_relocate(pcpu_first_chunk, -1);
@@ -1160,23 +1366,6 @@ static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size,
 	return size_sum;
 }
 
-/*
- * Embedding first chunk setup helper.
- */
-static void *pcpue_ptr __initdata;
-static size_t pcpue_size __initdata;
-static size_t pcpue_unit_size __initdata;
-
-static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
-{
-	size_t off = (size_t)pageno << PAGE_SHIFT;
-
-	if (off >= pcpue_size)
-		return NULL;
-
-	return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
-}
-
 /**
  * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
  * @static_size: the size of static percpu area in bytes
@@ -1207,18 +1396,19 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
 ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
 				      ssize_t dyn_size)
 {
-	size_t chunk_size;
+	size_t size_sum, unit_size, chunk_size;
+	void *base;
 	unsigned int cpu;
 
 	/* determine parameters and allocate */
-	pcpue_size = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
+	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
 
-	pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
-	chunk_size = pcpue_unit_size * num_possible_cpus();
+	unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+	chunk_size = unit_size * num_possible_cpus();
 
-	pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
-					    __pa(MAX_DMA_ADDRESS));
-	if (!pcpue_ptr) {
+	base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
+				       __pa(MAX_DMA_ADDRESS));
+	if (!base) {
 		pr_warning("PERCPU: failed to allocate %zu bytes for "
 			   "embedding\n", chunk_size);
 		return -ENOMEM;
@@ -1226,33 +1416,18 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
 
 	/* return the leftover and copy */
 	for_each_possible_cpu(cpu) {
-		void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
+		void *ptr = base + cpu * unit_size;
 
-		free_bootmem(__pa(ptr + pcpue_size),
-			     pcpue_unit_size - pcpue_size);
+		free_bootmem(__pa(ptr + size_sum), unit_size - size_sum);
 		memcpy(ptr, __per_cpu_load, static_size);
 	}
 
 	/* we're ready, commit */
 	pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
-		pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
+		size_sum >> PAGE_SHIFT, base, static_size);
 
-	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
-				      reserved_size, dyn_size,
-				      pcpue_unit_size, pcpue_ptr);
-}
-
-/*
- * 4k page first chunk setup helper.
- */
-static struct page **pcpu4k_pages __initdata;
-static int pcpu4k_unit_pages __initdata;
-
-static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
-{
-	if (pageno < pcpu4k_unit_pages)
-		return pcpu4k_pages[cpu * pcpu4k_unit_pages + pageno];
-	return NULL;
+	return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
+				      unit_size, base);
 }
 
 /**
@@ -1279,23 +1454,25 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
 				   pcpu_fc_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct vm;
+	int unit_pages;
 	size_t pages_size;
+	struct page **pages;
 	unsigned int cpu;
 	int i, j;
 	ssize_t ret;
 
-	pcpu4k_unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
-					 PCPU_MIN_UNIT_SIZE));
+	unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
+				  PCPU_MIN_UNIT_SIZE));
 
 	/* unaligned allocations can't be freed, round up to page size */
-	pages_size = PFN_ALIGN(pcpu4k_unit_pages * num_possible_cpus() *
-			       sizeof(pcpu4k_pages[0]));
-	pcpu4k_pages = alloc_bootmem(pages_size);
+	pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
+			       sizeof(pages[0]));
+	pages = alloc_bootmem(pages_size);
 
 	/* allocate pages */
 	j = 0;
 	for_each_possible_cpu(cpu)
-		for (i = 0; i < pcpu4k_unit_pages; i++) {
+		for (i = 0; i < unit_pages; i++) {
 			void *ptr;
 
 			ptr = alloc_fn(cpu, PAGE_SIZE);
@@ -1304,25 +1481,24 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
 					   "4k page for cpu%u\n", cpu);
 				goto enomem;
 			}
-			pcpu4k_pages[j++] = virt_to_page(ptr);
+			pages[j++] = virt_to_page(ptr);
 		}
 
 	/* allocate vm area, map the pages and copy static data */
 	vm.flags = VM_ALLOC;
-	vm.size = num_possible_cpus() * pcpu4k_unit_pages << PAGE_SHIFT;
+	vm.size = num_possible_cpus() * unit_pages << PAGE_SHIFT;
 	vm_area_register_early(&vm, PAGE_SIZE);
 
 	for_each_possible_cpu(cpu) {
 		unsigned long unit_addr = (unsigned long)vm.addr +
-			(cpu * pcpu4k_unit_pages << PAGE_SHIFT);
+			(cpu * unit_pages << PAGE_SHIFT);
 
-		for (i = 0; i < pcpu4k_unit_pages; i++)
+		for (i = 0; i < unit_pages; i++)
 			populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
 
 		/* pte already populated, the following shouldn't fail */
-		ret = __pcpu_map_pages(unit_addr,
-				       &pcpu4k_pages[cpu * pcpu4k_unit_pages],
-				       pcpu4k_unit_pages);
+		ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages],
+				       unit_pages);
 		if (ret < 0)
 			panic("failed to map percpu area, err=%zd\n", ret);
 
@@ -1340,19 +1516,18 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
 
 	/* we're ready, commit */
 	pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n",
-		pcpu4k_unit_pages, static_size);
+		unit_pages, static_size);
 
-	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
-				     reserved_size, -1,
-				     pcpu4k_unit_pages << PAGE_SHIFT, vm.addr);
+	ret = pcpu_setup_first_chunk(static_size, reserved_size, -1,
+				     unit_pages << PAGE_SHIFT, vm.addr);
 	goto out_free_ar;
 
 enomem:
 	while (--j >= 0)
-		free_fn(page_address(pcpu4k_pages[j]), PAGE_SIZE);
+		free_fn(page_address(pages[j]), PAGE_SIZE);
 	ret = -ENOMEM;
 out_free_ar:
-	free_bootmem(__pa(pcpu4k_pages), pages_size);
+	free_bootmem(__pa(pages), pages_size);
 	return ret;
 }
 
@@ -1370,16 +1545,6 @@ static size_t pcpul_unit_size;
 static struct pcpul_ent *pcpul_map;
 static struct vm_struct pcpul_vm;
 
-static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
-{
-	size_t off = (size_t)pageno << PAGE_SHIFT;
-
-	if (off >= pcpul_size)
-		return NULL;
-
-	return virt_to_page(pcpul_map[cpu].ptr + off);
-}
-
 /**
  * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
  * @static_size: the size of static percpu area in bytes
@@ -1475,9 +1640,8 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
 	pr_info("PERCPU: Remapped at %p with large pages, static data "
 		"%zu bytes\n", pcpul_vm.addr, static_size);
 
-	ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
-				     reserved_size, dyn_size, pcpul_unit_size,
-				     pcpul_vm.addr);
+	ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
+				     pcpul_unit_size, pcpul_vm.addr);
 
 	/* sort pcpul_map array for pcpu_lpage_remapped() */
 	for (i = 0; i < num_possible_cpus() - 1; i++)
-- 
cgit v1.2.3-71-gd317


From 2f39e637ea240efb74cf807d31c93a71a0b89174 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 4 Jul 2009 08:11:00 +0900
Subject: percpu: allow non-linear / sparse cpu -> unit mapping

Currently cpu and unit are always identity mapped.  To allow more
efficient large page support on NUMA and lazy allocation for possible
but offline cpus, cpu -> unit mapping needs to be non-linear and/or
sparse.  This can be easily implemented by adding a cpu -> unit
mapping array and using it whenever looking up the matching unit for a
cpu.

The only unusal conversion is in pcpu_chunk_addr_search().  The passed
in address is unit0 based and unit0 might not be in use so it needs to
be converted to address of an in-use unit.  This is easily done by
adding the unit offset for the current processor.

[ Impact: allows non-linear/sparse cpu -> unit mapping, no visible change yet ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
---
 arch/sparc/kernel/smp_64.c |   2 +-
 include/linux/percpu.h     |   3 +-
 mm/percpu.c                | 129 +++++++++++++++++++++++++++++++++------------
 3 files changed, 97 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index f2f22ee97a7a..6970333b48b8 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1516,7 +1516,7 @@ void __init setup_per_cpu_areas(void)
 
 	pcpu_unit_size = pcpu_setup_first_chunk(static_size,
 						PERCPU_MODULE_RESERVE, dyn_size,
-						PCPU_CHUNK_SIZE, vm.addr);
+						PCPU_CHUNK_SIZE, vm.addr, NULL);
 
 	free_bootmem(__pa(ptrs), ptrs_size);
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 63c8b7a23e66..1e0e8878dc2a 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -57,6 +57,7 @@
 #endif
 
 extern void *pcpu_base_addr;
+extern const int *pcpu_unit_map;
 
 typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size);
 typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
@@ -66,7 +67,7 @@ typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr);
 extern size_t __init pcpu_setup_first_chunk(
 				size_t static_size, size_t reserved_size,
 				ssize_t dyn_size, size_t unit_size,
-				void *base_addr);
+				void *base_addr, const int *unit_map);
 
 extern ssize_t __init pcpu_embed_first_chunk(
 				size_t static_size, size_t reserved_size,
diff --git a/mm/percpu.c b/mm/percpu.c
index 21756814d99f..2196fae24f00 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -8,12 +8,13 @@
  *
  * This is percpu allocator which can handle both static and dynamic
  * areas.  Percpu areas are allocated in chunks in vmalloc area.  Each
- * chunk is consisted of num_possible_cpus() units and the first chunk
- * is used for static percpu variables in the kernel image (special
- * boot time alloc/init handling necessary as these areas need to be
- * brought up before allocation services are running).  Unit grows as
- * necessary and all units grow or shrink in unison.  When a chunk is
- * filled up, another chunk is allocated.  ie. in vmalloc area
+ * chunk is consisted of boot-time determined number of units and the
+ * first chunk is used for static percpu variables in the kernel image
+ * (special boot time alloc/init handling necessary as these areas
+ * need to be brought up before allocation services are running).
+ * Unit grows as necessary and all units grow or shrink in unison.
+ * When a chunk is filled up, another chunk is allocated.  ie. in
+ * vmalloc area
  *
  *  c0                           c1                         c2
  *  -------------------          -------------------        ------------
@@ -22,11 +23,13 @@
  *
  * Allocation is done in offset-size areas of single unit space.  Ie,
  * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
- * c1:u1, c1:u2 and c1:u3.  Percpu access can be done by configuring
- * percpu base registers pcpu_unit_size apart.
+ * c1:u1, c1:u2 and c1:u3.  On UMA, units corresponds directly to
+ * cpus.  On NUMA, the mapping can be non-linear and even sparse.
+ * Percpu access can be done by configuring percpu base registers
+ * according to cpu to unit mapping and pcpu_unit_size.
  *
- * There are usually many small percpu allocations many of them as
- * small as 4 bytes.  The allocator organizes chunks into lists
+ * There are usually many small percpu allocations many of them being
+ * as small as 4 bytes.  The allocator organizes chunks into lists
  * according to free size and tries to allocate from the fullest one.
  * Each chunk keeps the maximum contiguous area size hint which is
  * guaranteed to be eqaul to or larger than the maximum contiguous
@@ -99,14 +102,22 @@ struct pcpu_chunk {
 
 static int pcpu_unit_pages __read_mostly;
 static int pcpu_unit_size __read_mostly;
+static int pcpu_nr_units __read_mostly;
 static int pcpu_chunk_size __read_mostly;
 static int pcpu_nr_slots __read_mostly;
 static size_t pcpu_chunk_struct_size __read_mostly;
 
+/* cpus with the lowest and highest unit numbers */
+static unsigned int pcpu_first_unit_cpu __read_mostly;
+static unsigned int pcpu_last_unit_cpu __read_mostly;
+
 /* the address of the first chunk which starts with the kernel static area */
 void *pcpu_base_addr __read_mostly;
 EXPORT_SYMBOL_GPL(pcpu_base_addr);
 
+/* cpu -> unit map */
+const int *pcpu_unit_map __read_mostly;
+
 /*
  * The first chunk which always exists.  Note that unlike other
  * chunks, this one can be allocated and mapped in several different
@@ -177,7 +188,7 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
 
 static int pcpu_page_idx(unsigned int cpu, int page_idx)
 {
-	return cpu * pcpu_unit_pages + page_idx;
+	return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
 }
 
 static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
@@ -321,6 +332,14 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 		return pcpu_first_chunk;
 	}
 
+	/*
+	 * The address is relative to unit0 which might be unused and
+	 * thus unmapped.  Offset the address to the unit space of the
+	 * current processor before looking it up in the vmalloc
+	 * space.  Note that any possible cpu id can be used here, so
+	 * there's no need to worry about preemption or cpu hotplug.
+	 */
+	addr += pcpu_unit_map[smp_processor_id()] * pcpu_unit_size;
 	return pcpu_get_page_chunk(vmalloc_to_page(addr));
 }
 
@@ -593,8 +612,7 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
 {
 	static struct page **pages;
 	static unsigned long *bitmap;
-	size_t pages_size = num_possible_cpus() * pcpu_unit_pages *
-			    sizeof(pages[0]);
+	size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
 	size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
 			     sizeof(unsigned long);
 
@@ -692,10 +710,9 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
 static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
 				 int page_start, int page_end)
 {
-	unsigned int last = num_possible_cpus() - 1;
-
-	flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
-			   pcpu_chunk_addr(chunk, last, page_end));
+	flush_cache_vunmap(
+		pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+		pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
 }
 
 static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
@@ -756,10 +773,9 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
 static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
 				      int page_start, int page_end)
 {
-	unsigned int last = num_possible_cpus() - 1;
-
-	flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
-			       pcpu_chunk_addr(chunk, last, page_end));
+	flush_tlb_kernel_range(
+		pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+		pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
 }
 
 static int __pcpu_map_pages(unsigned long addr, struct page **pages,
@@ -835,11 +851,9 @@ err:
 static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
 				int page_start, int page_end)
 {
-	unsigned int last = num_possible_cpus() - 1;
-
-	/* flush at once, please read comments in pcpu_unmap() */
-	flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
-			 pcpu_chunk_addr(chunk, last, page_end));
+	flush_cache_vmap(
+		pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+		pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
 }
 
 /**
@@ -953,8 +967,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
 	bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
 clear:
 	for_each_possible_cpu(cpu)
-		memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
-		       size);
+		memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
 	return 0;
 
 err_unmap:
@@ -1088,6 +1101,7 @@ area_found:
 
 	mutex_unlock(&pcpu_alloc_mutex);
 
+	/* return address relative to unit0 */
 	return __addr_to_pcpu_ptr(chunk->vm->addr + off);
 
 fail_unlock:
@@ -1222,6 +1236,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
  * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE
  * @base_addr: mapped address
+ * @unit_map: cpu -> unit map, NULL for sequential mapping
  *
  * Initialize the first percpu chunk which contains the kernel static
  * perpcu area.  This function is to be called from arch percpu area
@@ -1260,16 +1275,17 @@ EXPORT_SYMBOL_GPL(free_percpu);
  */
 size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
 				     ssize_t dyn_size, size_t unit_size,
-				     void *base_addr)
+				     void *base_addr, const int *unit_map)
 {
 	static struct vm_struct first_vm;
 	static int smap[2], dmap[2];
 	size_t size_sum = static_size + reserved_size +
 			  (dyn_size >= 0 ? dyn_size : 0);
 	struct pcpu_chunk *schunk, *dchunk = NULL;
+	unsigned int cpu, tcpu;
 	int i;
 
-	/* santiy checks */
+	/* sanity checks */
 	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
 		     ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
 	BUG_ON(!static_size);
@@ -1278,9 +1294,52 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
 	BUG_ON(unit_size & ~PAGE_MASK);
 	BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
 
+	/* determine number of units and verify and initialize pcpu_unit_map */
+	if (unit_map) {
+		int first_unit = INT_MAX, last_unit = INT_MIN;
+
+		for_each_possible_cpu(cpu) {
+			int unit = unit_map[cpu];
+
+			BUG_ON(unit < 0);
+			for_each_possible_cpu(tcpu) {
+				if (tcpu == cpu)
+					break;
+				/* the mapping should be one-to-one */
+				BUG_ON(unit_map[tcpu] == unit);
+			}
+
+			if (unit < first_unit) {
+				pcpu_first_unit_cpu = cpu;
+				first_unit = unit;
+			}
+			if (unit > last_unit) {
+				pcpu_last_unit_cpu = cpu;
+				last_unit = unit;
+			}
+		}
+		pcpu_nr_units = last_unit + 1;
+		pcpu_unit_map = unit_map;
+	} else {
+		int *identity_map;
+
+		/* #units == #cpus, identity mapped */
+		identity_map = alloc_bootmem(num_possible_cpus() *
+					     sizeof(identity_map[0]));
+
+		for_each_possible_cpu(cpu)
+			identity_map[cpu] = cpu;
+
+		pcpu_first_unit_cpu = 0;
+		pcpu_last_unit_cpu = pcpu_nr_units - 1;
+		pcpu_nr_units = num_possible_cpus();
+		pcpu_unit_map = identity_map;
+	}
+
+	/* determine basic parameters */
 	pcpu_unit_pages = unit_size >> PAGE_SHIFT;
 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
-	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
+	pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size;
 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
 		BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
 
@@ -1349,7 +1408,7 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
 	pcpu_chunk_relocate(pcpu_first_chunk, -1);
 
 	/* we're done */
-	pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
+	pcpu_base_addr = schunk->vm->addr;
 	return pcpu_unit_size;
 }
 
@@ -1427,7 +1486,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
 		size_sum >> PAGE_SHIFT, base, static_size);
 
 	return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
-				      unit_size, base);
+				      unit_size, base, NULL);
 }
 
 /**
@@ -1519,7 +1578,7 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
 		unit_pages, static_size);
 
 	ret = pcpu_setup_first_chunk(static_size, reserved_size, -1,
-				     unit_pages << PAGE_SHIFT, vm.addr);
+				     unit_pages << PAGE_SHIFT, vm.addr, NULL);
 	goto out_free_ar;
 
 enomem:
@@ -1641,7 +1700,7 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
 		"%zu bytes\n", pcpul_vm.addr, static_size);
 
 	ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
-				     pcpul_unit_size, pcpul_vm.addr);
+				     pcpul_unit_size, pcpul_vm.addr, NULL);
 
 	/* sort pcpul_map array for pcpu_lpage_remapped() */
 	for (i = 0; i < num_possible_cpus() - 1; i++)
-- 
cgit v1.2.3-71-gd317


From a530b7958612bafe2027e21359083dba84f0b3b4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 4 Jul 2009 08:11:00 +0900
Subject: percpu: teach large page allocator about NUMA

Large page first chunk allocator is primarily used for NUMA machines;
however, its NUMA handling is extremely simplistic.  Regardless of
their proximity, each cpu is put into separate large page just to
return most of the allocated space back wasting large amount of
vmalloc space and increasing cache footprint.

This patch teachs NUMA details to large page allocator.  Given
processor proximity information, pcpu_lpage_build_unit_map() will find
fitting cpu -> unit mapping in which cpus in LOCAL_DISTANCE share the
same large page and not too much virtual address space is wasted.

This greatly reduces the unit and thus chunk size and wastes much less
address space for the first chunk.  For example, on 4/4 NUMA machine,
the original code occupied 16MB of virtual space for the first chunk
while the new code only uses 4MB - one 2MB page for each node.

[ Impact: much better space efficiency on NUMA machines ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jan Beulich <JBeulich@novell.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Miller <davem@davemloft.net>
---
 arch/x86/kernel/setup_percpu.c |  72 +++++++--
 include/linux/percpu.h         |  24 ++-
 mm/percpu.c                    | 358 ++++++++++++++++++++++++++++++++---------
 3 files changed, 359 insertions(+), 95 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 4f2e0ac9130b..7501bb14bd51 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -149,36 +149,73 @@ static void __init pcpul_map(void *ptr, size_t size, void *addr)
 	set_pmd(pmd, pmd_v);
 }
 
+static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to)
+{
+	if (early_cpu_to_node(from) == early_cpu_to_node(to))
+		return LOCAL_DISTANCE;
+	else
+		return REMOTE_DISTANCE;
+}
+
 static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
 {
 	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
+	size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE;
+	size_t unit_map_size, unit_size;
+	int *unit_map;
+	int nr_units;
+	ssize_t ret;
+
+	/* on non-NUMA, embedding is better */
+	if (!chosen && !pcpu_need_numa())
+		return -EINVAL;
+
+	/* need PSE */
+	if (!cpu_has_pse) {
+		pr_warning("PERCPU: lpage allocator requires PSE\n");
+		return -EINVAL;
+	}
 
+	/* allocate and build unit_map */
+	unit_map_size = num_possible_cpus() * sizeof(int);
+	unit_map = alloc_bootmem_nopanic(unit_map_size);
+	if (!unit_map) {
+		pr_warning("PERCPU: failed to allocate unit_map\n");
+		return -ENOMEM;
+	}
+
+	ret = pcpu_lpage_build_unit_map(static_size,
+					PERCPU_FIRST_CHUNK_RESERVE,
+					&dyn_size, &unit_size, PMD_SIZE,
+					unit_map, pcpu_lpage_cpu_distance);
+	if (ret < 0) {
+		pr_warning("PERCPU: failed to build unit_map\n");
+		goto out_free;
+	}
+	nr_units = ret;
+
+	/* do the parameters look okay? */
 	if (!chosen) {
 		size_t vm_size = VMALLOC_END - VMALLOC_START;
-		size_t tot_size = num_possible_cpus() * PMD_SIZE;
-
-		/* on non-NUMA, embedding is better */
-		if (!pcpu_need_numa())
-			return -EINVAL;
+		size_t tot_size = nr_units * unit_size;
 
 		/* don't consume more than 20% of vmalloc area */
 		if (tot_size > vm_size / 5) {
 			pr_info("PERCPU: too large chunk size %zuMB for "
 				"large page remap\n", tot_size >> 20);
-			return -EINVAL;
+			ret = -EINVAL;
+			goto out_free;
 		}
 	}
 
-	/* need PSE */
-	if (!cpu_has_pse) {
-		pr_warning("PERCPU: lpage allocator requires PSE\n");
-		return -EINVAL;
-	}
-
-	return pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
-				      reserve - PERCPU_FIRST_CHUNK_RESERVE,
-				      PMD_SIZE,
-				      pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
+	ret = pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
+				     dyn_size, unit_size, PMD_SIZE,
+				     unit_map, nr_units,
+				     pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
+out_free:
+	if (ret < 0)
+		free_bootmem(__pa(unit_map), unit_map_size);
+	return ret;
 }
 #else
 static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
@@ -299,7 +336,8 @@ void __init setup_per_cpu_areas(void)
 	/* alrighty, percpu areas up and running */
 	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
 	for_each_possible_cpu(cpu) {
-		per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
+		per_cpu_offset(cpu) =
+			delta + pcpu_unit_map[cpu] * pcpu_unit_size;
 		per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
 		per_cpu(cpu_number, cpu) = cpu;
 		setup_percpu_segment(cpu);
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 1e0e8878dc2a..8ce91af4aa19 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -62,6 +62,7 @@ extern const int *pcpu_unit_map;
 typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size);
 typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
 typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
+typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to);
 typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr);
 
 extern size_t __init pcpu_setup_first_chunk(
@@ -80,18 +81,37 @@ extern ssize_t __init pcpu_4k_first_chunk(
 				pcpu_fc_populate_pte_fn_t populate_pte_fn);
 
 #ifdef CONFIG_NEED_MULTIPLE_NODES
+extern int __init pcpu_lpage_build_unit_map(
+				size_t static_size, size_t reserved_size,
+				ssize_t *dyn_sizep, size_t *unit_sizep,
+				size_t lpage_size, int *unit_map,
+				pcpu_fc_cpu_distance_fn_t cpu_distance_fn);
+
 extern ssize_t __init pcpu_lpage_first_chunk(
 				size_t static_size, size_t reserved_size,
-				ssize_t dyn_size, size_t lpage_size,
+				size_t dyn_size, size_t unit_size,
+				size_t lpage_size, const int *unit_map,
+				int nr_units,
 				pcpu_fc_alloc_fn_t alloc_fn,
 				pcpu_fc_free_fn_t free_fn,
 				pcpu_fc_map_fn_t map_fn);
 
 extern void *pcpu_lpage_remapped(void *kaddr);
 #else
+static inline int pcpu_lpage_build_unit_map(
+				size_t static_size, size_t reserved_size,
+				ssize_t *dyn_sizep, size_t *unit_sizep,
+				size_t lpage_size, int *unit_map,
+				pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
+{
+	return -EINVAL;
+}
+
 static inline ssize_t __init pcpu_lpage_first_chunk(
 				size_t static_size, size_t reserved_size,
-				ssize_t dyn_size, size_t lpage_size,
+				size_t dyn_size, size_t unit_size,
+				size_t lpage_size, const int *unit_map,
+				int nr_units,
 				pcpu_fc_alloc_fn_t alloc_fn,
 				pcpu_fc_free_fn_t free_fn,
 				pcpu_fc_map_fn_t map_fn)
diff --git a/mm/percpu.c b/mm/percpu.c
index 2196fae24f00..b3d0bcff8c7c 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -59,6 +59,7 @@
 #include <linux/bitmap.h>
 #include <linux/bootmem.h>
 #include <linux/list.h>
+#include <linux/log2.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
@@ -1594,75 +1595,259 @@ out_free_ar:
  * Large page remapping first chunk setup helper
  */
 #ifdef CONFIG_NEED_MULTIPLE_NODES
+
+/**
+ * pcpu_lpage_build_unit_map - build unit_map for large page remapping
+ * @static_size: the size of static percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @dyn_sizep: in/out parameter for dynamic size, -1 for auto
+ * @unit_sizep: out parameter for unit size
+ * @unit_map: unit_map to be filled
+ * @cpu_distance_fn: callback to determine distance between cpus
+ *
+ * This function builds cpu -> unit map and determine other parameters
+ * considering needed percpu size, large page size and distances
+ * between CPUs in NUMA.
+ *
+ * CPUs which are of LOCAL_DISTANCE both ways are grouped together and
+ * may share units in the same large page.  The returned configuration
+ * is guaranteed to have CPUs on different nodes on different large
+ * pages and >=75% usage of allocated virtual address space.
+ *
+ * RETURNS:
+ * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and
+ * returns the number of units to be allocated.  -errno on failure.
+ */
+int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size,
+				     ssize_t *dyn_sizep, size_t *unit_sizep,
+				     size_t lpage_size, int *unit_map,
+				     pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
+{
+	static int group_map[NR_CPUS] __initdata;
+	static int group_cnt[NR_CPUS] __initdata;
+	int group_cnt_max = 0;
+	size_t size_sum, min_unit_size, alloc_size;
+	int upa, max_upa, uninitialized_var(best_upa);	/* units_per_alloc */
+	int last_allocs;
+	unsigned int cpu, tcpu;
+	int group, unit;
+
+	/*
+	 * Determine min_unit_size, alloc_size and max_upa such that
+	 * alloc_size is multiple of lpage_size and is the smallest
+	 * which can accomodate 4k aligned segments which are equal to
+	 * or larger than min_unit_size.
+	 */
+	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep);
+	min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+
+	alloc_size = roundup(min_unit_size, lpage_size);
+	upa = alloc_size / min_unit_size;
+	while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+		upa--;
+	max_upa = upa;
+
+	/* group cpus according to their proximity */
+	for_each_possible_cpu(cpu) {
+		group = 0;
+	next_group:
+		for_each_possible_cpu(tcpu) {
+			if (cpu == tcpu)
+				break;
+			if (group_map[tcpu] == group &&
+			    (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
+			     cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
+				group++;
+				goto next_group;
+			}
+		}
+		group_map[cpu] = group;
+		group_cnt[group]++;
+		group_cnt_max = max(group_cnt_max, group_cnt[group]);
+	}
+
+	/*
+	 * Expand unit size until address space usage goes over 75%
+	 * and then as much as possible without using more address
+	 * space.
+	 */
+	last_allocs = INT_MAX;
+	for (upa = max_upa; upa; upa--) {
+		int allocs = 0, wasted = 0;
+
+		if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+			continue;
+
+		for (group = 0; group_cnt[group]; group++) {
+			int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
+			allocs += this_allocs;
+			wasted += this_allocs * upa - group_cnt[group];
+		}
+
+		/*
+		 * Don't accept if wastage is over 25%.  The
+		 * greater-than comparison ensures upa==1 always
+		 * passes the following check.
+		 */
+		if (wasted > num_possible_cpus() / 3)
+			continue;
+
+		/* and then don't consume more memory */
+		if (allocs > last_allocs)
+			break;
+		last_allocs = allocs;
+		best_upa = upa;
+	}
+	*unit_sizep = alloc_size / best_upa;
+
+	/* assign units to cpus accordingly */
+	unit = 0;
+	for (group = 0; group_cnt[group]; group++) {
+		for_each_possible_cpu(cpu)
+			if (group_map[cpu] == group)
+				unit_map[cpu] = unit++;
+		unit = roundup(unit, best_upa);
+	}
+
+	return unit;	/* unit contains aligned number of units */
+}
+
 struct pcpul_ent {
-	unsigned int	cpu;
 	void		*ptr;
+	void		*map_addr;
 };
 
 static size_t pcpul_size;
-static size_t pcpul_unit_size;
+static size_t pcpul_lpage_size;
+static int pcpul_nr_lpages;
 static struct pcpul_ent *pcpul_map;
-static struct vm_struct pcpul_vm;
+
+static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
+				     unsigned int *cpup)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu)
+		if (unit_map[cpu] == unit) {
+			if (cpup)
+				*cpup = cpu;
+			return true;
+		}
+
+	return false;
+}
+
+static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
+					size_t reserved_size, size_t dyn_size,
+					size_t unit_size, size_t lpage_size,
+					const int *unit_map, int nr_units)
+{
+	int width = 1, v = nr_units;
+	char empty_str[] = "--------";
+	int upl, lpl;	/* units per lpage, lpage per line */
+	unsigned int cpu;
+	int lpage, unit;
+
+	while (v /= 10)
+		width++;
+	empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0';
+
+	upl = max_t(int, lpage_size / unit_size, 1);
+	lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1));
+
+	printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl,
+	       static_size, reserved_size, dyn_size, unit_size, lpage_size);
+
+	for (lpage = 0, unit = 0; unit < nr_units; unit++) {
+		if (!(unit % upl)) {
+			if (!(lpage++ % lpl)) {
+				printk("\n");
+				printk("%spcpu-lpage: ", lvl);
+			} else
+				printk("| ");
+		}
+		if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
+			printk("%0*d ", width, cpu);
+		else
+			printk("%s ", empty_str);
+	}
+	printk("\n");
+}
 
 /**
  * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
  * @static_size: the size of static percpu area in bytes
  * @reserved_size: the size of reserved percpu area in bytes
- * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
+ * @dyn_size: free size for dynamic allocation in bytes
+ * @unit_size: unit size in bytes
  * @lpage_size: the size of a large page
+ * @unit_map: cpu -> unit mapping
+ * @nr_units: the number of units
  * @alloc_fn: function to allocate percpu lpage, always called with lpage_size
  * @free_fn: function to free percpu memory, @size <= lpage_size
  * @map_fn: function to map percpu lpage, always called with lpage_size
  *
- * This allocator uses large page as unit.  A large page is allocated
- * for each cpu and each is remapped into vmalloc area using large
- * page mapping.  As large page can be quite large, only part of it is
- * used for the first chunk.  Unused part is returned to the bootmem
- * allocator.
- *
- * So, the large pages are mapped twice - once to the physical mapping
- * and to the vmalloc area for the first percpu chunk.  The double
- * mapping does add one more large TLB entry pressure but still is
- * much better than only using 4k mappings while still being NUMA
- * friendly.
+ * This allocator uses large page to build and map the first chunk.
+ * Unlike other helpers, the caller should always specify @dyn_size
+ * and @unit_size.  These parameters along with @unit_map and
+ * @nr_units can be determined using pcpu_lpage_build_unit_map().
+ * This two stage initialization is to allow arch code to evaluate the
+ * parameters before committing to it.
+ *
+ * Large pages are allocated as directed by @unit_map and other
+ * parameters and mapped to vmalloc space.  Unused holes are returned
+ * to the page allocator.  Note that these holes end up being actively
+ * mapped twice - once to the physical mapping and to the vmalloc area
+ * for the first percpu chunk.  Depending on architecture, this might
+ * cause problem when changing page attributes of the returned area.
+ * These double mapped areas can be detected using
+ * pcpu_lpage_remapped().
  *
  * RETURNS:
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access on success, -errno on failure.
  */
 ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
-				      ssize_t dyn_size, size_t lpage_size,
+				      size_t dyn_size, size_t unit_size,
+				      size_t lpage_size, const int *unit_map,
+				      int nr_units,
 				      pcpu_fc_alloc_fn_t alloc_fn,
 				      pcpu_fc_free_fn_t free_fn,
 				      pcpu_fc_map_fn_t map_fn)
 {
-	size_t size_sum;
+	static struct vm_struct vm;
+	size_t chunk_size = unit_size * nr_units;
 	size_t map_size;
 	unsigned int cpu;
-	int i, j;
 	ssize_t ret;
+	int i, j, unit;
 
-	/*
-	 * Currently supports only single page.  Supporting multiple
-	 * pages won't be too difficult if it ever becomes necessary.
-	 */
-	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
+	pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size,
+			     unit_size, lpage_size, unit_map, nr_units);
 
-	pcpul_unit_size = lpage_size;
-	pcpul_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
-	if (pcpul_size > pcpul_unit_size) {
-		pr_warning("PERCPU: static data is larger than large page, "
-			   "can't use large page\n");
-		return -EINVAL;
-	}
+	BUG_ON(chunk_size % lpage_size);
+
+	pcpul_size = static_size + reserved_size + dyn_size;
+	pcpul_lpage_size = lpage_size;
+	pcpul_nr_lpages = chunk_size / lpage_size;
 
 	/* allocate pointer array and alloc large pages */
-	map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0]));
+	map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]);
 	pcpul_map = alloc_bootmem(map_size);
 
-	for_each_possible_cpu(cpu) {
+	/* allocate all pages */
+	for (i = 0; i < pcpul_nr_lpages; i++) {
+		size_t offset = i * lpage_size;
+		int first_unit = offset / unit_size;
+		int last_unit = (offset + lpage_size - 1) / unit_size;
 		void *ptr;
 
+		/* find out which cpu is mapped to this unit */
+		for (unit = first_unit; unit <= last_unit; unit++)
+			if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
+				goto found;
+		continue;
+	found:
 		ptr = alloc_fn(cpu, lpage_size);
 		if (!ptr) {
 			pr_warning("PERCPU: failed to allocate large page "
@@ -1670,53 +1855,79 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
 			goto enomem;
 		}
 
-		/*
-		 * Only use pcpul_size bytes and give back the rest.
-		 *
-		 * Ingo: The lpage_size up-rounding bootmem is needed
-		 * to make sure the partial lpage is still fully RAM -
-		 * it's not well-specified to have a incompatible area
-		 * (unmapped RAM, device memory, etc.) in that hole.
-		 */
-		free_fn(ptr + pcpul_size, lpage_size - pcpul_size);
-
-		pcpul_map[cpu].cpu = cpu;
-		pcpul_map[cpu].ptr = ptr;
+		pcpul_map[i].ptr = ptr;
+	}
 
-		memcpy(ptr, __per_cpu_load, static_size);
+	/* return unused holes */
+	for (unit = 0; unit < nr_units; unit++) {
+		size_t start = unit * unit_size;
+		size_t end = start + unit_size;
+		size_t off, next;
+
+		/* don't free used part of occupied unit */
+		if (pcpul_unit_to_cpu(unit, unit_map, NULL))
+			start += pcpul_size;
+
+		/* unit can span more than one page, punch the holes */
+		for (off = start; off < end; off = next) {
+			void *ptr = pcpul_map[off / lpage_size].ptr;
+			next = min(roundup(off + 1, lpage_size), end);
+			if (ptr)
+				free_fn(ptr + off % lpage_size, next - off);
+		}
 	}
 
-	/* allocate address and map */
-	pcpul_vm.flags = VM_ALLOC;
-	pcpul_vm.size = num_possible_cpus() * pcpul_unit_size;
-	vm_area_register_early(&pcpul_vm, pcpul_unit_size);
+	/* allocate address, map and copy */
+	vm.flags = VM_ALLOC;
+	vm.size = chunk_size;
+	vm_area_register_early(&vm, unit_size);
+
+	for (i = 0; i < pcpul_nr_lpages; i++) {
+		if (!pcpul_map[i].ptr)
+			continue;
+		pcpul_map[i].map_addr = vm.addr + i * lpage_size;
+		map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr);
+	}
 
 	for_each_possible_cpu(cpu)
-		map_fn(pcpul_map[cpu].ptr, pcpul_unit_size,
-		       pcpul_vm.addr + cpu * pcpul_unit_size);
+		memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load,
+		       static_size);
 
 	/* we're ready, commit */
 	pr_info("PERCPU: Remapped at %p with large pages, static data "
-		"%zu bytes\n", pcpul_vm.addr, static_size);
+		"%zu bytes\n", vm.addr, static_size);
 
 	ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
-				     pcpul_unit_size, pcpul_vm.addr, NULL);
-
-	/* sort pcpul_map array for pcpu_lpage_remapped() */
-	for (i = 0; i < num_possible_cpus() - 1; i++)
-		for (j = i + 1; j < num_possible_cpus(); j++)
-			if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
-				struct pcpul_ent tmp = pcpul_map[i];
-				pcpul_map[i] = pcpul_map[j];
-				pcpul_map[j] = tmp;
-			}
+				     unit_size, vm.addr, unit_map);
+
+	/*
+	 * Sort pcpul_map array for pcpu_lpage_remapped().  Unmapped
+	 * lpages are pushed to the end and trimmed.
+	 */
+	for (i = 0; i < pcpul_nr_lpages - 1; i++)
+		for (j = i + 1; j < pcpul_nr_lpages; j++) {
+			struct pcpul_ent tmp;
+
+			if (!pcpul_map[j].ptr)
+				continue;
+			if (pcpul_map[i].ptr &&
+			    pcpul_map[i].ptr < pcpul_map[j].ptr)
+				continue;
+
+			tmp = pcpul_map[i];
+			pcpul_map[i] = pcpul_map[j];
+			pcpul_map[j] = tmp;
+		}
+
+	while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr)
+		pcpul_nr_lpages--;
 
 	return ret;
 
 enomem:
-	for_each_possible_cpu(cpu)
-		if (pcpul_map[cpu].ptr)
-			free_fn(pcpul_map[cpu].ptr, pcpul_size);
+	for (i = 0; i < pcpul_nr_lpages; i++)
+		if (pcpul_map[i].ptr)
+			free_fn(pcpul_map[i].ptr, lpage_size);
 	free_bootmem(__pa(pcpul_map), map_size);
 	return -ENOMEM;
 }
@@ -1739,10 +1950,10 @@ enomem:
  */
 void *pcpu_lpage_remapped(void *kaddr)
 {
-	unsigned long unit_mask = pcpul_unit_size - 1;
-	void *lpage_addr = (void *)((unsigned long)kaddr & ~unit_mask);
-	unsigned long offset = (unsigned long)kaddr & unit_mask;
-	int left = 0, right = num_possible_cpus() - 1;
+	unsigned long lpage_mask = pcpul_lpage_size - 1;
+	void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask);
+	unsigned long offset = (unsigned long)kaddr & lpage_mask;
+	int left = 0, right = pcpul_nr_lpages - 1;
 	int pos;
 
 	/* pcpul in use at all? */
@@ -1757,13 +1968,8 @@ void *pcpu_lpage_remapped(void *kaddr)
 			left = pos + 1;
 		else if (pcpul_map[pos].ptr > lpage_addr)
 			right = pos - 1;
-		else {
-			/* it shouldn't be in the area for the first chunk */
-			WARN_ON(offset < pcpul_size);
-
-			return pcpul_vm.addr +
-				pcpul_map[pos].cpu * pcpul_unit_size + offset;
-		}
+		else
+			return pcpul_map[pos].map_addr + offset;
 	}
 
 	return NULL;
-- 
cgit v1.2.3-71-gd317


From 7a6d3c8b3049d07123628f2bf57127bba2cc878f Mon Sep 17 00:00:00 2001
From: Csaba Henk <csaba@gluster.com>
Date: Wed, 1 Jul 2009 17:28:41 -0700
Subject: fuse: make the number of max background requests and congestion
 threshold tunable

The practical values for these limits depend on the design of the
filesystem server so let userspace set them at initialization time.

Signed-off-by: Csaba Henk <csaba@gluster.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dev.c        | 10 +++++-----
 fs/fuse/fuse_i.h     | 12 ++++++------
 fs/fuse/inode.c      | 14 ++++++++++++++
 include/linux/fuse.h |  9 +++++++--
 4 files changed, 32 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index f58ecbc416c8..b152761c1bf6 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -250,7 +250,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 
 static void flush_bg_queue(struct fuse_conn *fc)
 {
-	while (fc->active_background < FUSE_MAX_BACKGROUND &&
+	while (fc->active_background < fc->max_background &&
 	       !list_empty(&fc->bg_queue)) {
 		struct fuse_req *req;
 
@@ -280,11 +280,11 @@ __releases(&fc->lock)
 	list_del(&req->intr_entry);
 	req->state = FUSE_REQ_FINISHED;
 	if (req->background) {
-		if (fc->num_background == FUSE_MAX_BACKGROUND) {
+		if (fc->num_background == fc->max_background) {
 			fc->blocked = 0;
 			wake_up_all(&fc->blocked_waitq);
 		}
-		if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
+		if (fc->num_background == fc->congestion_threshold &&
 		    fc->connected && fc->bdi_initialized) {
 			clear_bdi_congested(&fc->bdi, READ);
 			clear_bdi_congested(&fc->bdi, WRITE);
@@ -410,9 +410,9 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
 {
 	req->background = 1;
 	fc->num_background++;
-	if (fc->num_background == FUSE_MAX_BACKGROUND)
+	if (fc->num_background == fc->max_background)
 		fc->blocked = 1;
-	if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
+	if (fc->num_background == fc->congestion_threshold &&
 	    fc->bdi_initialized) {
 		set_bdi_congested(&fc->bdi, READ);
 		set_bdi_congested(&fc->bdi, WRITE);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 52b641fc0faf..6bcfab04396f 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -25,12 +25,6 @@
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
 
-/** Maximum number of outstanding background requests */
-#define FUSE_MAX_BACKGROUND 12
-
-/** Congestion starts at 75% of maximum */
-#define FUSE_CONGESTION_THRESHOLD (FUSE_MAX_BACKGROUND * 75 / 100)
-
 /** Bias for fi->writectr, meaning new writepages must not be sent */
 #define FUSE_NOWRITE INT_MIN
 
@@ -349,6 +343,12 @@ struct fuse_conn {
 	/** rbtree of fuse_files waiting for poll events indexed by ph */
 	struct rb_root polled_files;
 
+	/** Maximum number of outstanding background requests */
+	unsigned max_background;
+
+	/** Number of background requests at which congestion starts */
+	unsigned congestion_threshold;
+
 	/** Number of requests currently in the background */
 	unsigned num_background;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f91ccc4a189d..9aa6f46d0c32 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -32,6 +32,12 @@ DEFINE_MUTEX(fuse_mutex);
 
 #define FUSE_DEFAULT_BLKSIZE 512
 
+/** Maximum number of outstanding background requests */
+#define FUSE_DEFAULT_MAX_BACKGROUND 12
+
+/** Congestion starts at 75% of maximum */
+#define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4)
+
 struct fuse_mount_data {
 	int fd;
 	unsigned rootmode;
@@ -517,6 +523,8 @@ void fuse_conn_init(struct fuse_conn *fc)
 	INIT_LIST_HEAD(&fc->bg_queue);
 	INIT_LIST_HEAD(&fc->entry);
 	atomic_set(&fc->num_waiting, 0);
+	fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
+	fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
 	fc->khctr = 0;
 	fc->polled_files = RB_ROOT;
 	fc->reqctr = 0;
@@ -736,6 +744,12 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 	else {
 		unsigned long ra_pages;
 
+		if (arg->minor >= 13) {
+			if (arg->max_background)
+				fc->max_background = arg->max_background;
+			if (arg->congestion_threshold)
+				fc->congestion_threshold = arg->congestion_threshold;
+		}
 		if (arg->minor >= 6) {
 			ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
 			if (arg->flags & FUSE_ASYNC_READ)
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index cf593bf9fd32..b3700f0ac268 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -30,6 +30,10 @@
  *  - add umask flag to input argument of open, mknod and mkdir
  *  - add notification messages for invalidation of inodes and
  *    directory entries
+ *
+ * 7.13
+ *  - make max number of background requests and congestion threshold
+ *    tunables
  */
 
 #ifndef _LINUX_FUSE_H
@@ -41,7 +45,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 12
+#define FUSE_KERNEL_MINOR_VERSION 13
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -427,7 +431,8 @@ struct fuse_init_out {
 	__u32	minor;
 	__u32	max_readahead;
 	__u32	flags;
-	__u32	unused;
+	__u16   max_background;
+	__u16   congestion_threshold;
 	__u32	max_write;
 };
 
-- 
cgit v1.2.3-71-gd317


From 37d217f029a56a6d385f99773fb27dfcb51f9a46 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 8 Jul 2009 18:17:58 +0200
Subject: fuse: document protocol version negotiation

Clarify how the protocol version should be negotiated between kernel
and userspace.  Notably libfuse didn't correctly handle the case when
the supported major versions didn't match.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 include/linux/fuse.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index b3700f0ac268..3e2925a34bf0 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -41,6 +41,26 @@
 
 #include <linux/types.h>
 
+/*
+ * Version negotiation:
+ *
+ * Both the kernel and userspace send the version they support in the
+ * INIT request and reply respectively.
+ *
+ * If the major versions match then both shall use the smallest
+ * of the two minor versions for communication.
+ *
+ * If the kernel supports a larger major version, then userspace shall
+ * reply with the major version it supports, ignore the rest of the
+ * INIT message and expect a new INIT message from the kernel with a
+ * matching major version.
+ *
+ * If the library supports a larger major version, then it shall fall
+ * back to the major protocol version sent by the kernel for
+ * communication and reply with that major version (and an arbitrary
+ * supported minor version).
+ */
+
 /** Version number of this interface */
 #define FUSE_KERNEL_VERSION 7
 
-- 
cgit v1.2.3-71-gd317


From 5a421ce3c062a87db0a9e7f2a0a7ee0a5b869aab Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 10 Jul 2009 12:37:40 +0300
Subject: nfsd41: gather and report statistics also for v4.1 ops

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 include/linux/nfs4.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index bd2eba530667..aff924a24abb 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -234,7 +234,7 @@ enum nfs_opnum4 {
 Needs to be updated if more operations are defined in future.*/
 
 #define FIRST_NFS4_OP	OP_ACCESS
-#define LAST_NFS4_OP 	OP_RELEASE_LOCKOWNER
+#define LAST_NFS4_OP 	OP_RECLAIM_COMPLETE
 
 enum nfsstat4 {
 	NFS4_OK = 0,
-- 
cgit v1.2.3-71-gd317


From 4bd9b0f4afc76cf972578c702e1bc1b6f2d10ba5 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 24 Jun 2009 15:37:45 -0400
Subject: nfsd41: use globals for DRC limits

The version 4.1 DRC memory limit and tracking variables are server wide and
session specific. Replace struct svc_serv fields with globals.
Stop using the svc_serv sv_lock.

Add a spinlock to serialize access to the DRC limit management variables which
change on session creation and deletion (usage counter) or (future)
administrative action to adjust the total DRC memory limit.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfsd/nfs4state.c        | 10 +++++-----
 fs/nfsd/nfssvc.c           | 19 +++++++++++++++----
 include/linux/nfsd/nfsd.h  |  3 +++
 include/linux/sunrpc/svc.h |  2 --
 4 files changed, 23 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 980a216a48c8..2e6a44e3d2fe 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -430,11 +430,11 @@ static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
 	else if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
 		fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
 
-	spin_lock(&nfsd_serv->sv_lock);
-	if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages)
-		np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used;
-	nfsd_serv->sv_drc_pages_used += np;
-	spin_unlock(&nfsd_serv->sv_lock);
+	spin_lock(&nfsd_drc_lock);
+	if (np + nfsd_drc_pages_used > nfsd_drc_max_pages)
+		np = nfsd_drc_max_pages - nfsd_drc_pages_used;
+	nfsd_drc_pages_used += np;
+	spin_unlock(&nfsd_drc_lock);
 
 	if (np <= 0) {
 		status = nfserr_resource;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index d4c9884cd54b..78d8fcd883fb 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -67,6 +67,16 @@ struct timeval			nfssvc_boot;
 DEFINE_MUTEX(nfsd_mutex);
 struct svc_serv 		*nfsd_serv;
 
+/*
+ * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used.
+ * nfsd_drc_max_pages limits the total amount of memory available for
+ * version 4.1 DRC caches.
+ * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage.
+ */
+spinlock_t	nfsd_drc_lock;
+unsigned int	nfsd_drc_max_pages;
+unsigned int	nfsd_drc_pages_used;
+
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
 static struct svc_stat	nfsd_acl_svcstats;
 static struct svc_version *	nfsd_acl_version[] = {
@@ -238,11 +248,12 @@ static void set_max_drc(void)
 {
 	/* The percent of nr_free_buffer_pages used by the V4.1 server DRC */
 	#define NFSD_DRC_SIZE_SHIFT	7
-	nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages()
+	nfsd_drc_max_pages = nr_free_buffer_pages()
 						>> NFSD_DRC_SIZE_SHIFT;
-	nfsd_serv->sv_drc_pages_used = 0;
-	dprintk("%s svc_drc_max_pages %u\n", __func__,
-		nfsd_serv->sv_drc_max_pages);
+	nfsd_drc_pages_used = 0;
+	spin_lock_init(&nfsd_drc_lock);
+	dprintk("%s nfsd_drc_max_pages %u\n", __func__,
+		nfsd_drc_max_pages);
 }
 
 int nfsd_create_serv(void)
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 2b49d676d0c9..2571f856908f 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -56,6 +56,9 @@ extern struct svc_version	nfsd_version2, nfsd_version3,
 extern u32			nfsd_supported_minorversion;
 extern struct mutex		nfsd_mutex;
 extern struct svc_serv		*nfsd_serv;
+extern spinlock_t		nfsd_drc_lock;
+extern unsigned int		nfsd_drc_max_pages;
+extern unsigned int		nfsd_drc_pages_used;
 
 extern struct seq_operations nfs_exports_op;
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index ea8009695c69..52e8cb0a7569 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -94,8 +94,6 @@ struct svc_serv {
 	struct module *		sv_module;	/* optional module to count when
 						 * adding threads */
 	svc_thread_fn		sv_function;	/* main function for threads */
-	unsigned int		sv_drc_max_pages; /* Total pages for DRC */
-	unsigned int		sv_drc_pages_used;/* DRC pages used */
 #if defined(CONFIG_NFS_V4_1)
 	struct list_head	sv_cb_list;	/* queue for callback requests
 						 * that arrive over the same
-- 
cgit v1.2.3-71-gd317


From 3d39cecc4841e8d4c4abdb401d10180f5faaded0 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Wed, 8 Jul 2009 15:23:30 +0100
Subject: intel-iommu: Remove superfluous iova_alloc_lock from IOVA code

We only ever obtain this lock immediately before the iova_rbtree_lock,
and release it immediately after the iova_rbtree_lock. So ditch it and
just use iova_rbtree_lock.

[v2: Remove the lockdep bits this time too]
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c |  3 ---
 drivers/pci/iova.c        | 16 ++++------------
 include/linux/iova.h      |  1 -
 3 files changed, 4 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index c5f7c73cbb55..d6a857397ec3 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -1309,7 +1309,6 @@ static void iommu_detach_domain(struct dmar_domain *domain,
 }
 
 static struct iova_domain reserved_iova_list;
-static struct lock_class_key reserved_alloc_key;
 static struct lock_class_key reserved_rbtree_key;
 
 static void dmar_init_reserved_ranges(void)
@@ -1320,8 +1319,6 @@ static void dmar_init_reserved_ranges(void)
 
 	init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
 
-	lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
-		&reserved_alloc_key);
 	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
 		&reserved_rbtree_key);
 
diff --git a/drivers/pci/iova.c b/drivers/pci/iova.c
index 46dd440e2315..7914951ef29a 100644
--- a/drivers/pci/iova.c
+++ b/drivers/pci/iova.c
@@ -22,7 +22,6 @@
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit)
 {
-	spin_lock_init(&iovad->iova_alloc_lock);
 	spin_lock_init(&iovad->iova_rbtree_lock);
 	iovad->rbroot = RB_ROOT;
 	iovad->cached32_node = NULL;
@@ -205,7 +204,6 @@ alloc_iova(struct iova_domain *iovad, unsigned long size,
 	unsigned long limit_pfn,
 	bool size_aligned)
 {
-	unsigned long flags;
 	struct iova *new_iova;
 	int ret;
 
@@ -219,11 +217,9 @@ alloc_iova(struct iova_domain *iovad, unsigned long size,
 	if (size_aligned)
 		size = __roundup_pow_of_two(size);
 
-	spin_lock_irqsave(&iovad->iova_alloc_lock, flags);
 	ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn,
 			new_iova, size_aligned);
 
-	spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags);
 	if (ret) {
 		free_iova_mem(new_iova);
 		return NULL;
@@ -381,8 +377,7 @@ reserve_iova(struct iova_domain *iovad,
 	struct iova *iova;
 	unsigned int overlap = 0;
 
-	spin_lock_irqsave(&iovad->iova_alloc_lock, flags);
-	spin_lock(&iovad->iova_rbtree_lock);
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
 	for (node = rb_first(&iovad->rbroot); node; node = rb_next(node)) {
 		if (__is_range_overlap(node, pfn_lo, pfn_hi)) {
 			iova = container_of(node, struct iova, node);
@@ -402,8 +397,7 @@ reserve_iova(struct iova_domain *iovad,
 	iova = __insert_new_range(iovad, pfn_lo, pfn_hi);
 finish:
 
-	spin_unlock(&iovad->iova_rbtree_lock);
-	spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags);
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
 	return iova;
 }
 
@@ -420,8 +414,7 @@ copy_reserved_iova(struct iova_domain *from, struct iova_domain *to)
 	unsigned long flags;
 	struct rb_node *node;
 
-	spin_lock_irqsave(&from->iova_alloc_lock, flags);
-	spin_lock(&from->iova_rbtree_lock);
+	spin_lock_irqsave(&from->iova_rbtree_lock, flags);
 	for (node = rb_first(&from->rbroot); node; node = rb_next(node)) {
 		struct iova *iova = container_of(node, struct iova, node);
 		struct iova *new_iova;
@@ -430,6 +423,5 @@ copy_reserved_iova(struct iova_domain *from, struct iova_domain *to)
 			printk(KERN_ERR "Reserve iova range %lx@%lx failed\n",
 				iova->pfn_lo, iova->pfn_lo);
 	}
-	spin_unlock(&from->iova_rbtree_lock);
-	spin_unlock_irqrestore(&from->iova_alloc_lock, flags);
+	spin_unlock_irqrestore(&from->iova_rbtree_lock, flags);
 }
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 228f6c94b69c..76a0759e88ec 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -28,7 +28,6 @@ struct iova {
 
 /* holds all the iova translations for a domain */
 struct iova_domain {
-	spinlock_t	iova_alloc_lock;/* Lock to protect iova  allocation */
 	spinlock_t	iova_rbtree_lock; /* Lock to protect update of rbtree */
 	struct rb_root	rbroot;		/* iova domain rbtree root */
 	struct rb_node	*cached32_node; /* Save last alloced node */
-- 
cgit v1.2.3-71-gd317


From a76761b621bcd8336065c4fe3a74f046858bc34c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 15 Jul 2009 23:35:14 +0900
Subject: percpu: add dummy pcpu_lpage_remapped() for !CONFIG_SMP

!CONFIG_SMP was missing pcpu_lpage_remapped() definition causing build
failure.  Add dummy implementation.  This was discovered by linux-next
testing.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
---
 include/linux/percpu.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 8ce91af4aa19..e134c8229631 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -184,6 +184,11 @@ static inline void free_percpu(void *p)
 
 static inline void __init setup_per_cpu_areas(void) { }
 
+static inline void *pcpu_lpage_remapped(void *kaddr)
+{
+	return NULL;
+}
+
 #endif /* CONFIG_SMP */
 
 #define alloc_percpu(type)	(type *)__alloc_percpu(sizeof(type), \
-- 
cgit v1.2.3-71-gd317


From 719a72b7c75bb239ca6184190ab994b71a31c6dc Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Fri, 17 Jul 2009 14:59:55 +0000
Subject: usb: r8a66597-hcd platform data on_chip support

Convert the r8a66597-hcd driver to use the on_chip flag
from platform data to enable on chip behaviour instead
of relying on CONFIG_SUPERH_ON_CHIP_R8A66597 ugliness.

This makes the code cleaner and also allows us to support
both external and internal r8a66597 with the same kernel.

It also makes the Kconfig part more future proof since
we with this patch can add support for new processors
with on-chip r8a66597 without modifying the Kconfig.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boards/mach-se/7724/setup.c    |   1 +
 arch/sh/kernel/cpu/sh4a/setup-sh7366.c |   2 +-
 arch/sh/kernel/cpu/sh4a/setup-sh7723.c |   2 +-
 drivers/usb/host/Kconfig               |   7 --
 drivers/usb/host/r8a66597-hcd.c        | 187 +++++++++++++++++++--------------
 drivers/usb/host/r8a66597.h            |  76 ++++++--------
 include/linux/usb/r8a66597.h           |   3 +
 7 files changed, 147 insertions(+), 131 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sh/boards/mach-se/7724/setup.c b/arch/sh/boards/mach-se/7724/setup.c
index 8fed45a2fb85..4fb7e48e2843 100644
--- a/arch/sh/boards/mach-se/7724/setup.c
+++ b/arch/sh/boards/mach-se/7724/setup.c
@@ -304,6 +304,7 @@ static struct platform_device sh_eth_device = {
 };
 
 static struct r8a66597_platdata sh7724_usb0_host_data = {
+	.on_chip = 1,
 };
 
 static struct resource sh7724_usb0_host_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
index c18f7d09281b..f6d208813564 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
@@ -40,7 +40,7 @@ static struct platform_device iic_device = {
 };
 
 static struct r8a66597_platdata r8a66597_data = {
-	/* This set zero to all members */
+	.on_chip = 1,
 };
 
 static struct resource usb_host_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
index e1bb80b2a27b..28516499a2c4 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
@@ -398,7 +398,7 @@ static struct platform_device rtc_device = {
 };
 
 static struct r8a66597_platdata r8a66597_data = {
-	/* This set zero to all members */
+	.on_chip = 1,
 };
 
 static struct resource sh7723_usb_host_resources[] = {
diff --git a/drivers/usb/host/Kconfig b/drivers/usb/host/Kconfig
index 1a920c70b5a1..f21ca7d27a43 100644
--- a/drivers/usb/host/Kconfig
+++ b/drivers/usb/host/Kconfig
@@ -336,13 +336,6 @@ config USB_R8A66597_HCD
 	  To compile this driver as a module, choose M here: the
 	  module will be called r8a66597-hcd.
 
-config SUPERH_ON_CHIP_R8A66597
-	boolean "Enable SuperH on-chip R8A66597 USB"
-	depends on USB_R8A66597_HCD && (CPU_SUBTYPE_SH7366 || CPU_SUBTYPE_SH7723 || CPU_SUBTYPE_SH7724)
-	help
-	   This driver enables support for the on-chip R8A66597 in the
-	   SH7366, SH7723 and SH7724 processors.
-
 config USB_WHCI_HCD
 	tristate "Wireless USB Host Controller Interface (WHCI) driver (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
diff --git a/drivers/usb/host/r8a66597-hcd.c b/drivers/usb/host/r8a66597-hcd.c
index 09895a97c10b..82dce3e0d4d7 100644
--- a/drivers/usb/host/r8a66597-hcd.c
+++ b/drivers/usb/host/r8a66597-hcd.c
@@ -91,43 +91,43 @@ static int r8a66597_clock_enable(struct r8a66597 *r8a66597)
 	u16 tmp;
 	int i = 0;
 
-#if defined(CONFIG_SUPERH_ON_CHIP_R8A66597)
-#if defined(CONFIG_HAVE_CLK)
-	clk_enable(r8a66597->clk);
+	if (r8a66597->pdata->on_chip) {
+#ifdef CONFIG_HAVE_CLK
+		clk_enable(r8a66597->clk);
 #endif
-	do {
-		r8a66597_write(r8a66597, SCKE, SYSCFG0);
-		tmp = r8a66597_read(r8a66597, SYSCFG0);
-		if (i++ > 1000) {
-			printk(KERN_ERR "r8a66597: register access fail.\n");
-			return -ENXIO;
-		}
-	} while ((tmp & SCKE) != SCKE);
-	r8a66597_write(r8a66597, 0x04, 0x02);
-#else
-	do {
-		r8a66597_write(r8a66597, USBE, SYSCFG0);
-		tmp = r8a66597_read(r8a66597, SYSCFG0);
-		if (i++ > 1000) {
-			printk(KERN_ERR "r8a66597: register access fail.\n");
-			return -ENXIO;
-		}
-	} while ((tmp & USBE) != USBE);
-	r8a66597_bclr(r8a66597, USBE, SYSCFG0);
-	r8a66597_mdfy(r8a66597, get_xtal_from_pdata(r8a66597->pdata), XTAL,
-			SYSCFG0);
+		do {
+			r8a66597_write(r8a66597, SCKE, SYSCFG0);
+			tmp = r8a66597_read(r8a66597, SYSCFG0);
+			if (i++ > 1000) {
+				printk(KERN_ERR "r8a66597: reg access fail.\n");
+				return -ENXIO;
+			}
+		} while ((tmp & SCKE) != SCKE);
+		r8a66597_write(r8a66597, 0x04, 0x02);
+	} else {
+		do {
+			r8a66597_write(r8a66597, USBE, SYSCFG0);
+			tmp = r8a66597_read(r8a66597, SYSCFG0);
+			if (i++ > 1000) {
+				printk(KERN_ERR "r8a66597: reg access fail.\n");
+				return -ENXIO;
+			}
+		} while ((tmp & USBE) != USBE);
+		r8a66597_bclr(r8a66597, USBE, SYSCFG0);
+		r8a66597_mdfy(r8a66597, get_xtal_from_pdata(r8a66597->pdata),
+			      XTAL, SYSCFG0);
 
-	i = 0;
-	r8a66597_bset(r8a66597, XCKE, SYSCFG0);
-	do {
-		msleep(1);
-		tmp = r8a66597_read(r8a66597, SYSCFG0);
-		if (i++ > 500) {
-			printk(KERN_ERR "r8a66597: register access fail.\n");
-			return -ENXIO;
-		}
-	} while ((tmp & SCKE) != SCKE);
-#endif	/* #if defined(CONFIG_SUPERH_ON_CHIP_R8A66597) */
+		i = 0;
+		r8a66597_bset(r8a66597, XCKE, SYSCFG0);
+		do {
+			msleep(1);
+			tmp = r8a66597_read(r8a66597, SYSCFG0);
+			if (i++ > 500) {
+				printk(KERN_ERR "r8a66597: reg access fail.\n");
+				return -ENXIO;
+			}
+		} while ((tmp & SCKE) != SCKE);
+	}
 
 	return 0;
 }
@@ -136,15 +136,16 @@ static void r8a66597_clock_disable(struct r8a66597 *r8a66597)
 {
 	r8a66597_bclr(r8a66597, SCKE, SYSCFG0);
 	udelay(1);
-#if defined(CONFIG_SUPERH_ON_CHIP_R8A66597)
-#if defined(CONFIG_HAVE_CLK)
-	clk_disable(r8a66597->clk);
-#endif
-#else
-	r8a66597_bclr(r8a66597, PLLC, SYSCFG0);
-	r8a66597_bclr(r8a66597, XCKE, SYSCFG0);
-	r8a66597_bclr(r8a66597, USBE, SYSCFG0);
+
+	if (r8a66597->pdata->on_chip) {
+#ifdef CONFIG_HAVE_CLK
+		clk_disable(r8a66597->clk);
 #endif
+	} else {
+		r8a66597_bclr(r8a66597, PLLC, SYSCFG0);
+		r8a66597_bclr(r8a66597, XCKE, SYSCFG0);
+		r8a66597_bclr(r8a66597, USBE, SYSCFG0);
+	}
 }
 
 static void r8a66597_enable_port(struct r8a66597 *r8a66597, int port)
@@ -205,7 +206,7 @@ static int enable_controller(struct r8a66597 *r8a66597)
 
 	r8a66597_bset(r8a66597, SIGNE | SACKE, INTENB1);
 
-	for (port = 0; port < R8A66597_MAX_ROOT_HUB; port++)
+	for (port = 0; port < r8a66597->max_root_hub; port++)
 		r8a66597_enable_port(r8a66597, port);
 
 	return 0;
@@ -218,7 +219,7 @@ static void disable_controller(struct r8a66597 *r8a66597)
 	r8a66597_write(r8a66597, 0, INTENB0);
 	r8a66597_write(r8a66597, 0, INTSTS0);
 
-	for (port = 0; port < R8A66597_MAX_ROOT_HUB; port++)
+	for (port = 0; port < r8a66597->max_root_hub; port++)
 		r8a66597_disable_port(r8a66597, port);
 
 	r8a66597_clock_disable(r8a66597);
@@ -249,11 +250,12 @@ static int is_hub_limit(char *devpath)
 	return ((strlen(devpath) >= 4) ? 1 : 0);
 }
 
-static void get_port_number(char *devpath, u16 *root_port, u16 *hub_port)
+static void get_port_number(struct r8a66597 *r8a66597,
+			    char *devpath, u16 *root_port, u16 *hub_port)
 {
 	if (root_port) {
 		*root_port = (devpath[0] & 0x0F) - 1;
-		if (*root_port >= R8A66597_MAX_ROOT_HUB)
+		if (*root_port >= r8a66597->max_root_hub)
 			printk(KERN_ERR "r8a66597: Illegal root port number.\n");
 	}
 	if (hub_port)
@@ -355,7 +357,8 @@ static int make_r8a66597_device(struct r8a66597 *r8a66597,
 	INIT_LIST_HEAD(&dev->device_list);
 	list_add_tail(&dev->device_list, &r8a66597->child_device);
 
-	get_port_number(urb->dev->devpath, &dev->root_port, &dev->hub_port);
+	get_port_number(r8a66597, urb->dev->devpath,
+			&dev->root_port, &dev->hub_port);
 	if (!is_child_device(urb->dev->devpath))
 		r8a66597->root_hub[dev->root_port].dev = dev;
 
@@ -420,7 +423,7 @@ static void free_usb_address(struct r8a66597 *r8a66597,
 	list_del(&dev->device_list);
 	kfree(dev);
 
-	for (port = 0; port < R8A66597_MAX_ROOT_HUB; port++) {
+	for (port = 0; port < r8a66597->max_root_hub; port++) {
 		if (r8a66597->root_hub[port].dev == dev) {
 			r8a66597->root_hub[port].dev = NULL;
 			break;
@@ -495,10 +498,20 @@ static void r8a66597_pipe_toggle(struct r8a66597 *r8a66597,
 		r8a66597_bset(r8a66597, SQCLR, pipe->pipectr);
 }
 
+static inline unsigned short mbw_value(struct r8a66597 *r8a66597)
+{
+	if (r8a66597->pdata->on_chip)
+		return MBW_32;
+	else
+		return MBW_16;
+}
+
 /* this function must be called with interrupt disabled */
 static inline void cfifo_change(struct r8a66597 *r8a66597, u16 pipenum)
 {
-	r8a66597_mdfy(r8a66597, MBW | pipenum, MBW | CURPIPE, CFIFOSEL);
+	unsigned short mbw = mbw_value(r8a66597);
+
+	r8a66597_mdfy(r8a66597, mbw | pipenum, mbw | CURPIPE, CFIFOSEL);
 	r8a66597_reg_wait(r8a66597, CFIFOSEL, CURPIPE, pipenum);
 }
 
@@ -506,11 +519,13 @@ static inline void cfifo_change(struct r8a66597 *r8a66597, u16 pipenum)
 static inline void fifo_change_from_pipe(struct r8a66597 *r8a66597,
 					 struct r8a66597_pipe *pipe)
 {
+	unsigned short mbw = mbw_value(r8a66597);
+
 	cfifo_change(r8a66597, 0);
-	r8a66597_mdfy(r8a66597, MBW | 0, MBW | CURPIPE, D0FIFOSEL);
-	r8a66597_mdfy(r8a66597, MBW | 0, MBW | CURPIPE, D1FIFOSEL);
+	r8a66597_mdfy(r8a66597, mbw | 0, mbw | CURPIPE, D0FIFOSEL);
+	r8a66597_mdfy(r8a66597, mbw | 0, mbw | CURPIPE, D1FIFOSEL);
 
-	r8a66597_mdfy(r8a66597, MBW | pipe->info.pipenum, MBW | CURPIPE,
+	r8a66597_mdfy(r8a66597, mbw | pipe->info.pipenum, mbw | CURPIPE,
 		      pipe->fifosel);
 	r8a66597_reg_wait(r8a66597, pipe->fifosel, CURPIPE, pipe->info.pipenum);
 }
@@ -742,9 +757,13 @@ static void enable_r8a66597_pipe_dma(struct r8a66597 *r8a66597,
 				     struct r8a66597_pipe *pipe,
 				     struct urb *urb)
 {
-#if !defined(CONFIG_SUPERH_ON_CHIP_R8A66597)
 	int i;
 	struct r8a66597_pipe_info *info = &pipe->info;
+	unsigned short mbw = mbw_value(r8a66597);
+
+	/* pipe dma is only for external controlles */
+	if (r8a66597->pdata->on_chip)
+		return;
 
 	if ((pipe->info.pipenum != 0) && (info->type != R8A66597_INT)) {
 		for (i = 0; i < R8A66597_MAX_DMA_CHANNEL; i++) {
@@ -763,8 +782,8 @@ static void enable_r8a66597_pipe_dma(struct r8a66597 *r8a66597,
 			set_pipe_reg_addr(pipe, i);
 
 			cfifo_change(r8a66597, 0);
-			r8a66597_mdfy(r8a66597, MBW | pipe->info.pipenum,
-				      MBW | CURPIPE, pipe->fifosel);
+			r8a66597_mdfy(r8a66597, mbw | pipe->info.pipenum,
+				      mbw | CURPIPE, pipe->fifosel);
 
 			r8a66597_reg_wait(r8a66597, pipe->fifosel, CURPIPE,
 					  pipe->info.pipenum);
@@ -772,7 +791,6 @@ static void enable_r8a66597_pipe_dma(struct r8a66597 *r8a66597,
 			break;
 		}
 	}
-#endif	/* #if defined(CONFIG_SUPERH_ON_CHIP_R8A66597) */
 }
 
 /* this function must be called with interrupt disabled */
@@ -1769,7 +1787,7 @@ static void r8a66597_timer(unsigned long _r8a66597)
 
 	spin_lock_irqsave(&r8a66597->lock, flags);
 
-	for (port = 0; port < R8A66597_MAX_ROOT_HUB; port++)
+	for (port = 0; port < r8a66597->max_root_hub; port++)
 		r8a66597_root_hub_control(r8a66597, port);
 
 	spin_unlock_irqrestore(&r8a66597->lock, flags);
@@ -1807,7 +1825,7 @@ static void set_address_zero(struct r8a66597 *r8a66597, struct urb *urb)
 	u16 root_port, hub_port;
 
 	if (usb_address == 0) {
-		get_port_number(urb->dev->devpath,
+		get_port_number(r8a66597, urb->dev->devpath,
 				&root_port, &hub_port);
 		set_devadd_reg(r8a66597, 0,
 			       get_r8a66597_usb_speed(urb->dev->speed),
@@ -2082,7 +2100,7 @@ static int r8a66597_hub_status_data(struct usb_hcd *hcd, char *buf)
 
 	*buf = 0;	/* initialize (no change) */
 
-	for (i = 0; i < R8A66597_MAX_ROOT_HUB; i++) {
+	for (i = 0; i < r8a66597->max_root_hub; i++) {
 		if (r8a66597->root_hub[i].port & 0xffff0000)
 			*buf |= 1 << (i + 1);
 	}
@@ -2097,11 +2115,11 @@ static void r8a66597_hub_descriptor(struct r8a66597 *r8a66597,
 {
 	desc->bDescriptorType = 0x29;
 	desc->bHubContrCurrent = 0;
-	desc->bNbrPorts = R8A66597_MAX_ROOT_HUB;
+	desc->bNbrPorts = r8a66597->max_root_hub;
 	desc->bDescLength = 9;
 	desc->bPwrOn2PwrGood = 0;
 	desc->wHubCharacteristics = cpu_to_le16(0x0011);
-	desc->bitmap[0] = ((1 << R8A66597_MAX_ROOT_HUB) - 1) << 1;
+	desc->bitmap[0] = ((1 << r8a66597->max_root_hub) - 1) << 1;
 	desc->bitmap[1] = ~0;
 }
 
@@ -2129,7 +2147,7 @@ static int r8a66597_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue,
 		}
 		break;
 	case ClearPortFeature:
-		if (wIndex > R8A66597_MAX_ROOT_HUB)
+		if (wIndex > r8a66597->max_root_hub)
 			goto error;
 		if (wLength != 0)
 			goto error;
@@ -2162,12 +2180,12 @@ static int r8a66597_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue,
 		*buf = 0x00;
 		break;
 	case GetPortStatus:
-		if (wIndex > R8A66597_MAX_ROOT_HUB)
+		if (wIndex > r8a66597->max_root_hub)
 			goto error;
 		*(__le32 *)buf = cpu_to_le32(rh->port);
 		break;
 	case SetPortFeature:
-		if (wIndex > R8A66597_MAX_ROOT_HUB)
+		if (wIndex > r8a66597->max_root_hub)
 			goto error;
 		if (wLength != 0)
 			goto error;
@@ -2216,7 +2234,7 @@ static int r8a66597_bus_suspend(struct usb_hcd *hcd)
 
 	dbg("%s", __func__);
 
-	for (port = 0; port < R8A66597_MAX_ROOT_HUB; port++) {
+	for (port = 0; port < r8a66597->max_root_hub; port++) {
 		struct r8a66597_root_hub *rh = &r8a66597->root_hub[port];
 		unsigned long dvstctr_reg = get_dvstctr_reg(port);
 
@@ -2247,7 +2265,7 @@ static int r8a66597_bus_resume(struct usb_hcd *hcd)
 
 	dbg("%s", __func__);
 
-	for (port = 0; port < R8A66597_MAX_ROOT_HUB; port++) {
+	for (port = 0; port < r8a66597->max_root_hub; port++) {
 		struct r8a66597_root_hub *rh = &r8a66597->root_hub[port];
 		unsigned long dvstctr_reg = get_dvstctr_reg(port);
 
@@ -2314,7 +2332,7 @@ static int r8a66597_suspend(struct device *dev)
 
 	disable_controller(r8a66597);
 
-	for (port = 0; port < R8A66597_MAX_ROOT_HUB; port++) {
+	for (port = 0; port < r8a66597->max_root_hub; port++) {
 		struct r8a66597_root_hub *rh = &r8a66597->root_hub[port];
 
 		rh->port = 0x00000000;
@@ -2354,8 +2372,9 @@ static int __init_or_module r8a66597_remove(struct platform_device *pdev)
 	del_timer_sync(&r8a66597->rh_timer);
 	usb_remove_hcd(hcd);
 	iounmap((void *)r8a66597->reg);
-#if defined(CONFIG_SUPERH_ON_CHIP_R8A66597) && defined(CONFIG_HAVE_CLK)
-	clk_put(r8a66597->clk);
+#ifdef CONFIG_HAVE_CLK
+	if (r8a66597->pdata->on_chip)
+		clk_put(r8a66597->clk);
 #endif
 	usb_put_hcd(hcd);
 	return 0;
@@ -2363,7 +2382,7 @@ static int __init_or_module r8a66597_remove(struct platform_device *pdev)
 
 static int __devinit r8a66597_probe(struct platform_device *pdev)
 {
-#if defined(CONFIG_SUPERH_ON_CHIP_R8A66597) && defined(CONFIG_HAVE_CLK)
+#ifdef CONFIG_HAVE_CLK
 	char clk_name[8];
 #endif
 	struct resource *res = NULL, *ires;
@@ -2425,15 +2444,20 @@ static int __devinit r8a66597_probe(struct platform_device *pdev)
 	r8a66597->pdata = pdev->dev.platform_data;
 	r8a66597->irq_sense_low = irq_trigger == IRQF_TRIGGER_LOW;
 
-#if defined(CONFIG_SUPERH_ON_CHIP_R8A66597) && defined(CONFIG_HAVE_CLK)
-	snprintf(clk_name, sizeof(clk_name), "usb%d", pdev->id);
-	r8a66597->clk = clk_get(&pdev->dev, clk_name);
-	if (IS_ERR(r8a66597->clk)) {
-		dev_err(&pdev->dev, "cannot get clock \"%s\"\n", clk_name);
-		ret = PTR_ERR(r8a66597->clk);
-		goto clean_up2;
-	}
+	if (r8a66597->pdata->on_chip) {
+#ifdef CONFIG_HAVE_CLK
+		snprintf(clk_name, sizeof(clk_name), "usb%d", pdev->id);
+		r8a66597->clk = clk_get(&pdev->dev, clk_name);
+		if (IS_ERR(r8a66597->clk)) {
+			dev_err(&pdev->dev, "cannot get clock \"%s\"\n",
+				clk_name);
+			ret = PTR_ERR(r8a66597->clk);
+			goto clean_up2;
+		}
 #endif
+		r8a66597->max_root_hub = 1;
+	} else
+		r8a66597->max_root_hub = 2;
 
 	spin_lock_init(&r8a66597->lock);
 	init_timer(&r8a66597->rh_timer);
@@ -2463,8 +2487,9 @@ static int __devinit r8a66597_probe(struct platform_device *pdev)
 	return 0;
 
 clean_up3:
-#if defined(CONFIG_SUPERH_ON_CHIP_R8A66597) && defined(CONFIG_HAVE_CLK)
-	clk_put(r8a66597->clk);
+#ifdef CONFIG_HAVE_CLK
+	if (r8a66597->pdata->on_chip)
+		clk_put(r8a66597->clk);
 clean_up2:
 #endif
 	usb_put_hcd(hcd);
diff --git a/drivers/usb/host/r8a66597.h b/drivers/usb/host/r8a66597.h
index d72680b433f9..eecbd917bc81 100644
--- a/drivers/usb/host/r8a66597.h
+++ b/drivers/usb/host/r8a66597.h
@@ -26,7 +26,7 @@
 #ifndef __R8A66597_H__
 #define __R8A66597_H__
 
-#if defined(CONFIG_SUPERH_ON_CHIP_R8A66597) && defined(CONFIG_HAVE_CLK)
+#ifdef CONFIG_HAVE_CLK
 #include <linux/clk.h>
 #endif
 
@@ -193,13 +193,9 @@
 #define	REW		0x4000	/* b14: Buffer rewind */
 #define	DCLRM		0x2000	/* b13: DMA buffer clear mode */
 #define	DREQE		0x1000	/* b12: DREQ output enable */
-#if defined(CONFIG_SUPERH_ON_CHIP_R8A66597)
-#define	MBW		0x0800
-#else
-#define	MBW		0x0400	/* b10: Maximum bit width for FIFO access */
-#endif
 #define	  MBW_8		 0x0000	  /*  8bit */
 #define	  MBW_16	 0x0400	  /* 16bit */
+#define	  MBW_32	 0x0800   /* 32bit */
 #define	BIGEND		0x0100	/* b8: Big endian mode */
 #define	  BYTE_LITTLE	 0x0000		/* little dendian */
 #define	  BYTE_BIG	 0x0100		/* big endifan */
@@ -405,11 +401,7 @@
 #define R8A66597_MAX_NUM_PIPE		10
 #define R8A66597_BUF_BSIZE		8
 #define R8A66597_MAX_DEVICE		10
-#if defined(CONFIG_SUPERH_ON_CHIP_R8A66597)
-#define R8A66597_MAX_ROOT_HUB		1
-#else
 #define R8A66597_MAX_ROOT_HUB		2
-#endif
 #define R8A66597_MAX_SAMPLING		5
 #define R8A66597_RH_POLL_TIME		10
 #define R8A66597_MAX_DMA_CHANNEL	2
@@ -487,7 +479,7 @@ struct r8a66597_root_hub {
 struct r8a66597 {
 	spinlock_t lock;
 	unsigned long reg;
-#if defined(CONFIG_SUPERH_ON_CHIP_R8A66597) && defined(CONFIG_HAVE_CLK)
+#ifdef CONFIG_HAVE_CLK
 	struct clk *clk;
 #endif
 	struct r8a66597_platdata	*pdata;
@@ -504,6 +496,7 @@ struct r8a66597 {
 	unsigned short interval_map;
 	unsigned char pipe_cnt[R8A66597_MAX_NUM_PIPE];
 	unsigned char dma_map;
+	unsigned int max_root_hub;
 
 	struct list_head child_device;
 	unsigned long child_connect_map[4];
@@ -550,21 +543,22 @@ static inline void r8a66597_read_fifo(struct r8a66597 *r8a66597,
 				      unsigned long offset, u16 *buf,
 				      int len)
 {
-#if defined(CONFIG_SUPERH_ON_CHIP_R8A66597)
 	unsigned long fifoaddr = r8a66597->reg + offset;
 	unsigned long count;
 
-	count = len / 4;
-	insl(fifoaddr, buf, count);
+	if (r8a66597->pdata->on_chip) {
+		count = len / 4;
+		insl(fifoaddr, buf, count);
 
-	if (len & 0x00000003) {
-		unsigned long tmp = inl(fifoaddr);
-		memcpy((unsigned char *)buf + count * 4, &tmp, len & 0x03);
+		if (len & 0x00000003) {
+			unsigned long tmp = inl(fifoaddr);
+			memcpy((unsigned char *)buf + count * 4, &tmp,
+			       len & 0x03);
+		}
+	} else {
+		len = (len + 1) / 2;
+		insw(fifoaddr, buf, len);
 	}
-#else
-	len = (len + 1) / 2;
-	insw(r8a66597->reg + offset, buf, len);
-#endif
 }
 
 static inline void r8a66597_write(struct r8a66597 *r8a66597, u16 val,
@@ -578,33 +572,33 @@ static inline void r8a66597_write_fifo(struct r8a66597 *r8a66597,
 				       int len)
 {
 	unsigned long fifoaddr = r8a66597->reg + offset;
-#if defined(CONFIG_SUPERH_ON_CHIP_R8A66597)
 	unsigned long count;
 	unsigned char *pb;
 	int i;
 
-	count = len / 4;
-	outsl(fifoaddr, buf, count);
+	if (r8a66597->pdata->on_chip) {
+		count = len / 4;
+		outsl(fifoaddr, buf, count);
+
+		if (len & 0x00000003) {
+			pb = (unsigned char *)buf + count * 4;
+			for (i = 0; i < (len & 0x00000003); i++) {
+				if (r8a66597_read(r8a66597, CFIFOSEL) & BIGEND)
+					outb(pb[i], fifoaddr + i);
+				else
+					outb(pb[i], fifoaddr + 3 - i);
+			}
+		}
+	} else {
+		int odd = len & 0x0001;
 
-	if (len & 0x00000003) {
-		pb = (unsigned char *)buf + count * 4;
-		for (i = 0; i < (len & 0x00000003); i++) {
-			if (r8a66597_read(r8a66597, CFIFOSEL) & BIGEND)
-				outb(pb[i], fifoaddr + i);
-			else
-				outb(pb[i], fifoaddr + 3 - i);
+		len = len / 2;
+		outsw(fifoaddr, buf, len);
+		if (unlikely(odd)) {
+			buf = &buf[len];
+			outb((unsigned char)*buf, fifoaddr);
 		}
 	}
-#else
-	int odd = len & 0x0001;
-
-	len = len / 2;
-	outsw(fifoaddr, buf, len);
-	if (unlikely(odd)) {
-		buf = &buf[len];
-		outb((unsigned char)*buf, fifoaddr);
-	}
-#endif
 }
 
 static inline void r8a66597_mdfy(struct r8a66597 *r8a66597,
diff --git a/include/linux/usb/r8a66597.h b/include/linux/usb/r8a66597.h
index e9f0384fa20c..460ee3f6a2c6 100644
--- a/include/linux/usb/r8a66597.h
+++ b/include/linux/usb/r8a66597.h
@@ -31,6 +31,9 @@ struct r8a66597_platdata {
 	/* This ops can controll port power instead of DVSTCTR register. */
 	void (*port_power)(int port, int power);
 
+	/* set one = on chip controller, set zero = external controller */
+	unsigned	on_chip:1;
+
 	/* (external controller only) set R8A66597_PLATDATA_XTAL_nnMHZ */
 	unsigned	xtal:2;
 
-- 
cgit v1.2.3-71-gd317


From 99fde513f57db2c8e1b202ade4be7d47033ff09b Mon Sep 17 00:00:00 2001
From: Marek Vasut <marek.vasut@gmail.com>
Date: Mon, 20 Jul 2009 22:28:50 -0700
Subject: Input: wm97xx - add possibility to control the GPIO_STATUS shift

This patch allows tweaking the behaviour of GPIO_STATUS register
shift quirk that's in wm97xx-core. The problem with GPIO_STATUS
register being shifted by one doesn't appear on all hardware it
seems and causes problems with accelerated touchscreen drivers on
Palm hardware. Therefore an accelerated touchscreen driver can select
if the shift is/isn't happening on the hardware.

Signed-off-by: Marek Vasut <marek.vasut@gmail.com>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/touchscreen/mainstone-wm97xx.c | 3 +++
 drivers/input/touchscreen/wm97xx-core.c      | 6 ++++--
 include/linux/wm97xx.h                       | 7 +++++++
 3 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/mainstone-wm97xx.c b/drivers/input/touchscreen/mainstone-wm97xx.c
index c797bc04ee83..8fc3b08deb3b 100644
--- a/drivers/input/touchscreen/mainstone-wm97xx.c
+++ b/drivers/input/touchscreen/mainstone-wm97xx.c
@@ -198,6 +198,9 @@ static int wm97xx_acc_startup(struct wm97xx *wm)
 	if (machine_is_palmt5() || machine_is_palmtx() || machine_is_palmld()) {
 		pen_int = 1;
 		irq = 27;
+		/* There is some obscure mutant of WM9712 interbred with WM9713
+		 * used on Palm HW */
+		wm->variant = WM97xx_WM1613;
 	} else if (machine_is_mainstone() && pen_int)
 		irq = 4;
 
diff --git a/drivers/input/touchscreen/wm97xx-core.c b/drivers/input/touchscreen/wm97xx-core.c
index 2957d48e0045..252eb11fe9db 100644
--- a/drivers/input/touchscreen/wm97xx-core.c
+++ b/drivers/input/touchscreen/wm97xx-core.c
@@ -204,7 +204,7 @@ void wm97xx_set_gpio(struct wm97xx *wm, u32 gpio,
 	else
 		reg &= ~gpio;
 
-	if (wm->id == WM9712_ID2)
+	if (wm->id == WM9712_ID2 && wm->variant != WM97xx_WM1613)
 		wm97xx_reg_write(wm, AC97_GPIO_STATUS, reg << 1);
 	else
 		wm97xx_reg_write(wm, AC97_GPIO_STATUS, reg);
@@ -307,7 +307,7 @@ static void wm97xx_pen_irq_worker(struct work_struct *work)
 					 WM97XX_GPIO_13);
 		}
 
-		if (wm->id == WM9712_ID2)
+		if (wm->id == WM9712_ID2 && wm->variant != WM97xx_WM1613)
 			wm97xx_reg_write(wm, AC97_GPIO_STATUS, (status &
 						~WM97XX_GPIO_13) << 1);
 		else
@@ -582,6 +582,8 @@ static int wm97xx_probe(struct device *dev)
 
 	wm->id = wm97xx_reg_read(wm, AC97_VENDOR_ID2);
 
+	wm->variant = WM97xx_GENERIC;
+
 	dev_info(wm->dev, "detected a wm97%02x codec\n", wm->id & 0xff);
 
 	switch (wm->id & 0xff) {
diff --git a/include/linux/wm97xx.h b/include/linux/wm97xx.h
index 6f69968eab24..0c9878123d5f 100644
--- a/include/linux/wm97xx.h
+++ b/include/linux/wm97xx.h
@@ -15,6 +15,12 @@
 #include <linux/input.h>	/* Input device layer */
 #include <linux/platform_device.h>
 
+/*
+ * WM97xx variants
+ */
+#define	WM97xx_GENERIC			0x0000
+#define	WM97xx_WM1613			0x1613
+
 /*
  * WM97xx AC97 Touchscreen registers
  */
@@ -283,6 +289,7 @@ struct wm97xx {
 	unsigned pen_is_down:1;		/* Pen is down */
 	unsigned aux_waiting:1;		/* aux measurement waiting */
 	unsigned pen_probably_down:1;	/* used in polling mode */
+	u16 variant;			/* WM97xx chip variant */
 	u16 suspend_mode;               /* PRP in suspend mode */
 };
 
-- 
cgit v1.2.3-71-gd317


From d7aacaddcac3971e33cf52d7e610c06696cb347f Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 8 Jul 2009 13:21:31 +0200
Subject: Driver Core: Add platform device arch data V3

Allow architecture specific data in struct platform_device V3.

With this patch struct pdev_archdata is added to struct
platform_device, similar to struct dev_archdata in found in
struct device. Useful for architecture code that needs to
keep extra data associated with each platform device.

Struct pdev_archdata is different from dev.platform_data, the
convention is that dev.platform_data points to driver-specific
data. It may or may not be required by the driver. The format
of this depends on driver but is the same across architectures.

The structure pdev_archdata is a place for architecture specific
data. This data is handled by architecture specific code (for
example runtime PM), and since it is architecture specific it
should _never_ be touched by device driver code. Exactly like
struct dev_archdata but for platform devices.

[rjw: This change is for power management mostly and that's why it
 goes through the suspend tree.]

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Acked-by: Kevin Hilman <khilman@deeprootsystems.com>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 arch/arm/include/asm/device.h        | 3 +++
 arch/ia64/include/asm/device.h       | 3 +++
 arch/microblaze/include/asm/device.h | 3 +++
 arch/powerpc/include/asm/device.h    | 3 +++
 arch/sparc/include/asm/device.h      | 3 +++
 arch/x86/include/asm/device.h        | 3 +++
 include/asm-generic/device.h         | 3 +++
 include/linux/platform_device.h      | 3 +++
 8 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/device.h b/arch/arm/include/asm/device.h
index c61642b40603..9f390ce335cb 100644
--- a/arch/arm/include/asm/device.h
+++ b/arch/arm/include/asm/device.h
@@ -12,4 +12,7 @@ struct dev_archdata {
 #endif
 };
 
+struct pdev_archdata {
+};
+
 #endif
diff --git a/arch/ia64/include/asm/device.h b/arch/ia64/include/asm/device.h
index 41ab85d66f33..d66d446b127c 100644
--- a/arch/ia64/include/asm/device.h
+++ b/arch/ia64/include/asm/device.h
@@ -15,4 +15,7 @@ struct dev_archdata {
 #endif
 };
 
+struct pdev_archdata {
+};
+
 #endif /* _ASM_IA64_DEVICE_H */
diff --git a/arch/microblaze/include/asm/device.h b/arch/microblaze/include/asm/device.h
index c042830793ed..30286db27c1c 100644
--- a/arch/microblaze/include/asm/device.h
+++ b/arch/microblaze/include/asm/device.h
@@ -16,6 +16,9 @@ struct dev_archdata {
 	struct device_node	*of_node;
 };
 
+struct pdev_archdata {
+};
+
 #endif /* _ASM_MICROBLAZE_DEVICE_H */
 
 
diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h
index 7d2277cef09a..e3e06e0f7fc0 100644
--- a/arch/powerpc/include/asm/device.h
+++ b/arch/powerpc/include/asm/device.h
@@ -30,4 +30,7 @@ dev_archdata_get_node(const struct dev_archdata *ad)
 	return ad->of_node;
 }
 
+struct pdev_archdata {
+};
+
 #endif /* _ASM_POWERPC_DEVICE_H */
diff --git a/arch/sparc/include/asm/device.h b/arch/sparc/include/asm/device.h
index 3702e087df2c..f3b85b6b0b76 100644
--- a/arch/sparc/include/asm/device.h
+++ b/arch/sparc/include/asm/device.h
@@ -32,4 +32,7 @@ dev_archdata_get_node(const struct dev_archdata *ad)
 	return ad->prom_node;
 }
 
+struct pdev_archdata {
+};
+
 #endif /* _ASM_SPARC_DEVICE_H */
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
index 4994a20acbcb..cee34e9ca45b 100644
--- a/arch/x86/include/asm/device.h
+++ b/arch/x86/include/asm/device.h
@@ -13,4 +13,7 @@ struct dma_map_ops *dma_ops;
 #endif
 };
 
+struct pdev_archdata {
+};
+
 #endif /* _ASM_X86_DEVICE_H */
diff --git a/include/asm-generic/device.h b/include/asm-generic/device.h
index c17c9600f220..d7c76bba640d 100644
--- a/include/asm-generic/device.h
+++ b/include/asm-generic/device.h
@@ -9,4 +9,7 @@
 struct dev_archdata {
 };
 
+struct pdev_archdata {
+};
+
 #endif /* _ASM_GENERIC_DEVICE_H */
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 8dc5123b6305..672a69849735 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -22,6 +22,9 @@ struct platform_device {
 	struct resource	* resource;
 
 	struct platform_device_id	*id_entry;
+
+	/* arch specific additions */
+	struct pdev_archdata	archdata;
 };
 
 #define platform_get_device_id(pdev)	((pdev)->id_entry)
-- 
cgit v1.2.3-71-gd317


From 511647ff58fd0f1c1f415d2c757d841650edac91 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 8 Jul 2009 13:23:07 +0200
Subject: PM: Remove platform device suspend_late()/resume_early() V2

This is V2 of the platform driver power management late/early
callback removal patch. The callbacks ->suspend_late() and
->resume_early() are removed since all in-tree users now have
been migrated to dev_pm_ops.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 drivers/base/platform.c         | 36 ------------------------------------
 include/linux/platform_device.h |  2 --
 2 files changed, 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 81cb01bfc356..455e55971d0e 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -628,30 +628,6 @@ static int platform_legacy_suspend(struct device *dev, pm_message_t mesg)
 	return ret;
 }
 
-static int platform_legacy_suspend_late(struct device *dev, pm_message_t mesg)
-{
-	struct platform_driver *pdrv = to_platform_driver(dev->driver);
-	struct platform_device *pdev = to_platform_device(dev);
-	int ret = 0;
-
-	if (dev->driver && pdrv->suspend_late)
-		ret = pdrv->suspend_late(pdev, mesg);
-
-	return ret;
-}
-
-static int platform_legacy_resume_early(struct device *dev)
-{
-	struct platform_driver *pdrv = to_platform_driver(dev->driver);
-	struct platform_device *pdev = to_platform_device(dev);
-	int ret = 0;
-
-	if (dev->driver && pdrv->resume_early)
-		ret = pdrv->resume_early(pdev);
-
-	return ret;
-}
-
 static int platform_legacy_resume(struct device *dev)
 {
 	struct platform_driver *pdrv = to_platform_driver(dev->driver);
@@ -714,8 +690,6 @@ static int platform_pm_suspend_noirq(struct device *dev)
 	if (drv->pm) {
 		if (drv->pm->suspend_noirq)
 			ret = drv->pm->suspend_noirq(dev);
-	} else {
-		ret = platform_legacy_suspend_late(dev, PMSG_SUSPEND);
 	}
 
 	return ret;
@@ -750,8 +724,6 @@ static int platform_pm_resume_noirq(struct device *dev)
 	if (drv->pm) {
 		if (drv->pm->resume_noirq)
 			ret = drv->pm->resume_noirq(dev);
-	} else {
-		ret = platform_legacy_resume_early(dev);
 	}
 
 	return ret;
@@ -797,8 +769,6 @@ static int platform_pm_freeze_noirq(struct device *dev)
 	if (drv->pm) {
 		if (drv->pm->freeze_noirq)
 			ret = drv->pm->freeze_noirq(dev);
-	} else {
-		ret = platform_legacy_suspend_late(dev, PMSG_FREEZE);
 	}
 
 	return ret;
@@ -833,8 +803,6 @@ static int platform_pm_thaw_noirq(struct device *dev)
 	if (drv->pm) {
 		if (drv->pm->thaw_noirq)
 			ret = drv->pm->thaw_noirq(dev);
-	} else {
-		ret = platform_legacy_resume_early(dev);
 	}
 
 	return ret;
@@ -869,8 +837,6 @@ static int platform_pm_poweroff_noirq(struct device *dev)
 	if (drv->pm) {
 		if (drv->pm->poweroff_noirq)
 			ret = drv->pm->poweroff_noirq(dev);
-	} else {
-		ret = platform_legacy_suspend_late(dev, PMSG_HIBERNATE);
 	}
 
 	return ret;
@@ -905,8 +871,6 @@ static int platform_pm_restore_noirq(struct device *dev)
 	if (drv->pm) {
 		if (drv->pm->restore_noirq)
 			ret = drv->pm->restore_noirq(dev);
-	} else {
-		ret = platform_legacy_resume_early(dev);
 	}
 
 	return ret;
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 672a69849735..3c6675c2444b 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -60,8 +60,6 @@ struct platform_driver {
 	int (*remove)(struct platform_device *);
 	void (*shutdown)(struct platform_device *);
 	int (*suspend)(struct platform_device *, pm_message_t state);
-	int (*suspend_late)(struct platform_device *, pm_message_t state);
-	int (*resume_early)(struct platform_device *);
 	int (*resume)(struct platform_device *);
 	struct device_driver driver;
 	struct platform_device_id *id_table;
-- 
cgit v1.2.3-71-gd317


From fbd90375d7531927d312766b548376d909811b4d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 22 Jul 2009 13:40:14 +0200
Subject: hrtimer: Remove cb_entry from struct hrtimer

It's unused, remove it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
---
 include/linux/hrtimer.h | 2 --
 kernel/hrtimer.c        | 1 -
 2 files changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 54648e625efd..40e7d54fc424 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -91,7 +91,6 @@ enum hrtimer_restart {
  * @function:	timer expiry callback function
  * @base:	pointer to the timer base (per cpu and per clock)
  * @state:	state information (See bit values above)
- * @cb_entry:	list head to enqueue an expired timer into the callback list
  * @start_site:	timer statistics field to store the site where the timer
  *		was started
  * @start_comm: timer statistics field to store the name of the process which
@@ -108,7 +107,6 @@ struct hrtimer {
 	enum hrtimer_restart		(*function)(struct hrtimer *);
 	struct hrtimer_clock_base	*base;
 	unsigned long			state;
-	struct list_head		cb_entry;
 #ifdef CONFIG_TIMER_STATS
 	int				start_pid;
 	void				*start_site;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 43d151f185b6..052a0f53e4eb 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1092,7 +1092,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 		clock_id = CLOCK_MONOTONIC;
 
 	timer->base = &cpu_base->clock_base[clock_id];
-	INIT_LIST_HEAD(&timer->cb_entry);
 	hrtimer_init_timer_hres(timer);
 
 #ifdef CONFIG_TIMER_STATS
-- 
cgit v1.2.3-71-gd317


From cf4f1e76c49dacfde0680b170b9a9b6a42f296bb Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 22 Jul 2009 14:32:03 +0000
Subject: usb: move r8a66597 register defines

Move r8a66597 hardware register definitions from the host
controller header file to the platform data header file.

With this change in place we can easily share register
definitions between the host controller driver and a future
gadget driver.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/usb/host/r8a66597.h  | 366 ------------------------------------------
 include/linux/usb/r8a66597.h | 372 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 370 insertions(+), 368 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/host/r8a66597.h b/drivers/usb/host/r8a66597.h
index eecbd917bc81..228e3fb23854 100644
--- a/drivers/usb/host/r8a66597.h
+++ b/drivers/usb/host/r8a66597.h
@@ -32,372 +32,6 @@
 
 #include <linux/usb/r8a66597.h>
 
-#define SYSCFG0		0x00
-#define SYSCFG1		0x02
-#define SYSSTS0		0x04
-#define SYSSTS1		0x06
-#define DVSTCTR0	0x08
-#define DVSTCTR1	0x0A
-#define TESTMODE	0x0C
-#define PINCFG		0x0E
-#define DMA0CFG		0x10
-#define DMA1CFG		0x12
-#define CFIFO		0x14
-#define D0FIFO		0x18
-#define D1FIFO		0x1C
-#define CFIFOSEL	0x20
-#define CFIFOCTR	0x22
-#define CFIFOSIE	0x24
-#define D0FIFOSEL	0x28
-#define D0FIFOCTR	0x2A
-#define D1FIFOSEL	0x2C
-#define D1FIFOCTR	0x2E
-#define INTENB0		0x30
-#define INTENB1		0x32
-#define INTENB2		0x34
-#define BRDYENB		0x36
-#define NRDYENB		0x38
-#define BEMPENB		0x3A
-#define SOFCFG		0x3C
-#define INTSTS0		0x40
-#define INTSTS1		0x42
-#define INTSTS2		0x44
-#define BRDYSTS		0x46
-#define NRDYSTS		0x48
-#define BEMPSTS		0x4A
-#define FRMNUM		0x4C
-#define UFRMNUM		0x4E
-#define USBADDR		0x50
-#define USBREQ		0x54
-#define USBVAL		0x56
-#define USBINDX		0x58
-#define USBLENG		0x5A
-#define DCPCFG		0x5C
-#define DCPMAXP		0x5E
-#define DCPCTR		0x60
-#define PIPESEL		0x64
-#define PIPECFG		0x68
-#define PIPEBUF		0x6A
-#define PIPEMAXP	0x6C
-#define PIPEPERI	0x6E
-#define PIPE1CTR	0x70
-#define PIPE2CTR	0x72
-#define PIPE3CTR	0x74
-#define PIPE4CTR	0x76
-#define PIPE5CTR	0x78
-#define PIPE6CTR	0x7A
-#define PIPE7CTR	0x7C
-#define PIPE8CTR	0x7E
-#define PIPE9CTR	0x80
-#define PIPE1TRE	0x90
-#define PIPE1TRN	0x92
-#define PIPE2TRE	0x94
-#define PIPE2TRN	0x96
-#define PIPE3TRE	0x98
-#define PIPE3TRN	0x9A
-#define PIPE4TRE	0x9C
-#define	PIPE4TRN	0x9E
-#define	PIPE5TRE	0xA0
-#define	PIPE5TRN	0xA2
-#define DEVADD0		0xD0
-#define DEVADD1		0xD2
-#define DEVADD2		0xD4
-#define DEVADD3		0xD6
-#define DEVADD4		0xD8
-#define DEVADD5		0xDA
-#define DEVADD6		0xDC
-#define DEVADD7		0xDE
-#define DEVADD8		0xE0
-#define DEVADD9		0xE2
-#define DEVADDA		0xE4
-
-/* System Configuration Control Register */
-#define	XTAL		0xC000	/* b15-14: Crystal selection */
-#define	  XTAL48	 0x8000	  /* 48MHz */
-#define	  XTAL24	 0x4000	  /* 24MHz */
-#define	  XTAL12	 0x0000	  /* 12MHz */
-#define	XCKE		0x2000	/* b13: External clock enable */
-#define	PLLC		0x0800	/* b11: PLL control */
-#define	SCKE		0x0400	/* b10: USB clock enable */
-#define	PCSDIS		0x0200	/* b9: not CS wakeup */
-#define	LPSME		0x0100	/* b8: Low power sleep mode */
-#define	HSE		0x0080	/* b7: Hi-speed enable */
-#define	DCFM		0x0040	/* b6: Controller function select  */
-#define	DRPD		0x0020	/* b5: D+/- pull down control */
-#define	DPRPU		0x0010	/* b4: D+ pull up control */
-#define	USBE		0x0001	/* b0: USB module operation enable */
-
-/* System Configuration Status Register */
-#define	OVCBIT		0x8000	/* b15-14: Over-current bit */
-#define	OVCMON		0xC000	/* b15-14: Over-current monitor */
-#define	SOFEA		0x0020	/* b5: SOF monitor */
-#define	IDMON		0x0004	/* b3: ID-pin monitor */
-#define	LNST		0x0003	/* b1-0: D+, D- line status */
-#define	  SE1		 0x0003	  /* SE1 */
-#define	  FS_KSTS	 0x0002	  /* Full-Speed K State */
-#define	  FS_JSTS	 0x0001	  /* Full-Speed J State */
-#define	  LS_JSTS	 0x0002	  /* Low-Speed J State */
-#define	  LS_KSTS	 0x0001	  /* Low-Speed K State */
-#define	  SE0		 0x0000	  /* SE0 */
-
-/* Device State Control Register */
-#define	EXTLP0		0x0400	/* b10: External port */
-#define	VBOUT		0x0200	/* b9: VBUS output */
-#define	WKUP		0x0100	/* b8: Remote wakeup */
-#define	RWUPE		0x0080	/* b7: Remote wakeup sense */
-#define	USBRST		0x0040	/* b6: USB reset enable */
-#define	RESUME		0x0020	/* b5: Resume enable */
-#define	UACT		0x0010	/* b4: USB bus enable */
-#define	RHST		0x0007	/* b1-0: Reset handshake status */
-#define	  HSPROC	 0x0004	  /* HS handshake is processing */
-#define	  HSMODE	 0x0003	  /* Hi-Speed mode */
-#define	  FSMODE	 0x0002	  /* Full-Speed mode */
-#define	  LSMODE	 0x0001	  /* Low-Speed mode */
-#define	  UNDECID	 0x0000	  /* Undecided */
-
-/* Test Mode Register */
-#define	UTST			0x000F	/* b3-0: Test select */
-#define	  H_TST_PACKET		 0x000C	  /* HOST TEST Packet */
-#define	  H_TST_SE0_NAK		 0x000B	  /* HOST TEST SE0 NAK */
-#define	  H_TST_K		 0x000A	  /* HOST TEST K */
-#define	  H_TST_J		 0x0009	  /* HOST TEST J */
-#define	  H_TST_NORMAL		 0x0000	  /* HOST Normal Mode */
-#define	  P_TST_PACKET		 0x0004	  /* PERI TEST Packet */
-#define	  P_TST_SE0_NAK		 0x0003	  /* PERI TEST SE0 NAK */
-#define	  P_TST_K		 0x0002	  /* PERI TEST K */
-#define	  P_TST_J		 0x0001	  /* PERI TEST J */
-#define	  P_TST_NORMAL		 0x0000	  /* PERI Normal Mode */
-
-/* Data Pin Configuration Register */
-#define	LDRV			0x8000	/* b15: Drive Current Adjust */
-#define	  VIF1			  0x0000		/* VIF = 1.8V */
-#define	  VIF3			  0x8000		/* VIF = 3.3V */
-#define	INTA			0x0001	/* b1: USB INT-pin active */
-
-/* DMAx Pin Configuration Register */
-#define	DREQA			0x4000	/* b14: Dreq active select */
-#define	BURST			0x2000	/* b13: Burst mode */
-#define	DACKA			0x0400	/* b10: Dack active select */
-#define	DFORM			0x0380	/* b9-7: DMA mode select */
-#define	  CPU_ADR_RD_WR		 0x0000	  /* Address + RD/WR mode (CPU bus) */
-#define	  CPU_DACK_RD_WR	 0x0100	  /* DACK + RD/WR mode (CPU bus) */
-#define	  CPU_DACK_ONLY		 0x0180	  /* DACK only mode (CPU bus) */
-#define	  SPLIT_DACK_ONLY	 0x0200	  /* DACK only mode (SPLIT bus) */
-#define	DENDA			0x0040	/* b6: Dend active select */
-#define	PKTM			0x0020	/* b5: Packet mode */
-#define	DENDE			0x0010	/* b4: Dend enable */
-#define	OBUS			0x0004	/* b2: OUTbus mode */
-
-/* CFIFO/DxFIFO Port Select Register */
-#define	RCNT		0x8000	/* b15: Read count mode */
-#define	REW		0x4000	/* b14: Buffer rewind */
-#define	DCLRM		0x2000	/* b13: DMA buffer clear mode */
-#define	DREQE		0x1000	/* b12: DREQ output enable */
-#define	  MBW_8		 0x0000	  /*  8bit */
-#define	  MBW_16	 0x0400	  /* 16bit */
-#define	  MBW_32	 0x0800   /* 32bit */
-#define	BIGEND		0x0100	/* b8: Big endian mode */
-#define	  BYTE_LITTLE	 0x0000		/* little dendian */
-#define	  BYTE_BIG	 0x0100		/* big endifan */
-#define	ISEL		0x0020	/* b5: DCP FIFO port direction select */
-#define	CURPIPE		0x000F	/* b2-0: PIPE select */
-
-/* CFIFO/DxFIFO Port Control Register */
-#define	BVAL		0x8000	/* b15: Buffer valid flag */
-#define	BCLR		0x4000	/* b14: Buffer clear */
-#define	FRDY		0x2000	/* b13: FIFO ready */
-#define	DTLN		0x0FFF	/* b11-0: FIFO received data length */
-
-/* Interrupt Enable Register 0 */
-#define	VBSE	0x8000	/* b15: VBUS interrupt */
-#define	RSME	0x4000	/* b14: Resume interrupt */
-#define	SOFE	0x2000	/* b13: Frame update interrupt */
-#define	DVSE	0x1000	/* b12: Device state transition interrupt */
-#define	CTRE	0x0800	/* b11: Control transfer stage transition interrupt */
-#define	BEMPE	0x0400	/* b10: Buffer empty interrupt */
-#define	NRDYE	0x0200	/* b9: Buffer not ready interrupt */
-#define	BRDYE	0x0100	/* b8: Buffer ready interrupt */
-
-/* Interrupt Enable Register 1 */
-#define	OVRCRE		0x8000	/* b15: Over-current interrupt */
-#define	BCHGE		0x4000	/* b14: USB us chenge interrupt */
-#define	DTCHE		0x1000	/* b12: Detach sense interrupt */
-#define	ATTCHE		0x0800	/* b11: Attach sense interrupt */
-#define	EOFERRE		0x0040	/* b6: EOF error interrupt */
-#define	SIGNE		0x0020	/* b5: SETUP IGNORE interrupt */
-#define	SACKE		0x0010	/* b4: SETUP ACK interrupt */
-
-/* BRDY Interrupt Enable/Status Register */
-#define	BRDY9		0x0200	/* b9: PIPE9 */
-#define	BRDY8		0x0100	/* b8: PIPE8 */
-#define	BRDY7		0x0080	/* b7: PIPE7 */
-#define	BRDY6		0x0040	/* b6: PIPE6 */
-#define	BRDY5		0x0020	/* b5: PIPE5 */
-#define	BRDY4		0x0010	/* b4: PIPE4 */
-#define	BRDY3		0x0008	/* b3: PIPE3 */
-#define	BRDY2		0x0004	/* b2: PIPE2 */
-#define	BRDY1		0x0002	/* b1: PIPE1 */
-#define	BRDY0		0x0001	/* b1: PIPE0 */
-
-/* NRDY Interrupt Enable/Status Register */
-#define	NRDY9		0x0200	/* b9: PIPE9 */
-#define	NRDY8		0x0100	/* b8: PIPE8 */
-#define	NRDY7		0x0080	/* b7: PIPE7 */
-#define	NRDY6		0x0040	/* b6: PIPE6 */
-#define	NRDY5		0x0020	/* b5: PIPE5 */
-#define	NRDY4		0x0010	/* b4: PIPE4 */
-#define	NRDY3		0x0008	/* b3: PIPE3 */
-#define	NRDY2		0x0004	/* b2: PIPE2 */
-#define	NRDY1		0x0002	/* b1: PIPE1 */
-#define	NRDY0		0x0001	/* b1: PIPE0 */
-
-/* BEMP Interrupt Enable/Status Register */
-#define	BEMP9		0x0200	/* b9: PIPE9 */
-#define	BEMP8		0x0100	/* b8: PIPE8 */
-#define	BEMP7		0x0080	/* b7: PIPE7 */
-#define	BEMP6		0x0040	/* b6: PIPE6 */
-#define	BEMP5		0x0020	/* b5: PIPE5 */
-#define	BEMP4		0x0010	/* b4: PIPE4 */
-#define	BEMP3		0x0008	/* b3: PIPE3 */
-#define	BEMP2		0x0004	/* b2: PIPE2 */
-#define	BEMP1		0x0002	/* b1: PIPE1 */
-#define	BEMP0		0x0001	/* b0: PIPE0 */
-
-/* SOF Pin Configuration Register */
-#define	TRNENSEL	0x0100	/* b8: Select transaction enable period */
-#define	BRDYM		0x0040	/* b6: BRDY clear timing */
-#define	INTL		0x0020	/* b5: Interrupt sense select */
-#define	EDGESTS		0x0010	/* b4:  */
-#define	SOFMODE		0x000C	/* b3-2: SOF pin select */
-#define	  SOF_125US	 0x0008	  /* SOF OUT 125us Frame Signal */
-#define	  SOF_1MS	 0x0004	  /* SOF OUT 1ms Frame Signal */
-#define	  SOF_DISABLE	 0x0000	  /* SOF OUT Disable */
-
-/* Interrupt Status Register 0 */
-#define	VBINT	0x8000	/* b15: VBUS interrupt */
-#define	RESM	0x4000	/* b14: Resume interrupt */
-#define	SOFR	0x2000	/* b13: SOF frame update interrupt */
-#define	DVST	0x1000	/* b12: Device state transition interrupt */
-#define	CTRT	0x0800	/* b11: Control transfer stage transition interrupt */
-#define	BEMP	0x0400	/* b10: Buffer empty interrupt */
-#define	NRDY	0x0200	/* b9: Buffer not ready interrupt */
-#define	BRDY	0x0100	/* b8: Buffer ready interrupt */
-#define	VBSTS	0x0080	/* b7: VBUS input port */
-#define	DVSQ	0x0070	/* b6-4: Device state */
-#define	  DS_SPD_CNFG	 0x0070	  /* Suspend Configured */
-#define	  DS_SPD_ADDR	 0x0060	  /* Suspend Address */
-#define	  DS_SPD_DFLT	 0x0050	  /* Suspend Default */
-#define	  DS_SPD_POWR	 0x0040	  /* Suspend Powered */
-#define	  DS_SUSP	 0x0040	  /* Suspend */
-#define	  DS_CNFG	 0x0030	  /* Configured */
-#define	  DS_ADDS	 0x0020	  /* Address */
-#define	  DS_DFLT	 0x0010	  /* Default */
-#define	  DS_POWR	 0x0000	  /* Powered */
-#define	DVSQS		0x0030	/* b5-4: Device state */
-#define	VALID		0x0008	/* b3: Setup packet detected flag */
-#define	CTSQ		0x0007	/* b2-0: Control transfer stage */
-#define	  CS_SQER	 0x0006	  /* Sequence error */
-#define	  CS_WRND	 0x0005	  /* Control write nodata status stage */
-#define	  CS_WRSS	 0x0004	  /* Control write status stage */
-#define	  CS_WRDS	 0x0003	  /* Control write data stage */
-#define	  CS_RDSS	 0x0002	  /* Control read status stage */
-#define	  CS_RDDS	 0x0001	  /* Control read data stage */
-#define	  CS_IDST	 0x0000	  /* Idle or setup stage */
-
-/* Interrupt Status Register 1 */
-#define	OVRCR		0x8000	/* b15: Over-current interrupt */
-#define	BCHG		0x4000	/* b14: USB bus chenge interrupt */
-#define	DTCH		0x1000	/* b12: Detach sense interrupt */
-#define	ATTCH		0x0800	/* b11: Attach sense interrupt */
-#define	EOFERR		0x0040	/* b6: EOF-error interrupt */
-#define	SIGN		0x0020	/* b5: Setup ignore interrupt */
-#define	SACK		0x0010	/* b4: Setup acknowledge interrupt */
-
-/* Frame Number Register */
-#define	OVRN		0x8000	/* b15: Overrun error */
-#define	CRCE		0x4000	/* b14: Received data error */
-#define	FRNM		0x07FF	/* b10-0: Frame number */
-
-/* Micro Frame Number Register */
-#define	UFRNM		0x0007	/* b2-0: Micro frame number */
-
-/* Default Control Pipe Maxpacket Size Register */
-/* Pipe Maxpacket Size Register */
-#define	DEVSEL	0xF000	/* b15-14: Device address select */
-#define	MAXP	0x007F	/* b6-0: Maxpacket size of default control pipe */
-
-/* Default Control Pipe Control Register */
-#define	BSTS		0x8000	/* b15: Buffer status */
-#define	SUREQ		0x4000	/* b14: Send USB request  */
-#define	CSCLR		0x2000	/* b13: complete-split status clear */
-#define	CSSTS		0x1000	/* b12: complete-split status */
-#define	SUREQCLR	0x0800	/* b11: stop setup request */
-#define	SQCLR		0x0100	/* b8: Sequence toggle bit clear */
-#define	SQSET		0x0080	/* b7: Sequence toggle bit set */
-#define	SQMON		0x0040	/* b6: Sequence toggle bit monitor */
-#define	PBUSY		0x0020	/* b5: pipe busy */
-#define	PINGE		0x0010	/* b4: ping enable */
-#define	CCPL		0x0004	/* b2: Enable control transfer complete */
-#define	PID		0x0003	/* b1-0: Response PID */
-#define	  PID_STALL11	 0x0003	  /* STALL */
-#define	  PID_STALL	 0x0002	  /* STALL */
-#define	  PID_BUF	 0x0001	  /* BUF */
-#define	  PID_NAK	 0x0000	  /* NAK */
-
-/* Pipe Window Select Register */
-#define	PIPENM		0x0007	/* b2-0: Pipe select */
-
-/* Pipe Configuration Register */
-#define	R8A66597_TYP	0xC000	/* b15-14: Transfer type */
-#define	  R8A66597_ISO	 0xC000		  /* Isochronous */
-#define	  R8A66597_INT	 0x8000		  /* Interrupt */
-#define	  R8A66597_BULK	 0x4000		  /* Bulk */
-#define	R8A66597_BFRE	0x0400	/* b10: Buffer ready interrupt mode select */
-#define	R8A66597_DBLB	0x0200	/* b9: Double buffer mode select */
-#define	R8A66597_CNTMD	0x0100	/* b8: Continuous transfer mode select */
-#define	R8A66597_SHTNAK	0x0080	/* b7: Transfer end NAK */
-#define	R8A66597_DIR	0x0010	/* b4: Transfer direction select */
-#define	R8A66597_EPNUM	0x000F	/* b3-0: Eendpoint number select */
-
-/* Pipe Buffer Configuration Register */
-#define	BUFSIZE		0x7C00	/* b14-10: Pipe buffer size */
-#define	BUFNMB		0x007F	/* b6-0: Pipe buffer number */
-#define	PIPE0BUF	256
-#define	PIPExBUF	64
-
-/* Pipe Maxpacket Size Register */
-#define	MXPS		0x07FF	/* b10-0: Maxpacket size */
-
-/* Pipe Cycle Configuration Register */
-#define	IFIS	0x1000	/* b12: Isochronous in-buffer flush mode select */
-#define	IITV	0x0007	/* b2-0: Isochronous interval */
-
-/* Pipex Control Register */
-#define	BSTS	0x8000	/* b15: Buffer status */
-#define	INBUFM	0x4000	/* b14: IN buffer monitor (Only for PIPE1 to 5) */
-#define	CSCLR	0x2000	/* b13: complete-split status clear */
-#define	CSSTS	0x1000	/* b12: complete-split status */
-#define	ATREPM	0x0400	/* b10: Auto repeat mode */
-#define	ACLRM	0x0200	/* b9: Out buffer auto clear mode */
-#define	SQCLR	0x0100	/* b8: Sequence toggle bit clear */
-#define	SQSET	0x0080	/* b7: Sequence toggle bit set */
-#define	SQMON	0x0040	/* b6: Sequence toggle bit monitor */
-#define	PBUSY	0x0020	/* b5: pipe busy */
-#define	PID	0x0003	/* b1-0: Response PID */
-
-/* PIPExTRE */
-#define	TRENB		0x0200	/* b9: Transaction counter enable */
-#define	TRCLR		0x0100	/* b8: Transaction counter clear */
-
-/* PIPExTRN */
-#define	TRNCNT		0xFFFF	/* b15-0: Transaction counter */
-
-/* DEVADDx */
-#define	UPPHUB		0x7800
-#define	HUBPORT		0x0700
-#define	USBSPD		0x00C0
-#define	RTPORT		0x0001
-
 #define R8A66597_MAX_NUM_PIPE		10
 #define R8A66597_BUF_BSIZE		8
 #define R8A66597_MAX_DEVICE		10
diff --git a/include/linux/usb/r8a66597.h b/include/linux/usb/r8a66597.h
index 460ee3f6a2c6..26d216734057 100644
--- a/include/linux/usb/r8a66597.h
+++ b/include/linux/usb/r8a66597.h
@@ -28,7 +28,7 @@
 #define R8A66597_PLATDATA_XTAL_48MHZ	0x03
 
 struct r8a66597_platdata {
-	/* This ops can controll port power instead of DVSTCTR register. */
+	/* This callback can control port power instead of DVSTCTR register. */
 	void (*port_power)(int port, int power);
 
 	/* set one = on chip controller, set zero = external controller */
@@ -43,5 +43,373 @@ struct r8a66597_platdata {
 	/* set one = big endian, set zero = little endian */
 	unsigned	endian:1;
 };
-#endif
+
+/* Register definitions */
+#define SYSCFG0		0x00
+#define SYSCFG1		0x02
+#define SYSSTS0		0x04
+#define SYSSTS1		0x06
+#define DVSTCTR0	0x08
+#define DVSTCTR1	0x0A
+#define TESTMODE	0x0C
+#define PINCFG		0x0E
+#define DMA0CFG		0x10
+#define DMA1CFG		0x12
+#define CFIFO		0x14
+#define D0FIFO		0x18
+#define D1FIFO		0x1C
+#define CFIFOSEL	0x20
+#define CFIFOCTR	0x22
+#define CFIFOSIE	0x24
+#define D0FIFOSEL	0x28
+#define D0FIFOCTR	0x2A
+#define D1FIFOSEL	0x2C
+#define D1FIFOCTR	0x2E
+#define INTENB0		0x30
+#define INTENB1		0x32
+#define INTENB2		0x34
+#define BRDYENB		0x36
+#define NRDYENB		0x38
+#define BEMPENB		0x3A
+#define SOFCFG		0x3C
+#define INTSTS0		0x40
+#define INTSTS1		0x42
+#define INTSTS2		0x44
+#define BRDYSTS		0x46
+#define NRDYSTS		0x48
+#define BEMPSTS		0x4A
+#define FRMNUM		0x4C
+#define UFRMNUM		0x4E
+#define USBADDR		0x50
+#define USBREQ		0x54
+#define USBVAL		0x56
+#define USBINDX		0x58
+#define USBLENG		0x5A
+#define DCPCFG		0x5C
+#define DCPMAXP		0x5E
+#define DCPCTR		0x60
+#define PIPESEL		0x64
+#define PIPECFG		0x68
+#define PIPEBUF		0x6A
+#define PIPEMAXP	0x6C
+#define PIPEPERI	0x6E
+#define PIPE1CTR	0x70
+#define PIPE2CTR	0x72
+#define PIPE3CTR	0x74
+#define PIPE4CTR	0x76
+#define PIPE5CTR	0x78
+#define PIPE6CTR	0x7A
+#define PIPE7CTR	0x7C
+#define PIPE8CTR	0x7E
+#define PIPE9CTR	0x80
+#define PIPE1TRE	0x90
+#define PIPE1TRN	0x92
+#define PIPE2TRE	0x94
+#define PIPE2TRN	0x96
+#define PIPE3TRE	0x98
+#define PIPE3TRN	0x9A
+#define PIPE4TRE	0x9C
+#define	PIPE4TRN	0x9E
+#define	PIPE5TRE	0xA0
+#define	PIPE5TRN	0xA2
+#define DEVADD0		0xD0
+#define DEVADD1		0xD2
+#define DEVADD2		0xD4
+#define DEVADD3		0xD6
+#define DEVADD4		0xD8
+#define DEVADD5		0xDA
+#define DEVADD6		0xDC
+#define DEVADD7		0xDE
+#define DEVADD8		0xE0
+#define DEVADD9		0xE2
+#define DEVADDA		0xE4
+
+/* System Configuration Control Register */
+#define	XTAL		0xC000	/* b15-14: Crystal selection */
+#define	  XTAL48	 0x8000	  /* 48MHz */
+#define	  XTAL24	 0x4000	  /* 24MHz */
+#define	  XTAL12	 0x0000	  /* 12MHz */
+#define	XCKE		0x2000	/* b13: External clock enable */
+#define	PLLC		0x0800	/* b11: PLL control */
+#define	SCKE		0x0400	/* b10: USB clock enable */
+#define	PCSDIS		0x0200	/* b9: not CS wakeup */
+#define	LPSME		0x0100	/* b8: Low power sleep mode */
+#define	HSE		0x0080	/* b7: Hi-speed enable */
+#define	DCFM		0x0040	/* b6: Controller function select  */
+#define	DRPD		0x0020	/* b5: D+/- pull down control */
+#define	DPRPU		0x0010	/* b4: D+ pull up control */
+#define	USBE		0x0001	/* b0: USB module operation enable */
+
+/* System Configuration Status Register */
+#define	OVCBIT		0x8000	/* b15-14: Over-current bit */
+#define	OVCMON		0xC000	/* b15-14: Over-current monitor */
+#define	SOFEA		0x0020	/* b5: SOF monitor */
+#define	IDMON		0x0004	/* b3: ID-pin monitor */
+#define	LNST		0x0003	/* b1-0: D+, D- line status */
+#define	  SE1		 0x0003	  /* SE1 */
+#define	  FS_KSTS	 0x0002	  /* Full-Speed K State */
+#define	  FS_JSTS	 0x0001	  /* Full-Speed J State */
+#define	  LS_JSTS	 0x0002	  /* Low-Speed J State */
+#define	  LS_KSTS	 0x0001	  /* Low-Speed K State */
+#define	  SE0		 0x0000	  /* SE0 */
+
+/* Device State Control Register */
+#define	EXTLP0		0x0400	/* b10: External port */
+#define	VBOUT		0x0200	/* b9: VBUS output */
+#define	WKUP		0x0100	/* b8: Remote wakeup */
+#define	RWUPE		0x0080	/* b7: Remote wakeup sense */
+#define	USBRST		0x0040	/* b6: USB reset enable */
+#define	RESUME		0x0020	/* b5: Resume enable */
+#define	UACT		0x0010	/* b4: USB bus enable */
+#define	RHST		0x0007	/* b1-0: Reset handshake status */
+#define	  HSPROC	 0x0004	  /* HS handshake is processing */
+#define	  HSMODE	 0x0003	  /* Hi-Speed mode */
+#define	  FSMODE	 0x0002	  /* Full-Speed mode */
+#define	  LSMODE	 0x0001	  /* Low-Speed mode */
+#define	  UNDECID	 0x0000	  /* Undecided */
+
+/* Test Mode Register */
+#define	UTST			0x000F	/* b3-0: Test select */
+#define	  H_TST_PACKET		 0x000C	  /* HOST TEST Packet */
+#define	  H_TST_SE0_NAK		 0x000B	  /* HOST TEST SE0 NAK */
+#define	  H_TST_K		 0x000A	  /* HOST TEST K */
+#define	  H_TST_J		 0x0009	  /* HOST TEST J */
+#define	  H_TST_NORMAL		 0x0000	  /* HOST Normal Mode */
+#define	  P_TST_PACKET		 0x0004	  /* PERI TEST Packet */
+#define	  P_TST_SE0_NAK		 0x0003	  /* PERI TEST SE0 NAK */
+#define	  P_TST_K		 0x0002	  /* PERI TEST K */
+#define	  P_TST_J		 0x0001	  /* PERI TEST J */
+#define	  P_TST_NORMAL		 0x0000	  /* PERI Normal Mode */
+
+/* Data Pin Configuration Register */
+#define	LDRV			0x8000	/* b15: Drive Current Adjust */
+#define	  VIF1			  0x0000		/* VIF = 1.8V */
+#define	  VIF3			  0x8000		/* VIF = 3.3V */
+#define	INTA			0x0001	/* b1: USB INT-pin active */
+
+/* DMAx Pin Configuration Register */
+#define	DREQA			0x4000	/* b14: Dreq active select */
+#define	BURST			0x2000	/* b13: Burst mode */
+#define	DACKA			0x0400	/* b10: Dack active select */
+#define	DFORM			0x0380	/* b9-7: DMA mode select */
+#define	  CPU_ADR_RD_WR		 0x0000	  /* Address + RD/WR mode (CPU bus) */
+#define	  CPU_DACK_RD_WR	 0x0100	  /* DACK + RD/WR mode (CPU bus) */
+#define	  CPU_DACK_ONLY		 0x0180	  /* DACK only mode (CPU bus) */
+#define	  SPLIT_DACK_ONLY	 0x0200	  /* DACK only mode (SPLIT bus) */
+#define	DENDA			0x0040	/* b6: Dend active select */
+#define	PKTM			0x0020	/* b5: Packet mode */
+#define	DENDE			0x0010	/* b4: Dend enable */
+#define	OBUS			0x0004	/* b2: OUTbus mode */
+
+/* CFIFO/DxFIFO Port Select Register */
+#define	RCNT		0x8000	/* b15: Read count mode */
+#define	REW		0x4000	/* b14: Buffer rewind */
+#define	DCLRM		0x2000	/* b13: DMA buffer clear mode */
+#define	DREQE		0x1000	/* b12: DREQ output enable */
+#define	  MBW_8		 0x0000	  /*  8bit */
+#define	  MBW_16	 0x0400	  /* 16bit */
+#define	  MBW_32	 0x0800   /* 32bit */
+#define	BIGEND		0x0100	/* b8: Big endian mode */
+#define	  BYTE_LITTLE	 0x0000		/* little dendian */
+#define	  BYTE_BIG	 0x0100		/* big endifan */
+#define	ISEL		0x0020	/* b5: DCP FIFO port direction select */
+#define	CURPIPE		0x000F	/* b2-0: PIPE select */
+
+/* CFIFO/DxFIFO Port Control Register */
+#define	BVAL		0x8000	/* b15: Buffer valid flag */
+#define	BCLR		0x4000	/* b14: Buffer clear */
+#define	FRDY		0x2000	/* b13: FIFO ready */
+#define	DTLN		0x0FFF	/* b11-0: FIFO received data length */
+
+/* Interrupt Enable Register 0 */
+#define	VBSE	0x8000	/* b15: VBUS interrupt */
+#define	RSME	0x4000	/* b14: Resume interrupt */
+#define	SOFE	0x2000	/* b13: Frame update interrupt */
+#define	DVSE	0x1000	/* b12: Device state transition interrupt */
+#define	CTRE	0x0800	/* b11: Control transfer stage transition interrupt */
+#define	BEMPE	0x0400	/* b10: Buffer empty interrupt */
+#define	NRDYE	0x0200	/* b9: Buffer not ready interrupt */
+#define	BRDYE	0x0100	/* b8: Buffer ready interrupt */
+
+/* Interrupt Enable Register 1 */
+#define	OVRCRE		0x8000	/* b15: Over-current interrupt */
+#define	BCHGE		0x4000	/* b14: USB us chenge interrupt */
+#define	DTCHE		0x1000	/* b12: Detach sense interrupt */
+#define	ATTCHE		0x0800	/* b11: Attach sense interrupt */
+#define	EOFERRE		0x0040	/* b6: EOF error interrupt */
+#define	SIGNE		0x0020	/* b5: SETUP IGNORE interrupt */
+#define	SACKE		0x0010	/* b4: SETUP ACK interrupt */
+
+/* BRDY Interrupt Enable/Status Register */
+#define	BRDY9		0x0200	/* b9: PIPE9 */
+#define	BRDY8		0x0100	/* b8: PIPE8 */
+#define	BRDY7		0x0080	/* b7: PIPE7 */
+#define	BRDY6		0x0040	/* b6: PIPE6 */
+#define	BRDY5		0x0020	/* b5: PIPE5 */
+#define	BRDY4		0x0010	/* b4: PIPE4 */
+#define	BRDY3		0x0008	/* b3: PIPE3 */
+#define	BRDY2		0x0004	/* b2: PIPE2 */
+#define	BRDY1		0x0002	/* b1: PIPE1 */
+#define	BRDY0		0x0001	/* b1: PIPE0 */
+
+/* NRDY Interrupt Enable/Status Register */
+#define	NRDY9		0x0200	/* b9: PIPE9 */
+#define	NRDY8		0x0100	/* b8: PIPE8 */
+#define	NRDY7		0x0080	/* b7: PIPE7 */
+#define	NRDY6		0x0040	/* b6: PIPE6 */
+#define	NRDY5		0x0020	/* b5: PIPE5 */
+#define	NRDY4		0x0010	/* b4: PIPE4 */
+#define	NRDY3		0x0008	/* b3: PIPE3 */
+#define	NRDY2		0x0004	/* b2: PIPE2 */
+#define	NRDY1		0x0002	/* b1: PIPE1 */
+#define	NRDY0		0x0001	/* b1: PIPE0 */
+
+/* BEMP Interrupt Enable/Status Register */
+#define	BEMP9		0x0200	/* b9: PIPE9 */
+#define	BEMP8		0x0100	/* b8: PIPE8 */
+#define	BEMP7		0x0080	/* b7: PIPE7 */
+#define	BEMP6		0x0040	/* b6: PIPE6 */
+#define	BEMP5		0x0020	/* b5: PIPE5 */
+#define	BEMP4		0x0010	/* b4: PIPE4 */
+#define	BEMP3		0x0008	/* b3: PIPE3 */
+#define	BEMP2		0x0004	/* b2: PIPE2 */
+#define	BEMP1		0x0002	/* b1: PIPE1 */
+#define	BEMP0		0x0001	/* b0: PIPE0 */
+
+/* SOF Pin Configuration Register */
+#define	TRNENSEL	0x0100	/* b8: Select transaction enable period */
+#define	BRDYM		0x0040	/* b6: BRDY clear timing */
+#define	INTL		0x0020	/* b5: Interrupt sense select */
+#define	EDGESTS		0x0010	/* b4:  */
+#define	SOFMODE		0x000C	/* b3-2: SOF pin select */
+#define	  SOF_125US	 0x0008	  /* SOF OUT 125us Frame Signal */
+#define	  SOF_1MS	 0x0004	  /* SOF OUT 1ms Frame Signal */
+#define	  SOF_DISABLE	 0x0000	  /* SOF OUT Disable */
+
+/* Interrupt Status Register 0 */
+#define	VBINT	0x8000	/* b15: VBUS interrupt */
+#define	RESM	0x4000	/* b14: Resume interrupt */
+#define	SOFR	0x2000	/* b13: SOF frame update interrupt */
+#define	DVST	0x1000	/* b12: Device state transition interrupt */
+#define	CTRT	0x0800	/* b11: Control transfer stage transition interrupt */
+#define	BEMP	0x0400	/* b10: Buffer empty interrupt */
+#define	NRDY	0x0200	/* b9: Buffer not ready interrupt */
+#define	BRDY	0x0100	/* b8: Buffer ready interrupt */
+#define	VBSTS	0x0080	/* b7: VBUS input port */
+#define	DVSQ	0x0070	/* b6-4: Device state */
+#define	  DS_SPD_CNFG	 0x0070	  /* Suspend Configured */
+#define	  DS_SPD_ADDR	 0x0060	  /* Suspend Address */
+#define	  DS_SPD_DFLT	 0x0050	  /* Suspend Default */
+#define	  DS_SPD_POWR	 0x0040	  /* Suspend Powered */
+#define	  DS_SUSP	 0x0040	  /* Suspend */
+#define	  DS_CNFG	 0x0030	  /* Configured */
+#define	  DS_ADDS	 0x0020	  /* Address */
+#define	  DS_DFLT	 0x0010	  /* Default */
+#define	  DS_POWR	 0x0000	  /* Powered */
+#define	DVSQS		0x0030	/* b5-4: Device state */
+#define	VALID		0x0008	/* b3: Setup packet detected flag */
+#define	CTSQ		0x0007	/* b2-0: Control transfer stage */
+#define	  CS_SQER	 0x0006	  /* Sequence error */
+#define	  CS_WRND	 0x0005	  /* Control write nodata status stage */
+#define	  CS_WRSS	 0x0004	  /* Control write status stage */
+#define	  CS_WRDS	 0x0003	  /* Control write data stage */
+#define	  CS_RDSS	 0x0002	  /* Control read status stage */
+#define	  CS_RDDS	 0x0001	  /* Control read data stage */
+#define	  CS_IDST	 0x0000	  /* Idle or setup stage */
+
+/* Interrupt Status Register 1 */
+#define	OVRCR		0x8000	/* b15: Over-current interrupt */
+#define	BCHG		0x4000	/* b14: USB bus chenge interrupt */
+#define	DTCH		0x1000	/* b12: Detach sense interrupt */
+#define	ATTCH		0x0800	/* b11: Attach sense interrupt */
+#define	EOFERR		0x0040	/* b6: EOF-error interrupt */
+#define	SIGN		0x0020	/* b5: Setup ignore interrupt */
+#define	SACK		0x0010	/* b4: Setup acknowledge interrupt */
+
+/* Frame Number Register */
+#define	OVRN		0x8000	/* b15: Overrun error */
+#define	CRCE		0x4000	/* b14: Received data error */
+#define	FRNM		0x07FF	/* b10-0: Frame number */
+
+/* Micro Frame Number Register */
+#define	UFRNM		0x0007	/* b2-0: Micro frame number */
+
+/* Default Control Pipe Maxpacket Size Register */
+/* Pipe Maxpacket Size Register */
+#define	DEVSEL	0xF000	/* b15-14: Device address select */
+#define	MAXP	0x007F	/* b6-0: Maxpacket size of default control pipe */
+
+/* Default Control Pipe Control Register */
+#define	BSTS		0x8000	/* b15: Buffer status */
+#define	SUREQ		0x4000	/* b14: Send USB request  */
+#define	CSCLR		0x2000	/* b13: complete-split status clear */
+#define	CSSTS		0x1000	/* b12: complete-split status */
+#define	SUREQCLR	0x0800	/* b11: stop setup request */
+#define	SQCLR		0x0100	/* b8: Sequence toggle bit clear */
+#define	SQSET		0x0080	/* b7: Sequence toggle bit set */
+#define	SQMON		0x0040	/* b6: Sequence toggle bit monitor */
+#define	PBUSY		0x0020	/* b5: pipe busy */
+#define	PINGE		0x0010	/* b4: ping enable */
+#define	CCPL		0x0004	/* b2: Enable control transfer complete */
+#define	PID		0x0003	/* b1-0: Response PID */
+#define	  PID_STALL11	 0x0003	  /* STALL */
+#define	  PID_STALL	 0x0002	  /* STALL */
+#define	  PID_BUF	 0x0001	  /* BUF */
+#define	  PID_NAK	 0x0000	  /* NAK */
+
+/* Pipe Window Select Register */
+#define	PIPENM		0x0007	/* b2-0: Pipe select */
+
+/* Pipe Configuration Register */
+#define	R8A66597_TYP	0xC000	/* b15-14: Transfer type */
+#define	  R8A66597_ISO	 0xC000		  /* Isochronous */
+#define	  R8A66597_INT	 0x8000		  /* Interrupt */
+#define	  R8A66597_BULK	 0x4000		  /* Bulk */
+#define	R8A66597_BFRE	0x0400	/* b10: Buffer ready interrupt mode select */
+#define	R8A66597_DBLB	0x0200	/* b9: Double buffer mode select */
+#define	R8A66597_CNTMD	0x0100	/* b8: Continuous transfer mode select */
+#define	R8A66597_SHTNAK	0x0080	/* b7: Transfer end NAK */
+#define	R8A66597_DIR	0x0010	/* b4: Transfer direction select */
+#define	R8A66597_EPNUM	0x000F	/* b3-0: Eendpoint number select */
+
+/* Pipe Buffer Configuration Register */
+#define	BUFSIZE		0x7C00	/* b14-10: Pipe buffer size */
+#define	BUFNMB		0x007F	/* b6-0: Pipe buffer number */
+#define	PIPE0BUF	256
+#define	PIPExBUF	64
+
+/* Pipe Maxpacket Size Register */
+#define	MXPS		0x07FF	/* b10-0: Maxpacket size */
+
+/* Pipe Cycle Configuration Register */
+#define	IFIS	0x1000	/* b12: Isochronous in-buffer flush mode select */
+#define	IITV	0x0007	/* b2-0: Isochronous interval */
+
+/* Pipex Control Register */
+#define	BSTS	0x8000	/* b15: Buffer status */
+#define	INBUFM	0x4000	/* b14: IN buffer monitor (Only for PIPE1 to 5) */
+#define	CSCLR	0x2000	/* b13: complete-split status clear */
+#define	CSSTS	0x1000	/* b12: complete-split status */
+#define	ATREPM	0x0400	/* b10: Auto repeat mode */
+#define	ACLRM	0x0200	/* b9: Out buffer auto clear mode */
+#define	SQCLR	0x0100	/* b8: Sequence toggle bit clear */
+#define	SQSET	0x0080	/* b7: Sequence toggle bit set */
+#define	SQMON	0x0040	/* b6: Sequence toggle bit monitor */
+#define	PBUSY	0x0020	/* b5: pipe busy */
+#define	PID	0x0003	/* b1-0: Response PID */
+
+/* PIPExTRE */
+#define	TRENB		0x0200	/* b9: Transaction counter enable */
+#define	TRCLR		0x0100	/* b8: Transaction counter clear */
+
+/* PIPExTRN */
+#define	TRNCNT		0xFFFF	/* b15-0: Transaction counter */
+
+/* DEVADDx */
+#define	UPPHUB		0x7800
+#define	HUBPORT		0x0700
+#define	USBSPD		0x00C0
+#define	RTPORT		0x0001
+
+#endif /* __LINUX_USB_R8A66597_H */
 
-- 
cgit v1.2.3-71-gd317


From 2c59b0b70b9d5d61c726f179724660c4c2423f31 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 22 Jul 2009 14:41:35 +0000
Subject: usb: m66592-udc platform data on_chip support

Convert the m66592-udc driver to use the on_chip flag
from platform data to enable on chip behaviour instead
of relying on CONFIG_SUPERH_BUILT_IN_M66592 ugliness.

This makes the code cleaner and also allows us to support
both external and internal m66592 with the same kernel.

It also makes the Kconfig part more future proof since
we with this patch can add support for new processors
with on-chip m66592 without modifying the Kconfig.

The patch adds a m66592 header file for platform data
and ties in platform data to the existing m66592 devices.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boards/mach-highlander/setup.c |   7 +
 arch/sh/boards/mach-x3proto/setup.c    |   7 +
 arch/sh/kernel/cpu/sh4a/setup-sh7722.c |   8 +-
 drivers/usb/gadget/Kconfig             |  10 --
 drivers/usb/gadget/m66592-udc.c        | 252 +++++++++++++++++++--------------
 drivers/usb/gadget/m66592-udc.h        |  89 ++++++------
 include/linux/usb/m66592.h             |  44 ++++++
 7 files changed, 257 insertions(+), 160 deletions(-)
 create mode 100644 include/linux/usb/m66592.h

(limited to 'include/linux')

diff --git a/arch/sh/boards/mach-highlander/setup.c b/arch/sh/boards/mach-highlander/setup.c
index 1639f8915000..566e69d8d729 100644
--- a/arch/sh/boards/mach-highlander/setup.c
+++ b/arch/sh/boards/mach-highlander/setup.c
@@ -22,6 +22,7 @@
 #include <linux/irq.h>
 #include <linux/interrupt.h>
 #include <linux/usb/r8a66597.h>
+#include <linux/usb/m66592.h>
 #include <net/ax88796.h>
 #include <asm/machvec.h>
 #include <mach/highlander.h>
@@ -60,6 +61,11 @@ static struct platform_device r8a66597_usb_host_device = {
 	.resource	= r8a66597_usb_host_resources,
 };
 
+static struct m66592_platdata usbf_platdata = {
+	.xtal = M66592_PLATDATA_XTAL_24MHZ,
+	.vif = 1,
+};
+
 static struct resource m66592_usb_peripheral_resources[] = {
 	[0] = {
 		.name	= "m66592_udc",
@@ -81,6 +87,7 @@ static struct platform_device m66592_usb_peripheral_device = {
 	.dev = {
 		.dma_mask		= NULL,		/* don't use dma */
 		.coherent_dma_mask	= 0xffffffff,
+		.platform_data		= &usbf_platdata,
 	},
 	.num_resources	= ARRAY_SIZE(m66592_usb_peripheral_resources),
 	.resource	= m66592_usb_peripheral_resources,
diff --git a/arch/sh/boards/mach-x3proto/setup.c b/arch/sh/boards/mach-x3proto/setup.c
index 8913ae39a802..efe4cb9f8a77 100644
--- a/arch/sh/boards/mach-x3proto/setup.c
+++ b/arch/sh/boards/mach-x3proto/setup.c
@@ -17,6 +17,7 @@
 #include <linux/irq.h>
 #include <linux/interrupt.h>
 #include <linux/usb/r8a66597.h>
+#include <linux/usb/m66592.h>
 #include <asm/ilsel.h>
 
 static struct resource heartbeat_resources[] = {
@@ -89,6 +90,11 @@ static struct platform_device r8a66597_usb_host_device = {
 	.resource	= r8a66597_usb_host_resources,
 };
 
+static struct m66592_platdata usbf_platdata = {
+	.xtal = M66592_PLATDATA_XTAL_24MHZ,
+	.vif = 1,
+};
+
 static struct resource m66592_usb_peripheral_resources[] = {
 	[0] = {
 		.name	= "m66592_udc",
@@ -109,6 +115,7 @@ static struct platform_device m66592_usb_peripheral_device = {
 	.dev = {
 		.dma_mask		= NULL,		/* don't use dma */
 		.coherent_dma_mask	= 0xffffffff,
+		.platform_data		= &usbf_platdata,
 	},
 	.num_resources	= ARRAY_SIZE(m66592_usb_peripheral_resources),
 	.resource	= m66592_usb_peripheral_resources,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
index ea524a2da3e4..0bad14a44238 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
@@ -13,6 +13,7 @@
 #include <linux/serial_sci.h>
 #include <linux/mm.h>
 #include <linux/uio_driver.h>
+#include <linux/usb/m66592.h>
 #include <linux/sh_timer.h>
 #include <asm/clock.h>
 #include <asm/mmzone.h>
@@ -47,9 +48,13 @@ static struct platform_device rtc_device = {
 	.resource	= rtc_resources,
 };
 
+static struct m66592_platdata usbf_platdata = {
+	.on_chip = 1,
+};
+
 static struct resource usbf_resources[] = {
 	[0] = {
-		.name	= "m66592_udc",
+		.name	= "USBF",
 		.start	= 0x04480000,
 		.end	= 0x044800FF,
 		.flags	= IORESOURCE_MEM,
@@ -67,6 +72,7 @@ static struct platform_device usbf_device = {
 	.dev = {
 		.dma_mask		= NULL,
 		.coherent_dma_mask	= 0xffffffff,
+		.platform_data		= &usbf_platdata,
 	},
 	.num_resources	= ARRAY_SIZE(usbf_resources),
 	.resource	= usbf_resources,
diff --git a/drivers/usb/gadget/Kconfig b/drivers/usb/gadget/Kconfig
index 7f8e83a954ac..b7f10bc25c2c 100644
--- a/drivers/usb/gadget/Kconfig
+++ b/drivers/usb/gadget/Kconfig
@@ -360,16 +360,6 @@ config USB_M66592
 	default USB_GADGET
 	select USB_GADGET_SELECTED
 
-config SUPERH_BUILT_IN_M66592
-	boolean "Enable SuperH built-in USB like the M66592"
-	depends on USB_GADGET_M66592 && CPU_SUBTYPE_SH7722
-	help
-	   SH7722 has USB like the M66592.
-
-	   The transfer rate is very slow when use "Ethernet Gadget".
-	   However, this problem is improved if change a value of
-	   NET_IP_ALIGN to 4.
-
 #
 # Controllers available only in discrete form (and all PCI controllers)
 #
diff --git a/drivers/usb/gadget/m66592-udc.c b/drivers/usb/gadget/m66592-udc.c
index 0dddd2f8ff35..a61c70caff12 100644
--- a/drivers/usb/gadget/m66592-udc.c
+++ b/drivers/usb/gadget/m66592-udc.c
@@ -31,38 +31,12 @@
 
 #include "m66592-udc.h"
 
-
 MODULE_DESCRIPTION("M66592 USB gadget driver");
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Yoshihiro Shimoda");
 MODULE_ALIAS("platform:m66592_udc");
 
-#define DRIVER_VERSION	"26 Jun 2009"
-
-/* module parameters */
-#if defined(CONFIG_SUPERH_BUILT_IN_M66592)
-static unsigned short endian = M66592_LITTLE;
-module_param(endian, ushort, 0644);
-MODULE_PARM_DESC(endian, "data endian: big=0, little=0 (default=0)");
-#else
-static unsigned short clock = M66592_XTAL24;
-module_param(clock, ushort, 0644);
-MODULE_PARM_DESC(clock, "input clock: 48MHz=32768, 24MHz=16384, 12MHz=0 "
-		"(default=16384)");
-
-static unsigned short vif = M66592_LDRV;
-module_param(vif, ushort, 0644);
-MODULE_PARM_DESC(vif, "input VIF: 3.3V=32768, 1.5V=0 (default=32768)");
-
-static unsigned short endian;
-module_param(endian, ushort, 0644);
-MODULE_PARM_DESC(endian, "data endian: big=256, little=0 (default=0)");
-
-static unsigned short irq_sense = M66592_INTL;
-module_param(irq_sense, ushort, 0644);
-MODULE_PARM_DESC(irq_sense, "IRQ sense: low level=2, falling edge=0 "
-		"(default=2)");
-#endif
+#define DRIVER_VERSION	"21 July 2009"
 
 static const char udc_name[] = "m66592_udc";
 static const char *m66592_ep_name[] = {
@@ -244,6 +218,7 @@ static inline int get_buffer_size(struct m66592 *m66592, u16 pipenum)
 static inline void pipe_change(struct m66592 *m66592, u16 pipenum)
 {
 	struct m66592_ep *ep = m66592->pipenum2ep[pipenum];
+	unsigned short mbw;
 
 	if (ep->use_dma)
 		return;
@@ -252,7 +227,12 @@ static inline void pipe_change(struct m66592 *m66592, u16 pipenum)
 
 	ndelay(450);
 
-	m66592_bset(m66592, M66592_MBW, ep->fifosel);
+	if (m66592->pdata->on_chip)
+		mbw = M66592_MBW_32;
+	else
+		mbw = M66592_MBW_16;
+
+	m66592_bset(m66592, mbw, ep->fifosel);
 }
 
 static int pipe_buffer_setting(struct m66592 *m66592,
@@ -332,6 +312,7 @@ static void pipe_buffer_release(struct m66592 *m66592,
 static void pipe_initialize(struct m66592_ep *ep)
 {
 	struct m66592 *m66592 = ep->m66592;
+	unsigned short mbw;
 
 	m66592_mdfy(m66592, 0, M66592_CURPIPE, ep->fifosel);
 
@@ -343,7 +324,12 @@ static void pipe_initialize(struct m66592_ep *ep)
 
 		ndelay(450);
 
-		m66592_bset(m66592, M66592_MBW, ep->fifosel);
+		if (m66592->pdata->on_chip)
+			mbw = M66592_MBW_32;
+		else
+			mbw = M66592_MBW_16;
+
+		m66592_bset(m66592, mbw, ep->fifosel);
 	}
 }
 
@@ -359,15 +345,13 @@ static void m66592_ep_setting(struct m66592 *m66592, struct m66592_ep *ep,
 			ep->fifosel = M66592_D0FIFOSEL;
 			ep->fifoctr = M66592_D0FIFOCTR;
 			ep->fifotrn = M66592_D0FIFOTRN;
-#if !defined(CONFIG_SUPERH_BUILT_IN_M66592)
-		} else if (m66592->num_dma == 1) {
+		} else if (!m66592->pdata->on_chip && m66592->num_dma == 1) {
 			m66592->num_dma++;
 			ep->use_dma = 1;
 			ep->fifoaddr = M66592_D1FIFO;
 			ep->fifosel = M66592_D1FIFOSEL;
 			ep->fifoctr = M66592_D1FIFOCTR;
 			ep->fifotrn = M66592_D1FIFOTRN;
-#endif
 		} else {
 			ep->use_dma = 0;
 			ep->fifoaddr = M66592_CFIFO;
@@ -612,76 +596,120 @@ static void start_ep0(struct m66592_ep *ep, struct m66592_request *req)
 	}
 }
 
-#if defined(CONFIG_SUPERH_BUILT_IN_M66592)
 static void init_controller(struct m66592 *m66592)
 {
-	m66592_bset(m66592, M66592_HSE, M66592_SYSCFG);		/* High spd */
-	m66592_bclr(m66592, M66592_USBE, M66592_SYSCFG);
-	m66592_bclr(m66592, M66592_DPRPU, M66592_SYSCFG);
-	m66592_bset(m66592, M66592_USBE, M66592_SYSCFG);
+	unsigned int endian;
 
-	/* This is a workaound for SH7722 2nd cut */
-	m66592_bset(m66592, 0x8000, M66592_DVSTCTR);
-	m66592_bset(m66592, 0x1000, M66592_TESTMODE);
-	m66592_bclr(m66592, 0x8000, M66592_DVSTCTR);
+	if (m66592->pdata->on_chip) {
+		if (m66592->pdata->endian)
+			endian = 0; /* big endian */
+		else
+			endian = M66592_LITTLE; /* little endian */
 
-	m66592_bset(m66592, M66592_INTL, M66592_INTENB1);
+		m66592_bset(m66592, M66592_HSE, M66592_SYSCFG);	/* High spd */
+		m66592_bclr(m66592, M66592_USBE, M66592_SYSCFG);
+		m66592_bclr(m66592, M66592_DPRPU, M66592_SYSCFG);
+		m66592_bset(m66592, M66592_USBE, M66592_SYSCFG);
 
-	m66592_write(m66592, 0, M66592_CFBCFG);
-	m66592_write(m66592, 0, M66592_D0FBCFG);
-	m66592_bset(m66592, endian, M66592_CFBCFG);
-	m66592_bset(m66592, endian, M66592_D0FBCFG);
-}
-#else	/* #if defined(CONFIG_SUPERH_BUILT_IN_M66592) */
-static void init_controller(struct m66592 *m66592)
-{
-	m66592_bset(m66592, (vif & M66592_LDRV) | (endian & M66592_BIGEND),
-			M66592_PINCFG);
-	m66592_bset(m66592, M66592_HSE, M66592_SYSCFG);		/* High spd */
-	m66592_mdfy(m66592, clock & M66592_XTAL, M66592_XTAL, M66592_SYSCFG);
+		/* This is a workaound for SH7722 2nd cut */
+		m66592_bset(m66592, 0x8000, M66592_DVSTCTR);
+		m66592_bset(m66592, 0x1000, M66592_TESTMODE);
+		m66592_bclr(m66592, 0x8000, M66592_DVSTCTR);
 
-	m66592_bclr(m66592, M66592_USBE, M66592_SYSCFG);
-	m66592_bclr(m66592, M66592_DPRPU, M66592_SYSCFG);
-	m66592_bset(m66592, M66592_USBE, M66592_SYSCFG);
+		m66592_bset(m66592, M66592_INTL, M66592_INTENB1);
+
+		m66592_write(m66592, 0, M66592_CFBCFG);
+		m66592_write(m66592, 0, M66592_D0FBCFG);
+		m66592_bset(m66592, endian, M66592_CFBCFG);
+		m66592_bset(m66592, endian, M66592_D0FBCFG);
+	} else {
+		unsigned int clock, vif, irq_sense;
+
+		if (m66592->pdata->endian)
+			endian = M66592_BIGEND; /* big endian */
+		else
+			endian = 0; /* little endian */
+
+		if (m66592->pdata->vif)
+			vif = M66592_LDRV; /* 3.3v */
+		else
+			vif = 0; /* 1.5v */
+
+		switch (m66592->pdata->xtal) {
+		case M66592_PLATDATA_XTAL_12MHZ:
+			clock = M66592_XTAL12;
+			break;
+		case M66592_PLATDATA_XTAL_24MHZ:
+			clock = M66592_XTAL24;
+			break;
+		case M66592_PLATDATA_XTAL_48MHZ:
+			clock = M66592_XTAL48;
+			break;
+		default:
+			pr_warning("m66592-udc: xtal configuration error\n");
+			clock = 0;
+		}
 
-	m66592_bset(m66592, M66592_XCKE, M66592_SYSCFG);
+		switch (m66592->irq_trigger) {
+		case IRQF_TRIGGER_LOW:
+			irq_sense = M66592_INTL;
+			break;
+		case IRQF_TRIGGER_FALLING:
+			irq_sense = 0;
+			break;
+		default:
+			pr_warning("m66592-udc: irq trigger config error\n");
+			irq_sense = 0;
+		}
 
-	msleep(3);
+		m66592_bset(m66592,
+			    (vif & M66592_LDRV) | (endian & M66592_BIGEND),
+			    M66592_PINCFG);
+		m66592_bset(m66592, M66592_HSE, M66592_SYSCFG);	/* High spd */
+		m66592_mdfy(m66592, clock & M66592_XTAL, M66592_XTAL,
+			    M66592_SYSCFG);
+		m66592_bclr(m66592, M66592_USBE, M66592_SYSCFG);
+		m66592_bclr(m66592, M66592_DPRPU, M66592_SYSCFG);
+		m66592_bset(m66592, M66592_USBE, M66592_SYSCFG);
 
-	m66592_bset(m66592, M66592_RCKE | M66592_PLLC, M66592_SYSCFG);
+		m66592_bset(m66592, M66592_XCKE, M66592_SYSCFG);
+
+		msleep(3);
 
-	msleep(1);
+		m66592_bset(m66592, M66592_RCKE | M66592_PLLC, M66592_SYSCFG);
 
-	m66592_bset(m66592, M66592_SCKE, M66592_SYSCFG);
+		msleep(1);
 
-	m66592_bset(m66592, irq_sense & M66592_INTL, M66592_INTENB1);
-	m66592_write(m66592, M66592_BURST | M66592_CPU_ADR_RD_WR,
-			M66592_DMA0CFG);
+		m66592_bset(m66592, M66592_SCKE, M66592_SYSCFG);
+
+		m66592_bset(m66592, irq_sense & M66592_INTL, M66592_INTENB1);
+		m66592_write(m66592, M66592_BURST | M66592_CPU_ADR_RD_WR,
+			     M66592_DMA0CFG);
+	}
 }
-#endif	/* #if defined(CONFIG_SUPERH_BUILT_IN_M66592) */
 
 static void disable_controller(struct m66592 *m66592)
 {
-#if !defined(CONFIG_SUPERH_BUILT_IN_M66592)
-	m66592_bclr(m66592, M66592_SCKE, M66592_SYSCFG);
-	udelay(1);
-	m66592_bclr(m66592, M66592_PLLC, M66592_SYSCFG);
-	udelay(1);
-	m66592_bclr(m66592, M66592_RCKE, M66592_SYSCFG);
-	udelay(1);
-	m66592_bclr(m66592, M66592_XCKE, M66592_SYSCFG);
-#endif
+	if (!m66592->pdata->on_chip) {
+		m66592_bclr(m66592, M66592_SCKE, M66592_SYSCFG);
+		udelay(1);
+		m66592_bclr(m66592, M66592_PLLC, M66592_SYSCFG);
+		udelay(1);
+		m66592_bclr(m66592, M66592_RCKE, M66592_SYSCFG);
+		udelay(1);
+		m66592_bclr(m66592, M66592_XCKE, M66592_SYSCFG);
+	}
 }
 
 static void m66592_start_xclock(struct m66592 *m66592)
 {
-#if !defined(CONFIG_SUPERH_BUILT_IN_M66592)
 	u16 tmp;
 
-	tmp = m66592_read(m66592, M66592_SYSCFG);
-	if (!(tmp & M66592_XCKE))
-		m66592_bset(m66592, M66592_XCKE, M66592_SYSCFG);
-#endif
+	if (!m66592->pdata->on_chip) {
+		tmp = m66592_read(m66592, M66592_SYSCFG);
+		if (!(tmp & M66592_XCKE))
+			m66592_bset(m66592, M66592_XCKE, M66592_SYSCFG);
+	}
 }
 
 /*-------------------------------------------------------------------------*/
@@ -1169,8 +1197,7 @@ static irqreturn_t m66592_irq(int irq, void *_m66592)
 	intsts0 = m66592_read(m66592, M66592_INTSTS0);
 	intenb0 = m66592_read(m66592, M66592_INTENB0);
 
-#if defined(CONFIG_SUPERH_BUILT_IN_M66592)
-	if (!intsts0 && !intenb0) {
+	if (m66592->pdata->on_chip && !intsts0 && !intenb0) {
 		/*
 		 * When USB clock stops, it cannot read register. Even if a
 		 * clock stops, the interrupt occurs. So this driver turn on
@@ -1180,7 +1207,6 @@ static irqreturn_t m66592_irq(int irq, void *_m66592)
 		intsts0 = m66592_read(m66592, M66592_INTSTS0);
 		intenb0 = m66592_read(m66592, M66592_INTENB0);
 	}
-#endif
 
 	savepipe = m66592_read(m66592, M66592_CFIFOSEL);
 
@@ -1526,9 +1552,11 @@ static int __exit m66592_remove(struct platform_device *pdev)
 	iounmap(m66592->reg);
 	free_irq(platform_get_irq(pdev, 0), m66592);
 	m66592_free_request(&m66592->ep[0].ep, m66592->ep0_req);
-#if defined(CONFIG_SUPERH_BUILT_IN_M66592) && defined(CONFIG_HAVE_CLK)
-	clk_disable(m66592->clk);
-	clk_put(m66592->clk);
+#ifdef CONFIG_HAVE_CLK
+	if (m66592->pdata->on_chip) {
+		clk_disable(m66592->clk);
+		clk_put(m66592->clk);
+	}
 #endif
 	kfree(m66592);
 	return 0;
@@ -1540,11 +1568,10 @@ static void nop_completion(struct usb_ep *ep, struct usb_request *r)
 
 static int __init m66592_probe(struct platform_device *pdev)
 {
-	struct resource *res;
-	int irq;
+	struct resource *res, *ires;
 	void __iomem *reg = NULL;
 	struct m66592 *m66592 = NULL;
-#if defined(CONFIG_SUPERH_BUILT_IN_M66592) && defined(CONFIG_HAVE_CLK)
+#ifdef CONFIG_HAVE_CLK
 	char clk_name[8];
 #endif
 	int ret = 0;
@@ -1557,10 +1584,11 @@ static int __init m66592_probe(struct platform_device *pdev)
 		goto clean_up;
 	}
 
-	irq = platform_get_irq(pdev, 0);
-	if (irq < 0) {
+	ires = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+	if (!ires) {
 		ret = -ENODEV;
-		pr_err("platform_get_irq error.\n");
+		dev_err(&pdev->dev,
+			"platform_get_resource IORESOURCE_IRQ error.\n");
 		goto clean_up;
 	}
 
@@ -1571,6 +1599,12 @@ static int __init m66592_probe(struct platform_device *pdev)
 		goto clean_up;
 	}
 
+	if (pdev->dev.platform_data == NULL) {
+		dev_err(&pdev->dev, "no platform data\n");
+		ret = -ENODEV;
+		goto clean_up;
+	}
+
 	/* initialize ucd */
 	m66592 = kzalloc(sizeof(struct m66592), GFP_KERNEL);
 	if (m66592 == NULL) {
@@ -1578,6 +1612,9 @@ static int __init m66592_probe(struct platform_device *pdev)
 		goto clean_up;
 	}
 
+	m66592->pdata = pdev->dev.platform_data;
+	m66592->irq_trigger = ires->flags & IRQF_TRIGGER_MASK;
+
 	spin_lock_init(&m66592->lock);
 	dev_set_drvdata(&pdev->dev, m66592);
 
@@ -1595,22 +1632,25 @@ static int __init m66592_probe(struct platform_device *pdev)
 	m66592->timer.data = (unsigned long)m66592;
 	m66592->reg = reg;
 
-	ret = request_irq(irq, m66592_irq, IRQF_DISABLED | IRQF_SHARED,
+	ret = request_irq(ires->start, m66592_irq, IRQF_DISABLED | IRQF_SHARED,
 			udc_name, m66592);
 	if (ret < 0) {
 		pr_err("request_irq error (%d)\n", ret);
 		goto clean_up;
 	}
 
-#if defined(CONFIG_SUPERH_BUILT_IN_M66592) && defined(CONFIG_HAVE_CLK)
-	snprintf(clk_name, sizeof(clk_name), "usbf%d", pdev->id);
-	m66592->clk = clk_get(&pdev->dev, clk_name);
-	if (IS_ERR(m66592->clk)) {
-		dev_err(&pdev->dev, "cannot get clock \"%s\"\n", clk_name);
-		ret = PTR_ERR(m66592->clk);
-		goto clean_up2;
+#ifdef CONFIG_HAVE_CLK
+	if (m66592->pdata->on_chip) {
+		snprintf(clk_name, sizeof(clk_name), "usbf%d", pdev->id);
+		m66592->clk = clk_get(&pdev->dev, clk_name);
+		if (IS_ERR(m66592->clk)) {
+			dev_err(&pdev->dev, "cannot get clock \"%s\"\n",
+				clk_name);
+			ret = PTR_ERR(m66592->clk);
+			goto clean_up2;
+		}
+		clk_enable(m66592->clk);
 	}
-	clk_enable(m66592->clk);
 #endif
 	INIT_LIST_HEAD(&m66592->gadget.ep_list);
 	m66592->gadget.ep0 = &m66592->ep[0].ep;
@@ -1652,12 +1692,14 @@ static int __init m66592_probe(struct platform_device *pdev)
 	return 0;
 
 clean_up3:
-#if defined(CONFIG_SUPERH_BUILT_IN_M66592) && defined(CONFIG_HAVE_CLK)
-	clk_disable(m66592->clk);
-	clk_put(m66592->clk);
+#ifdef CONFIG_HAVE_CLK
+	if (m66592->pdata->on_chip) {
+		clk_disable(m66592->clk);
+		clk_put(m66592->clk);
+	}
 clean_up2:
 #endif
-	free_irq(irq, m66592);
+	free_irq(ires->start, m66592);
 clean_up:
 	if (m66592) {
 		if (m66592->ep0_req)
diff --git a/drivers/usb/gadget/m66592-udc.h b/drivers/usb/gadget/m66592-udc.h
index 9a9c2bf9fbd5..8b960deed680 100644
--- a/drivers/usb/gadget/m66592-udc.h
+++ b/drivers/usb/gadget/m66592-udc.h
@@ -23,10 +23,12 @@
 #ifndef __M66592_UDC_H__
 #define __M66592_UDC_H__
 
-#if defined(CONFIG_SUPERH_BUILT_IN_M66592) && defined(CONFIG_HAVE_CLK)
+#ifdef CONFIG_HAVE_CLK
 #include <linux/clk.h>
 #endif
 
+#include <linux/usb/m66592.h>
+
 #define M66592_SYSCFG		0x00
 #define M66592_XTAL		0xC000	/* b15-14: Crystal selection */
 #define   M66592_XTAL48		 0x8000		/* 48MHz */
@@ -76,11 +78,11 @@
 #define   M66592_P_TST_J	 0x0001		/* PERI TEST J */
 #define   M66592_P_TST_NORMAL	 0x0000		/* PERI Normal Mode */
 
-#if defined(CONFIG_SUPERH_BUILT_IN_M66592)
+/* built-in registers */
 #define M66592_CFBCFG		0x0A
 #define M66592_D0FBCFG		0x0C
 #define M66592_LITTLE		0x0100	/* b8: Little endian mode */
-#else
+/* external chip case */
 #define M66592_PINCFG		0x0A
 #define M66592_LDRV		0x8000	/* b15: Drive Current Adjust */
 #define M66592_BIGEND		0x0100	/* b8: Big endian mode */
@@ -100,8 +102,8 @@
 #define M66592_PKTM		0x0020	/* b5: Packet mode */
 #define M66592_DENDE		0x0010	/* b4: Dend enable */
 #define M66592_OBUS		0x0004	/* b2: OUTbus mode */
-#endif	/* #if defined(CONFIG_SUPERH_BUILT_IN_M66592) */
 
+/* common case */
 #define M66592_CFIFO		0x10
 #define M66592_D0FIFO		0x14
 #define M66592_D1FIFO		0x18
@@ -113,13 +115,9 @@
 #define M66592_REW		0x4000	/* b14: Buffer rewind */
 #define M66592_DCLRM		0x2000	/* b13: DMA buffer clear mode */
 #define M66592_DREQE		0x1000	/* b12: DREQ output enable */
-#if defined(CONFIG_SUPERH_BUILT_IN_M66592)
-#define M66592_MBW		0x0800	/* b11: Maximum bit width for FIFO */
-#else
-#define M66592_MBW		0x0400	/* b10: Maximum bit width for FIFO */
-#define   M66592_MBW_8		 0x0000   /*  8bit */
-#define   M66592_MBW_16		 0x0400   /* 16bit */
-#endif	/* #if defined(CONFIG_SUPERH_BUILT_IN_M66592) */
+#define M66592_MBW_8		0x0000   /*  8bit */
+#define M66592_MBW_16		0x0400   /* 16bit */
+#define M66592_MBW_32		0x0800   /* 32bit */
 #define M66592_TRENB		0x0200	/* b9: Transaction counter enable */
 #define M66592_TRCLR		0x0100	/* b8: Transaction counter clear */
 #define M66592_DEZPM		0x0080	/* b7: Zero-length packet mode */
@@ -480,9 +478,11 @@ struct m66592_ep {
 struct m66592 {
 	spinlock_t		lock;
 	void __iomem		*reg;
-#if defined(CONFIG_SUPERH_BUILT_IN_M66592) && defined(CONFIG_HAVE_CLK)
+#ifdef CONFIG_HAVE_CLK
 	struct clk *clk;
 #endif
+	struct m66592_platdata	*pdata;
+	unsigned long		irq_trigger;
 
 	struct usb_gadget		gadget;
 	struct usb_gadget_driver	*driver;
@@ -546,13 +546,13 @@ static inline void m66592_read_fifo(struct m66592 *m66592,
 {
 	unsigned long fifoaddr = (unsigned long)m66592->reg + offset;
 
-#if defined(CONFIG_SUPERH_BUILT_IN_M66592)
-	len = (len + 3) / 4;
-	insl(fifoaddr, buf, len);
-#else
-	len = (len + 1) / 2;
-	insw(fifoaddr, buf, len);
-#endif
+	if (m66592->pdata->on_chip) {
+		len = (len + 3) / 4;
+		insl(fifoaddr, buf, len);
+	} else {
+		len = (len + 1) / 2;
+		insw(fifoaddr, buf, len);
+	}
 }
 
 static inline void m66592_write(struct m66592 *m66592, u16 val,
@@ -566,33 +566,34 @@ static inline void m66592_write_fifo(struct m66592 *m66592,
 		void *buf, unsigned long len)
 {
 	unsigned long fifoaddr = (unsigned long)m66592->reg + offset;
-#if defined(CONFIG_SUPERH_BUILT_IN_M66592)
-	unsigned long count;
-	unsigned char *pb;
-	int i;
-
-	count = len / 4;
-	outsl(fifoaddr, buf, count);
-
-	if (len & 0x00000003) {
-		pb = buf + count * 4;
-		for (i = 0; i < (len & 0x00000003); i++) {
-			if (m66592_read(m66592, M66592_CFBCFG))	/* little */
-				outb(pb[i], fifoaddr + (3 - i));
-			else
-				outb(pb[i], fifoaddr + i);
+
+	if (m66592->pdata->on_chip) {
+		unsigned long count;
+		unsigned char *pb;
+		int i;
+
+		count = len / 4;
+		outsl(fifoaddr, buf, count);
+
+		if (len & 0x00000003) {
+			pb = buf + count * 4;
+			for (i = 0; i < (len & 0x00000003); i++) {
+				if (m66592_read(m66592, M66592_CFBCFG))	/* le */
+					outb(pb[i], fifoaddr + (3 - i));
+				else
+					outb(pb[i], fifoaddr + i);
+			}
+		}
+	} else {
+		unsigned long odd = len & 0x0001;
+
+		len = len / 2;
+		outsw(fifoaddr, buf, len);
+		if (odd) {
+			unsigned char *p = buf + len*2;
+			outb(*p, fifoaddr);
 		}
 	}
-#else
-	unsigned long odd = len & 0x0001;
-
-	len = len / 2;
-	outsw(fifoaddr, buf, len);
-	if (odd) {
-		unsigned char *p = buf + len*2;
-		outb(*p, fifoaddr);
-	}
-#endif	/* #if defined(CONFIG_SUPERH_BUILT_IN_M66592) */
 }
 
 static inline void m66592_mdfy(struct m66592 *m66592, u16 val, u16 pat,
diff --git a/include/linux/usb/m66592.h b/include/linux/usb/m66592.h
new file mode 100644
index 000000000000..cda9625e7df0
--- /dev/null
+++ b/include/linux/usb/m66592.h
@@ -0,0 +1,44 @@
+/*
+ * M66592 driver platform data
+ *
+ * Copyright (C) 2009  Renesas Solutions Corp.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#ifndef __LINUX_USB_M66592_H
+#define __LINUX_USB_M66592_H
+
+#define M66592_PLATDATA_XTAL_12MHZ	0x01
+#define M66592_PLATDATA_XTAL_24MHZ	0x02
+#define M66592_PLATDATA_XTAL_48MHZ	0x03
+
+struct m66592_platdata {
+	/* one = on chip controller, zero = external controller */
+	unsigned	on_chip:1;
+
+	/* one = big endian, zero = little endian */
+	unsigned	endian:1;
+
+	/* (external controller only) M66592_PLATDATA_XTAL_nnMHZ */
+	unsigned	xtal:2;
+
+	/* (external controller only) one = 3.3V, zero = 1.5V */
+	unsigned	vif:1;
+
+};
+
+#endif /* __LINUX_USB_M66592_H */
+
-- 
cgit v1.2.3-71-gd317


From d9ab77161d811ffb0bccf396f7155cc905c1b9e1 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Wed, 22 Jul 2009 00:37:25 +0200
Subject: Driver Core: Make PM operations a const pointer

They are not supposed to be modified by drivers, so make them const.

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 drivers/base/platform.c   |  2 +-
 drivers/base/power/main.c |  8 +++++---
 drivers/pci/pci-driver.c  | 16 ++++++++--------
 include/linux/device.h    |  9 +++++----
 4 files changed, 19 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 455e55971d0e..ae5c4aa6d269 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -889,7 +889,7 @@ static int platform_pm_restore_noirq(struct device *dev)
 
 #endif /* !CONFIG_HIBERNATION */
 
-static struct dev_pm_ops platform_dev_pm_ops = {
+static const struct dev_pm_ops platform_dev_pm_ops = {
 	.prepare = platform_pm_prepare,
 	.complete = platform_pm_complete,
 	.suspend = platform_pm_suspend,
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 58a3e572f2c9..1b1a786b7dec 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -157,8 +157,9 @@ void device_pm_move_last(struct device *dev)
  *	@ops:	PM operations to choose from.
  *	@state:	PM transition of the system being carried out.
  */
-static int pm_op(struct device *dev, struct dev_pm_ops *ops,
-			pm_message_t state)
+static int pm_op(struct device *dev,
+		 const struct dev_pm_ops *ops,
+		 pm_message_t state)
 {
 	int error = 0;
 
@@ -220,7 +221,8 @@ static int pm_op(struct device *dev, struct dev_pm_ops *ops,
  *	The operation is executed with interrupts disabled by the only remaining
  *	functional CPU in the system.
  */
-static int pm_noirq_op(struct device *dev, struct dev_pm_ops *ops,
+static int pm_noirq_op(struct device *dev,
+			const struct dev_pm_ops *ops,
 			pm_message_t state)
 {
 	int error = 0;
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index d76c4c85367e..0c2ea44ae5e1 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -575,7 +575,7 @@ static void pci_pm_complete(struct device *dev)
 static int pci_pm_suspend(struct device *dev)
 {
 	struct pci_dev *pci_dev = to_pci_dev(dev);
-	struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
+	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
 
 	if (pci_has_legacy_pm_support(pci_dev))
 		return pci_legacy_suspend(dev, PMSG_SUSPEND);
@@ -613,7 +613,7 @@ static int pci_pm_suspend(struct device *dev)
 static int pci_pm_suspend_noirq(struct device *dev)
 {
 	struct pci_dev *pci_dev = to_pci_dev(dev);
-	struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
+	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
 
 	if (pci_has_legacy_pm_support(pci_dev))
 		return pci_legacy_suspend_late(dev, PMSG_SUSPEND);
@@ -672,7 +672,7 @@ static int pci_pm_resume_noirq(struct device *dev)
 static int pci_pm_resume(struct device *dev)
 {
 	struct pci_dev *pci_dev = to_pci_dev(dev);
-	struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
+	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
 	int error = 0;
 
 	/*
@@ -711,7 +711,7 @@ static int pci_pm_resume(struct device *dev)
 static int pci_pm_freeze(struct device *dev)
 {
 	struct pci_dev *pci_dev = to_pci_dev(dev);
-	struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
+	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
 
 	if (pci_has_legacy_pm_support(pci_dev))
 		return pci_legacy_suspend(dev, PMSG_FREEZE);
@@ -780,7 +780,7 @@ static int pci_pm_thaw_noirq(struct device *dev)
 static int pci_pm_thaw(struct device *dev)
 {
 	struct pci_dev *pci_dev = to_pci_dev(dev);
-	struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
+	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
 	int error = 0;
 
 	if (pci_has_legacy_pm_support(pci_dev))
@@ -799,7 +799,7 @@ static int pci_pm_thaw(struct device *dev)
 static int pci_pm_poweroff(struct device *dev)
 {
 	struct pci_dev *pci_dev = to_pci_dev(dev);
-	struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
+	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
 
 	if (pci_has_legacy_pm_support(pci_dev))
 		return pci_legacy_suspend(dev, PMSG_HIBERNATE);
@@ -872,7 +872,7 @@ static int pci_pm_restore_noirq(struct device *dev)
 static int pci_pm_restore(struct device *dev)
 {
 	struct pci_dev *pci_dev = to_pci_dev(dev);
-	struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
+	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
 	int error = 0;
 
 	/*
@@ -910,7 +910,7 @@ static int pci_pm_restore(struct device *dev)
 
 #endif /* !CONFIG_HIBERNATION */
 
-struct dev_pm_ops pci_dev_pm_ops = {
+const struct dev_pm_ops pci_dev_pm_ops = {
 	.prepare = pci_pm_prepare,
 	.complete = pci_pm_complete,
 	.suspend = pci_pm_suspend,
diff --git a/include/linux/device.h b/include/linux/device.h
index aebb81036db2..a28642975053 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -62,7 +62,7 @@ struct bus_type {
 	int (*suspend)(struct device *dev, pm_message_t state);
 	int (*resume)(struct device *dev);
 
-	struct dev_pm_ops *pm;
+	const struct dev_pm_ops *pm;
 
 	struct bus_type_private *p;
 };
@@ -132,7 +132,7 @@ struct device_driver {
 	int (*resume) (struct device *dev);
 	struct attribute_group **groups;
 
-	struct dev_pm_ops *pm;
+	const struct dev_pm_ops *pm;
 
 	struct driver_private *p;
 };
@@ -200,7 +200,8 @@ struct class {
 	int (*suspend)(struct device *dev, pm_message_t state);
 	int (*resume)(struct device *dev);
 
-	struct dev_pm_ops *pm;
+	const struct dev_pm_ops *pm;
+
 	struct class_private *p;
 };
 
@@ -291,7 +292,7 @@ struct device_type {
 	char *(*nodename)(struct device *dev);
 	void (*release)(struct device *dev);
 
-	struct dev_pm_ops *pm;
+	const struct dev_pm_ops *pm;
 };
 
 /* interface for exporting device attributes */
-- 
cgit v1.2.3-71-gd317


From 8150f32b90f630ad3e460f026ce338cb81685bc9 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Fri, 24 Jul 2009 22:11:32 -0700
Subject: Driver Core: Make PM operations a const pointer

They are not supposed to be modified by drivers, so make them const.

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 drivers/base/platform.c   |  2 +-
 drivers/base/power/main.c |  8 +++++---
 drivers/pci/pci-driver.c  | 16 ++++++++--------
 include/linux/device.h    |  9 +++++----
 4 files changed, 19 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 81cb01bfc356..c4b3fcee17b2 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -925,7 +925,7 @@ static int platform_pm_restore_noirq(struct device *dev)
 
 #endif /* !CONFIG_HIBERNATION */
 
-static struct dev_pm_ops platform_dev_pm_ops = {
+static const struct dev_pm_ops platform_dev_pm_ops = {
 	.prepare = platform_pm_prepare,
 	.complete = platform_pm_complete,
 	.suspend = platform_pm_suspend,
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 58a3e572f2c9..1b1a786b7dec 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -157,8 +157,9 @@ void device_pm_move_last(struct device *dev)
  *	@ops:	PM operations to choose from.
  *	@state:	PM transition of the system being carried out.
  */
-static int pm_op(struct device *dev, struct dev_pm_ops *ops,
-			pm_message_t state)
+static int pm_op(struct device *dev,
+		 const struct dev_pm_ops *ops,
+		 pm_message_t state)
 {
 	int error = 0;
 
@@ -220,7 +221,8 @@ static int pm_op(struct device *dev, struct dev_pm_ops *ops,
  *	The operation is executed with interrupts disabled by the only remaining
  *	functional CPU in the system.
  */
-static int pm_noirq_op(struct device *dev, struct dev_pm_ops *ops,
+static int pm_noirq_op(struct device *dev,
+			const struct dev_pm_ops *ops,
 			pm_message_t state)
 {
 	int error = 0;
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index d76c4c85367e..0c2ea44ae5e1 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -575,7 +575,7 @@ static void pci_pm_complete(struct device *dev)
 static int pci_pm_suspend(struct device *dev)
 {
 	struct pci_dev *pci_dev = to_pci_dev(dev);
-	struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
+	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
 
 	if (pci_has_legacy_pm_support(pci_dev))
 		return pci_legacy_suspend(dev, PMSG_SUSPEND);
@@ -613,7 +613,7 @@ static int pci_pm_suspend(struct device *dev)
 static int pci_pm_suspend_noirq(struct device *dev)
 {
 	struct pci_dev *pci_dev = to_pci_dev(dev);
-	struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
+	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
 
 	if (pci_has_legacy_pm_support(pci_dev))
 		return pci_legacy_suspend_late(dev, PMSG_SUSPEND);
@@ -672,7 +672,7 @@ static int pci_pm_resume_noirq(struct device *dev)
 static int pci_pm_resume(struct device *dev)
 {
 	struct pci_dev *pci_dev = to_pci_dev(dev);
-	struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
+	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
 	int error = 0;
 
 	/*
@@ -711,7 +711,7 @@ static int pci_pm_resume(struct device *dev)
 static int pci_pm_freeze(struct device *dev)
 {
 	struct pci_dev *pci_dev = to_pci_dev(dev);
-	struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
+	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
 
 	if (pci_has_legacy_pm_support(pci_dev))
 		return pci_legacy_suspend(dev, PMSG_FREEZE);
@@ -780,7 +780,7 @@ static int pci_pm_thaw_noirq(struct device *dev)
 static int pci_pm_thaw(struct device *dev)
 {
 	struct pci_dev *pci_dev = to_pci_dev(dev);
-	struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
+	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
 	int error = 0;
 
 	if (pci_has_legacy_pm_support(pci_dev))
@@ -799,7 +799,7 @@ static int pci_pm_thaw(struct device *dev)
 static int pci_pm_poweroff(struct device *dev)
 {
 	struct pci_dev *pci_dev = to_pci_dev(dev);
-	struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
+	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
 
 	if (pci_has_legacy_pm_support(pci_dev))
 		return pci_legacy_suspend(dev, PMSG_HIBERNATE);
@@ -872,7 +872,7 @@ static int pci_pm_restore_noirq(struct device *dev)
 static int pci_pm_restore(struct device *dev)
 {
 	struct pci_dev *pci_dev = to_pci_dev(dev);
-	struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
+	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
 	int error = 0;
 
 	/*
@@ -910,7 +910,7 @@ static int pci_pm_restore(struct device *dev)
 
 #endif /* !CONFIG_HIBERNATION */
 
-struct dev_pm_ops pci_dev_pm_ops = {
+const struct dev_pm_ops pci_dev_pm_ops = {
 	.prepare = pci_pm_prepare,
 	.complete = pci_pm_complete,
 	.suspend = pci_pm_suspend,
diff --git a/include/linux/device.h b/include/linux/device.h
index aebb81036db2..a28642975053 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -62,7 +62,7 @@ struct bus_type {
 	int (*suspend)(struct device *dev, pm_message_t state);
 	int (*resume)(struct device *dev);
 
-	struct dev_pm_ops *pm;
+	const struct dev_pm_ops *pm;
 
 	struct bus_type_private *p;
 };
@@ -132,7 +132,7 @@ struct device_driver {
 	int (*resume) (struct device *dev);
 	struct attribute_group **groups;
 
-	struct dev_pm_ops *pm;
+	const struct dev_pm_ops *pm;
 
 	struct driver_private *p;
 };
@@ -200,7 +200,8 @@ struct class {
 	int (*suspend)(struct device *dev, pm_message_t state);
 	int (*resume)(struct device *dev);
 
-	struct dev_pm_ops *pm;
+	const struct dev_pm_ops *pm;
+
 	struct class_private *p;
 };
 
@@ -291,7 +292,7 @@ struct device_type {
 	char *(*nodename)(struct device *dev);
 	void (*release)(struct device *dev);
 
-	struct dev_pm_ops *pm;
+	const struct dev_pm_ops *pm;
 };
 
 /* interface for exporting device attributes */
-- 
cgit v1.2.3-71-gd317


From 633aae23ff31bef692a70772652e753a0ae59b81 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Wed, 22 Jul 2009 21:51:36 -0700
Subject: Input: serio - switch to using dev_pm_ops

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/serio/serio.c | 23 +++++++++++------------
 include/linux/serio.h       |  2 --
 2 files changed, 11 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/serio/serio.c b/drivers/input/serio/serio.c
index d66f4944f2a0..0236f0d5fd91 100644
--- a/drivers/input/serio/serio.c
+++ b/drivers/input/serio/serio.c
@@ -931,15 +931,11 @@ static int serio_uevent(struct device *dev, struct kobj_uevent_env *env)
 #endif /* CONFIG_HOTPLUG */
 
 #ifdef CONFIG_PM
-static int serio_suspend(struct device *dev, pm_message_t state)
+static int serio_suspend(struct device *dev)
 {
 	struct serio *serio = to_serio_port(dev);
 
-	if (!serio->suspended && state.event == PM_EVENT_SUSPEND)
-		serio_cleanup(serio);
-
-	serio->suspended = state.event == PM_EVENT_SUSPEND ||
-			   state.event == PM_EVENT_FREEZE;
+	serio_cleanup(serio);
 
 	return 0;
 }
@@ -952,13 +948,17 @@ static int serio_resume(struct device *dev)
 	 * Driver reconnect can take a while, so better let kseriod
 	 * deal with it.
 	 */
-	if (serio->suspended) {
-		serio->suspended = false;
-		serio_queue_event(serio, NULL, SERIO_RECONNECT_PORT);
-	}
+	serio_queue_event(serio, NULL, SERIO_RECONNECT_PORT);
 
 	return 0;
 }
+
+static const struct dev_pm_ops serio_pm_ops = {
+	.suspend	= serio_suspend,
+	.resume		= serio_resume,
+	.poweroff	= serio_suspend,
+	.restore	= serio_resume,
+};
 #endif /* CONFIG_PM */
 
 /* called from serio_driver->connect/disconnect methods under serio_mutex */
@@ -1015,8 +1015,7 @@ static struct bus_type serio_bus = {
 	.remove		= serio_driver_remove,
 	.shutdown	= serio_shutdown,
 #ifdef CONFIG_PM
-	.suspend	= serio_suspend,
-	.resume		= serio_resume,
+	.pm		= &serio_pm_ops,
 #endif
 };
 
diff --git a/include/linux/serio.h b/include/linux/serio.h
index 126d24c9eaa8..a640bc2afe76 100644
--- a/include/linux/serio.h
+++ b/include/linux/serio.h
@@ -31,8 +31,6 @@ struct serio {
 
 	bool manual_bind;
 	bool registered;	/* port has been fully registered with driver core */
-	bool suspended;		/* port is suspended */
-
 
 	struct serio_device_id id;
 
-- 
cgit v1.2.3-71-gd317


From 0c193054a4c1cf190d2f23e5e91bd14402e43912 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Mon, 27 Jul 2009 19:09:19 -0400
Subject: nfsd41: hange from page to memory based drc limits

NFSD_SLOT_CACHE_SIZE is the size of all encoded operation responses
(excluding the sequence operation) that we want to cache.

For now, keep NFSD_SLOT_CACHE_SIZE at PAGE_SIZE. It will be reduced
when the DRC is changed from page based to memory based.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c        | 28 +++++++++++++---------------
 fs/nfsd/nfssvc.c           | 13 ++++++-------
 include/linux/nfsd/nfsd.h  |  4 ++--
 include/linux/nfsd/state.h |  1 +
 4 files changed, 22 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 70cba3fbfa6d..e2b11b1b515c 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -414,31 +414,31 @@ gen_sessionid(struct nfsd4_session *ses)
 
 /*
  * Give the client the number of slots it requests bound by
- * NFSD_MAX_SLOTS_PER_SESSION and by sv_drc_max_pages.
+ * NFSD_MAX_SLOTS_PER_SESSION and by nfsd_drc_max_mem.
  *
- * If we run out of pages (sv_drc_pages_used == sv_drc_max_pages) we
- * should (up to a point) re-negotiate active sessions and reduce their
- * slot usage to make rooom for new connections. For now we just fail the
- * create session.
+ * If we run out of reserved DRC memory we should (up to a point) re-negotiate
+ * active sessions and reduce their slot usage to make rooom for new
+ * connections. For now we just fail the create session.
  */
 static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
 {
-	int np;
+	int mem;
 
 	if (fchan->maxreqs < 1)
 		return nfserr_inval;
 	else if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
 		fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
 
-	np = fchan->maxreqs * NFSD_PAGES_PER_SLOT;
+	mem = fchan->maxreqs * NFSD_SLOT_CACHE_SIZE;
 
 	spin_lock(&nfsd_drc_lock);
-	if (np + nfsd_drc_pages_used > nfsd_drc_max_pages)
-		np = nfsd_drc_max_pages - nfsd_drc_pages_used;
-	nfsd_drc_pages_used += np;
+	if (mem + nfsd_drc_mem_used > nfsd_drc_max_mem)
+		mem = ((nfsd_drc_max_mem - nfsd_drc_mem_used) /
+				NFSD_SLOT_CACHE_SIZE) * NFSD_SLOT_CACHE_SIZE;
+	nfsd_drc_mem_used += mem;
 	spin_unlock(&nfsd_drc_lock);
 
-	fchan->maxreqs = np / NFSD_PAGES_PER_SLOT;
+	fchan->maxreqs = mem / NFSD_SLOT_CACHE_SIZE;
 	if (fchan->maxreqs == 0)
 		return nfserr_resource;
 	return 0;
@@ -465,9 +465,7 @@ static int init_forechannel_attrs(struct svc_rqst *rqstp,
 		fchan->maxresp_sz = maxcount;
 	session_fchan->maxresp_sz = fchan->maxresp_sz;
 
-	/* Set the max response cached size our default which is
-	 * a multiple of PAGE_SIZE and small */
-	session_fchan->maxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
+	session_fchan->maxresp_cached = NFSD_SLOT_CACHE_SIZE;
 	fchan->maxresp_cached = session_fchan->maxresp_cached;
 
 	/* Use the client's maxops if possible */
@@ -585,7 +583,7 @@ free_session(struct kref *kref)
 		nfsd4_release_respages(e->ce_respages, e->ce_resused);
 	}
 	spin_lock(&nfsd_drc_lock);
-	nfsd_drc_pages_used -= ses->se_fchannel.maxreqs * NFSD_PAGES_PER_SLOT;
+	nfsd_drc_mem_used -= ses->se_fchannel.maxreqs * NFSD_SLOT_CACHE_SIZE;
 	spin_unlock(&nfsd_drc_lock);
 	kfree(ses);
 }
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 9be2a1932f8a..5a280a9cb540 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -74,8 +74,8 @@ struct svc_serv 		*nfsd_serv;
  * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage.
  */
 spinlock_t	nfsd_drc_lock;
-unsigned int	nfsd_drc_max_pages;
-unsigned int	nfsd_drc_pages_used;
+unsigned int	nfsd_drc_max_mem;
+unsigned int	nfsd_drc_mem_used;
 
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
 static struct svc_stat	nfsd_acl_svcstats;
@@ -247,12 +247,11 @@ void nfsd_reset_versions(void)
 static void set_max_drc(void)
 {
 	#define NFSD_DRC_SIZE_SHIFT	10
-	nfsd_drc_max_pages = nr_free_buffer_pages()
-						>> NFSD_DRC_SIZE_SHIFT;
-	nfsd_drc_pages_used = 0;
+	nfsd_drc_max_mem = (nr_free_buffer_pages()
+					>> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE;
+	nfsd_drc_mem_used = 0;
 	spin_lock_init(&nfsd_drc_lock);
-	dprintk("%s nfsd_drc_max_pages %u\n", __func__,
-		nfsd_drc_max_pages);
+	dprintk("%s nfsd_drc_max_mem %u \n", __func__, nfsd_drc_max_mem);
 }
 
 int nfsd_create_serv(void)
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 2571f856908f..2812ed52669d 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -57,8 +57,8 @@ extern u32			nfsd_supported_minorversion;
 extern struct mutex		nfsd_mutex;
 extern struct svc_serv		*nfsd_serv;
 extern spinlock_t		nfsd_drc_lock;
-extern unsigned int		nfsd_drc_max_pages;
-extern unsigned int		nfsd_drc_pages_used;
+extern unsigned int		nfsd_drc_max_mem;
+extern unsigned int		nfsd_drc_mem_used;
 
 extern struct seq_operations nfs_exports_op;
 
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 57ab2ed08459..a6c87d623891 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -96,6 +96,7 @@ struct nfs4_cb_conn {
 #define NFSD_MAX_SLOTS_PER_SESSION	128
 /* Maximum number of pages per slot cache entry */
 #define NFSD_PAGES_PER_SLOT	1
+#define NFSD_SLOT_CACHE_SIZE		PAGE_SIZE
 /* Maximum number of operations per session compound */
 #define NFSD_MAX_OPS_PER_COMPOUND	16
 
-- 
cgit v1.2.3-71-gd317


From 49557cc74c7bdf6a984be227ead9a84b3a26f053 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Thu, 23 Jul 2009 19:02:16 -0400
Subject: nfsd41: Use separate DRC for setclientid

Instead of trying to share the generic 4.1 reply cache code for the
CREATE_SESSION reply cache, it's simpler to handle CREATE_SESSION
separately.

The nfs41 single slot clientid DRC holds the results of create session
processing.  CREATE_SESSION can be preceeded by a SEQUENCE operation
(an embedded CREATE_SESSION) and the create session single slot cache must be
maintained.  nfsd4_replay_cache_entry() and nfsd4_store_cache_entry() do not
implement the replay of an embedded CREATE_SESSION.

The clientid DRC slot does not need the inuse, cachethis or other fields that
the multiple slot session cache uses.  Replace the clientid DRC cache struct
nfs4_slot cache with a new nfsd4_clid_slot cache.  Save the xdr struct
nfsd4_create_session into the cache at the end of processing, and on a replay,
replace the struct for the replay request with the cached version all while
under the state lock.

nfsd4_proc_compound will handle both the solo and embedded CREATE_SESSION case
via the normal use of encode_operation.

Errors that do not change the create session cache:
A create session NFS4ERR_STALE_CLIENTID error means that a client record
(and associated create session slot) could not be found and therefore can't
be changed.  NFSERR_SEQ_MISORDERED errors do not change the slot cache.

All other errors get cached.

Remove the clientid DRC specific check in nfs4svc_encode_compoundres to
put the session only if cstate.session is set which will now always be true.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4proc.c         |  2 +-
 fs/nfsd/nfs4state.c        | 64 +++++++++++++++++++++++++++-------------------
 fs/nfsd/nfs4xdr.c          |  3 +--
 include/linux/nfsd/state.h | 21 ++++++++++++++-
 include/linux/nfsd/xdr4.h  | 12 ---------
 5 files changed, 60 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index d781658e8084..d606c6a427de 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1120,7 +1120,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 			BUG_ON(op->status == nfs_ok);
 
 encode_op:
-		/* Only from SEQUENCE or CREATE_SESSION */
+		/* Only from SEQUENCE */
 		if (resp->cstate.status == nfserr_replay_cache) {
 			dprintk("%s NFS4.1 replay from cache\n", __func__);
 			if (nfsd4_not_cached(resp))
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 99df8e7a687b..7729d092c8a5 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -653,8 +653,6 @@ static inline void
 free_client(struct nfs4_client *clp)
 {
 	shutdown_callback_client(clp);
-	nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages,
-			     clp->cl_slot.sl_cache_entry.ce_resused);
 	if (clp->cl_cred.cr_group_info)
 		put_group_info(clp->cl_cred.cr_group_info);
 	kfree(clp->cl_principal);
@@ -1293,12 +1291,11 @@ out_copy:
 	exid->clientid.cl_boot = new->cl_clientid.cl_boot;
 	exid->clientid.cl_id = new->cl_clientid.cl_id;
 
-	new->cl_slot.sl_seqid = 0;
 	exid->seqid = 1;
 	nfsd4_set_ex_flags(new, exid);
 
 	dprintk("nfsd4_exchange_id seqid %d flags %x\n",
-		new->cl_slot.sl_seqid, new->cl_exchange_flags);
+		new->cl_cs_slot.sl_seqid, new->cl_exchange_flags);
 	status = nfs_ok;
 
 out:
@@ -1334,15 +1331,35 @@ check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse)
 	return nfserr_seq_misordered;
 }
 
+/*
+ * Cache the create session result into the create session single DRC
+ * slot cache by saving the xdr structure. sl_seqid has been set.
+ * Do this for solo or embedded create session operations.
+ */
+static void
+nfsd4_cache_create_session(struct nfsd4_create_session *cr_ses,
+			   struct nfsd4_clid_slot *slot, int nfserr)
+{
+	slot->sl_status = nfserr;
+	memcpy(&slot->sl_cr_ses, cr_ses, sizeof(*cr_ses));
+}
+
+static __be32
+nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses,
+			    struct nfsd4_clid_slot *slot)
+{
+	memcpy(cr_ses, &slot->sl_cr_ses, sizeof(*cr_ses));
+	return slot->sl_status;
+}
+
 __be32
 nfsd4_create_session(struct svc_rqst *rqstp,
 		     struct nfsd4_compound_state *cstate,
 		     struct nfsd4_create_session *cr_ses)
 {
 	u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
-	struct nfsd4_compoundres *resp = rqstp->rq_resp;
 	struct nfs4_client *conf, *unconf;
-	struct nfsd4_slot *slot = NULL;
+	struct nfsd4_clid_slot *cs_slot = NULL;
 	int status = 0;
 
 	nfs4_lock_state();
@@ -1350,25 +1367,22 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	conf = find_confirmed_client(&cr_ses->clientid);
 
 	if (conf) {
-		slot = &conf->cl_slot;
-		status = check_slot_seqid(cr_ses->seqid, slot->sl_seqid,
-					  slot->sl_inuse);
+		cs_slot = &conf->cl_cs_slot;
+		status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
 		if (status == nfserr_replay_cache) {
 			dprintk("Got a create_session replay! seqid= %d\n",
-				slot->sl_seqid);
-			cstate->slot = slot;
-			cstate->status = status;
+				cs_slot->sl_seqid);
 			/* Return the cached reply status */
-			status = nfsd4_replay_cache_entry(resp, NULL);
+			status = nfsd4_replay_create_session(cr_ses, cs_slot);
 			goto out;
-		} else if (cr_ses->seqid != conf->cl_slot.sl_seqid + 1) {
+		} else if (cr_ses->seqid != cs_slot->sl_seqid + 1) {
 			status = nfserr_seq_misordered;
 			dprintk("Sequence misordered!\n");
 			dprintk("Expected seqid= %d but got seqid= %d\n",
-				slot->sl_seqid, cr_ses->seqid);
+				cs_slot->sl_seqid, cr_ses->seqid);
 			goto out;
 		}
-		conf->cl_slot.sl_seqid++;
+		cs_slot->sl_seqid++;
 	} else if (unconf) {
 		if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
 		    (ip_addr != unconf->cl_addr)) {
@@ -1376,16 +1390,15 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 			goto out;
 		}
 
-		slot = &unconf->cl_slot;
-		status = check_slot_seqid(cr_ses->seqid, slot->sl_seqid,
-					  slot->sl_inuse);
+		cs_slot = &unconf->cl_cs_slot;
+		status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
 		if (status) {
 			/* an unconfirmed replay returns misordered */
 			status = nfserr_seq_misordered;
-			goto out;
+			goto out_cache;
 		}
 
-		slot->sl_seqid++; /* from 0 to 1 */
+		cs_slot->sl_seqid++; /* from 0 to 1 */
 		move_to_confirmed(unconf);
 
 		/*
@@ -1406,12 +1419,11 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 
 	memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data,
 	       NFS4_MAX_SESSIONID_LEN);
-	cr_ses->seqid = slot->sl_seqid;
+	cr_ses->seqid = cs_slot->sl_seqid;
 
-	slot->sl_inuse = true;
-	cstate->slot = slot;
-	/* Ensure a page is used for the cache */
-	slot->sl_cache_entry.ce_cachethis = 1;
+out_cache:
+	/* cache solo and embedded create sessions under the state lock */
+	nfsd4_cache_create_session(cr_ses, cs_slot, status);
 out:
 	nfs4_unlock_state();
 	dprintk("%s returns %d\n", __func__, ntohl(status));
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 2dcc7feaa6ff..fdf632bf1cfe 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3313,8 +3313,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
 			dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
 			resp->cstate.slot->sl_inuse = 0;
 		}
-		if (resp->cstate.session)
-			nfsd4_put_session(resp->cstate.session);
+		nfsd4_put_session(resp->cstate.session);
 	}
 	return 1;
 }
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index a6c87d623891..58bb19784e12 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -127,6 +127,25 @@ struct nfsd4_channel_attrs {
 	u32		rdma_attrs;
 };
 
+struct nfsd4_create_session {
+	clientid_t			clientid;
+	struct nfs4_sessionid		sessionid;
+	u32				seqid;
+	u32				flags;
+	struct nfsd4_channel_attrs	fore_channel;
+	struct nfsd4_channel_attrs	back_channel;
+	u32				callback_prog;
+	u32				uid;
+	u32				gid;
+};
+
+/* The single slot clientid cache structure */
+struct nfsd4_clid_slot {
+	u32				sl_seqid;
+	__be32				sl_status;
+	struct nfsd4_create_session	sl_cr_ses;
+};
+
 struct nfsd4_session {
 	struct kref		se_ref;
 	struct list_head	se_hash;	/* hash by sessionid */
@@ -193,7 +212,7 @@ struct nfs4_client {
 
 	/* for nfs41 */
 	struct list_head	cl_sessions;
-	struct nfsd4_slot	cl_slot;	/* create_session slot */
+	struct nfsd4_clid_slot	cl_cs_slot;	/* create_session slot */
 	u32			cl_exchange_flags;
 	struct nfs4_sessionid	cl_sessionid;
 };
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index 2bacf7535069..5e4beb0deb80 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -366,18 +366,6 @@ struct nfsd4_exchange_id {
 	int		spa_how;
 };
 
-struct nfsd4_create_session {
-	clientid_t		clientid;
-	struct nfs4_sessionid	sessionid;
-	u32			seqid;
-	u32			flags;
-	struct nfsd4_channel_attrs fore_channel;
-	struct nfsd4_channel_attrs back_channel;
-	u32			callback_prog;
-	u32			uid;
-	u32			gid;
-};
-
 struct nfsd4_sequence {
 	struct nfs4_sessionid	sessionid;		/* request/response */
 	u32			seqid;			/* request/response */
-- 
cgit v1.2.3-71-gd317


From e5f5ccb646bc6009572b5c23201b5e81638ff150 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@caiaq.de>
Date: Thu, 23 Jul 2009 20:35:53 +0200
Subject: power_supply: get_by_name and set_charged functionality

This adds a function that indicates that a battery is fully charged.
It also includes functions to get a power_supply device from the class
of registered devices by name reference. These can be used to find a
specific battery to call power_supply_set_battery_charged() on.

Some battery drivers might need this information to calibrate
themselves.

Signed-off-by: Daniel Mack <daniel@caiaq.de>
Cc: Ian Molton <spyro@f2s.com>
Cc: Anton Vorontsov <cbou@mail.ru>
Cc: Matt Reimer <mreimer@vpop.net>
Signed-off-by: Anton Vorontsov <cbouatmailru@gmail.com>
---
 drivers/power/power_supply_core.c | 28 ++++++++++++++++++++++++++++
 include/linux/power_supply.h      |  3 +++
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/power/power_supply_core.c b/drivers/power/power_supply_core.c
index 12cd6e36ff1d..cce75b40b435 100644
--- a/drivers/power/power_supply_core.c
+++ b/drivers/power/power_supply_core.c
@@ -116,6 +116,34 @@ int power_supply_is_system_supplied(void)
 }
 EXPORT_SYMBOL_GPL(power_supply_is_system_supplied);
 
+int power_supply_set_battery_charged(struct power_supply *psy)
+{
+	if (psy->type == POWER_SUPPLY_TYPE_BATTERY && psy->set_charged) {
+		psy->set_charged(psy);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(power_supply_set_battery_charged);
+
+static int power_supply_match_device_by_name(struct device *dev, void *data)
+{
+	const char *name = data;
+	struct power_supply *psy = dev_get_drvdata(dev);
+
+	return strcmp(psy->name, name) == 0;
+}
+
+struct power_supply *power_supply_get_by_name(char *name)
+{
+	struct device *dev = class_find_device(power_supply_class, NULL, name,
+					power_supply_match_device_by_name);
+
+	return dev ? dev_get_drvdata(dev) : NULL;
+}
+EXPORT_SYMBOL_GPL(power_supply_get_by_name);
+
 int power_supply_register(struct device *parent, struct power_supply *psy)
 {
 	int rc = 0;
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 4c7c6fc35487..b5d096d3a9be 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -144,6 +144,7 @@ struct power_supply {
 			    enum power_supply_property psp,
 			    union power_supply_propval *val);
 	void (*external_power_changed)(struct power_supply *psy);
+	void (*set_charged)(struct power_supply *psy);
 
 	/* For APM emulation, think legacy userspace. */
 	int use_for_apm;
@@ -183,8 +184,10 @@ struct power_supply_info {
 	int use_for_apm;
 };
 
+extern struct power_supply *power_supply_get_by_name(char *name);
 extern void power_supply_changed(struct power_supply *psy);
 extern int power_supply_am_i_supplied(struct power_supply *psy);
+extern int power_supply_set_battery_charged(struct power_supply *psy);
 
 #if defined(CONFIG_POWER_SUPPLY) || defined(CONFIG_POWER_SUPPLY_MODULE)
 extern int power_supply_is_system_supplied(void);
-- 
cgit v1.2.3-71-gd317


From 78ddb2785980fc01b129c3547463266cae9c6ca9 Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben@simtec.co.uk>
Date: Thu, 30 Jul 2009 23:23:34 +0100
Subject: ARM: PL093: Header file for PL093 SSMC PrimeCell

Header to define the standard registers for an
PL093 SSMC memory controller.

Signed-off-by: Ben Dooks <ben@simtec.co.uk>
Signed-off-by: Ben Dooks <ben-linux@fluff.org>
---
 include/linux/amba/pl093.h | 80 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 include/linux/amba/pl093.h

(limited to 'include/linux')

diff --git a/include/linux/amba/pl093.h b/include/linux/amba/pl093.h
new file mode 100644
index 000000000000..2983e3671adb
--- /dev/null
+++ b/include/linux/amba/pl093.h
@@ -0,0 +1,80 @@
+/* linux/amba/pl093.h
+ *
+ * Copyright (c) 2008 Simtec Electronics
+ *	http://armlinux.simtec.co.uk/
+ *	Ben Dooks <ben@simtec.co.uk>
+ *
+ * AMBA PL093 SSMC (synchronous static memory controller)
+ *  See DDI0236.pdf (r0p4) for more details
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+
+#define SMB_BANK(x)	((x) * 0x20) /* each bank control set is 0x20 apart */
+
+/* Offsets for SMBxxxxRy registers */
+
+#define SMBIDCYR	(0x00)
+#define SMBWSTRDR	(0x04)
+#define SMBWSTWRR	(0x08)
+#define SMBWSTOENR	(0x0C)
+#define SMBWSTWENR	(0x10)
+#define SMBCR		(0x14)
+#define SMBSR		(0x18)
+#define SMBWSTBRDR	(0x1C)
+
+/* Masks for SMB registers */
+#define IDCY_MASK	(0xf)
+#define WSTRD_MASK	(0xf)
+#define WSTWR_MASK	(0xf)
+#define WSTOEN_MASK	(0xf)
+#define WSTWEN_MASK	(0xf)
+
+/* Notes from datasheet:
+ *	WSTOEN <= WSTRD
+ *	WSTWEN <= WSTWR
+ *
+ * WSTOEN is not used with nWAIT
+ */
+
+/* SMBCR bit definitions */
+#define SMBCR_BIWRITEEN		(1 << 21)
+#define SMBCR_ADDRVALIDWRITEEN	(1 << 20)
+#define SMBCR_SYNCWRITE		(1 << 17)
+#define SMBCR_BMWRITE		(1 << 16)
+#define SMBCR_WRAPREAD		(1 << 14)
+#define SMBCR_BIREADEN		(1 << 13)
+#define SMBCR_ADDRVALIDREADEN	(1 << 12)
+#define SMBCR_SYNCREAD		(1 << 9)
+#define SMBCR_BMREAD		(1 << 8)
+#define SMBCR_SMBLSPOL		(1 << 6)
+#define SMBCR_WP		(1 << 3)
+#define SMBCR_WAITEN		(1 << 2)
+#define SMBCR_WAITPOL		(1 << 1)
+#define SMBCR_RBLE		(1 << 0)
+
+#define SMBCR_BURSTLENWRITE_MASK	(3 << 18)
+#define SMBCR_BURSTLENWRITE_4		(0 << 18)
+#define SMBCR_BURSTLENWRITE_8		(1 << 18)
+#define SMBCR_BURSTLENWRITE_RESERVED	(2 << 18)
+#define SMBCR_BURSTLENWRITE_CONTINUOUS	(3 << 18)
+
+#define SMBCR_BURSTLENREAD_MASK		(3 << 10)
+#define SMBCR_BURSTLENREAD_4		(0 << 10)
+#define SMBCR_BURSTLENREAD_8		(1 << 10)
+#define SMBCR_BURSTLENREAD_16		(2 << 10)
+#define SMBCR_BURSTLENREAD_CONTINUOUS	(3 << 10)
+
+#define SMBCR_MW_MASK			(3 << 4)
+#define SMBCR_MW_8BIT			(0 << 4)
+#define SMBCR_MW_16BIT			(1 << 4)
+#define SMBCR_MW_M32BIT			(2 << 4)
+
+/* SSMC status registers */
+#define SSMCCSR		(0x200)
+#define SSMCCR		(0x204)
+#define SSMCITCR	(0x208)
+#define SSMCITIP	(0x20C)
+#define SSMCITIOP	(0x210)
-- 
cgit v1.2.3-71-gd317


From ff663cf8705bea101d5f73cf471855c85242575e Mon Sep 17 00:00:00 2001
From: Zhenyu Wang <zhenyu.z.wang@intel.com>
Date: Thu, 23 Jul 2009 17:25:49 +0100
Subject: agp: Add generic support for graphics dma remapping

New driver hooks for support graphics memory dma remapping
are introduced in this patch. It makes generic code can
tell if current device needs dma remapping, then call driver
provided interfaces for mapping and unmapping. Change has
also been made to handle scratch_page in remapping case.

Signed-off-by: Zhenyu Wang <zhenyu.z.wang@intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/char/agp/agp.h      |  6 ++++++
 drivers/char/agp/backend.c  | 20 ++++++++++++++++++++
 drivers/char/agp/generic.c  |  9 +++++++++
 include/linux/agp_backend.h |  6 +++++-
 4 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/char/agp/agp.h b/drivers/char/agp/agp.h
index ce110a3bf298..17e6d0d3ba36 100644
--- a/drivers/char/agp/agp.h
+++ b/drivers/char/agp/agp.h
@@ -121,6 +121,11 @@ struct agp_bridge_driver {
 	void (*agp_destroy_pages)(struct agp_memory *);
 	int (*agp_type_to_mask_type) (struct agp_bridge_data *, int);
 	void (*chipset_flush)(struct agp_bridge_data *);
+
+	int (*agp_map_page)(void *addr, dma_addr_t *ret);
+	void (*agp_unmap_page)(void *addr, dma_addr_t dma);
+	int (*agp_map_memory)(struct agp_memory *mem);
+	void (*agp_unmap_memory)(struct agp_memory *mem);
 };
 
 struct agp_bridge_data {
@@ -135,6 +140,7 @@ struct agp_bridge_data {
 	u32 *gatt_table_real;
 	unsigned long scratch_page;
 	unsigned long scratch_page_real;
+	dma_addr_t scratch_page_dma;
 	unsigned long gart_bus_addr;
 	unsigned long gatt_bus_addr;
 	u32 mode;
diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c
index 3bd7e503de41..19ac3663acdc 100644
--- a/drivers/char/agp/backend.c
+++ b/drivers/char/agp/backend.c
@@ -152,6 +152,15 @@ static int agp_backend_initialize(struct agp_bridge_data *bridge)
 		bridge->scratch_page_real = phys_to_gart(page_to_phys(page));
 		bridge->scratch_page = bridge->driver->mask_memory(bridge,
 					   phys_to_gart(page_to_phys(page)), 0);
+
+		if (bridge->driver->agp_map_page &&
+		    bridge->driver->agp_map_page(phys_to_virt(page_to_phys(page)),
+						&bridge->scratch_page_dma)) {
+			dev_err(&bridge->dev->dev,
+				"unable to dma-map scratch page\n");
+			rc = -ENOMEM;
+			goto err_out_nounmap;
+		}
 	}
 
 	size_value = bridge->driver->fetch_size();
@@ -191,6 +200,13 @@ static int agp_backend_initialize(struct agp_bridge_data *bridge)
 	return 0;
 
 err_out:
+	if (bridge->driver->needs_scratch_page &&
+	    bridge->driver->agp_unmap_page) {
+		void *va = gart_to_virt(bridge->scratch_page_real);
+
+		bridge->driver->agp_unmap_page(va, bridge->scratch_page_dma);
+	}
+err_out_nounmap:
 	if (bridge->driver->needs_scratch_page) {
 		void *va = gart_to_virt(bridge->scratch_page_real);
 
@@ -221,6 +237,10 @@ static void agp_backend_cleanup(struct agp_bridge_data *bridge)
 	    bridge->driver->needs_scratch_page) {
 		void *va = gart_to_virt(bridge->scratch_page_real);
 
+		if (bridge->driver->agp_unmap_page)
+			bridge->driver->agp_unmap_page(va,
+					       bridge->scratch_page_dma);
+
 		bridge->driver->agp_destroy_page(va, AGP_PAGE_DESTROY_UNMAP);
 		bridge->driver->agp_destroy_page(va, AGP_PAGE_DESTROY_FREE);
 	}
diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c
index a3bcc7ef42f9..28f0208c66a6 100644
--- a/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@ -437,6 +437,12 @@ int agp_bind_memory(struct agp_memory *curr, off_t pg_start)
 		curr->bridge->driver->cache_flush();
 		curr->is_flushed = true;
 	}
+
+	if (curr->bridge->driver->agp_map_memory) {
+		ret_val = curr->bridge->driver->agp_map_memory(curr);
+		if (ret_val)
+			return ret_val;
+	}
 	ret_val = curr->bridge->driver->insert_memory(curr, pg_start, curr->type);
 
 	if (ret_val != 0)
@@ -478,6 +484,9 @@ int agp_unbind_memory(struct agp_memory *curr)
 	if (ret_val != 0)
 		return ret_val;
 
+	if (curr->bridge->driver->agp_unmap_memory)
+		curr->bridge->driver->agp_unmap_memory(curr);
+
 	curr->is_bound = false;
 	curr->pg_start = 0;
 	spin_lock(&curr->bridge->mapped_lock);
diff --git a/include/linux/agp_backend.h b/include/linux/agp_backend.h
index 76fa794fdac0..8a294d65b9b1 100644
--- a/include/linux/agp_backend.h
+++ b/include/linux/agp_backend.h
@@ -79,9 +79,13 @@ struct agp_memory {
 	u32 physical;
 	bool is_bound;
 	bool is_flushed;
-        bool vmalloc_flag;
+	bool vmalloc_flag;
+	bool sg_vmalloc_flag;
 	/* list of agp_memory mapped to the aperture */
 	struct list_head mapped_list;
+	/* DMA-mapped addresses */
+	struct scatterlist *sg_list;
+	int num_sg;
 };
 
 #define AGP_NORMAL_MEMORY 0
-- 
cgit v1.2.3-71-gd317


From f692775d7e0a22477143cd884e45c955448ac7d2 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Wed, 29 Jul 2009 09:28:45 +0100
Subject: intel-agp: fix sglist allocation to avoid vmalloc()

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/char/agp/intel-agp.c | 29 ++++++++++-------------------
 include/linux/agp_backend.h  |  1 -
 2 files changed, 10 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index b9d9886ff3c3..d8c80d8be5e2 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -198,39 +198,30 @@ static void intel_agp_unmap_page(struct page *page, dma_addr_t dma)
 
 static void intel_agp_free_sglist(struct agp_memory *mem)
 {
+	struct sg_table st;
+
+	st.sgl = mem->sg_list;
+	st.orig_nents = st.nents = mem->page_count;
+
+	sg_free_table(&st);
 
-	if (mem->sg_vmalloc_flag)
-		vfree(mem->sg_list);
-	else
-		kfree(mem->sg_list);
-	mem->sg_vmalloc_flag = 0;
 	mem->sg_list = NULL;
 	mem->num_sg = 0;
 }
 
 static int intel_agp_map_memory(struct agp_memory *mem)
 {
+	struct sg_table st;
 	struct scatterlist *sg;
 	int i;
 
 	DBG("try mapping %lu pages\n", (unsigned long)mem->page_count);
 
-	if ((mem->page_count * sizeof(*mem->sg_list)) < 2*PAGE_SIZE)
-		mem->sg_list = kcalloc(mem->page_count, sizeof(*mem->sg_list),
-				       GFP_KERNEL);
-
-	if (mem->sg_list == NULL) {
-		mem->sg_list = vmalloc(mem->page_count * sizeof(*mem->sg_list));
-		mem->sg_vmalloc_flag = 1;
-	}
-
-	if (!mem->sg_list) {
-		mem->sg_vmalloc_flag = 0;
+	if (sg_alloc_table(&st, mem->page_count, GFP_KERNEL))
 		return -ENOMEM;
-	}
-	sg_init_table(mem->sg_list, mem->page_count);
 
-	sg = mem->sg_list;
+	mem->sg_list = sg = st.sgl;
+
 	for (i = 0 ; i < mem->page_count; i++, sg = sg_next(sg))
 		sg_set_page(sg, mem->pages[i], PAGE_SIZE, 0);
 
diff --git a/include/linux/agp_backend.h b/include/linux/agp_backend.h
index 8a294d65b9b1..880130f7311f 100644
--- a/include/linux/agp_backend.h
+++ b/include/linux/agp_backend.h
@@ -80,7 +80,6 @@ struct agp_memory {
 	bool is_bound;
 	bool is_flushed;
 	bool vmalloc_flag;
-	bool sg_vmalloc_flag;
 	/* list of agp_memory mapped to the aperture */
 	struct list_head mapped_list;
 	/* DMA-mapped addresses */
-- 
cgit v1.2.3-71-gd317


From 42c4ab41a176ee784c0f28c0b29025a8fc34f05a Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Wed, 29 Jul 2009 12:15:26 +0200
Subject: itimers: Merge ITIMER_VIRT and ITIMER_PROF

Both cpu itimers have same data flow in the few places, this
patch make unification of code related with VIRT and PROF
itimers.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
LKML-Reference: <1248862529-6063-2-git-send-email-sgruszka@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h     |  14 ++++-
 kernel/fork.c             |   9 +--
 kernel/itimer.c           | 146 +++++++++++++++++++++-------------------------
 kernel/posix-cpu-timers.c |  98 +++++++++++++++----------------
 4 files changed, 130 insertions(+), 137 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3ab08e4bb6b8..3b3efaddd953 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -470,6 +470,11 @@ struct pacct_struct {
 	unsigned long		ac_minflt, ac_majflt;
 };
 
+struct cpu_itimer {
+	cputime_t expires;
+	cputime_t incr;
+};
+
 /**
  * struct task_cputime - collected CPU time counts
  * @utime:		time spent in user mode, in &cputime_t units
@@ -564,9 +569,12 @@ struct signal_struct {
 	struct pid *leader_pid;
 	ktime_t it_real_incr;
 
-	/* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */
-	cputime_t it_prof_expires, it_virt_expires;
-	cputime_t it_prof_incr, it_virt_incr;
+	/*
+	 * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use
+	 * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these
+	 * values are defined to 0 and 1 respectively
+	 */
+	struct cpu_itimer it[2];
 
 	/*
 	 * Thread group totals for process CPU timers.
diff --git a/kernel/fork.c b/kernel/fork.c
index 29b532e718f7..893ab0bf5e39 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -62,6 +62,7 @@
 #include <linux/fs_struct.h>
 #include <linux/magic.h>
 #include <linux/perf_counter.h>
+#include <linux/posix-timers.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -790,10 +791,10 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
 	thread_group_cputime_init(sig);
 
 	/* Expiration times and increments. */
-	sig->it_virt_expires = cputime_zero;
-	sig->it_virt_incr = cputime_zero;
-	sig->it_prof_expires = cputime_zero;
-	sig->it_prof_incr = cputime_zero;
+	sig->it[CPUCLOCK_PROF].expires = cputime_zero;
+	sig->it[CPUCLOCK_PROF].incr = cputime_zero;
+	sig->it[CPUCLOCK_VIRT].expires = cputime_zero;
+	sig->it[CPUCLOCK_VIRT].incr = cputime_zero;
 
 	/* Cached expiration times. */
 	sig->cputime_expires.prof_exp = cputime_zero;
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 58762f7077ec..852c88ddd1f0 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -41,10 +41,43 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer)
 	return ktime_to_timeval(rem);
 }
 
+static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
+			   struct itimerval *value)
+{
+	cputime_t cval, cinterval;
+	struct cpu_itimer *it = &tsk->signal->it[clock_id];
+
+	spin_lock_irq(&tsk->sighand->siglock);
+
+	cval = it->expires;
+	cinterval = it->incr;
+	if (!cputime_eq(cval, cputime_zero)) {
+		struct task_cputime cputime;
+		cputime_t t;
+
+		thread_group_cputimer(tsk, &cputime);
+		if (clock_id == CPUCLOCK_PROF)
+			t = cputime_add(cputime.utime, cputime.stime);
+		else
+			/* CPUCLOCK_VIRT */
+			t = cputime.utime;
+
+		if (cputime_le(cval, t))
+			/* about to fire */
+			cval = jiffies_to_cputime(1);
+		else
+			cval = cputime_sub(cval, t);
+	}
+
+	spin_unlock_irq(&tsk->sighand->siglock);
+
+	cputime_to_timeval(cval, &value->it_value);
+	cputime_to_timeval(cinterval, &value->it_interval);
+}
+
 int do_getitimer(int which, struct itimerval *value)
 {
 	struct task_struct *tsk = current;
-	cputime_t cinterval, cval;
 
 	switch (which) {
 	case ITIMER_REAL:
@@ -55,44 +88,10 @@ int do_getitimer(int which, struct itimerval *value)
 		spin_unlock_irq(&tsk->sighand->siglock);
 		break;
 	case ITIMER_VIRTUAL:
-		spin_lock_irq(&tsk->sighand->siglock);
-		cval = tsk->signal->it_virt_expires;
-		cinterval = tsk->signal->it_virt_incr;
-		if (!cputime_eq(cval, cputime_zero)) {
-			struct task_cputime cputime;
-			cputime_t utime;
-
-			thread_group_cputimer(tsk, &cputime);
-			utime = cputime.utime;
-			if (cputime_le(cval, utime)) { /* about to fire */
-				cval = jiffies_to_cputime(1);
-			} else {
-				cval = cputime_sub(cval, utime);
-			}
-		}
-		spin_unlock_irq(&tsk->sighand->siglock);
-		cputime_to_timeval(cval, &value->it_value);
-		cputime_to_timeval(cinterval, &value->it_interval);
+		get_cpu_itimer(tsk, CPUCLOCK_VIRT, value);
 		break;
 	case ITIMER_PROF:
-		spin_lock_irq(&tsk->sighand->siglock);
-		cval = tsk->signal->it_prof_expires;
-		cinterval = tsk->signal->it_prof_incr;
-		if (!cputime_eq(cval, cputime_zero)) {
-			struct task_cputime times;
-			cputime_t ptime;
-
-			thread_group_cputimer(tsk, &times);
-			ptime = cputime_add(times.utime, times.stime);
-			if (cputime_le(cval, ptime)) { /* about to fire */
-				cval = jiffies_to_cputime(1);
-			} else {
-				cval = cputime_sub(cval, ptime);
-			}
-		}
-		spin_unlock_irq(&tsk->sighand->siglock);
-		cputime_to_timeval(cval, &value->it_value);
-		cputime_to_timeval(cinterval, &value->it_interval);
+		get_cpu_itimer(tsk, CPUCLOCK_PROF, value);
 		break;
 	default:
 		return(-EINVAL);
@@ -128,6 +127,36 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
+static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
+			   struct itimerval *value, struct itimerval *ovalue)
+{
+	cputime_t cval, cinterval, nval, ninterval;
+	struct cpu_itimer *it = &tsk->signal->it[clock_id];
+
+	nval = timeval_to_cputime(&value->it_value);
+	ninterval = timeval_to_cputime(&value->it_interval);
+
+	spin_lock_irq(&tsk->sighand->siglock);
+
+	cval = it->expires;
+	cinterval = it->incr;
+	if (!cputime_eq(cval, cputime_zero) ||
+	    !cputime_eq(nval, cputime_zero)) {
+		if (cputime_gt(nval, cputime_zero))
+			nval = cputime_add(nval, jiffies_to_cputime(1));
+		set_process_cpu_timer(tsk, clock_id, &nval, &cval);
+	}
+	it->expires = nval;
+	it->incr = ninterval;
+
+	spin_unlock_irq(&tsk->sighand->siglock);
+
+	if (ovalue) {
+		cputime_to_timeval(cval, &ovalue->it_value);
+		cputime_to_timeval(cinterval, &ovalue->it_interval);
+	}
+}
+
 /*
  * Returns true if the timeval is in canonical form
  */
@@ -139,7 +168,6 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
 	struct task_struct *tsk = current;
 	struct hrtimer *timer;
 	ktime_t expires;
-	cputime_t cval, cinterval, nval, ninterval;
 
 	/*
 	 * Validate the timevals in value.
@@ -174,48 +202,10 @@ again:
 		spin_unlock_irq(&tsk->sighand->siglock);
 		break;
 	case ITIMER_VIRTUAL:
-		nval = timeval_to_cputime(&value->it_value);
-		ninterval = timeval_to_cputime(&value->it_interval);
-		spin_lock_irq(&tsk->sighand->siglock);
-		cval = tsk->signal->it_virt_expires;
-		cinterval = tsk->signal->it_virt_incr;
-		if (!cputime_eq(cval, cputime_zero) ||
-		    !cputime_eq(nval, cputime_zero)) {
-			if (cputime_gt(nval, cputime_zero))
-				nval = cputime_add(nval,
-						   jiffies_to_cputime(1));
-			set_process_cpu_timer(tsk, CPUCLOCK_VIRT,
-					      &nval, &cval);
-		}
-		tsk->signal->it_virt_expires = nval;
-		tsk->signal->it_virt_incr = ninterval;
-		spin_unlock_irq(&tsk->sighand->siglock);
-		if (ovalue) {
-			cputime_to_timeval(cval, &ovalue->it_value);
-			cputime_to_timeval(cinterval, &ovalue->it_interval);
-		}
+		set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue);
 		break;
 	case ITIMER_PROF:
-		nval = timeval_to_cputime(&value->it_value);
-		ninterval = timeval_to_cputime(&value->it_interval);
-		spin_lock_irq(&tsk->sighand->siglock);
-		cval = tsk->signal->it_prof_expires;
-		cinterval = tsk->signal->it_prof_incr;
-		if (!cputime_eq(cval, cputime_zero) ||
-		    !cputime_eq(nval, cputime_zero)) {
-			if (cputime_gt(nval, cputime_zero))
-				nval = cputime_add(nval,
-						   jiffies_to_cputime(1));
-			set_process_cpu_timer(tsk, CPUCLOCK_PROF,
-					      &nval, &cval);
-		}
-		tsk->signal->it_prof_expires = nval;
-		tsk->signal->it_prof_incr = ninterval;
-		spin_unlock_irq(&tsk->sighand->siglock);
-		if (ovalue) {
-			cputime_to_timeval(cval, &ovalue->it_value);
-			cputime_to_timeval(cinterval, &ovalue->it_interval);
-		}
+		set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue);
 		break;
 	default:
 		return -EINVAL;
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index bece7c0b67b2..9b2d5e4dc8c4 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -14,11 +14,11 @@
  */
 void update_rlimit_cpu(unsigned long rlim_new)
 {
-	cputime_t cputime;
+	cputime_t cputime = secs_to_cputime(rlim_new);
+	struct signal_struct *const sig = current->signal;
 
-	cputime = secs_to_cputime(rlim_new);
-	if (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
-	    cputime_gt(current->signal->it_prof_expires, cputime)) {
+	if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) ||
+	    cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) {
 		spin_lock_irq(&current->sighand->siglock);
 		set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
 		spin_unlock_irq(&current->sighand->siglock);
@@ -613,6 +613,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 				break;
 			}
 		} else {
+			struct signal_struct *const sig = p->signal;
+			union cpu_time_count *exp = &timer->it.cpu.expires;
+
 			/*
 			 * For a process timer, set the cached expiration time.
 			 */
@@ -620,30 +623,27 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 			default:
 				BUG();
 			case CPUCLOCK_VIRT:
-				if (!cputime_eq(p->signal->it_virt_expires,
+				if (!cputime_eq(sig->it[CPUCLOCK_VIRT].expires,
 						cputime_zero) &&
-				    cputime_lt(p->signal->it_virt_expires,
-					       timer->it.cpu.expires.cpu))
+				    cputime_lt(sig->it[CPUCLOCK_VIRT].expires,
+					       exp->cpu))
 					break;
-				p->signal->cputime_expires.virt_exp =
-					timer->it.cpu.expires.cpu;
+				sig->cputime_expires.virt_exp = exp->cpu;
 				break;
 			case CPUCLOCK_PROF:
-				if (!cputime_eq(p->signal->it_prof_expires,
+				if (!cputime_eq(sig->it[CPUCLOCK_PROF].expires,
 						cputime_zero) &&
-				    cputime_lt(p->signal->it_prof_expires,
-					       timer->it.cpu.expires.cpu))
+				    cputime_lt(sig->it[CPUCLOCK_PROF].expires,
+					       exp->cpu))
 					break;
-				i = p->signal->rlim[RLIMIT_CPU].rlim_cur;
+				i = sig->rlim[RLIMIT_CPU].rlim_cur;
 				if (i != RLIM_INFINITY &&
-				    i <= cputime_to_secs(timer->it.cpu.expires.cpu))
+				    i <= cputime_to_secs(exp->cpu))
 					break;
-				p->signal->cputime_expires.prof_exp =
-					timer->it.cpu.expires.cpu;
+				sig->cputime_expires.prof_exp = exp->cpu;
 				break;
 			case CPUCLOCK_SCHED:
-				p->signal->cputime_expires.sched_exp =
-					timer->it.cpu.expires.sched;
+				sig->cputime_expires.sched_exp = exp->sched;
 				break;
 			}
 		}
@@ -1070,6 +1070,27 @@ static void stop_process_timers(struct task_struct *tsk)
 	spin_unlock_irqrestore(&cputimer->lock, flags);
 }
 
+static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
+			     cputime_t *expires, cputime_t cur_time, int signo)
+{
+	if (cputime_eq(it->expires, cputime_zero))
+		return;
+
+	if (cputime_ge(cur_time, it->expires)) {
+		it->expires = it->incr;
+		if (!cputime_eq(it->expires, cputime_zero))
+			it->expires = cputime_add(it->expires, cur_time);
+
+		__group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
+	}
+
+	if (!cputime_eq(it->expires, cputime_zero) &&
+	    (cputime_eq(*expires, cputime_zero) ||
+	     cputime_lt(it->expires, *expires))) {
+		*expires = it->expires;
+	}
+}
+
 /*
  * Check for any per-thread CPU timers that have fired and move them
  * off the tsk->*_timers list onto the firing list.  Per-thread timers
@@ -1089,10 +1110,10 @@ static void check_process_timers(struct task_struct *tsk,
 	 * Don't sample the current process CPU clocks if there are no timers.
 	 */
 	if (list_empty(&timers[CPUCLOCK_PROF]) &&
-	    cputime_eq(sig->it_prof_expires, cputime_zero) &&
+	    cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) &&
 	    sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
 	    list_empty(&timers[CPUCLOCK_VIRT]) &&
-	    cputime_eq(sig->it_virt_expires, cputime_zero) &&
+	    cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
 	    list_empty(&timers[CPUCLOCK_SCHED])) {
 		stop_process_timers(tsk);
 		return;
@@ -1152,38 +1173,11 @@ static void check_process_timers(struct task_struct *tsk,
 	/*
 	 * Check for the special case process timers.
 	 */
-	if (!cputime_eq(sig->it_prof_expires, cputime_zero)) {
-		if (cputime_ge(ptime, sig->it_prof_expires)) {
-			/* ITIMER_PROF fires and reloads.  */
-			sig->it_prof_expires = sig->it_prof_incr;
-			if (!cputime_eq(sig->it_prof_expires, cputime_zero)) {
-				sig->it_prof_expires = cputime_add(
-					sig->it_prof_expires, ptime);
-			}
-			__group_send_sig_info(SIGPROF, SEND_SIG_PRIV, tsk);
-		}
-		if (!cputime_eq(sig->it_prof_expires, cputime_zero) &&
-		    (cputime_eq(prof_expires, cputime_zero) ||
-		     cputime_lt(sig->it_prof_expires, prof_expires))) {
-			prof_expires = sig->it_prof_expires;
-		}
-	}
-	if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
-		if (cputime_ge(utime, sig->it_virt_expires)) {
-			/* ITIMER_VIRTUAL fires and reloads.  */
-			sig->it_virt_expires = sig->it_virt_incr;
-			if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
-				sig->it_virt_expires = cputime_add(
-					sig->it_virt_expires, utime);
-			}
-			__group_send_sig_info(SIGVTALRM, SEND_SIG_PRIV, tsk);
-		}
-		if (!cputime_eq(sig->it_virt_expires, cputime_zero) &&
-		    (cputime_eq(virt_expires, cputime_zero) ||
-		     cputime_lt(sig->it_virt_expires, virt_expires))) {
-			virt_expires = sig->it_virt_expires;
-		}
-	}
+	check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime,
+			 SIGPROF);
+	check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
+			 SIGVTALRM);
+
 	if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
 		unsigned long psecs = cputime_to_secs(ptime);
 		cputime_t x;
-- 
cgit v1.2.3-71-gd317


From 8356b5f9c424e5831715abbce747197c30d1fd71 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Wed, 29 Jul 2009 12:15:27 +0200
Subject: itimers: Fix periodic tics precision

Measure ITIMER_PROF and ITIMER_VIRT timers interval error
between real ticks and requested by user. Take it into account
when scheduling next tick.

This patch introduce possibility where time between two
consecutive tics is smaller then requested interval, it
preserve however dependency that n tick is generated not
earlier than n*interval time - counting from the beginning of
periodic signal generation.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
LKML-Reference: <1248862529-6063-3-git-send-email-sgruszka@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h     |  2 ++
 kernel/itimer.c           | 24 +++++++++++++++++++++---
 kernel/posix-cpu-timers.c | 20 +++++++++++++++++---
 3 files changed, 40 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3b3efaddd953..a069e65e8bb7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -473,6 +473,8 @@ struct pacct_struct {
 struct cpu_itimer {
 	cputime_t expires;
 	cputime_t incr;
+	u32 error;
+	u32 incr_error;
 };
 
 /**
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 852c88ddd1f0..21adff7b2a17 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -42,7 +42,7 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer)
 }
 
 static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
-			   struct itimerval *value)
+			   struct itimerval *const value)
 {
 	cputime_t cval, cinterval;
 	struct cpu_itimer *it = &tsk->signal->it[clock_id];
@@ -127,14 +127,32 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
+static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns)
+{
+	struct timespec ts;
+	s64 cpu_ns;
+
+	cputime_to_timespec(ct, &ts);
+	cpu_ns = timespec_to_ns(&ts);
+
+	return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns;
+}
+
 static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
-			   struct itimerval *value, struct itimerval *ovalue)
+			   const struct itimerval *const value,
+			   struct itimerval *const ovalue)
 {
-	cputime_t cval, cinterval, nval, ninterval;
+	cputime_t cval, nval, cinterval, ninterval;
+	s64 ns_ninterval, ns_nval;
 	struct cpu_itimer *it = &tsk->signal->it[clock_id];
 
 	nval = timeval_to_cputime(&value->it_value);
+	ns_nval = timeval_to_ns(&value->it_value);
 	ninterval = timeval_to_cputime(&value->it_interval);
+	ns_ninterval = timeval_to_ns(&value->it_interval);
+
+	it->incr_error = cputime_sub_ns(ninterval, ns_ninterval);
+	it->error = cputime_sub_ns(nval, ns_nval);
 
 	spin_lock_irq(&tsk->sighand->siglock);
 
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 9b2d5e4dc8c4..b60d644ea4b3 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1070,6 +1070,8 @@ static void stop_process_timers(struct task_struct *tsk)
 	spin_unlock_irqrestore(&cputimer->lock, flags);
 }
 
+static u32 onecputick;
+
 static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
 			     cputime_t *expires, cputime_t cur_time, int signo)
 {
@@ -1077,9 +1079,16 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
 		return;
 
 	if (cputime_ge(cur_time, it->expires)) {
-		it->expires = it->incr;
-		if (!cputime_eq(it->expires, cputime_zero))
-			it->expires = cputime_add(it->expires, cur_time);
+		if (!cputime_eq(it->incr, cputime_zero)) {
+			it->expires = cputime_add(it->expires, it->incr);
+			it->error += it->incr_error;
+			if (it->error >= onecputick) {
+				it->expires = cputime_sub(it->expires,
+							jiffies_to_cputime(1));
+				it->error -= onecputick;
+			}
+		} else
+			it->expires = cputime_zero;
 
 		__group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
 	}
@@ -1696,10 +1705,15 @@ static __init int init_posix_cpu_timers(void)
 		.nsleep = thread_cpu_nsleep,
 		.nsleep_restart = thread_cpu_nsleep_restart,
 	};
+	struct timespec ts;
 
 	register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
 	register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
 
+	cputime_to_timespec(jiffies_to_cputime(1), &ts);
+	onecputick = ts.tv_nsec;
+	WARN_ON(ts.tv_sec != 0);
+
 	return 0;
 }
 __initcall(init_posix_cpu_timers);
-- 
cgit v1.2.3-71-gd317


From bbff2e433e80fae72c8d00d482927d52ec19ba33 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Thu, 6 Aug 2009 11:36:25 +0300
Subject: slab: remove duplicate kmem_cache_init_late() declarations

kmem_cache_init_late() has been declared in slab.h

CC: Nick Piggin <npiggin@suse.de>
CC: Matt Mackall <mpm@selenic.com>
CC: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/slob_def.h | 5 -----
 include/linux/slub_def.h | 2 --
 mm/slob.c                | 5 +++++
 3 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slob_def.h b/include/linux/slob_def.h
index bb5368df4be8..0ec00b39d006 100644
--- a/include/linux/slob_def.h
+++ b/include/linux/slob_def.h
@@ -34,9 +34,4 @@ static __always_inline void *__kmalloc(size_t size, gfp_t flags)
 	return kmalloc(size, flags);
 }
 
-static inline void kmem_cache_init_late(void)
-{
-	/* Nothing to do */
-}
-
 #endif /* __LINUX_SLOB_DEF_H */
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index c1c862b1d01a..76c831693616 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -304,6 +304,4 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 }
 #endif
 
-void __init kmem_cache_init_late(void);
-
 #endif /* _LINUX_SLUB_DEF_H */
diff --git a/mm/slob.c b/mm/slob.c
index 9641da3d5e58..837ebd64cc34 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -692,3 +692,8 @@ void __init kmem_cache_init(void)
 {
 	slob_ready = 1;
 }
+
+void __init kmem_cache_init_late(void)
+{
+	/* Nothing to do */
+}
-- 
cgit v1.2.3-71-gd317


From fa56d4cb4022c8b313c3b99236e1b87effc3655b Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 23 Jun 2009 11:29:11 +0000
Subject: ide: allow ide_dev_read_id() to be called from the IRQ context

* Un-static __ide_wait_stat().

* Allow ide_dev_read_id() helper to be called from the IRQ context by
  adding irq_ctx flag and using mdelay()/__ide_wait_stat() when needed.

* Switch ide_driveid_update() to set irq_ctx flag.

This change is needed for the consecutive patch which fixes races in
handling of user-space SET XFER commands but for improved bisectability
and clarity it is better to do it in a separate patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ide/ide-iops.c  |  6 +++---
 drivers/ide/ide-probe.c | 31 +++++++++++++++++++++----------
 include/linux/ide.h     |  3 ++-
 3 files changed, 26 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 2892b242bbe1..b99873845d21 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -102,8 +102,8 @@ EXPORT_SYMBOL(ide_fixstring);
  * setting a timer to wake up at half second intervals thereafter,
  * until timeout is achieved, before timing out.
  */
-static int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad,
-			   unsigned long timeout, u8 *rstat)
+int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad,
+		    unsigned long timeout, u8 *rstat)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
@@ -316,7 +316,7 @@ int ide_driveid_update(ide_drive_t *drive)
 		return 0;
 
 	SELECT_MASK(drive, 1);
-	rc = ide_dev_read_id(drive, ATA_CMD_ID_ATA, id);
+	rc = ide_dev_read_id(drive, ATA_CMD_ID_ATA, id, 1);
 	SELECT_MASK(drive, 0);
 
 	if (rc)
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 1bb106f6221a..8de442cbee94 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -238,6 +238,7 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id)
  *	@drive: drive to identify
  *	@cmd: command to use
  *	@id: buffer for IDENTIFY data
+ *	@irq_ctx: flag set when called from the IRQ context
  *
  *	Sends an ATA(PI) IDENTIFY request to a drive and waits for a response.
  *
@@ -246,7 +247,7 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id)
  *			2  device aborted the command (refused to identify itself)
  */
 
-int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id)
+int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id, int irq_ctx)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct ide_io_ports *io_ports = &hwif->io_ports;
@@ -263,7 +264,10 @@ int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id)
 		tp_ops->write_devctl(hwif, ATA_NIEN | ATA_DEVCTL_OBS);
 
 	/* take a deep breath */
-	msleep(50);
+	if (irq_ctx)
+		mdelay(50);
+	else
+		msleep(50);
 
 	if (io_ports->ctl_addr &&
 	    (hwif->host_flags & IDE_HFLAG_BROKEN_ALTSTATUS) == 0) {
@@ -295,12 +299,19 @@ int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id)
 
 	timeout = ((cmd == ATA_CMD_ID_ATA) ? WAIT_WORSTCASE : WAIT_PIDENTIFY) / 2;
 
-	if (ide_busy_sleep(drive, timeout, use_altstatus))
-		return 1;
-
 	/* wait for IRQ and ATA_DRQ */
-	msleep(50);
-	s = tp_ops->read_status(hwif);
+	if (irq_ctx) {
+		rc = __ide_wait_stat(drive, ATA_DRQ, BAD_R_STAT, timeout, &s);
+		if (rc)
+			return 1;
+	} else {
+		rc = ide_busy_sleep(drive, timeout, use_altstatus);
+		if (rc)
+			return 1;
+
+		msleep(50);
+		s = tp_ops->read_status(hwif);
+	}
 
 	if (OK_STAT(s, ATA_DRQ, BAD_R_STAT)) {
 		/* drive returned ID */
@@ -406,10 +417,10 @@ static int do_probe (ide_drive_t *drive, u8 cmd)
 
 	if (OK_STAT(stat, ATA_DRDY, ATA_BUSY) ||
 	    present || cmd == ATA_CMD_ID_ATAPI) {
-		rc = ide_dev_read_id(drive, cmd, id);
+		rc = ide_dev_read_id(drive, cmd, id, 0);
 		if (rc)
 			/* failed: try again */
-			rc = ide_dev_read_id(drive, cmd, id);
+			rc = ide_dev_read_id(drive, cmd, id, 0);
 
 		stat = tp_ops->read_status(hwif);
 
@@ -424,7 +435,7 @@ static int do_probe (ide_drive_t *drive, u8 cmd)
 			msleep(50);
 			tp_ops->exec_command(hwif, ATA_CMD_DEV_RESET);
 			(void)ide_busy_sleep(drive, WAIT_WORSTCASE, 0);
-			rc = ide_dev_read_id(drive, cmd, id);
+			rc = ide_dev_read_id(drive, cmd, id, 0);
 		}
 
 		/* ensure drive IRQ is clear */
diff --git a/include/linux/ide.h b/include/linux/ide.h
index edc93a6d931d..cb6cd0459a5e 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1081,6 +1081,7 @@ extern void ide_fixstring(u8 *, const int, const int);
 
 int ide_busy_sleep(ide_drive_t *, unsigned long, int);
 
+int __ide_wait_stat(ide_drive_t *, u8, u8, unsigned long, u8 *);
 int ide_wait_stat(ide_startstop_t *, ide_drive_t *, u8, u8, unsigned long);
 
 ide_startstop_t ide_do_park_unpark(ide_drive_t *, struct request *);
@@ -1169,7 +1170,7 @@ int ide_no_data_taskfile(ide_drive_t *, struct ide_cmd *);
 
 int ide_taskfile_ioctl(ide_drive_t *, unsigned long);
 
-int ide_dev_read_id(ide_drive_t *, u8, u16 *);
+int ide_dev_read_id(ide_drive_t *, u8, u16 *, int);
 
 extern int ide_driveid_update(ide_drive_t *);
 extern int ide_config_drive_speed(ide_drive_t *, u8);
-- 
cgit v1.2.3-71-gd317


From 665d66e8fad60a5a162c4615f27f916ad1a6d567 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 23 Jun 2009 11:35:51 +0000
Subject: ide: fix races in handling of user-space SET XFER commands

* Make cmd->tf_flags field 'u16' and add IDE_TFLAG_SET_XFER taskfile flag.

* Update ide_finish_cmd() to set xfer / re-read id if the new flag is set.

* Convert set_xfer_rate() (write handler for /proc/ide/hd?/current_speed)
  and ide_cmd_ioctl() (HDIO_DRIVE_CMD ioctl handler) to use the new flag.

* Remove no longer needed disable_irq_nosync() + enable_irq() from
  ide_config_drive_speed().

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ide/ide-ioctls.c   |  8 ++------
 drivers/ide/ide-iops.c     | 10 ----------
 drivers/ide/ide-proc.c     | 10 ++--------
 drivers/ide/ide-taskfile.c |  9 ++++++++-
 include/linux/ide.h        |  3 ++-
 5 files changed, 14 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index e246d3d3fbcc..d3440b5010a5 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -167,6 +167,8 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg)
 			err = -EINVAL;
 			goto abort;
 		}
+
+		cmd.tf_flags |= IDE_TFLAG_SET_XFER;
 	}
 
 	err = ide_raw_taskfile(drive, &cmd, buf, args[3]);
@@ -174,12 +176,6 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg)
 	args[0] = tf->status;
 	args[1] = tf->error;
 	args[2] = tf->nsect;
-
-	if (!err && xfer_rate) {
-		/* active-retuning-calls future */
-		ide_set_xfer_rate(drive, xfer_rate);
-		ide_driveid_update(drive);
-	}
 abort:
 	if (copy_to_user((void __user *)arg, &args, 4))
 		err = -EFAULT;
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index b99873845d21..b14fa9a87c49 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -363,14 +363,6 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed)
 	 * this point (lost interrupt).
 	 */
 
-	/*
-	 *	FIXME: we race against the running IRQ here if
-	 *	this is called from non IRQ context. If we use
-	 *	disable_irq() we hang on the error path. Work
-	 *	is needed.
-	 */
-	disable_irq_nosync(hwif->irq);
-
 	udelay(1);
 	tp_ops->dev_select(drive);
 	SELECT_MASK(drive, 1);
@@ -394,8 +386,6 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed)
 
 	SELECT_MASK(drive, 0);
 
-	enable_irq(hwif->irq);
-
 	if (error) {
 		(void) ide_dump_status(drive, "set_drive_speed_status", stat);
 		return error;
diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c
index 3242698832a4..021de41655e6 100644
--- a/drivers/ide/ide-proc.c
+++ b/drivers/ide/ide-proc.c
@@ -195,7 +195,6 @@ ide_devset_get(xfer_rate, current_speed);
 static int set_xfer_rate (ide_drive_t *drive, int arg)
 {
 	struct ide_cmd cmd;
-	int err;
 
 	if (arg < XFER_PIO_0 || arg > XFER_UDMA_6)
 		return -EINVAL;
@@ -206,14 +205,9 @@ static int set_xfer_rate (ide_drive_t *drive, int arg)
 	cmd.tf.nsect   = (u8)arg;
 	cmd.valid.out.tf = IDE_VALID_FEATURE | IDE_VALID_NSECT;
 	cmd.valid.in.tf  = IDE_VALID_NSECT;
+	cmd.tf_flags   = IDE_TFLAG_SET_XFER;
 
-	err = ide_no_data_taskfile(drive, &cmd);
-
-	if (!err) {
-		ide_set_xfer_rate(drive, (u8) arg);
-		ide_driveid_update(drive);
-	}
-	return err;
+	return ide_no_data_taskfile(drive, &cmd);
 }
 
 ide_devset_rw(current_speed, xfer_rate);
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 50336d51eebc..cc8633cbe133 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -324,10 +324,17 @@ static void ide_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd)
 void ide_finish_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat)
 {
 	struct request *rq = drive->hwif->rq;
-	u8 err = ide_read_error(drive);
+	u8 err = ide_read_error(drive), nsect = cmd->tf.nsect;
+	u8 set_xfer = !!(cmd->tf_flags & IDE_TFLAG_SET_XFER);
 
 	ide_complete_cmd(drive, cmd, stat, err);
 	rq->errors = err;
+
+	if (err == 0 && set_xfer) {
+		ide_set_xfer_rate(drive, nsect);
+		ide_driveid_update(drive);
+	}
+
 	ide_complete_rq(drive, err ? -EIO : 0, blk_rq_bytes(rq));
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index cb6cd0459a5e..803c1ae31237 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -258,6 +258,7 @@ enum {
 	IDE_TFLAG_DYN			= (1 << 5),
 	IDE_TFLAG_FS			= (1 << 6),
 	IDE_TFLAG_MULTI_PIO		= (1 << 7),
+	IDE_TFLAG_SET_XFER		= (1 << 8),
 };
 
 enum {
@@ -294,7 +295,7 @@ struct ide_cmd {
 		} out, in;
 	} valid;
 
-	u8			tf_flags;
+	u16			tf_flags;
 	u8			ftf_flags;	/* for TASKFILE ioctl */
 	int			protocol;
 
-- 
cgit v1.2.3-71-gd317


From 25a70e38cd57952b09a013bf070e03705ee4f2ff Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@caiaq.de>
Date: Wed, 12 Aug 2009 00:50:09 -0700
Subject: Input: eeti_ts - allow active high irq lines

This adds a struct eeti_ts_platform_data which currently holds only one
value to specify the interrupt polarity.

The driver has a fallback if no platform data is passed in via the
i2c_board_info, so no regression is caused.

Signed-off-by: Daniel Mack <daniel@caiaq.de>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/touchscreen/eeti_ts.c | 22 +++++++++++++++++++---
 include/linux/input/eeti_ts.h       |  9 +++++++++
 2 files changed, 28 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/input/eeti_ts.h

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/eeti_ts.c b/drivers/input/touchscreen/eeti_ts.c
index 3ab92222a525..9029bd3f34e5 100644
--- a/drivers/input/touchscreen/eeti_ts.c
+++ b/drivers/input/touchscreen/eeti_ts.c
@@ -32,6 +32,7 @@
 #include <linux/i2c.h>
 #include <linux/timer.h>
 #include <linux/gpio.h>
+#include <linux/input/eeti_ts.h>
 
 static int flip_x;
 module_param(flip_x, bool, 0644);
@@ -46,7 +47,7 @@ struct eeti_ts_priv {
 	struct input_dev *input;
 	struct work_struct work;
 	struct mutex mutex;
-	int irq;
+	int irq, irq_active_high;
 };
 
 #define EETI_TS_BITDEPTH	(11)
@@ -58,6 +59,11 @@ struct eeti_ts_priv {
 #define REPORT_BIT_HAS_PRESSURE	(1 << 6)
 #define REPORT_RES_BITS(v)	(((v) >> 1) + EETI_TS_BITDEPTH)
 
+static inline int eeti_ts_irq_active(struct eeti_ts_priv *priv)
+{
+	return gpio_get_value(irq_to_gpio(priv->irq)) == priv->irq_active_high;
+}
+
 static void eeti_ts_read(struct work_struct *work)
 {
 	char buf[6];
@@ -67,7 +73,7 @@ static void eeti_ts_read(struct work_struct *work)
 
 	mutex_lock(&priv->mutex);
 
-	while (!gpio_get_value(irq_to_gpio(priv->irq)) && --to)
+	while (eeti_ts_irq_active(priv) && --to)
 		i2c_master_recv(priv->client, buf, sizeof(buf));
 
 	if (!to) {
@@ -140,8 +146,10 @@ static void eeti_ts_close(struct input_dev *dev)
 static int __devinit eeti_ts_probe(struct i2c_client *client,
 				   const struct i2c_device_id *idp)
 {
+	struct eeti_ts_platform_data *pdata;
 	struct eeti_ts_priv *priv;
 	struct input_dev *input;
+	unsigned int irq_flags;
 	int err = -ENOMEM;
 
 	/* In contrast to what's described in the datasheet, there seems
@@ -180,6 +188,14 @@ static int __devinit eeti_ts_probe(struct i2c_client *client,
 	priv->input = input;
 	priv->irq = client->irq;
 
+	pdata = client->dev.platform_data;
+
+	if (pdata)
+		priv->irq_active_high = pdata->irq_active_high;
+
+	irq_flags = priv->irq_active_high ?
+		IRQF_TRIGGER_RISING : IRQF_TRIGGER_FALLING;
+
 	INIT_WORK(&priv->work, eeti_ts_read);
 	i2c_set_clientdata(client, priv);
 	input_set_drvdata(input, priv);
@@ -188,7 +204,7 @@ static int __devinit eeti_ts_probe(struct i2c_client *client,
 	if (err)
 		goto err1;
 
-	err = request_irq(priv->irq, eeti_ts_isr, IRQF_TRIGGER_FALLING,
+	err = request_irq(priv->irq, eeti_ts_isr, irq_flags,
 			  client->name, priv);
 	if (err) {
 		dev_err(&client->dev, "Unable to request touchscreen IRQ.\n");
diff --git a/include/linux/input/eeti_ts.h b/include/linux/input/eeti_ts.h
new file mode 100644
index 000000000000..f875b316249d
--- /dev/null
+++ b/include/linux/input/eeti_ts.h
@@ -0,0 +1,9 @@
+#ifndef LINUX_INPUT_EETI_TS_H
+#define LINUX_INPUT_EETI_TS_H
+
+struct eeti_ts_platform_data {
+	unsigned int irq_active_high;
+};
+
+#endif /* LINUX_INPUT_EETI_TS_H */
+
-- 
cgit v1.2.3-71-gd317


From 00ae4064b1445524752575dd84df227c0687c99d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 15:00:49 +0900
Subject: percpu: rename 4k first chunk allocator to page

Page size isn't always 4k depending on arch and configuration.  Rename
4k first chunk allocator to page.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: David Howells <dhowells@redhat.com>
---
 Documentation/kernel-parameters.txt |  2 +-
 arch/x86/kernel/setup_percpu.c      | 23 ++++++++++++-----------
 include/linux/percpu.h              |  2 +-
 mm/percpu.c                         | 25 ++++++++++++++-----------
 4 files changed, 28 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 7936b801fe6a..12e9eb77ee0d 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1920,7 +1920,7 @@ and is between 256 and 4096 characters. It is defined in the file
 			See arch/parisc/kernel/pdc_chassis.c
 
 	percpu_alloc=	[X86] Select which percpu first chunk allocator to use.
-			Allowed values are one of "lpage", "embed" and "4k".
+			Allowed values are one of "lpage", "embed" and "page".
 			See comments in arch/x86/kernel/setup_percpu.c for
 			details on each allocator.  This parameter is primarily
 			for debugging and performance comparison.
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index a26ff61e2fb0..1e17711c29d6 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -249,21 +249,22 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
 }
 
 /*
- * 4k allocator
+ * Page allocator
  *
- * Boring fallback 4k allocator.  This allocator puts more pressure on
- * PTE TLBs but other than that behaves nicely on both UMA and NUMA.
+ * Boring fallback 4k page allocator.  This allocator puts more
+ * pressure on PTE TLBs but other than that behaves nicely on both UMA
+ * and NUMA.
  */
-static void __init pcpu4k_populate_pte(unsigned long addr)
+static void __init pcpup_populate_pte(unsigned long addr)
 {
 	populate_extra_pte(addr);
 }
 
-static ssize_t __init setup_pcpu_4k(size_t static_size)
+static ssize_t __init setup_pcpu_page(size_t static_size)
 {
-	return pcpu_4k_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
-				   pcpu_fc_alloc, pcpu_fc_free,
-				   pcpu4k_populate_pte);
+	return pcpu_page_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
+				     pcpu_fc_alloc, pcpu_fc_free,
+				     pcpup_populate_pte);
 }
 
 /* for explicit first chunk allocator selection */
@@ -307,7 +308,7 @@ void __init setup_per_cpu_areas(void)
 	 */
 	ret = -EINVAL;
 	if (strlen(pcpu_chosen_alloc)) {
-		if (strcmp(pcpu_chosen_alloc, "4k")) {
+		if (strcmp(pcpu_chosen_alloc, "page")) {
 			if (!strcmp(pcpu_chosen_alloc, "lpage"))
 				ret = setup_pcpu_lpage(static_size, true);
 			else if (!strcmp(pcpu_chosen_alloc, "embed"))
@@ -317,7 +318,7 @@ void __init setup_per_cpu_areas(void)
 					   "specified\n", pcpu_chosen_alloc);
 			if (ret < 0)
 				pr_warning("PERCPU: %s allocator failed (%zd), "
-					   "falling back to 4k\n",
+					   "falling back to page size\n",
 					   pcpu_chosen_alloc, ret);
 		}
 	} else {
@@ -326,7 +327,7 @@ void __init setup_per_cpu_areas(void)
 			ret = setup_pcpu_embed(static_size, false);
 	}
 	if (ret < 0)
-		ret = setup_pcpu_4k(static_size);
+		ret = setup_pcpu_page(static_size);
 	if (ret < 0)
 		panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
 		      static_size, ret);
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index e134c8229631..7989f61b03f3 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -74,7 +74,7 @@ extern ssize_t __init pcpu_embed_first_chunk(
 				size_t static_size, size_t reserved_size,
 				ssize_t dyn_size);
 
-extern ssize_t __init pcpu_4k_first_chunk(
+extern ssize_t __init pcpu_page_first_chunk(
 				size_t static_size, size_t reserved_size,
 				pcpu_fc_alloc_fn_t alloc_fn,
 				pcpu_fc_free_fn_t free_fn,
diff --git a/mm/percpu.c b/mm/percpu.c
index cbddcbdab681..6feac7934904 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1497,15 +1497,15 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
 }
 
 /**
- * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages
+ * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
  * @static_size: the size of static percpu area in bytes
  * @reserved_size: the size of reserved percpu area in bytes
  * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
  * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
  * @populate_pte_fn: function to populate pte
  *
- * This is a helper to ease setting up embedded first percpu chunk and
- * can be called where pcpu_setup_first_chunk() is expected.
+ * This is a helper to ease setting up page-remapped first percpu
+ * chunk and can be called where pcpu_setup_first_chunk() is expected.
  *
  * This is the basic allocator.  Static percpu area is allocated
  * page-by-page into vmalloc area.
@@ -1514,12 +1514,13 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access on success, -errno on failure.
  */
-ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
-				   pcpu_fc_alloc_fn_t alloc_fn,
-				   pcpu_fc_free_fn_t free_fn,
-				   pcpu_fc_populate_pte_fn_t populate_pte_fn)
+ssize_t __init pcpu_page_first_chunk(size_t static_size, size_t reserved_size,
+				     pcpu_fc_alloc_fn_t alloc_fn,
+				     pcpu_fc_free_fn_t free_fn,
+				     pcpu_fc_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct vm;
+	char psize_str[16];
 	int unit_pages;
 	size_t pages_size;
 	struct page **pages;
@@ -1527,6 +1528,8 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
 	int i, j;
 	ssize_t ret;
 
+	snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
+
 	unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
 				  PCPU_MIN_UNIT_SIZE));
 
@@ -1542,8 +1545,8 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
 
 			ptr = alloc_fn(cpu, PAGE_SIZE);
 			if (!ptr) {
-				pr_warning("PERCPU: failed to allocate "
-					   "4k page for cpu%u\n", cpu);
+				pr_warning("PERCPU: failed to allocate %s page "
+					   "for cpu%u\n", psize_str, cpu);
 				goto enomem;
 			}
 			pages[j++] = virt_to_page(ptr);
@@ -1580,8 +1583,8 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
 	}
 
 	/* we're ready, commit */
-	pr_info("PERCPU: %d 4k pages/cpu @%p s%zu r%zu\n",
-		unit_pages, vm.addr, static_size, reserved_size);
+	pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu\n",
+		unit_pages, psize_str, vm.addr, static_size, reserved_size);
 
 	ret = pcpu_setup_first_chunk(static_size, reserved_size, -1,
 				     unit_pages << PAGE_SHIFT, vm.addr, NULL);
-- 
cgit v1.2.3-71-gd317


From 08fc45806103e59a37418e84719b878f9bb32540 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 15:00:49 +0900
Subject: percpu: build first chunk allocators selectively

There's no need to build unused first chunk allocators in.  Define
CONFIG_NEED_PER_CPU_*_FIRST_CHUNK and let archs enable them
selectively.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/Kconfig       | 10 ++++++++++
 include/linux/percpu.h | 27 +++++----------------------
 mm/percpu.c            | 19 +++++++++++--------
 3 files changed, 26 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e06b2eeff9f2..f7ac27215512 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -150,6 +150,16 @@ config ARCH_HAS_CACHE_LINE_SIZE
 config HAVE_SETUP_PER_CPU_AREA
 	def_bool y
 
+config NEED_PER_CPU_EMBED_FIRST_CHUNK
+	def_bool y
+
+config NEED_PER_CPU_PAGE_FIRST_CHUNK
+	def_bool y
+
+config NEED_PER_CPU_LPAGE_FIRST_CHUNK
+	def_bool y
+	depends on NEED_MULTIPLE_NODES
+
 config HAVE_CPUMASK_OF_CPU_MAP
 	def_bool X86_64_SMP
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 7989f61b03f3..e26788e0da4a 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -70,17 +70,21 @@ extern size_t __init pcpu_setup_first_chunk(
 				ssize_t dyn_size, size_t unit_size,
 				void *base_addr, const int *unit_map);
 
+#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
 extern ssize_t __init pcpu_embed_first_chunk(
 				size_t static_size, size_t reserved_size,
 				ssize_t dyn_size);
+#endif
 
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
 extern ssize_t __init pcpu_page_first_chunk(
 				size_t static_size, size_t reserved_size,
 				pcpu_fc_alloc_fn_t alloc_fn,
 				pcpu_fc_free_fn_t free_fn,
 				pcpu_fc_populate_pte_fn_t populate_pte_fn);
+#endif
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
 extern int __init pcpu_lpage_build_unit_map(
 				size_t static_size, size_t reserved_size,
 				ssize_t *dyn_sizep, size_t *unit_sizep,
@@ -98,27 +102,6 @@ extern ssize_t __init pcpu_lpage_first_chunk(
 
 extern void *pcpu_lpage_remapped(void *kaddr);
 #else
-static inline int pcpu_lpage_build_unit_map(
-				size_t static_size, size_t reserved_size,
-				ssize_t *dyn_sizep, size_t *unit_sizep,
-				size_t lpage_size, int *unit_map,
-				pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
-{
-	return -EINVAL;
-}
-
-static inline ssize_t __init pcpu_lpage_first_chunk(
-				size_t static_size, size_t reserved_size,
-				size_t dyn_size, size_t unit_size,
-				size_t lpage_size, const int *unit_map,
-				int nr_units,
-				pcpu_fc_alloc_fn_t alloc_fn,
-				pcpu_fc_free_fn_t free_fn,
-				pcpu_fc_map_fn_t map_fn)
-{
-	return -EINVAL;
-}
-
 static inline void *pcpu_lpage_remapped(void *kaddr)
 {
 	return NULL;
diff --git a/mm/percpu.c b/mm/percpu.c
index 6feac7934904..7971997de310 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1414,8 +1414,9 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
 	return pcpu_unit_size;
 }
 
-static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size,
-				 ssize_t *dyn_sizep)
+static inline size_t pcpu_calc_fc_sizes(size_t static_size,
+					size_t reserved_size,
+					ssize_t *dyn_sizep)
 {
 	size_t size_sum;
 
@@ -1427,6 +1428,8 @@ static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size,
 	return size_sum;
 }
 
+#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
+	!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
 /**
  * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
  * @static_size: the size of static percpu area in bytes
@@ -1495,7 +1498,10 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
 	return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
 				      unit_size, base, NULL);
 }
+#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
+	  !CONFIG_HAVE_SETUP_PER_CPU_AREA */
 
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
 /**
  * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
  * @static_size: the size of static percpu area in bytes
@@ -1598,12 +1604,9 @@ out_free_ar:
 	free_bootmem(__pa(pages), pages_size);
 	return ret;
 }
+#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
 
-/*
- * Large page remapping first chunk setup helper
- */
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-
+#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
 /**
  * pcpu_lpage_build_unit_map - build unit_map for large page remapping
  * @static_size: the size of static percpu area in bytes
@@ -1982,7 +1985,7 @@ void *pcpu_lpage_remapped(void *kaddr)
 
 	return NULL;
 }
-#endif
+#endif /* CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK */
 
 /*
  * Generic percpu area setup.
-- 
cgit v1.2.3-71-gd317


From f58dc01ba2ca9fe3ab2ba4ca43d9c8a735cf62d8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 15:00:50 +0900
Subject: percpu: generalize first chunk allocator selection

Now that all first chunk allocators are in mm/percpu.c, it makes sense
to make generalize percpu_alloc kernel parameter.  Define PCPU_FC_*
and set pcpu_chosen_fc using early_param() in mm/percpu.c.  Arch code
can use the set value to determine which first chunk allocator to use.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/kernel-parameters.txt | 11 ++++++-----
 arch/x86/kernel/setup_percpu.c      | 24 ++++++------------------
 include/linux/percpu.h              | 12 ++++++++++++
 mm/percpu.c                         | 32 ++++++++++++++++++++++++++++++++
 4 files changed, 56 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 12e9eb77ee0d..dee9ce2e6cfa 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1919,11 +1919,12 @@ and is between 256 and 4096 characters. It is defined in the file
 			Format: { 0 | 1 }
 			See arch/parisc/kernel/pdc_chassis.c
 
-	percpu_alloc=	[X86] Select which percpu first chunk allocator to use.
-			Allowed values are one of "lpage", "embed" and "page".
-			See comments in arch/x86/kernel/setup_percpu.c for
-			details on each allocator.  This parameter is primarily
-			for debugging and performance comparison.
+	percpu_alloc=	Select which percpu first chunk allocator to use.
+			Currently supported values are "embed", "page" and
+			"lpage".  Archs may support subset or none of the
+			selections.  See comments in mm/percpu.c for details
+			on each allocator.  This parameter is primarily	for
+			debugging and performance comparison.
 
 	pf.		[PARIDE]
 			See Documentation/blockdev/paride.txt.
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 1e17711c29d6..b961d99e6416 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -267,16 +267,6 @@ static ssize_t __init setup_pcpu_page(size_t static_size)
 				     pcpup_populate_pte);
 }
 
-/* for explicit first chunk allocator selection */
-static char pcpu_chosen_alloc[16] __initdata;
-
-static int __init percpu_alloc_setup(char *str)
-{
-	strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1);
-	return 0;
-}
-early_param("percpu_alloc", percpu_alloc_setup);
-
 static inline void setup_percpu_segment(int cpu)
 {
 #ifdef CONFIG_X86_32
@@ -307,19 +297,17 @@ void __init setup_per_cpu_areas(void)
 	 * each allocator for details.
 	 */
 	ret = -EINVAL;
-	if (strlen(pcpu_chosen_alloc)) {
-		if (strcmp(pcpu_chosen_alloc, "page")) {
-			if (!strcmp(pcpu_chosen_alloc, "lpage"))
+	if (pcpu_chosen_fc != PCPU_FC_AUTO) {
+		if (pcpu_chosen_fc != PCPU_FC_PAGE) {
+			if (pcpu_chosen_fc == PCPU_FC_LPAGE)
 				ret = setup_pcpu_lpage(static_size, true);
-			else if (!strcmp(pcpu_chosen_alloc, "embed"))
-				ret = setup_pcpu_embed(static_size, true);
 			else
-				pr_warning("PERCPU: unknown allocator %s "
-					   "specified\n", pcpu_chosen_alloc);
+				ret = setup_pcpu_embed(static_size, true);
+
 			if (ret < 0)
 				pr_warning("PERCPU: %s allocator failed (%zd), "
 					   "falling back to page size\n",
-					   pcpu_chosen_alloc, ret);
+					   pcpu_fc_names[pcpu_chosen_fc], ret);
 		}
 	} else {
 		ret = setup_pcpu_lpage(static_size, false);
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index e26788e0da4a..9be05cbe5ee0 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -59,6 +59,18 @@
 extern void *pcpu_base_addr;
 extern const int *pcpu_unit_map;
 
+enum pcpu_fc {
+	PCPU_FC_AUTO,
+	PCPU_FC_EMBED,
+	PCPU_FC_PAGE,
+	PCPU_FC_LPAGE,
+
+	PCPU_FC_NR,
+};
+extern const char *pcpu_fc_names[PCPU_FC_NR];
+
+extern enum pcpu_fc pcpu_chosen_fc;
+
 typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size);
 typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
 typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
diff --git a/mm/percpu.c b/mm/percpu.c
index 7971997de310..7fb40bb1555a 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1414,6 +1414,38 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
 	return pcpu_unit_size;
 }
 
+const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
+	[PCPU_FC_AUTO]	= "auto",
+	[PCPU_FC_EMBED]	= "embed",
+	[PCPU_FC_PAGE]	= "page",
+	[PCPU_FC_LPAGE]	= "lpage",
+};
+
+enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
+
+static int __init percpu_alloc_setup(char *str)
+{
+	if (0)
+		/* nada */;
+#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
+	else if (!strcmp(str, "embed"))
+		pcpu_chosen_fc = PCPU_FC_EMBED;
+#endif
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+	else if (!strcmp(str, "page"))
+		pcpu_chosen_fc = PCPU_FC_PAGE;
+#endif
+#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
+	else if (!strcmp(str, "lpage"))
+		pcpu_chosen_fc = PCPU_FC_LPAGE;
+#endif
+	else
+		pr_warning("PERCPU: unknown allocator %s specified\n", str);
+
+	return 0;
+}
+early_param("percpu_alloc", percpu_alloc_setup);
+
 static inline size_t pcpu_calc_fc_sizes(size_t static_size,
 					size_t reserved_size,
 					ssize_t *dyn_sizep)
-- 
cgit v1.2.3-71-gd317


From 9a7737691e90d3cce0e5248f91826c50e5aa3fcf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 15:00:50 +0900
Subject: percpu: drop @static_size from first chunk allocators

First chunk allocators assume percpu areas have been linked using one
of PERCPU_*() macros and depend on __per_cpu_load symbol defined by
those macros, so there isn't much point in passing in static area size
explicitly when it can be easily calculated from __per_cpu_start and
__per_cpu_end.  Drop @static_size from all percpu first chunk
allocators and helpers.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/kernel/setup_percpu.c | 34 +++++++++++++++-------------------
 include/linux/percpu.h         | 18 ++++++++----------
 mm/percpu.c                    | 29 +++++++++++++----------------
 3 files changed, 36 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index b961d99e6416..8aad486c688f 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -157,7 +157,7 @@ static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to)
 		return REMOTE_DISTANCE;
 }
 
-static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
+static ssize_t __init setup_pcpu_lpage(bool chosen)
 {
 	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
 	size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE;
@@ -184,8 +184,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
 		return -ENOMEM;
 	}
 
-	ret = pcpu_lpage_build_unit_map(static_size,
-					PERCPU_FIRST_CHUNK_RESERVE,
+	ret = pcpu_lpage_build_unit_map(PERCPU_FIRST_CHUNK_RESERVE,
 					&dyn_size, &unit_size, PMD_SIZE,
 					unit_map, pcpu_lpage_cpu_distance);
 	if (ret < 0) {
@@ -208,9 +207,8 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
 		}
 	}
 
-	ret = pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
-				     dyn_size, unit_size, PMD_SIZE,
-				     unit_map, nr_units,
+	ret = pcpu_lpage_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
+				     unit_size, PMD_SIZE, unit_map, nr_units,
 				     pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
 out_free:
 	if (ret < 0)
@@ -218,7 +216,7 @@ out_free:
 	return ret;
 }
 #else
-static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
+static ssize_t __init setup_pcpu_lpage(bool chosen)
 {
 	return -EINVAL;
 }
@@ -232,7 +230,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
  * mapping so that it can use PMD mapping without additional TLB
  * pressure.
  */
-static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
+static ssize_t __init setup_pcpu_embed(bool chosen)
 {
 	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
 
@@ -244,7 +242,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
 	if (!chosen && (!cpu_has_pse || pcpu_need_numa()))
 		return -EINVAL;
 
-	return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
+	return pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
 				      reserve - PERCPU_FIRST_CHUNK_RESERVE);
 }
 
@@ -260,9 +258,9 @@ static void __init pcpup_populate_pte(unsigned long addr)
 	populate_extra_pte(addr);
 }
 
-static ssize_t __init setup_pcpu_page(size_t static_size)
+static ssize_t __init setup_pcpu_page(void)
 {
-	return pcpu_page_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
+	return pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
 				     pcpu_fc_alloc, pcpu_fc_free,
 				     pcpup_populate_pte);
 }
@@ -282,7 +280,6 @@ static inline void setup_percpu_segment(int cpu)
 
 void __init setup_per_cpu_areas(void)
 {
-	size_t static_size = __per_cpu_end - __per_cpu_start;
 	unsigned int cpu;
 	unsigned long delta;
 	size_t pcpu_unit_size;
@@ -300,9 +297,9 @@ void __init setup_per_cpu_areas(void)
 	if (pcpu_chosen_fc != PCPU_FC_AUTO) {
 		if (pcpu_chosen_fc != PCPU_FC_PAGE) {
 			if (pcpu_chosen_fc == PCPU_FC_LPAGE)
-				ret = setup_pcpu_lpage(static_size, true);
+				ret = setup_pcpu_lpage(true);
 			else
-				ret = setup_pcpu_embed(static_size, true);
+				ret = setup_pcpu_embed(true);
 
 			if (ret < 0)
 				pr_warning("PERCPU: %s allocator failed (%zd), "
@@ -310,15 +307,14 @@ void __init setup_per_cpu_areas(void)
 					   pcpu_fc_names[pcpu_chosen_fc], ret);
 		}
 	} else {
-		ret = setup_pcpu_lpage(static_size, false);
+		ret = setup_pcpu_lpage(false);
 		if (ret < 0)
-			ret = setup_pcpu_embed(static_size, false);
+			ret = setup_pcpu_embed(false);
 	}
 	if (ret < 0)
-		ret = setup_pcpu_page(static_size);
+		ret = setup_pcpu_page();
 	if (ret < 0)
-		panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
-		      static_size, ret);
+		panic("cannot initialize percpu area (err=%zd)", ret);
 
 	pcpu_unit_size = ret;
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 9be05cbe5ee0..be2fc8fb9b6f 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -84,13 +84,12 @@ extern size_t __init pcpu_setup_first_chunk(
 
 #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
 extern ssize_t __init pcpu_embed_first_chunk(
-				size_t static_size, size_t reserved_size,
-				ssize_t dyn_size);
+				size_t reserved_size, ssize_t dyn_size);
 #endif
 
 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
 extern ssize_t __init pcpu_page_first_chunk(
-				size_t static_size, size_t reserved_size,
+				size_t reserved_size,
 				pcpu_fc_alloc_fn_t alloc_fn,
 				pcpu_fc_free_fn_t free_fn,
 				pcpu_fc_populate_pte_fn_t populate_pte_fn);
@@ -98,16 +97,15 @@ extern ssize_t __init pcpu_page_first_chunk(
 
 #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
 extern int __init pcpu_lpage_build_unit_map(
-				size_t static_size, size_t reserved_size,
-				ssize_t *dyn_sizep, size_t *unit_sizep,
-				size_t lpage_size, int *unit_map,
+				size_t reserved_size, ssize_t *dyn_sizep,
+				size_t *unit_sizep, size_t lpage_size,
+				int *unit_map,
 				pcpu_fc_cpu_distance_fn_t cpu_distance_fn);
 
 extern ssize_t __init pcpu_lpage_first_chunk(
-				size_t static_size, size_t reserved_size,
-				size_t dyn_size, size_t unit_size,
-				size_t lpage_size, const int *unit_map,
-				int nr_units,
+				size_t reserved_size, size_t dyn_size,
+				size_t unit_size, size_t lpage_size,
+				const int *unit_map, int nr_units,
 				pcpu_fc_alloc_fn_t alloc_fn,
 				pcpu_fc_free_fn_t free_fn,
 				pcpu_fc_map_fn_t map_fn);
diff --git a/mm/percpu.c b/mm/percpu.c
index 7fb40bb1555a..e2ac58a39bb2 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1464,7 +1464,6 @@ static inline size_t pcpu_calc_fc_sizes(size_t static_size,
 	!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
 /**
  * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
- * @static_size: the size of static percpu area in bytes
  * @reserved_size: the size of reserved percpu area in bytes
  * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
  *
@@ -1489,9 +1488,9 @@ static inline size_t pcpu_calc_fc_sizes(size_t static_size,
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access on success, -errno on failure.
  */
-ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
-				      ssize_t dyn_size)
+ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size)
 {
+	const size_t static_size = __per_cpu_end - __per_cpu_start;
 	size_t size_sum, unit_size, chunk_size;
 	void *base;
 	unsigned int cpu;
@@ -1536,7 +1535,6 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
 /**
  * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
- * @static_size: the size of static percpu area in bytes
  * @reserved_size: the size of reserved percpu area in bytes
  * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
  * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
@@ -1552,12 +1550,13 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access on success, -errno on failure.
  */
-ssize_t __init pcpu_page_first_chunk(size_t static_size, size_t reserved_size,
+ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 				     pcpu_fc_alloc_fn_t alloc_fn,
 				     pcpu_fc_free_fn_t free_fn,
 				     pcpu_fc_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct vm;
+	const size_t static_size = __per_cpu_end - __per_cpu_start;
 	char psize_str[16];
 	int unit_pages;
 	size_t pages_size;
@@ -1641,7 +1640,6 @@ out_free_ar:
 #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
 /**
  * pcpu_lpage_build_unit_map - build unit_map for large page remapping
- * @static_size: the size of static percpu area in bytes
  * @reserved_size: the size of reserved percpu area in bytes
  * @dyn_sizep: in/out parameter for dynamic size, -1 for auto
  * @unit_sizep: out parameter for unit size
@@ -1661,13 +1659,14 @@ out_free_ar:
  * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and
  * returns the number of units to be allocated.  -errno on failure.
  */
-int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size,
-				     ssize_t *dyn_sizep, size_t *unit_sizep,
-				     size_t lpage_size, int *unit_map,
+int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep,
+				     size_t *unit_sizep, size_t lpage_size,
+				     int *unit_map,
 				     pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
 {
 	static int group_map[NR_CPUS] __initdata;
 	static int group_cnt[NR_CPUS] __initdata;
+	const size_t static_size = __per_cpu_end - __per_cpu_start;
 	int group_cnt_max = 0;
 	size_t size_sum, min_unit_size, alloc_size;
 	int upa, max_upa, uninitialized_var(best_upa);	/* units_per_alloc */
@@ -1819,7 +1818,6 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
 
 /**
  * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
- * @static_size: the size of static percpu area in bytes
  * @reserved_size: the size of reserved percpu area in bytes
  * @dyn_size: free size for dynamic allocation in bytes
  * @unit_size: unit size in bytes
@@ -1850,15 +1848,15 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access on success, -errno on failure.
  */
-ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
-				      size_t dyn_size, size_t unit_size,
-				      size_t lpage_size, const int *unit_map,
-				      int nr_units,
+ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size,
+				      size_t unit_size, size_t lpage_size,
+				      const int *unit_map, int nr_units,
 				      pcpu_fc_alloc_fn_t alloc_fn,
 				      pcpu_fc_free_fn_t free_fn,
 				      pcpu_fc_map_fn_t map_fn)
 {
 	static struct vm_struct vm;
+	const size_t static_size = __per_cpu_end - __per_cpu_start;
 	size_t chunk_size = unit_size * nr_units;
 	size_t map_size;
 	unsigned int cpu;
@@ -2037,7 +2035,6 @@ EXPORT_SYMBOL(__per_cpu_offset);
 
 void __init setup_per_cpu_areas(void)
 {
-	size_t static_size = __per_cpu_end - __per_cpu_start;
 	ssize_t unit_size;
 	unsigned long delta;
 	unsigned int cpu;
@@ -2046,7 +2043,7 @@ void __init setup_per_cpu_areas(void)
 	 * Always reserve area for module percpu variables.  That's
 	 * what the legacy allocator did.
 	 */
-	unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE,
+	unit_size = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
 					   PERCPU_DYNAMIC_RESERVE);
 	if (unit_size < 0)
 		panic("Failed to initialized percpu areas.");
-- 
cgit v1.2.3-71-gd317


From 1d9d32572163b30be81dbe1409dfa7ea9763d0e8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 15:00:50 +0900
Subject: percpu: make @dyn_size mandatory for pcpu_setup_first_chunk()

Now that all actual first chunk allocation and copying happen in the
first chunk allocators and helpers, there's no reason for
pcpu_setup_first_chunk() to try to determine @dyn_size automatically.
The only left user is page first chunk allocator.  Make it determine
dyn_size like other allocators and make @dyn_size mandatory for
pcpu_setup_first_chunk().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/percpu.h |  2 +-
 mm/percpu.c            | 39 +++++++++++++++++++--------------------
 2 files changed, 20 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index be2fc8fb9b6f..0cfdd14d096a 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -79,7 +79,7 @@ typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr);
 
 extern size_t __init pcpu_setup_first_chunk(
 				size_t static_size, size_t reserved_size,
-				ssize_t dyn_size, size_t unit_size,
+				size_t dyn_size, size_t unit_size,
 				void *base_addr, const int *unit_map);
 
 #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
diff --git a/mm/percpu.c b/mm/percpu.c
index e2ac58a39bb2..287f59cc5fb9 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1235,7 +1235,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * pcpu_setup_first_chunk - initialize the first percpu chunk
  * @static_size: the size of static percpu area in bytes
  * @reserved_size: the size of reserved percpu area in bytes, 0 for none
- * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
+ * @dyn_size: free size for dynamic allocation in bytes
  * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE
  * @base_addr: mapped address
  * @unit_map: cpu -> unit map, NULL for sequential mapping
@@ -1252,10 +1252,9 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * limited offset range for symbol relocations to guarantee module
  * percpu symbols fall inside the relocatable range.
  *
- * @dyn_size, if non-negative, determines the number of bytes
- * available for dynamic allocation in the first chunk.  Specifying
- * non-negative value makes percpu leave alone the area beyond
- * @static_size + @reserved_size + @dyn_size.
+ * @dyn_size determines the number of bytes available for dynamic
+ * allocation in the first chunk.  The area between @static_size +
+ * @reserved_size + @dyn_size and @unit_size is unused.
  *
  * @unit_size specifies unit size and must be aligned to PAGE_SIZE and
  * equal to or larger than @static_size + @reserved_size + if
@@ -1276,13 +1275,12 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * percpu access.
  */
 size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
-				     ssize_t dyn_size, size_t unit_size,
+				     size_t dyn_size, size_t unit_size,
 				     void *base_addr, const int *unit_map)
 {
 	static struct vm_struct first_vm;
 	static int smap[2], dmap[2];
-	size_t size_sum = static_size + reserved_size +
-			  (dyn_size >= 0 ? dyn_size : 0);
+	size_t size_sum = static_size + reserved_size + dyn_size;
 	struct pcpu_chunk *schunk, *dchunk = NULL;
 	unsigned int cpu, tcpu;
 	int i;
@@ -1345,9 +1343,6 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
 		BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
 
-	if (dyn_size < 0)
-		dyn_size = pcpu_unit_size - static_size - reserved_size;
-
 	first_vm.flags = VM_ALLOC;
 	first_vm.size = pcpu_chunk_size;
 	first_vm.addr = base_addr;
@@ -1557,6 +1552,8 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 {
 	static struct vm_struct vm;
 	const size_t static_size = __per_cpu_end - __per_cpu_start;
+	ssize_t dyn_size = -1;
+	size_t size_sum, unit_size;
 	char psize_str[16];
 	int unit_pages;
 	size_t pages_size;
@@ -1567,8 +1564,9 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 
 	snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
 
-	unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
-				  PCPU_MIN_UNIT_SIZE));
+	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
+	unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+	unit_pages = unit_size >> PAGE_SHIFT;
 
 	/* unaligned allocations can't be freed, round up to page size */
 	pages_size = PFN_ALIGN(unit_pages * nr_cpu_ids * sizeof(pages[0]));
@@ -1591,12 +1589,12 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 
 	/* allocate vm area, map the pages and copy static data */
 	vm.flags = VM_ALLOC;
-	vm.size = nr_cpu_ids * unit_pages << PAGE_SHIFT;
+	vm.size = nr_cpu_ids * unit_size;
 	vm_area_register_early(&vm, PAGE_SIZE);
 
 	for_each_possible_cpu(cpu) {
-		unsigned long unit_addr = (unsigned long)vm.addr +
-			(cpu * unit_pages << PAGE_SHIFT);
+		unsigned long unit_addr =
+			(unsigned long)vm.addr + cpu * unit_size;
 
 		for (i = 0; i < unit_pages; i++)
 			populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
@@ -1620,11 +1618,12 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 	}
 
 	/* we're ready, commit */
-	pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu\n",
-		unit_pages, psize_str, vm.addr, static_size, reserved_size);
+	pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",
+		unit_pages, psize_str, vm.addr, static_size, reserved_size,
+		dyn_size);
 
-	ret = pcpu_setup_first_chunk(static_size, reserved_size, -1,
-				     unit_pages << PAGE_SHIFT, vm.addr, NULL);
+	ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
+				     unit_size, vm.addr, NULL);
 	goto out_free_ar;
 
 enomem:
-- 
cgit v1.2.3-71-gd317


From 3cbc85652767c38b252c8de55f9fd180b29e4c0d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 15:00:50 +0900
Subject: percpu: add @align to pcpu_fc_alloc_fn_t

pcpu_fc_alloc_fn_t is about to see more interesting usage, add @align
parameter.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/kernel/setup_percpu.c | 4 ++--
 include/linux/percpu.h         | 3 ++-
 mm/percpu.c                    | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 8aad486c688f..660cde133141 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -126,9 +126,9 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
 /*
  * Helpers for first chunk memory allocation
  */
-static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size)
+static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
 {
-	return pcpu_alloc_bootmem(cpu, size, size);
+	return pcpu_alloc_bootmem(cpu, size, align);
 }
 
 static void __init pcpu_fc_free(void *ptr, size_t size)
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 0cfdd14d096a..d385dbcf190b 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -71,7 +71,8 @@ extern const char *pcpu_fc_names[PCPU_FC_NR];
 
 extern enum pcpu_fc pcpu_chosen_fc;
 
-typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size);
+typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size,
+				     size_t align);
 typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
 typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
 typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to);
diff --git a/mm/percpu.c b/mm/percpu.c
index 287f59cc5fb9..3316e3aac7ee 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1578,7 +1578,7 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 		for (i = 0; i < unit_pages; i++) {
 			void *ptr;
 
-			ptr = alloc_fn(cpu, PAGE_SIZE);
+			ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
 			if (!ptr) {
 				pr_warning("PERCPU: failed to allocate %s page "
 					   "for cpu%u\n", psize_str, cpu);
@@ -1888,7 +1888,7 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size,
 				goto found;
 		continue;
 	found:
-		ptr = alloc_fn(cpu, lpage_size);
+		ptr = alloc_fn(cpu, lpage_size, lpage_size);
 		if (!ptr) {
 			pr_warning("PERCPU: failed to allocate large page "
 				   "for cpu%u\n", cpu);
-- 
cgit v1.2.3-71-gd317


From 033e48fb82958053113178264ddb9d5038d5e38b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 15:00:51 +0900
Subject: percpu: move pcpu_lpage_build_unit_map() and pcpul_lpage_dump_cfg()
 upward

Unit map handling will be generalized and extended and used for
embedding sparse first chunk and other purposes.  Relocate two
unit_map related functions upward in preparation.  This patch just
moves the code without any actual change.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/percpu.h |  14 +-
 mm/percpu.c            | 339 +++++++++++++++++++++++++------------------------
 2 files changed, 180 insertions(+), 173 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index d385dbcf190b..570fb18de2ba 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -78,6 +78,14 @@ typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
 typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to);
 typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr);
 
+#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
+extern int __init pcpu_lpage_build_unit_map(
+				size_t reserved_size, ssize_t *dyn_sizep,
+				size_t *unit_sizep, size_t lpage_size,
+				int *unit_map,
+				pcpu_fc_cpu_distance_fn_t cpu_distance_fn);
+#endif
+
 extern size_t __init pcpu_setup_first_chunk(
 				size_t static_size, size_t reserved_size,
 				size_t dyn_size, size_t unit_size,
@@ -97,12 +105,6 @@ extern ssize_t __init pcpu_page_first_chunk(
 #endif
 
 #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
-extern int __init pcpu_lpage_build_unit_map(
-				size_t reserved_size, ssize_t *dyn_sizep,
-				size_t *unit_sizep, size_t lpage_size,
-				int *unit_map,
-				pcpu_fc_cpu_distance_fn_t cpu_distance_fn);
-
 extern ssize_t __init pcpu_lpage_first_chunk(
 				size_t reserved_size, size_t dyn_size,
 				size_t unit_size, size_t lpage_size,
diff --git a/mm/percpu.c b/mm/percpu.c
index 3316e3aac7ee..2b9c4b2a2fc0 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1231,6 +1231,178 @@ void free_percpu(void *ptr)
 }
 EXPORT_SYMBOL_GPL(free_percpu);
 
+static inline size_t pcpu_calc_fc_sizes(size_t static_size,
+					size_t reserved_size,
+					ssize_t *dyn_sizep)
+{
+	size_t size_sum;
+
+	size_sum = PFN_ALIGN(static_size + reserved_size +
+			     (*dyn_sizep >= 0 ? *dyn_sizep : 0));
+	if (*dyn_sizep != 0)
+		*dyn_sizep = size_sum - static_size - reserved_size;
+
+	return size_sum;
+}
+
+#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
+/**
+ * pcpu_lpage_build_unit_map - build unit_map for large page remapping
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @dyn_sizep: in/out parameter for dynamic size, -1 for auto
+ * @unit_sizep: out parameter for unit size
+ * @unit_map: unit_map to be filled
+ * @cpu_distance_fn: callback to determine distance between cpus
+ *
+ * This function builds cpu -> unit map and determine other parameters
+ * considering needed percpu size, large page size and distances
+ * between CPUs in NUMA.
+ *
+ * CPUs which are of LOCAL_DISTANCE both ways are grouped together and
+ * may share units in the same large page.  The returned configuration
+ * is guaranteed to have CPUs on different nodes on different large
+ * pages and >=75% usage of allocated virtual address space.
+ *
+ * RETURNS:
+ * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and
+ * returns the number of units to be allocated.  -errno on failure.
+ */
+int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep,
+				     size_t *unit_sizep, size_t lpage_size,
+				     int *unit_map,
+				     pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
+{
+	static int group_map[NR_CPUS] __initdata;
+	static int group_cnt[NR_CPUS] __initdata;
+	const size_t static_size = __per_cpu_end - __per_cpu_start;
+	int group_cnt_max = 0;
+	size_t size_sum, min_unit_size, alloc_size;
+	int upa, max_upa, uninitialized_var(best_upa);	/* units_per_alloc */
+	int last_allocs;
+	unsigned int cpu, tcpu;
+	int group, unit;
+
+	/*
+	 * Determine min_unit_size, alloc_size and max_upa such that
+	 * alloc_size is multiple of lpage_size and is the smallest
+	 * which can accomodate 4k aligned segments which are equal to
+	 * or larger than min_unit_size.
+	 */
+	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep);
+	min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+
+	alloc_size = roundup(min_unit_size, lpage_size);
+	upa = alloc_size / min_unit_size;
+	while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+		upa--;
+	max_upa = upa;
+
+	/* group cpus according to their proximity */
+	for_each_possible_cpu(cpu) {
+		group = 0;
+	next_group:
+		for_each_possible_cpu(tcpu) {
+			if (cpu == tcpu)
+				break;
+			if (group_map[tcpu] == group &&
+			    (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
+			     cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
+				group++;
+				goto next_group;
+			}
+		}
+		group_map[cpu] = group;
+		group_cnt[group]++;
+		group_cnt_max = max(group_cnt_max, group_cnt[group]);
+	}
+
+	/*
+	 * Expand unit size until address space usage goes over 75%
+	 * and then as much as possible without using more address
+	 * space.
+	 */
+	last_allocs = INT_MAX;
+	for (upa = max_upa; upa; upa--) {
+		int allocs = 0, wasted = 0;
+
+		if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+			continue;
+
+		for (group = 0; group_cnt[group]; group++) {
+			int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
+			allocs += this_allocs;
+			wasted += this_allocs * upa - group_cnt[group];
+		}
+
+		/*
+		 * Don't accept if wastage is over 25%.  The
+		 * greater-than comparison ensures upa==1 always
+		 * passes the following check.
+		 */
+		if (wasted > num_possible_cpus() / 3)
+			continue;
+
+		/* and then don't consume more memory */
+		if (allocs > last_allocs)
+			break;
+		last_allocs = allocs;
+		best_upa = upa;
+	}
+	*unit_sizep = alloc_size / best_upa;
+
+	/* assign units to cpus accordingly */
+	unit = 0;
+	for (group = 0; group_cnt[group]; group++) {
+		for_each_possible_cpu(cpu)
+			if (group_map[cpu] == group)
+				unit_map[cpu] = unit++;
+		unit = roundup(unit, best_upa);
+	}
+
+	return unit;	/* unit contains aligned number of units */
+}
+
+static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
+				     unsigned int *cpup);
+
+static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
+					size_t reserved_size, size_t dyn_size,
+					size_t unit_size, size_t lpage_size,
+					const int *unit_map, int nr_units)
+{
+	int width = 1, v = nr_units;
+	char empty_str[] = "--------";
+	int upl, lpl;	/* units per lpage, lpage per line */
+	unsigned int cpu;
+	int lpage, unit;
+
+	while (v /= 10)
+		width++;
+	empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0';
+
+	upl = max_t(int, lpage_size / unit_size, 1);
+	lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1));
+
+	printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl,
+	       static_size, reserved_size, dyn_size, unit_size, lpage_size);
+
+	for (lpage = 0, unit = 0; unit < nr_units; unit++) {
+		if (!(unit % upl)) {
+			if (!(lpage++ % lpl)) {
+				printk("\n");
+				printk("%spcpu-lpage: ", lvl);
+			} else
+				printk("| ");
+		}
+		if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
+			printk("%0*d ", width, cpu);
+		else
+			printk("%s ", empty_str);
+	}
+	printk("\n");
+}
+#endif
+
 /**
  * pcpu_setup_first_chunk - initialize the first percpu chunk
  * @static_size: the size of static percpu area in bytes
@@ -1441,20 +1613,6 @@ static int __init percpu_alloc_setup(char *str)
 }
 early_param("percpu_alloc", percpu_alloc_setup);
 
-static inline size_t pcpu_calc_fc_sizes(size_t static_size,
-					size_t reserved_size,
-					ssize_t *dyn_sizep)
-{
-	size_t size_sum;
-
-	size_sum = PFN_ALIGN(static_size + reserved_size +
-			     (*dyn_sizep >= 0 ? *dyn_sizep : 0));
-	if (*dyn_sizep != 0)
-		*dyn_sizep = size_sum - static_size - reserved_size;
-
-	return size_sum;
-}
-
 #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
 	!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
 /**
@@ -1637,122 +1795,6 @@ out_free_ar:
 #endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
 
 #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
-/**
- * pcpu_lpage_build_unit_map - build unit_map for large page remapping
- * @reserved_size: the size of reserved percpu area in bytes
- * @dyn_sizep: in/out parameter for dynamic size, -1 for auto
- * @unit_sizep: out parameter for unit size
- * @unit_map: unit_map to be filled
- * @cpu_distance_fn: callback to determine distance between cpus
- *
- * This function builds cpu -> unit map and determine other parameters
- * considering needed percpu size, large page size and distances
- * between CPUs in NUMA.
- *
- * CPUs which are of LOCAL_DISTANCE both ways are grouped together and
- * may share units in the same large page.  The returned configuration
- * is guaranteed to have CPUs on different nodes on different large
- * pages and >=75% usage of allocated virtual address space.
- *
- * RETURNS:
- * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and
- * returns the number of units to be allocated.  -errno on failure.
- */
-int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep,
-				     size_t *unit_sizep, size_t lpage_size,
-				     int *unit_map,
-				     pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
-{
-	static int group_map[NR_CPUS] __initdata;
-	static int group_cnt[NR_CPUS] __initdata;
-	const size_t static_size = __per_cpu_end - __per_cpu_start;
-	int group_cnt_max = 0;
-	size_t size_sum, min_unit_size, alloc_size;
-	int upa, max_upa, uninitialized_var(best_upa);	/* units_per_alloc */
-	int last_allocs;
-	unsigned int cpu, tcpu;
-	int group, unit;
-
-	/*
-	 * Determine min_unit_size, alloc_size and max_upa such that
-	 * alloc_size is multiple of lpage_size and is the smallest
-	 * which can accomodate 4k aligned segments which are equal to
-	 * or larger than min_unit_size.
-	 */
-	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep);
-	min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
-
-	alloc_size = roundup(min_unit_size, lpage_size);
-	upa = alloc_size / min_unit_size;
-	while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
-		upa--;
-	max_upa = upa;
-
-	/* group cpus according to their proximity */
-	for_each_possible_cpu(cpu) {
-		group = 0;
-	next_group:
-		for_each_possible_cpu(tcpu) {
-			if (cpu == tcpu)
-				break;
-			if (group_map[tcpu] == group &&
-			    (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
-			     cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
-				group++;
-				goto next_group;
-			}
-		}
-		group_map[cpu] = group;
-		group_cnt[group]++;
-		group_cnt_max = max(group_cnt_max, group_cnt[group]);
-	}
-
-	/*
-	 * Expand unit size until address space usage goes over 75%
-	 * and then as much as possible without using more address
-	 * space.
-	 */
-	last_allocs = INT_MAX;
-	for (upa = max_upa; upa; upa--) {
-		int allocs = 0, wasted = 0;
-
-		if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
-			continue;
-
-		for (group = 0; group_cnt[group]; group++) {
-			int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
-			allocs += this_allocs;
-			wasted += this_allocs * upa - group_cnt[group];
-		}
-
-		/*
-		 * Don't accept if wastage is over 25%.  The
-		 * greater-than comparison ensures upa==1 always
-		 * passes the following check.
-		 */
-		if (wasted > num_possible_cpus() / 3)
-			continue;
-
-		/* and then don't consume more memory */
-		if (allocs > last_allocs)
-			break;
-		last_allocs = allocs;
-		best_upa = upa;
-	}
-	*unit_sizep = alloc_size / best_upa;
-
-	/* assign units to cpus accordingly */
-	unit = 0;
-	for (group = 0; group_cnt[group]; group++) {
-		for_each_possible_cpu(cpu)
-			if (group_map[cpu] == group)
-				unit_map[cpu] = unit++;
-		unit = roundup(unit, best_upa);
-	}
-
-	return unit;	/* unit contains aligned number of units */
-}
-
 struct pcpul_ent {
 	void		*ptr;
 	void		*map_addr;
@@ -1778,43 +1820,6 @@ static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
 	return false;
 }
 
-static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
-					size_t reserved_size, size_t dyn_size,
-					size_t unit_size, size_t lpage_size,
-					const int *unit_map, int nr_units)
-{
-	int width = 1, v = nr_units;
-	char empty_str[] = "--------";
-	int upl, lpl;	/* units per lpage, lpage per line */
-	unsigned int cpu;
-	int lpage, unit;
-
-	while (v /= 10)
-		width++;
-	empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0';
-
-	upl = max_t(int, lpage_size / unit_size, 1);
-	lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1));
-
-	printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl,
-	       static_size, reserved_size, dyn_size, unit_size, lpage_size);
-
-	for (lpage = 0, unit = 0; unit < nr_units; unit++) {
-		if (!(unit % upl)) {
-			if (!(lpage++ % lpl)) {
-				printk("\n");
-				printk("%spcpu-lpage: ", lvl);
-			} else
-				printk("| ");
-		}
-		if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
-			printk("%0*d ", width, cpu);
-		else
-			printk("%s ", empty_str);
-	}
-	printk("\n");
-}
-
 /**
  * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
  * @reserved_size: the size of reserved percpu area in bytes
-- 
cgit v1.2.3-71-gd317


From fd1e8a1fe2b54df6c185b4fa65f181f50b9c4d4e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 15:00:51 +0900
Subject: percpu: introduce pcpu_alloc_info and pcpu_group_info

Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array.  For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem.  Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.

This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus.  pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.

pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.

* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
  help dealing with pcpu_alloc_info.

* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
  generalized and renamed to pcpu_build_alloc_info().
  @cpu_distance_fn may be NULL indicating that all cpus are of
  LOCAL_DISTANCE.

* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
  generalized and renamed to pcpu_dump_alloc_info().  It now also
  prints which group each alloc unit belongs to.

* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
  separate parameters.  All first chunk allocators are updated to use
  pcpu_build_alloc_info() to build alloc_info and call
  pcpu_setup_first_chunk() with it.  This has the side effect of
  packing units for sparse possible cpus.  ie. if cpus 0, 2 and 4 are
  possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.

* x86 setup_pcpu_lpage() is updated to deal with alloc_info.

* sparc64 setup_per_cpu_areas() is updated to build alloc_info.

Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus.  It mostly changes how information is passed among
initialization functions and makes room for more flexibility.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
---
 arch/sparc/kernel/smp_64.c     |  24 +-
 arch/x86/kernel/setup_percpu.c |  38 ++-
 include/linux/percpu.h         |  42 +++-
 mm/percpu.c                    | 529 +++++++++++++++++++++++++----------------
 4 files changed, 389 insertions(+), 244 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 9856d866b77b..a42a4a744d14 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1475,17 +1475,29 @@ static void __init pcpu_map_range(unsigned long start, unsigned long end,
 
 void __init setup_per_cpu_areas(void)
 {
-	size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start;
 	static struct vm_struct vm;
+	struct pcpu_alloc_info *ai;
 	unsigned long delta, cpu;
 	size_t size_sum, pcpu_unit_size;
 	size_t ptrs_size;
 	void **ptrs;
 
-	size_sum = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
+	ai = pcpu_alloc_alloc_info(1, nr_cpu_ids);
+
+	ai->static_size = __per_cpu_end - __per_cpu_start;
+	ai->reserved_size = PERCPU_MODULE_RESERVE;
+
+	size_sum = PFN_ALIGN(ai->static_size + ai->reserved_size +
 			     PERCPU_DYNAMIC_RESERVE);
-	dyn_size = size_sum - static_size - PERCPU_MODULE_RESERVE;
 
+	ai->dyn_size = size_sum - ai->static_size - ai->reserved_size;
+	ai->unit_size = PCPU_CHUNK_SIZE;
+	ai->atom_size = PCPU_CHUNK_SIZE;
+	ai->alloc_size = PCPU_CHUNK_SIZE;
+	ai->groups[0].nr_units = nr_cpu_ids;
+
+	for_each_possible_cpu(cpu)
+		ai->groups[0].cpu_map[cpu] = cpu;
 
 	ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(ptrs[0]));
 	ptrs = alloc_bootmem(ptrs_size);
@@ -1497,7 +1509,7 @@ void __init setup_per_cpu_areas(void)
 		free_bootmem(__pa(ptrs[cpu] + size_sum),
 			     PCPU_CHUNK_SIZE - size_sum);
 
-		memcpy(ptrs[cpu], __per_cpu_load, static_size);
+		memcpy(ptrs[cpu], __per_cpu_load, ai->static_size);
 	}
 
 	/* allocate address and map */
@@ -1514,9 +1526,7 @@ void __init setup_per_cpu_areas(void)
 		pcpu_map_range(start, end, virt_to_page(ptrs[cpu]));
 	}
 
-	pcpu_unit_size = pcpu_setup_first_chunk(static_size,
-						PERCPU_MODULE_RESERVE, dyn_size,
-						PCPU_CHUNK_SIZE, vm.addr, NULL);
+	pcpu_unit_size = pcpu_setup_first_chunk(ai, vm.addr);
 
 	free_bootmem(__pa(ptrs), ptrs_size);
 
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 660cde133141..db5f9c49fec5 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -161,9 +161,7 @@ static ssize_t __init setup_pcpu_lpage(bool chosen)
 {
 	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
 	size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE;
-	size_t unit_map_size, unit_size;
-	int *unit_map;
-	int nr_units;
+	struct pcpu_alloc_info *ai;
 	ssize_t ret;
 
 	/* on non-NUMA, embedding is better */
@@ -177,26 +175,22 @@ static ssize_t __init setup_pcpu_lpage(bool chosen)
 	}
 
 	/* allocate and build unit_map */
-	unit_map_size = nr_cpu_ids * sizeof(int);
-	unit_map = alloc_bootmem_nopanic(unit_map_size);
-	if (!unit_map) {
-		pr_warning("PERCPU: failed to allocate unit_map\n");
-		return -ENOMEM;
+	ai = pcpu_build_alloc_info(PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
+				   PMD_SIZE, pcpu_lpage_cpu_distance);
+	if (IS_ERR(ai)) {
+		pr_warning("PERCPU: failed to build unit_map (%ld)\n",
+			   PTR_ERR(ai));
+		return PTR_ERR(ai);
 	}
 
-	ret = pcpu_lpage_build_unit_map(PERCPU_FIRST_CHUNK_RESERVE,
-					&dyn_size, &unit_size, PMD_SIZE,
-					unit_map, pcpu_lpage_cpu_distance);
-	if (ret < 0) {
-		pr_warning("PERCPU: failed to build unit_map\n");
-		goto out_free;
-	}
-	nr_units = ret;
-
 	/* do the parameters look okay? */
 	if (!chosen) {
 		size_t vm_size = VMALLOC_END - VMALLOC_START;
-		size_t tot_size = nr_units * unit_size;
+		size_t tot_size = 0;
+		int group;
+
+		for (group = 0; group < ai->nr_groups; group++)
+			tot_size += ai->unit_size * ai->groups[group].nr_units;
 
 		/* don't consume more than 20% of vmalloc area */
 		if (tot_size > vm_size / 5) {
@@ -207,12 +201,10 @@ static ssize_t __init setup_pcpu_lpage(bool chosen)
 		}
 	}
 
-	ret = pcpu_lpage_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
-				     unit_size, PMD_SIZE, unit_map, nr_units,
-				     pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
+	ret = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free,
+				     pcpul_map);
 out_free:
-	if (ret < 0)
-		free_bootmem(__pa(unit_map), unit_map_size);
+	pcpu_free_alloc_info(ai);
 	return ret;
 }
 #else
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 570fb18de2ba..77b86be8ce4f 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -59,6 +59,25 @@
 extern void *pcpu_base_addr;
 extern const int *pcpu_unit_map;
 
+struct pcpu_group_info {
+	int			nr_units;	/* aligned # of units */
+	unsigned long		base_offset;	/* base address offset */
+	unsigned int		*cpu_map;	/* unit->cpu map, empty
+						 * entries contain NR_CPUS */
+};
+
+struct pcpu_alloc_info {
+	size_t			static_size;
+	size_t			reserved_size;
+	size_t			dyn_size;
+	size_t			unit_size;
+	size_t			atom_size;
+	size_t			alloc_size;
+	size_t			__ai_size;	/* internal, don't use */
+	int			nr_groups;	/* 0 if grouping unnecessary */
+	struct pcpu_group_info	groups[];
+};
+
 enum pcpu_fc {
 	PCPU_FC_AUTO,
 	PCPU_FC_EMBED,
@@ -78,18 +97,17 @@ typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
 typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to);
 typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr);
 
-#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
-extern int __init pcpu_lpage_build_unit_map(
-				size_t reserved_size, ssize_t *dyn_sizep,
-				size_t *unit_sizep, size_t lpage_size,
-				int *unit_map,
+extern struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
+							     int nr_units);
+extern void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai);
+
+extern struct pcpu_alloc_info * __init pcpu_build_alloc_info(
+				size_t reserved_size, ssize_t dyn_size,
+				size_t atom_size,
 				pcpu_fc_cpu_distance_fn_t cpu_distance_fn);
-#endif
 
-extern size_t __init pcpu_setup_first_chunk(
-				size_t static_size, size_t reserved_size,
-				size_t dyn_size, size_t unit_size,
-				void *base_addr, const int *unit_map);
+extern size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
+					    void *base_addr);
 
 #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
 extern ssize_t __init pcpu_embed_first_chunk(
@@ -106,9 +124,7 @@ extern ssize_t __init pcpu_page_first_chunk(
 
 #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
 extern ssize_t __init pcpu_lpage_first_chunk(
-				size_t reserved_size, size_t dyn_size,
-				size_t unit_size, size_t lpage_size,
-				const int *unit_map, int nr_units,
+				const struct pcpu_alloc_info *ai,
 				pcpu_fc_alloc_fn_t alloc_fn,
 				pcpu_fc_free_fn_t free_fn,
 				pcpu_fc_map_fn_t map_fn);
diff --git a/mm/percpu.c b/mm/percpu.c
index 2b9c4b2a2fc0..99f7fa682722 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -58,6 +58,7 @@
 
 #include <linux/bitmap.h>
 #include <linux/bootmem.h>
+#include <linux/err.h>
 #include <linux/list.h>
 #include <linux/log2.h>
 #include <linux/mm.h>
@@ -1245,53 +1246,108 @@ static inline size_t pcpu_calc_fc_sizes(size_t static_size,
 	return size_sum;
 }
 
-#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
 /**
- * pcpu_lpage_build_unit_map - build unit_map for large page remapping
+ * pcpu_alloc_alloc_info - allocate percpu allocation info
+ * @nr_groups: the number of groups
+ * @nr_units: the number of units
+ *
+ * Allocate ai which is large enough for @nr_groups groups containing
+ * @nr_units units.  The returned ai's groups[0].cpu_map points to the
+ * cpu_map array which is long enough for @nr_units and filled with
+ * NR_CPUS.  It's the caller's responsibility to initialize cpu_map
+ * pointer of other groups.
+ *
+ * RETURNS:
+ * Pointer to the allocated pcpu_alloc_info on success, NULL on
+ * failure.
+ */
+struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
+						      int nr_units)
+{
+	struct pcpu_alloc_info *ai;
+	size_t base_size, ai_size;
+	void *ptr;
+	int unit;
+
+	base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
+			  __alignof__(ai->groups[0].cpu_map[0]));
+	ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
+
+	ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size));
+	if (!ptr)
+		return NULL;
+	ai = ptr;
+	ptr += base_size;
+
+	ai->groups[0].cpu_map = ptr;
+
+	for (unit = 0; unit < nr_units; unit++)
+		ai->groups[0].cpu_map[unit] = NR_CPUS;
+
+	ai->nr_groups = nr_groups;
+	ai->__ai_size = PFN_ALIGN(ai_size);
+
+	return ai;
+}
+
+/**
+ * pcpu_free_alloc_info - free percpu allocation info
+ * @ai: pcpu_alloc_info to free
+ *
+ * Free @ai which was allocated by pcpu_alloc_alloc_info().
+ */
+void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
+{
+	free_bootmem(__pa(ai), ai->__ai_size);
+}
+
+/**
+ * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
  * @reserved_size: the size of reserved percpu area in bytes
- * @dyn_sizep: in/out parameter for dynamic size, -1 for auto
- * @unit_sizep: out parameter for unit size
- * @unit_map: unit_map to be filled
- * @cpu_distance_fn: callback to determine distance between cpus
+ * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
+ * @atom_size: allocation atom size
+ * @cpu_distance_fn: callback to determine distance between cpus, optional
  *
- * This function builds cpu -> unit map and determine other parameters
- * considering needed percpu size, large page size and distances
- * between CPUs in NUMA.
+ * This function determines grouping of units, their mappings to cpus
+ * and other parameters considering needed percpu size, allocation
+ * atom size and distances between CPUs.
  *
- * CPUs which are of LOCAL_DISTANCE both ways are grouped together and
- * may share units in the same large page.  The returned configuration
- * is guaranteed to have CPUs on different nodes on different large
- * pages and >=75% usage of allocated virtual address space.
+ * Groups are always mutliples of atom size and CPUs which are of
+ * LOCAL_DISTANCE both ways are grouped together and share space for
+ * units in the same group.  The returned configuration is guaranteed
+ * to have CPUs on different nodes on different groups and >=75% usage
+ * of allocated virtual address space.
  *
  * RETURNS:
- * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and
- * returns the number of units to be allocated.  -errno on failure.
+ * On success, pointer to the new allocation_info is returned.  On
+ * failure, ERR_PTR value is returned.
  */
-int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep,
-				     size_t *unit_sizep, size_t lpage_size,
-				     int *unit_map,
-				     pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
+struct pcpu_alloc_info * __init pcpu_build_alloc_info(
+				size_t reserved_size, ssize_t dyn_size,
+				size_t atom_size,
+				pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
 {
 	static int group_map[NR_CPUS] __initdata;
 	static int group_cnt[NR_CPUS] __initdata;
 	const size_t static_size = __per_cpu_end - __per_cpu_start;
-	int group_cnt_max = 0;
+	int group_cnt_max = 0, nr_groups = 1, nr_units = 0;
 	size_t size_sum, min_unit_size, alloc_size;
 	int upa, max_upa, uninitialized_var(best_upa);	/* units_per_alloc */
-	int last_allocs;
+	int last_allocs, group, unit;
 	unsigned int cpu, tcpu;
-	int group, unit;
+	struct pcpu_alloc_info *ai;
+	unsigned int *cpu_map;
 
 	/*
 	 * Determine min_unit_size, alloc_size and max_upa such that
-	 * alloc_size is multiple of lpage_size and is the smallest
+	 * alloc_size is multiple of atom_size and is the smallest
 	 * which can accomodate 4k aligned segments which are equal to
 	 * or larger than min_unit_size.
 	 */
-	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep);
+	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
 	min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
 
-	alloc_size = roundup(min_unit_size, lpage_size);
+	alloc_size = roundup(min_unit_size, atom_size);
 	upa = alloc_size / min_unit_size;
 	while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
 		upa--;
@@ -1304,10 +1360,11 @@ int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep,
 		for_each_possible_cpu(tcpu) {
 			if (cpu == tcpu)
 				break;
-			if (group_map[tcpu] == group &&
+			if (group_map[tcpu] == group && cpu_distance_fn &&
 			    (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
 			     cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
 				group++;
+				nr_groups = max(nr_groups, group + 1);
 				goto next_group;
 			}
 		}
@@ -1328,7 +1385,7 @@ int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep,
 		if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
 			continue;
 
-		for (group = 0; group_cnt[group]; group++) {
+		for (group = 0; group < nr_groups; group++) {
 			int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
 			allocs += this_allocs;
 			wasted += this_allocs * upa - group_cnt[group];
@@ -1348,75 +1405,122 @@ int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep,
 		last_allocs = allocs;
 		best_upa = upa;
 	}
-	*unit_sizep = alloc_size / best_upa;
+	upa = best_upa;
+
+	/* allocate and fill alloc_info */
+	for (group = 0; group < nr_groups; group++)
+		nr_units += roundup(group_cnt[group], upa);
+
+	ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
+	if (!ai)
+		return ERR_PTR(-ENOMEM);
+	cpu_map = ai->groups[0].cpu_map;
+
+	for (group = 0; group < nr_groups; group++) {
+		ai->groups[group].cpu_map = cpu_map;
+		cpu_map += roundup(group_cnt[group], upa);
+	}
+
+	ai->static_size = static_size;
+	ai->reserved_size = reserved_size;
+	ai->dyn_size = dyn_size;
+	ai->unit_size = alloc_size / upa;
+	ai->atom_size = atom_size;
+	ai->alloc_size = alloc_size;
+
+	for (group = 0, unit = 0; group_cnt[group]; group++) {
+		struct pcpu_group_info *gi = &ai->groups[group];
+
+		/*
+		 * Initialize base_offset as if all groups are located
+		 * back-to-back.  The caller should update this to
+		 * reflect actual allocation.
+		 */
+		gi->base_offset = unit * ai->unit_size;
 
-	/* assign units to cpus accordingly */
-	unit = 0;
-	for (group = 0; group_cnt[group]; group++) {
 		for_each_possible_cpu(cpu)
 			if (group_map[cpu] == group)
-				unit_map[cpu] = unit++;
-		unit = roundup(unit, best_upa);
+				gi->cpu_map[gi->nr_units++] = cpu;
+		gi->nr_units = roundup(gi->nr_units, upa);
+		unit += gi->nr_units;
 	}
+	BUG_ON(unit != nr_units);
 
-	return unit;	/* unit contains aligned number of units */
+	return ai;
 }
 
-static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
-				     unsigned int *cpup);
-
-static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
-					size_t reserved_size, size_t dyn_size,
-					size_t unit_size, size_t lpage_size,
-					const int *unit_map, int nr_units)
+/**
+ * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
+ * @lvl: loglevel
+ * @ai: allocation info to dump
+ *
+ * Print out information about @ai using loglevel @lvl.
+ */
+static void pcpu_dump_alloc_info(const char *lvl,
+				 const struct pcpu_alloc_info *ai)
 {
-	int width = 1, v = nr_units;
+	int group_width = 1, cpu_width = 1, width;
 	char empty_str[] = "--------";
-	int upl, lpl;	/* units per lpage, lpage per line */
-	unsigned int cpu;
-	int lpage, unit;
+	int alloc = 0, alloc_end = 0;
+	int group, v;
+	int upa, apl;	/* units per alloc, allocs per line */
+
+	v = ai->nr_groups;
+	while (v /= 10)
+		group_width++;
 
+	v = num_possible_cpus();
 	while (v /= 10)
-		width++;
-	empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0';
+		cpu_width++;
+	empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
 
-	upl = max_t(int, lpage_size / unit_size, 1);
-	lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1));
+	upa = ai->alloc_size / ai->unit_size;
+	width = upa * (cpu_width + 1) + group_width + 3;
+	apl = rounddown_pow_of_two(max(60 / width, 1));
 
-	printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl,
-	       static_size, reserved_size, dyn_size, unit_size, lpage_size);
+	printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
+	       lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
+	       ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
 
-	for (lpage = 0, unit = 0; unit < nr_units; unit++) {
-		if (!(unit % upl)) {
-			if (!(lpage++ % lpl)) {
+	for (group = 0; group < ai->nr_groups; group++) {
+		const struct pcpu_group_info *gi = &ai->groups[group];
+		int unit = 0, unit_end = 0;
+
+		BUG_ON(gi->nr_units % upa);
+		for (alloc_end += gi->nr_units / upa;
+		     alloc < alloc_end; alloc++) {
+			if (!(alloc % apl)) {
 				printk("\n");
-				printk("%spcpu-lpage: ", lvl);
-			} else
-				printk("| ");
+				printk("%spcpu-alloc: ", lvl);
+			}
+			printk("[%0*d] ", group_width, group);
+
+			for (unit_end += upa; unit < unit_end; unit++)
+				if (gi->cpu_map[unit] != NR_CPUS)
+					printk("%0*d ", cpu_width,
+					       gi->cpu_map[unit]);
+				else
+					printk("%s ", empty_str);
 		}
-		if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
-			printk("%0*d ", width, cpu);
-		else
-			printk("%s ", empty_str);
 	}
 	printk("\n");
 }
-#endif
 
 /**
  * pcpu_setup_first_chunk - initialize the first percpu chunk
- * @static_size: the size of static percpu area in bytes
- * @reserved_size: the size of reserved percpu area in bytes, 0 for none
- * @dyn_size: free size for dynamic allocation in bytes
- * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE
+ * @ai: pcpu_alloc_info describing how to percpu area is shaped
  * @base_addr: mapped address
- * @unit_map: cpu -> unit map, NULL for sequential mapping
  *
  * Initialize the first percpu chunk which contains the kernel static
  * perpcu area.  This function is to be called from arch percpu area
  * setup path.
  *
- * @reserved_size, if non-zero, specifies the amount of bytes to
+ * @ai contains all information necessary to initialize the first
+ * chunk and prime the dynamic percpu allocator.
+ *
+ * @ai->static_size is the size of static percpu area.
+ *
+ * @ai->reserved_size, if non-zero, specifies the amount of bytes to
  * reserve after the static area in the first chunk.  This reserves
  * the first chunk such that it's available only through reserved
  * percpu allocation.  This is primarily used to serve module percpu
@@ -1424,13 +1528,26 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
  * limited offset range for symbol relocations to guarantee module
  * percpu symbols fall inside the relocatable range.
  *
- * @dyn_size determines the number of bytes available for dynamic
- * allocation in the first chunk.  The area between @static_size +
- * @reserved_size + @dyn_size and @unit_size is unused.
+ * @ai->dyn_size determines the number of bytes available for dynamic
+ * allocation in the first chunk.  The area between @ai->static_size +
+ * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
  *
- * @unit_size specifies unit size and must be aligned to PAGE_SIZE and
- * equal to or larger than @static_size + @reserved_size + if
- * non-negative, @dyn_size.
+ * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
+ * and equal to or larger than @ai->static_size + @ai->reserved_size +
+ * @ai->dyn_size.
+ *
+ * @ai->atom_size is the allocation atom size and used as alignment
+ * for vm areas.
+ *
+ * @ai->alloc_size is the allocation size and always multiple of
+ * @ai->atom_size.  This is larger than @ai->atom_size if
+ * @ai->unit_size is larger than @ai->atom_size.
+ *
+ * @ai->nr_groups and @ai->groups describe virtual memory layout of
+ * percpu areas.  Units which should be colocated are put into the
+ * same group.  Dynamic VM areas will be allocated according to these
+ * groupings.  If @ai->nr_groups is zero, a single group containing
+ * all units is assumed.
  *
  * The caller should have mapped the first chunk at @base_addr and
  * copied static data to each unit.
@@ -1446,70 +1563,63 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access.
  */
-size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
-				     size_t dyn_size, size_t unit_size,
-				     void *base_addr, const int *unit_map)
+size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
+				     void *base_addr)
 {
 	static struct vm_struct first_vm;
 	static int smap[2], dmap[2];
-	size_t size_sum = static_size + reserved_size + dyn_size;
+	size_t dyn_size = ai->dyn_size;
+	size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
 	struct pcpu_chunk *schunk, *dchunk = NULL;
-	unsigned int cpu, tcpu;
-	int i;
+	unsigned int cpu;
+	int *unit_map;
+	int group, unit, i;
 
 	/* sanity checks */
 	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
 		     ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
-	BUG_ON(!static_size);
+	BUG_ON(ai->nr_groups <= 0);
+	BUG_ON(!ai->static_size);
 	BUG_ON(!base_addr);
-	BUG_ON(unit_size < size_sum);
-	BUG_ON(unit_size & ~PAGE_MASK);
-	BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
+	BUG_ON(ai->unit_size < size_sum);
+	BUG_ON(ai->unit_size & ~PAGE_MASK);
+	BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
+
+	pcpu_dump_alloc_info(KERN_DEBUG, ai);
 
 	/* determine number of units and verify and initialize pcpu_unit_map */
-	if (unit_map) {
-		int first_unit = INT_MAX, last_unit = INT_MIN;
-
-		for_each_possible_cpu(cpu) {
-			int unit = unit_map[cpu];
-
-			BUG_ON(unit < 0);
-			for_each_possible_cpu(tcpu) {
-				if (tcpu == cpu)
-					break;
-				/* the mapping should be one-to-one */
-				BUG_ON(unit_map[tcpu] == unit);
-			}
+	unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));
 
-			if (unit < first_unit) {
-				pcpu_first_unit_cpu = cpu;
-				first_unit = unit;
-			}
-			if (unit > last_unit) {
-				pcpu_last_unit_cpu = cpu;
-				last_unit = unit;
-			}
-		}
-		pcpu_nr_units = last_unit + 1;
-		pcpu_unit_map = unit_map;
-	} else {
-		int *identity_map;
+	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
+		unit_map[cpu] = NR_CPUS;
+	pcpu_first_unit_cpu = NR_CPUS;
 
-		/* #units == #cpus, identity mapped */
-		identity_map = alloc_bootmem(nr_cpu_ids *
-					     sizeof(identity_map[0]));
+	for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
+		const struct pcpu_group_info *gi = &ai->groups[group];
 
-		for_each_possible_cpu(cpu)
-			identity_map[cpu] = cpu;
+		for (i = 0; i < gi->nr_units; i++) {
+			cpu = gi->cpu_map[i];
+			if (cpu == NR_CPUS)
+				continue;
 
-		pcpu_first_unit_cpu = 0;
-		pcpu_last_unit_cpu = pcpu_nr_units - 1;
-		pcpu_nr_units = nr_cpu_ids;
-		pcpu_unit_map = identity_map;
+			BUG_ON(cpu > nr_cpu_ids || !cpu_possible(cpu));
+			BUG_ON(unit_map[cpu] != NR_CPUS);
+
+			unit_map[cpu] = unit + i;
+			if (pcpu_first_unit_cpu == NR_CPUS)
+				pcpu_first_unit_cpu = cpu;
+		}
 	}
+	pcpu_last_unit_cpu = cpu;
+	pcpu_nr_units = unit;
+
+	for_each_possible_cpu(cpu)
+		BUG_ON(unit_map[cpu] == NR_CPUS);
+
+	pcpu_unit_map = unit_map;
 
 	/* determine basic parameters */
-	pcpu_unit_pages = unit_size >> PAGE_SHIFT;
+	pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 	pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size;
 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
@@ -1543,17 +1653,17 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
 	schunk->immutable = true;
 	bitmap_fill(schunk->populated, pcpu_unit_pages);
 
-	if (reserved_size) {
-		schunk->free_size = reserved_size;
+	if (ai->reserved_size) {
+		schunk->free_size = ai->reserved_size;
 		pcpu_reserved_chunk = schunk;
-		pcpu_reserved_chunk_limit = static_size + reserved_size;
+		pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;
 	} else {
 		schunk->free_size = dyn_size;
 		dyn_size = 0;			/* dynamic area covered */
 	}
 	schunk->contig_hint = schunk->free_size;
 
-	schunk->map[schunk->map_used++] = -static_size;
+	schunk->map[schunk->map_used++] = -ai->static_size;
 	if (schunk->free_size)
 		schunk->map[schunk->map_used++] = schunk->free_size;
 
@@ -1643,44 +1753,47 @@ early_param("percpu_alloc", percpu_alloc_setup);
  */
 ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size)
 {
-	const size_t static_size = __per_cpu_end - __per_cpu_start;
-	size_t size_sum, unit_size, chunk_size;
+	struct pcpu_alloc_info *ai;
+	size_t size_sum, chunk_size;
 	void *base;
-	unsigned int cpu;
+	int unit;
+	ssize_t ret;
 
-	/* determine parameters and allocate */
-	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
+	ai = pcpu_build_alloc_info(reserved_size, dyn_size, PAGE_SIZE, NULL);
+	if (IS_ERR(ai))
+		return PTR_ERR(ai);
+	BUG_ON(ai->nr_groups != 1);
+	BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
 
-	unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
-	chunk_size = unit_size * nr_cpu_ids;
+	size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
+	chunk_size = ai->unit_size * num_possible_cpus();
 
 	base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
 				       __pa(MAX_DMA_ADDRESS));
 	if (!base) {
 		pr_warning("PERCPU: failed to allocate %zu bytes for "
 			   "embedding\n", chunk_size);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out_free_ai;
 	}
 
 	/* return the leftover and copy */
-	for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
-		void *ptr = base + cpu * unit_size;
-
-		if (cpu_possible(cpu)) {
-			free_bootmem(__pa(ptr + size_sum),
-				     unit_size - size_sum);
-			memcpy(ptr, __per_cpu_load, static_size);
-		} else
-			free_bootmem(__pa(ptr), unit_size);
+	for (unit = 0; unit < num_possible_cpus(); unit++) {
+		void *ptr = base + unit * ai->unit_size;
+
+		free_bootmem(__pa(ptr + size_sum), ai->unit_size - size_sum);
+		memcpy(ptr, __per_cpu_load, ai->static_size);
 	}
 
 	/* we're ready, commit */
 	pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
-		PFN_DOWN(size_sum), base, static_size, reserved_size, dyn_size,
-		unit_size);
+		PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
+		ai->dyn_size, ai->unit_size);
 
-	return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
-				      unit_size, base, NULL);
+	ret = pcpu_setup_first_chunk(ai, base);
+out_free_ai:
+	pcpu_free_alloc_info(ai);
+	return ret;
 }
 #endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
 	  !CONFIG_HAVE_SETUP_PER_CPU_AREA */
@@ -1709,31 +1822,34 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 				     pcpu_fc_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct vm;
-	const size_t static_size = __per_cpu_end - __per_cpu_start;
-	ssize_t dyn_size = -1;
-	size_t size_sum, unit_size;
+	struct pcpu_alloc_info *ai;
 	char psize_str[16];
 	int unit_pages;
 	size_t pages_size;
 	struct page **pages;
-	unsigned int cpu;
-	int i, j;
+	int unit, i, j;
 	ssize_t ret;
 
 	snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
 
-	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
-	unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
-	unit_pages = unit_size >> PAGE_SHIFT;
+	ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL);
+	if (IS_ERR(ai))
+		return PTR_ERR(ai);
+	BUG_ON(ai->nr_groups != 1);
+	BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
+
+	unit_pages = ai->unit_size >> PAGE_SHIFT;
 
 	/* unaligned allocations can't be freed, round up to page size */
-	pages_size = PFN_ALIGN(unit_pages * nr_cpu_ids * sizeof(pages[0]));
+	pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
+			       sizeof(pages[0]));
 	pages = alloc_bootmem(pages_size);
 
 	/* allocate pages */
 	j = 0;
-	for_each_possible_cpu(cpu)
+	for (unit = 0; unit < num_possible_cpus(); unit++)
 		for (i = 0; i < unit_pages; i++) {
+			unsigned int cpu = ai->groups[0].cpu_map[unit];
 			void *ptr;
 
 			ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
@@ -1747,18 +1863,18 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 
 	/* allocate vm area, map the pages and copy static data */
 	vm.flags = VM_ALLOC;
-	vm.size = nr_cpu_ids * unit_size;
+	vm.size = num_possible_cpus() * ai->unit_size;
 	vm_area_register_early(&vm, PAGE_SIZE);
 
-	for_each_possible_cpu(cpu) {
+	for (unit = 0; unit < num_possible_cpus(); unit++) {
 		unsigned long unit_addr =
-			(unsigned long)vm.addr + cpu * unit_size;
+			(unsigned long)vm.addr + unit * ai->unit_size;
 
 		for (i = 0; i < unit_pages; i++)
 			populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
 
 		/* pte already populated, the following shouldn't fail */
-		ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages],
+		ret = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
 				       unit_pages);
 		if (ret < 0)
 			panic("failed to map percpu area, err=%zd\n", ret);
@@ -1772,16 +1888,15 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 		 */
 
 		/* copy static data */
-		memcpy((void *)unit_addr, __per_cpu_load, static_size);
+		memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
 	}
 
 	/* we're ready, commit */
 	pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",
-		unit_pages, psize_str, vm.addr, static_size, reserved_size,
-		dyn_size);
+		unit_pages, psize_str, vm.addr, ai->static_size,
+		ai->reserved_size, ai->dyn_size);
 
-	ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
-				     unit_size, vm.addr, NULL);
+	ret = pcpu_setup_first_chunk(ai, vm.addr);
 	goto out_free_ar;
 
 enomem:
@@ -1790,6 +1905,7 @@ enomem:
 	ret = -ENOMEM;
 out_free_ar:
 	free_bootmem(__pa(pages), pages_size);
+	pcpu_free_alloc_info(ai);
 	return ret;
 }
 #endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
@@ -1805,38 +1921,50 @@ static size_t pcpul_lpage_size;
 static int pcpul_nr_lpages;
 static struct pcpul_ent *pcpul_map;
 
-static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
+static bool __init pcpul_unit_to_cpu(int unit, const struct pcpu_alloc_info *ai,
 				     unsigned int *cpup)
 {
-	unsigned int cpu;
+	int group, cunit;
 
-	for_each_possible_cpu(cpu)
-		if (unit_map[cpu] == unit) {
+	for (group = 0, cunit = 0; group < ai->nr_groups; group++) {
+		const struct pcpu_group_info *gi = &ai->groups[group];
+
+		if (unit < cunit + gi->nr_units) {
 			if (cpup)
-				*cpup = cpu;
+				*cpup = gi->cpu_map[unit - cunit];
 			return true;
 		}
+		cunit += gi->nr_units;
+	}
 
 	return false;
 }
 
+static int __init pcpul_cpu_to_unit(int cpu, const struct pcpu_alloc_info *ai)
+{
+	int group, unit, i;
+
+	for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
+		const struct pcpu_group_info *gi = &ai->groups[group];
+
+		for (i = 0; i < gi->nr_units; i++)
+			if (gi->cpu_map[i] == cpu)
+				return unit + i;
+	}
+	BUG();
+}
+
 /**
  * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
- * @reserved_size: the size of reserved percpu area in bytes
- * @dyn_size: free size for dynamic allocation in bytes
- * @unit_size: unit size in bytes
- * @lpage_size: the size of a large page
- * @unit_map: cpu -> unit mapping
- * @nr_units: the number of units
+ * @ai: pcpu_alloc_info
  * @alloc_fn: function to allocate percpu lpage, always called with lpage_size
  * @free_fn: function to free percpu memory, @size <= lpage_size
  * @map_fn: function to map percpu lpage, always called with lpage_size
  *
  * This allocator uses large page to build and map the first chunk.
- * Unlike other helpers, the caller should always specify @dyn_size
- * and @unit_size.  These parameters along with @unit_map and
- * @nr_units can be determined using pcpu_lpage_build_unit_map().
- * This two stage initialization is to allow arch code to evaluate the
+ * Unlike other helpers, the caller should provide fully initialized
+ * @ai.  This can be done using pcpu_build_alloc_info().  This two
+ * stage initialization is to allow arch code to evaluate the
  * parameters before committing to it.
  *
  * Large pages are allocated as directed by @unit_map and other
@@ -1852,27 +1980,26 @@ static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access on success, -errno on failure.
  */
-ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size,
-				      size_t unit_size, size_t lpage_size,
-				      const int *unit_map, int nr_units,
+ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai,
 				      pcpu_fc_alloc_fn_t alloc_fn,
 				      pcpu_fc_free_fn_t free_fn,
 				      pcpu_fc_map_fn_t map_fn)
 {
 	static struct vm_struct vm;
-	const size_t static_size = __per_cpu_end - __per_cpu_start;
-	size_t chunk_size = unit_size * nr_units;
-	size_t map_size;
+	const size_t lpage_size = ai->atom_size;
+	size_t chunk_size, map_size;
 	unsigned int cpu;
 	ssize_t ret;
-	int i, j, unit;
+	int i, j, unit, nr_units;
 
-	pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size,
-			     unit_size, lpage_size, unit_map, nr_units);
+	nr_units = 0;
+	for (i = 0; i < ai->nr_groups; i++)
+		nr_units += ai->groups[i].nr_units;
 
+	chunk_size = ai->unit_size * nr_units;
 	BUG_ON(chunk_size % lpage_size);
 
-	pcpul_size = static_size + reserved_size + dyn_size;
+	pcpul_size = ai->static_size + ai->reserved_size + ai->dyn_size;
 	pcpul_lpage_size = lpage_size;
 	pcpul_nr_lpages = chunk_size / lpage_size;
 
@@ -1883,13 +2010,13 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size,
 	/* allocate all pages */
 	for (i = 0; i < pcpul_nr_lpages; i++) {
 		size_t offset = i * lpage_size;
-		int first_unit = offset / unit_size;
-		int last_unit = (offset + lpage_size - 1) / unit_size;
+		int first_unit = offset / ai->unit_size;
+		int last_unit = (offset + lpage_size - 1) / ai->unit_size;
 		void *ptr;
 
 		/* find out which cpu is mapped to this unit */
 		for (unit = first_unit; unit <= last_unit; unit++)
-			if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
+			if (pcpul_unit_to_cpu(unit, ai, &cpu))
 				goto found;
 		continue;
 	found:
@@ -1905,12 +2032,12 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size,
 
 	/* return unused holes */
 	for (unit = 0; unit < nr_units; unit++) {
-		size_t start = unit * unit_size;
-		size_t end = start + unit_size;
+		size_t start = unit * ai->unit_size;
+		size_t end = start + ai->unit_size;
 		size_t off, next;
 
 		/* don't free used part of occupied unit */
-		if (pcpul_unit_to_cpu(unit, unit_map, NULL))
+		if (pcpul_unit_to_cpu(unit, ai, NULL))
 			start += pcpul_size;
 
 		/* unit can span more than one page, punch the holes */
@@ -1925,7 +2052,7 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size,
 	/* allocate address, map and copy */
 	vm.flags = VM_ALLOC;
 	vm.size = chunk_size;
-	vm_area_register_early(&vm, unit_size);
+	vm_area_register_early(&vm, ai->unit_size);
 
 	for (i = 0; i < pcpul_nr_lpages; i++) {
 		if (!pcpul_map[i].ptr)
@@ -1935,15 +2062,15 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size,
 	}
 
 	for_each_possible_cpu(cpu)
-		memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load,
-		       static_size);
+		memcpy(vm.addr + pcpul_cpu_to_unit(cpu, ai) * ai->unit_size,
+		       __per_cpu_load, ai->static_size);
 
 	/* we're ready, commit */
 	pr_info("PERCPU: large pages @%p s%zu r%zu d%zu u%zu\n",
-		vm.addr, static_size, reserved_size, dyn_size, unit_size);
+		vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size,
+		ai->unit_size);
 
-	ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
-				     unit_size, vm.addr, unit_map);
+	ret = pcpu_setup_first_chunk(ai, vm.addr);
 
 	/*
 	 * Sort pcpul_map array for pcpu_lpage_remapped().  Unmapped
-- 
cgit v1.2.3-71-gd317


From fb435d5233f8b6f9b93c11d6304d8e98fed03234 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 15:00:51 +0900
Subject: percpu: add pcpu_unit_offsets[]

Currently units are mapped sequentially into address space.  This
patch adds pcpu_unit_offsets[] which allows units to be mapped to
arbitrary offsets from the chunk base address.  This is necessary to
allow sparse embedding which might would need to allocate address
ranges and memory areas which aren't aligned to unit size but
allocation atom size (page or large page size).  This also simplifies
things a bit by removing the need to calculate offset from unit
number.

With this change, there's no need for the arch code to know
pcpu_unit_size.  Update pcpu_setup_first_chunk() and first chunk
allocators to return regular 0 or -errno return code instead of unit
size or -errno.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: David S. Miller <davem@davemloft.net>
---
 arch/sparc/kernel/smp_64.c     | 12 +++---
 arch/x86/kernel/setup_percpu.c | 51 ++++++++++-------------
 include/linux/percpu.h         | 16 ++++---
 mm/percpu.c                    | 95 +++++++++++++++++++++---------------------
 4 files changed, 84 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index a42a4a744d14..b03fd362c629 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1478,9 +1478,10 @@ void __init setup_per_cpu_areas(void)
 	static struct vm_struct vm;
 	struct pcpu_alloc_info *ai;
 	unsigned long delta, cpu;
-	size_t size_sum, pcpu_unit_size;
+	size_t size_sum;
 	size_t ptrs_size;
 	void **ptrs;
+	int rc;
 
 	ai = pcpu_alloc_alloc_info(1, nr_cpu_ids);
 
@@ -1526,14 +1527,15 @@ void __init setup_per_cpu_areas(void)
 		pcpu_map_range(start, end, virt_to_page(ptrs[cpu]));
 	}
 
-	pcpu_unit_size = pcpu_setup_first_chunk(ai, vm.addr);
+	rc = pcpu_setup_first_chunk(ai, vm.addr);
+	if (rc)
+		panic("failed to setup percpu first chunk (%d)", rc);
 
 	free_bootmem(__pa(ptrs), ptrs_size);
 
 	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
-	for_each_possible_cpu(cpu) {
-		__per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
-	}
+	for_each_possible_cpu(cpu)
+		__per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
 
 	/* Setup %g5 for the boot cpu.  */
 	__local_per_cpu_offset = __per_cpu_offset(smp_processor_id());
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index db5f9c49fec5..9becc5d4b518 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -157,12 +157,12 @@ static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to)
 		return REMOTE_DISTANCE;
 }
 
-static ssize_t __init setup_pcpu_lpage(bool chosen)
+static int __init setup_pcpu_lpage(bool chosen)
 {
 	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
 	size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE;
 	struct pcpu_alloc_info *ai;
-	ssize_t ret;
+	int rc;
 
 	/* on non-NUMA, embedding is better */
 	if (!chosen && !pcpu_need_numa())
@@ -196,19 +196,18 @@ static ssize_t __init setup_pcpu_lpage(bool chosen)
 		if (tot_size > vm_size / 5) {
 			pr_info("PERCPU: too large chunk size %zuMB for "
 				"large page remap\n", tot_size >> 20);
-			ret = -EINVAL;
+			rc = -EINVAL;
 			goto out_free;
 		}
 	}
 
-	ret = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free,
-				     pcpul_map);
+	rc = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
 out_free:
 	pcpu_free_alloc_info(ai);
-	return ret;
+	return rc;
 }
 #else
-static ssize_t __init setup_pcpu_lpage(bool chosen)
+static int __init setup_pcpu_lpage(bool chosen)
 {
 	return -EINVAL;
 }
@@ -222,7 +221,7 @@ static ssize_t __init setup_pcpu_lpage(bool chosen)
  * mapping so that it can use PMD mapping without additional TLB
  * pressure.
  */
-static ssize_t __init setup_pcpu_embed(bool chosen)
+static int __init setup_pcpu_embed(bool chosen)
 {
 	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
 
@@ -250,7 +249,7 @@ static void __init pcpup_populate_pte(unsigned long addr)
 	populate_extra_pte(addr);
 }
 
-static ssize_t __init setup_pcpu_page(void)
+static int __init setup_pcpu_page(void)
 {
 	return pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
 				     pcpu_fc_alloc, pcpu_fc_free,
@@ -274,8 +273,7 @@ void __init setup_per_cpu_areas(void)
 {
 	unsigned int cpu;
 	unsigned long delta;
-	size_t pcpu_unit_size;
-	ssize_t ret;
+	int rc;
 
 	pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
 		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
@@ -285,36 +283,33 @@ void __init setup_per_cpu_areas(void)
 	 * of large page mappings.  Please read comments on top of
 	 * each allocator for details.
 	 */
-	ret = -EINVAL;
+	rc = -EINVAL;
 	if (pcpu_chosen_fc != PCPU_FC_AUTO) {
 		if (pcpu_chosen_fc != PCPU_FC_PAGE) {
 			if (pcpu_chosen_fc == PCPU_FC_LPAGE)
-				ret = setup_pcpu_lpage(true);
+				rc = setup_pcpu_lpage(true);
 			else
-				ret = setup_pcpu_embed(true);
+				rc = setup_pcpu_embed(true);
 
-			if (ret < 0)
-				pr_warning("PERCPU: %s allocator failed (%zd), "
+			if (rc < 0)
+				pr_warning("PERCPU: %s allocator failed (%d), "
 					   "falling back to page size\n",
-					   pcpu_fc_names[pcpu_chosen_fc], ret);
+					   pcpu_fc_names[pcpu_chosen_fc], rc);
 		}
 	} else {
-		ret = setup_pcpu_lpage(false);
-		if (ret < 0)
-			ret = setup_pcpu_embed(false);
+		rc = setup_pcpu_lpage(false);
+		if (rc < 0)
+			rc = setup_pcpu_embed(false);
 	}
-	if (ret < 0)
-		ret = setup_pcpu_page();
-	if (ret < 0)
-		panic("cannot initialize percpu area (err=%zd)", ret);
-
-	pcpu_unit_size = ret;
+	if (rc < 0)
+		rc = setup_pcpu_page();
+	if (rc < 0)
+		panic("cannot initialize percpu area (err=%d)", rc);
 
 	/* alrighty, percpu areas up and running */
 	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
 	for_each_possible_cpu(cpu) {
-		per_cpu_offset(cpu) =
-			delta + pcpu_unit_map[cpu] * pcpu_unit_size;
+		per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
 		per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
 		per_cpu(cpu_number, cpu) = cpu;
 		setup_percpu_segment(cpu);
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 77b86be8ce4f..a7ec840f596c 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -57,7 +57,7 @@
 #endif
 
 extern void *pcpu_base_addr;
-extern const int *pcpu_unit_map;
+extern const unsigned long *pcpu_unit_offsets;
 
 struct pcpu_group_info {
 	int			nr_units;	/* aligned # of units */
@@ -106,25 +106,23 @@ extern struct pcpu_alloc_info * __init pcpu_build_alloc_info(
 				size_t atom_size,
 				pcpu_fc_cpu_distance_fn_t cpu_distance_fn);
 
-extern size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
-					    void *base_addr);
+extern int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
+					 void *base_addr);
 
 #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
-extern ssize_t __init pcpu_embed_first_chunk(
-				size_t reserved_size, ssize_t dyn_size);
+extern int __init pcpu_embed_first_chunk(size_t reserved_size,
+					 ssize_t dyn_size);
 #endif
 
 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
-extern ssize_t __init pcpu_page_first_chunk(
-				size_t reserved_size,
+extern int __init pcpu_page_first_chunk(size_t reserved_size,
 				pcpu_fc_alloc_fn_t alloc_fn,
 				pcpu_fc_free_fn_t free_fn,
 				pcpu_fc_populate_pte_fn_t populate_pte_fn);
 #endif
 
 #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
-extern ssize_t __init pcpu_lpage_first_chunk(
-				const struct pcpu_alloc_info *ai,
+extern int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai,
 				pcpu_fc_alloc_fn_t alloc_fn,
 				pcpu_fc_free_fn_t free_fn,
 				pcpu_fc_map_fn_t map_fn);
diff --git a/mm/percpu.c b/mm/percpu.c
index 99f7fa682722..653b02c40200 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -117,8 +117,8 @@ static unsigned int pcpu_last_unit_cpu __read_mostly;
 void *pcpu_base_addr __read_mostly;
 EXPORT_SYMBOL_GPL(pcpu_base_addr);
 
-/* cpu -> unit map */
-const int *pcpu_unit_map __read_mostly;
+static const int *pcpu_unit_map __read_mostly;		/* cpu -> unit */
+const unsigned long *pcpu_unit_offsets __read_mostly;	/* cpu -> unit offset */
 
 /*
  * The first chunk which always exists.  Note that unlike other
@@ -196,8 +196,8 @@ static int pcpu_page_idx(unsigned int cpu, int page_idx)
 static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
 				     unsigned int cpu, int page_idx)
 {
-	return (unsigned long)chunk->vm->addr +
-		(pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
+	return (unsigned long)chunk->vm->addr + pcpu_unit_offsets[cpu] +
+		(page_idx << PAGE_SHIFT);
 }
 
 static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
@@ -341,7 +341,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 	 * space.  Note that any possible cpu id can be used here, so
 	 * there's no need to worry about preemption or cpu hotplug.
 	 */
-	addr += pcpu_unit_map[smp_processor_id()] * pcpu_unit_size;
+	addr += pcpu_unit_offsets[smp_processor_id()];
 	return pcpu_get_page_chunk(vmalloc_to_page(addr));
 }
 
@@ -1560,17 +1560,17 @@ static void pcpu_dump_alloc_info(const char *lvl,
  * and available for dynamic allocation like any other chunks.
  *
  * RETURNS:
- * The determined pcpu_unit_size which can be used to initialize
- * percpu access.
+ * 0 on success, -errno on failure.
  */
-size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
-				     void *base_addr)
+int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
+				  void *base_addr)
 {
 	static struct vm_struct first_vm;
 	static int smap[2], dmap[2];
 	size_t dyn_size = ai->dyn_size;
 	size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
 	struct pcpu_chunk *schunk, *dchunk = NULL;
+	unsigned long *unit_off;
 	unsigned int cpu;
 	int *unit_map;
 	int group, unit, i;
@@ -1587,8 +1587,9 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 
 	pcpu_dump_alloc_info(KERN_DEBUG, ai);
 
-	/* determine number of units and verify and initialize pcpu_unit_map */
+	/* determine number of units and initialize unit_map and base */
 	unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));
+	unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
 
 	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
 		unit_map[cpu] = NR_CPUS;
@@ -1606,6 +1607,8 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 			BUG_ON(unit_map[cpu] != NR_CPUS);
 
 			unit_map[cpu] = unit + i;
+			unit_off[cpu] = gi->base_offset + i * ai->unit_size;
+
 			if (pcpu_first_unit_cpu == NR_CPUS)
 				pcpu_first_unit_cpu = cpu;
 		}
@@ -1617,6 +1620,7 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 		BUG_ON(unit_map[cpu] == NR_CPUS);
 
 	pcpu_unit_map = unit_map;
+	pcpu_unit_offsets = unit_off;
 
 	/* determine basic parameters */
 	pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
@@ -1688,7 +1692,7 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 
 	/* we're done */
 	pcpu_base_addr = schunk->vm->addr;
-	return pcpu_unit_size;
+	return 0;
 }
 
 const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
@@ -1748,16 +1752,15 @@ early_param("percpu_alloc", percpu_alloc_setup);
  * size, the leftover is returned to the bootmem allocator.
  *
  * RETURNS:
- * The determined pcpu_unit_size which can be used to initialize
- * percpu access on success, -errno on failure.
+ * 0 on success, -errno on failure.
  */
-ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size)
+int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size)
 {
 	struct pcpu_alloc_info *ai;
 	size_t size_sum, chunk_size;
 	void *base;
 	int unit;
-	ssize_t ret;
+	int rc;
 
 	ai = pcpu_build_alloc_info(reserved_size, dyn_size, PAGE_SIZE, NULL);
 	if (IS_ERR(ai))
@@ -1773,7 +1776,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size)
 	if (!base) {
 		pr_warning("PERCPU: failed to allocate %zu bytes for "
 			   "embedding\n", chunk_size);
-		ret = -ENOMEM;
+		rc = -ENOMEM;
 		goto out_free_ai;
 	}
 
@@ -1790,10 +1793,10 @@ ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size)
 		PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
 		ai->dyn_size, ai->unit_size);
 
-	ret = pcpu_setup_first_chunk(ai, base);
+	rc = pcpu_setup_first_chunk(ai, base);
 out_free_ai:
 	pcpu_free_alloc_info(ai);
-	return ret;
+	return rc;
 }
 #endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
 	  !CONFIG_HAVE_SETUP_PER_CPU_AREA */
@@ -1813,13 +1816,12 @@ out_free_ai:
  * page-by-page into vmalloc area.
  *
  * RETURNS:
- * The determined pcpu_unit_size which can be used to initialize
- * percpu access on success, -errno on failure.
+ * 0 on success, -errno on failure.
  */
-ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
-				     pcpu_fc_alloc_fn_t alloc_fn,
-				     pcpu_fc_free_fn_t free_fn,
-				     pcpu_fc_populate_pte_fn_t populate_pte_fn)
+int __init pcpu_page_first_chunk(size_t reserved_size,
+				 pcpu_fc_alloc_fn_t alloc_fn,
+				 pcpu_fc_free_fn_t free_fn,
+				 pcpu_fc_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct vm;
 	struct pcpu_alloc_info *ai;
@@ -1827,8 +1829,7 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 	int unit_pages;
 	size_t pages_size;
 	struct page **pages;
-	int unit, i, j;
-	ssize_t ret;
+	int unit, i, j, rc;
 
 	snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
 
@@ -1874,10 +1875,10 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 			populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
 
 		/* pte already populated, the following shouldn't fail */
-		ret = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
-				       unit_pages);
-		if (ret < 0)
-			panic("failed to map percpu area, err=%zd\n", ret);
+		rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
+				      unit_pages);
+		if (rc < 0)
+			panic("failed to map percpu area, err=%d\n", rc);
 
 		/*
 		 * FIXME: Archs with virtual cache should flush local
@@ -1896,17 +1897,17 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 		unit_pages, psize_str, vm.addr, ai->static_size,
 		ai->reserved_size, ai->dyn_size);
 
-	ret = pcpu_setup_first_chunk(ai, vm.addr);
+	rc = pcpu_setup_first_chunk(ai, vm.addr);
 	goto out_free_ar;
 
 enomem:
 	while (--j >= 0)
 		free_fn(page_address(pages[j]), PAGE_SIZE);
-	ret = -ENOMEM;
+	rc = -ENOMEM;
 out_free_ar:
 	free_bootmem(__pa(pages), pages_size);
 	pcpu_free_alloc_info(ai);
-	return ret;
+	return rc;
 }
 #endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
 
@@ -1977,20 +1978,18 @@ static int __init pcpul_cpu_to_unit(int cpu, const struct pcpu_alloc_info *ai)
  * pcpu_lpage_remapped().
  *
  * RETURNS:
- * The determined pcpu_unit_size which can be used to initialize
- * percpu access on success, -errno on failure.
+ * 0 on success, -errno on failure.
  */
-ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai,
-				      pcpu_fc_alloc_fn_t alloc_fn,
-				      pcpu_fc_free_fn_t free_fn,
-				      pcpu_fc_map_fn_t map_fn)
+int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai,
+				  pcpu_fc_alloc_fn_t alloc_fn,
+				  pcpu_fc_free_fn_t free_fn,
+				  pcpu_fc_map_fn_t map_fn)
 {
 	static struct vm_struct vm;
 	const size_t lpage_size = ai->atom_size;
 	size_t chunk_size, map_size;
 	unsigned int cpu;
-	ssize_t ret;
-	int i, j, unit, nr_units;
+	int i, j, unit, nr_units, rc;
 
 	nr_units = 0;
 	for (i = 0; i < ai->nr_groups; i++)
@@ -2070,7 +2069,7 @@ ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai,
 		vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size,
 		ai->unit_size);
 
-	ret = pcpu_setup_first_chunk(ai, vm.addr);
+	rc = pcpu_setup_first_chunk(ai, vm.addr);
 
 	/*
 	 * Sort pcpul_map array for pcpu_lpage_remapped().  Unmapped
@@ -2094,7 +2093,7 @@ ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai,
 	while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr)
 		pcpul_nr_lpages--;
 
-	return ret;
+	return rc;
 
 enomem:
 	for (i = 0; i < pcpul_nr_lpages; i++)
@@ -2166,21 +2165,21 @@ EXPORT_SYMBOL(__per_cpu_offset);
 
 void __init setup_per_cpu_areas(void)
 {
-	ssize_t unit_size;
 	unsigned long delta;
 	unsigned int cpu;
+	int rc;
 
 	/*
 	 * Always reserve area for module percpu variables.  That's
 	 * what the legacy allocator did.
 	 */
-	unit_size = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
-					   PERCPU_DYNAMIC_RESERVE);
-	if (unit_size < 0)
+	rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
+				    PERCPU_DYNAMIC_RESERVE);
+	if (rc < 0)
 		panic("Failed to initialized percpu areas.");
 
 	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
 	for_each_possible_cpu(cpu)
-		__per_cpu_offset[cpu] = delta + cpu * unit_size;
+		__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
 }
 #endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
-- 
cgit v1.2.3-71-gd317


From ca23e405e06d5fffb005df004c72781f76062f51 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 15:00:52 +0900
Subject: vmalloc: implement pcpu_get_vm_areas()

To directly use spread NUMA memories for percpu units, percpu
allocator will be updated to allow sparsely mapping units in a chunk.
As the distances between units can be very large, this makes
allocating single vmap area for each chunk undesirable.  This patch
implements pcpu_get_vm_areas() and pcpu_free_vm_areas() which
allocates and frees sparse congruent vmap areas.

pcpu_get_vm_areas() take @offsets and @sizes array which define
distances and sizes of vmap areas.  It scans down from the top of
vmalloc area looking for the top-most address which can accomodate all
the areas.  The top-down scan is to avoid interacting with regular
vmallocs which can push up these congruent areas up little by little
ending up wasting address space and page table.

To speed up top-down scan, the highest possible address hint is
maintained.  Although the scan is linear from the hint, given the
usual large holes between memory addresses between NUMA nodes, the
scanning is highly likely to finish after finding the first hole for
the last unit which is scanned first.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Nick Piggin <npiggin@suse.de>
---
 include/linux/vmalloc.h |   6 +
 mm/vmalloc.c            | 293 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 299 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index a43ebec3a7b9..227c2a585e4f 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -115,4 +115,10 @@ extern rwlock_t vmlist_lock;
 extern struct vm_struct *vmlist;
 extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
 
+struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
+				     const size_t *sizes, int nr_vms,
+				     size_t align, gfp_t gfp_mask);
+
+void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms);
+
 #endif /* _LINUX_VMALLOC_H */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2eb461c3a46e..204b8243d8ab 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -265,6 +265,7 @@ struct vmap_area {
 static DEFINE_SPINLOCK(vmap_area_lock);
 static struct rb_root vmap_area_root = RB_ROOT;
 static LIST_HEAD(vmap_area_list);
+static unsigned long vmap_area_pcpu_hole;
 
 static struct vmap_area *__find_vmap_area(unsigned long addr)
 {
@@ -431,6 +432,15 @@ static void __free_vmap_area(struct vmap_area *va)
 	RB_CLEAR_NODE(&va->rb_node);
 	list_del_rcu(&va->list);
 
+	/*
+	 * Track the highest possible candidate for pcpu area
+	 * allocation.  Areas outside of vmalloc area can be returned
+	 * here too, consider only end addresses which fall inside
+	 * vmalloc area proper.
+	 */
+	if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
+		vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
+
 	call_rcu(&va->rcu_head, rcu_free_va);
 }
 
@@ -1038,6 +1048,9 @@ void __init vmalloc_init(void)
 		va->va_end = va->va_start + tmp->size;
 		__insert_vmap_area(va);
 	}
+
+	vmap_area_pcpu_hole = VMALLOC_END;
+
 	vmap_initialized = true;
 }
 
@@ -1821,6 +1834,286 @@ void free_vm_area(struct vm_struct *area)
 }
 EXPORT_SYMBOL_GPL(free_vm_area);
 
+static struct vmap_area *node_to_va(struct rb_node *n)
+{
+	return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
+}
+
+/**
+ * pvm_find_next_prev - find the next and prev vmap_area surrounding @end
+ * @end: target address
+ * @pnext: out arg for the next vmap_area
+ * @pprev: out arg for the previous vmap_area
+ *
+ * Returns: %true if either or both of next and prev are found,
+ *	    %false if no vmap_area exists
+ *
+ * Find vmap_areas end addresses of which enclose @end.  ie. if not
+ * NULL, *pnext->va_end > @end and *pprev->va_end <= @end.
+ */
+static bool pvm_find_next_prev(unsigned long end,
+			       struct vmap_area **pnext,
+			       struct vmap_area **pprev)
+{
+	struct rb_node *n = vmap_area_root.rb_node;
+	struct vmap_area *va = NULL;
+
+	while (n) {
+		va = rb_entry(n, struct vmap_area, rb_node);
+		if (end < va->va_end)
+			n = n->rb_left;
+		else if (end > va->va_end)
+			n = n->rb_right;
+		else
+			break;
+	}
+
+	if (!va)
+		return false;
+
+	if (va->va_end > end) {
+		*pnext = va;
+		*pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
+	} else {
+		*pprev = va;
+		*pnext = node_to_va(rb_next(&(*pprev)->rb_node));
+	}
+	return true;
+}
+
+/**
+ * pvm_determine_end - find the highest aligned address between two vmap_areas
+ * @pnext: in/out arg for the next vmap_area
+ * @pprev: in/out arg for the previous vmap_area
+ * @align: alignment
+ *
+ * Returns: determined end address
+ *
+ * Find the highest aligned address between *@pnext and *@pprev below
+ * VMALLOC_END.  *@pnext and *@pprev are adjusted so that the aligned
+ * down address is between the end addresses of the two vmap_areas.
+ *
+ * Please note that the address returned by this function may fall
+ * inside *@pnext vmap_area.  The caller is responsible for checking
+ * that.
+ */
+static unsigned long pvm_determine_end(struct vmap_area **pnext,
+				       struct vmap_area **pprev,
+				       unsigned long align)
+{
+	const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
+	unsigned long addr;
+
+	if (*pnext)
+		addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end);
+	else
+		addr = vmalloc_end;
+
+	while (*pprev && (*pprev)->va_end > addr) {
+		*pnext = *pprev;
+		*pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
+	}
+
+	return addr;
+}
+
+/**
+ * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
+ * @offsets: array containing offset of each area
+ * @sizes: array containing size of each area
+ * @nr_vms: the number of areas to allocate
+ * @align: alignment, all entries in @offsets and @sizes must be aligned to this
+ * @gfp_mask: allocation mask
+ *
+ * Returns: kmalloc'd vm_struct pointer array pointing to allocated
+ *	    vm_structs on success, %NULL on failure
+ *
+ * Percpu allocator wants to use congruent vm areas so that it can
+ * maintain the offsets among percpu areas.  This function allocates
+ * congruent vmalloc areas for it.  These areas tend to be scattered
+ * pretty far, distance between two areas easily going up to
+ * gigabytes.  To avoid interacting with regular vmallocs, these areas
+ * are allocated from top.
+ *
+ * Despite its complicated look, this allocator is rather simple.  It
+ * does everything top-down and scans areas from the end looking for
+ * matching slot.  While scanning, if any of the areas overlaps with
+ * existing vmap_area, the base address is pulled down to fit the
+ * area.  Scanning is repeated till all the areas fit and then all
+ * necessary data structres are inserted and the result is returned.
+ */
+struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
+				     const size_t *sizes, int nr_vms,
+				     size_t align, gfp_t gfp_mask)
+{
+	const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
+	const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
+	struct vmap_area **vas, *prev, *next;
+	struct vm_struct **vms;
+	int area, area2, last_area, term_area;
+	unsigned long base, start, end, last_end;
+	bool purged = false;
+
+	gfp_mask &= GFP_RECLAIM_MASK;
+
+	/* verify parameters and allocate data structures */
+	BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
+	for (last_area = 0, area = 0; area < nr_vms; area++) {
+		start = offsets[area];
+		end = start + sizes[area];
+
+		/* is everything aligned properly? */
+		BUG_ON(!IS_ALIGNED(offsets[area], align));
+		BUG_ON(!IS_ALIGNED(sizes[area], align));
+
+		/* detect the area with the highest address */
+		if (start > offsets[last_area])
+			last_area = area;
+
+		for (area2 = 0; area2 < nr_vms; area2++) {
+			unsigned long start2 = offsets[area2];
+			unsigned long end2 = start2 + sizes[area2];
+
+			if (area2 == area)
+				continue;
+
+			BUG_ON(start2 >= start && start2 < end);
+			BUG_ON(end2 <= end && end2 > start);
+		}
+	}
+	last_end = offsets[last_area] + sizes[last_area];
+
+	if (vmalloc_end - vmalloc_start < last_end) {
+		WARN_ON(true);
+		return NULL;
+	}
+
+	vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask);
+	vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask);
+	if (!vas || !vms)
+		goto err_free;
+
+	for (area = 0; area < nr_vms; area++) {
+		vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask);
+		vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask);
+		if (!vas[area] || !vms[area])
+			goto err_free;
+	}
+retry:
+	spin_lock(&vmap_area_lock);
+
+	/* start scanning - we scan from the top, begin with the last area */
+	area = term_area = last_area;
+	start = offsets[area];
+	end = start + sizes[area];
+
+	if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) {
+		base = vmalloc_end - last_end;
+		goto found;
+	}
+	base = pvm_determine_end(&next, &prev, align) - end;
+
+	while (true) {
+		BUG_ON(next && next->va_end <= base + end);
+		BUG_ON(prev && prev->va_end > base + end);
+
+		/*
+		 * base might have underflowed, add last_end before
+		 * comparing.
+		 */
+		if (base + last_end < vmalloc_start + last_end) {
+			spin_unlock(&vmap_area_lock);
+			if (!purged) {
+				purge_vmap_area_lazy();
+				purged = true;
+				goto retry;
+			}
+			goto err_free;
+		}
+
+		/*
+		 * If next overlaps, move base downwards so that it's
+		 * right below next and then recheck.
+		 */
+		if (next && next->va_start < base + end) {
+			base = pvm_determine_end(&next, &prev, align) - end;
+			term_area = area;
+			continue;
+		}
+
+		/*
+		 * If prev overlaps, shift down next and prev and move
+		 * base so that it's right below new next and then
+		 * recheck.
+		 */
+		if (prev && prev->va_end > base + start)  {
+			next = prev;
+			prev = node_to_va(rb_prev(&next->rb_node));
+			base = pvm_determine_end(&next, &prev, align) - end;
+			term_area = area;
+			continue;
+		}
+
+		/*
+		 * This area fits, move on to the previous one.  If
+		 * the previous one is the terminal one, we're done.
+		 */
+		area = (area + nr_vms - 1) % nr_vms;
+		if (area == term_area)
+			break;
+		start = offsets[area];
+		end = start + sizes[area];
+		pvm_find_next_prev(base + end, &next, &prev);
+	}
+found:
+	/* we've found a fitting base, insert all va's */
+	for (area = 0; area < nr_vms; area++) {
+		struct vmap_area *va = vas[area];
+
+		va->va_start = base + offsets[area];
+		va->va_end = va->va_start + sizes[area];
+		__insert_vmap_area(va);
+	}
+
+	vmap_area_pcpu_hole = base + offsets[last_area];
+
+	spin_unlock(&vmap_area_lock);
+
+	/* insert all vm's */
+	for (area = 0; area < nr_vms; area++)
+		insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
+				  pcpu_get_vm_areas);
+
+	kfree(vas);
+	return vms;
+
+err_free:
+	for (area = 0; area < nr_vms; area++) {
+		if (vas)
+			kfree(vas[area]);
+		if (vms)
+			kfree(vms[area]);
+	}
+	kfree(vas);
+	kfree(vms);
+	return NULL;
+}
+
+/**
+ * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
+ * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
+ * @nr_vms: the number of allocated areas
+ *
+ * Free vm_structs and the array allocated by pcpu_get_vm_areas().
+ */
+void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
+{
+	int i;
+
+	for (i = 0; i < nr_vms; i++)
+		free_vm_area(vms[i]);
+	kfree(vms);
+}
 
 #ifdef CONFIG_PROC_FS
 static void *s_start(struct seq_file *m, loff_t *pos)
-- 
cgit v1.2.3-71-gd317


From c8826dd538602d730ed2c18c6753f1bbfa6c4933 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 15:00:52 +0900
Subject: percpu: update embedding first chunk allocator to handle sparse units

Now that percpu core can handle very sparse units, given that vmalloc
space is large enough, embedding first chunk allocator can use any
memory to build the first chunk.  This patch teaches
pcpu_embed_first_chunk() about distances between cpus and to use
alloc/free callbacks to allocate node specific areas for each group
and use them for the first chunk.

This brings the benefits of embedding allocator to NUMA configurations
- no extra TLB pressure with the flexibility of unified dynamic
allocator and no need to restructure arch code to build memory layout
suitable for percpu.  With units put into atom_size aligned groups
according to cpu distances, using large page for dynamic chunks is
also easily possible with falling back to reuglar pages if large
allocation fails.

Embedding allocator users are converted to specify NULL
cpu_distance_fn, so this patch doesn't cause any visible behavior
difference.  Following patches will convert them.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/kernel/setup_percpu.c |   4 +-
 include/linux/percpu.h         |   7 ++-
 mm/percpu.c                    | 113 +++++++++++++++++++++++++++++++----------
 3 files changed, 93 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 9becc5d4b518..67f6314de9f1 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -234,7 +234,9 @@ static int __init setup_pcpu_embed(bool chosen)
 		return -EINVAL;
 
 	return pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
-				      reserve - PERCPU_FIRST_CHUNK_RESERVE);
+				      reserve - PERCPU_FIRST_CHUNK_RESERVE,
+				      PAGE_SIZE, NULL, pcpu_fc_alloc,
+				      pcpu_fc_free);
 }
 
 /*
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index a7ec840f596c..25359932740e 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -110,8 +110,11 @@ extern int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 					 void *base_addr);
 
 #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
-extern int __init pcpu_embed_first_chunk(size_t reserved_size,
-					 ssize_t dyn_size);
+extern int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
+				size_t atom_size,
+				pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
+				pcpu_fc_alloc_fn_t alloc_fn,
+				pcpu_fc_free_fn_t free_fn);
 #endif
 
 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
diff --git a/mm/percpu.c b/mm/percpu.c
index cc9c4c64606d..c2826d05505c 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1747,15 +1747,25 @@ early_param("percpu_alloc", percpu_alloc_setup);
  * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
  * @reserved_size: the size of reserved percpu area in bytes
  * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
+ * @atom_size: allocation atom size
+ * @cpu_distance_fn: callback to determine distance between cpus, optional
+ * @alloc_fn: function to allocate percpu page
+ * @free_fn: funtion to free percpu page
  *
  * This is a helper to ease setting up embedded first percpu chunk and
  * can be called where pcpu_setup_first_chunk() is expected.
  *
  * If this function is used to setup the first chunk, it is allocated
- * as a contiguous area using bootmem allocator and used as-is without
- * being mapped into vmalloc area.  This enables the first chunk to
- * piggy back on the linear physical mapping which often uses larger
- * page size.
+ * by calling @alloc_fn and used as-is without being mapped into
+ * vmalloc area.  Allocations are always whole multiples of @atom_size
+ * aligned to @atom_size.
+ *
+ * This enables the first chunk to piggy back on the linear physical
+ * mapping which often uses larger page size.  Please note that this
+ * can result in very sparse cpu->unit mapping on NUMA machines thus
+ * requiring large vmalloc address space.  Don't use this allocator if
+ * vmalloc space is not orders of magnitude larger than distances
+ * between node memory addresses (ie. 32bit NUMA machines).
  *
  * When @dyn_size is positive, dynamic area might be larger than
  * specified to fill page alignment.  When @dyn_size is auto,
@@ -1763,53 +1773,88 @@ early_param("percpu_alloc", percpu_alloc_setup);
  * and reserved areas.
  *
  * If the needed size is smaller than the minimum or specified unit
- * size, the leftover is returned to the bootmem allocator.
+ * size, the leftover is returned using @free_fn.
  *
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size)
+int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
+				  size_t atom_size,
+				  pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
+				  pcpu_fc_alloc_fn_t alloc_fn,
+				  pcpu_fc_free_fn_t free_fn)
 {
+	void *base = (void *)ULONG_MAX;
+	void **areas = NULL;
 	struct pcpu_alloc_info *ai;
-	size_t size_sum, chunk_size;
-	void *base;
-	int unit;
-	int rc;
+	size_t size_sum, areas_size;
+	int group, i, rc;
 
-	ai = pcpu_build_alloc_info(reserved_size, dyn_size, PAGE_SIZE, NULL);
+	ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
+				   cpu_distance_fn);
 	if (IS_ERR(ai))
 		return PTR_ERR(ai);
-	BUG_ON(ai->nr_groups != 1);
-	BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
 
 	size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
-	chunk_size = ai->unit_size * num_possible_cpus();
+	areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
 
-	base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
-				       __pa(MAX_DMA_ADDRESS));
-	if (!base) {
-		pr_warning("PERCPU: failed to allocate %zu bytes for "
-			   "embedding\n", chunk_size);
+	areas = alloc_bootmem_nopanic(areas_size);
+	if (!areas) {
 		rc = -ENOMEM;
-		goto out_free_ai;
+		goto out_free;
 	}
 
-	/* return the leftover and copy */
-	for (unit = 0; unit < num_possible_cpus(); unit++) {
-		void *ptr = base + unit * ai->unit_size;
+	/* allocate, copy and determine base address */
+	for (group = 0; group < ai->nr_groups; group++) {
+		struct pcpu_group_info *gi = &ai->groups[group];
+		unsigned int cpu = NR_CPUS;
+		void *ptr;
+
+		for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
+			cpu = gi->cpu_map[i];
+		BUG_ON(cpu == NR_CPUS);
+
+		/* allocate space for the whole group */
+		ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
+		if (!ptr) {
+			rc = -ENOMEM;
+			goto out_free_areas;
+		}
+		areas[group] = ptr;
 
-		free_bootmem(__pa(ptr + size_sum), ai->unit_size - size_sum);
-		memcpy(ptr, __per_cpu_load, ai->static_size);
+		base = min(ptr, base);
+
+		for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
+			if (gi->cpu_map[i] == NR_CPUS) {
+				/* unused unit, free whole */
+				free_fn(ptr, ai->unit_size);
+				continue;
+			}
+			/* copy and return the unused part */
+			memcpy(ptr, __per_cpu_load, ai->static_size);
+			free_fn(ptr + size_sum, ai->unit_size - size_sum);
+		}
 	}
 
-	/* we're ready, commit */
+	/* base address is now known, determine group base offsets */
+	for (group = 0; group < ai->nr_groups; group++)
+		ai->groups[group].base_offset = areas[group] - base;
+
 	pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
 		PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
 		ai->dyn_size, ai->unit_size);
 
 	rc = pcpu_setup_first_chunk(ai, base);
-out_free_ai:
+	goto out_free;
+
+out_free_areas:
+	for (group = 0; group < ai->nr_groups; group++)
+		free_fn(areas[group],
+			ai->groups[group].nr_units * ai->unit_size);
+out_free:
 	pcpu_free_alloc_info(ai);
+	if (areas)
+		free_bootmem(__pa(areas), areas_size);
 	return rc;
 }
 #endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
@@ -2177,6 +2222,17 @@ void *pcpu_lpage_remapped(void *kaddr)
 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(__per_cpu_offset);
 
+static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
+				       size_t align)
+{
+	return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
+}
+
+static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
+{
+	free_bootmem(__pa(ptr), size);
+}
+
 void __init setup_per_cpu_areas(void)
 {
 	unsigned long delta;
@@ -2188,7 +2244,8 @@ void __init setup_per_cpu_areas(void)
 	 * what the legacy allocator did.
 	 */
 	rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
-				    PERCPU_DYNAMIC_RESERVE);
+				    PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
+				    pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
 	if (rc < 0)
 		panic("Failed to initialized percpu areas.");
 
-- 
cgit v1.2.3-71-gd317


From e933a73f48e3b2d40cfa56d81e2646f194b5a66a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 15:00:53 +0900
Subject: percpu: kill lpage first chunk allocator

With x86 converted to embedding allocator, lpage doesn't have any user
left.  Kill it along with cpa handling code.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jan Beulich <JBeulich@novell.com>
---
 Documentation/kernel-parameters.txt |  10 +-
 arch/x86/mm/pageattr.c              |  20 +--
 include/linux/percpu.h              |  16 ---
 mm/percpu.c                         | 241 ------------------------------------
 4 files changed, 6 insertions(+), 281 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index dee9ce2e6cfa..e710093e3d32 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1920,11 +1920,11 @@ and is between 256 and 4096 characters. It is defined in the file
 			See arch/parisc/kernel/pdc_chassis.c
 
 	percpu_alloc=	Select which percpu first chunk allocator to use.
-			Currently supported values are "embed", "page" and
-			"lpage".  Archs may support subset or none of the
-			selections.  See comments in mm/percpu.c for details
-			on each allocator.  This parameter is primarily	for
-			debugging and performance comparison.
+			Currently supported values are "embed" and "page".
+			Archs may support subset or none of the	selections.
+			See comments in mm/percpu.c for details on each
+			allocator.  This parameter is primarily	for debugging
+			and performance comparison.
 
 	pf.		[PARIDE]
 			See Documentation/blockdev/paride.txt.
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index dce282f65700..f53cfc7f963d 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -687,7 +687,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
 {
 	struct cpa_data alias_cpa;
 	unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
-	unsigned long vaddr, remapped;
+	unsigned long vaddr;
 	int ret;
 
 	if (cpa->pfn >= max_pfn_mapped)
@@ -745,24 +745,6 @@ static int cpa_process_alias(struct cpa_data *cpa)
 	}
 #endif
 
-	/*
-	 * If the PMD page was partially used for per-cpu remapping,
-	 * the recycled area needs to be split and modified.  Because
-	 * the area is always proper subset of a PMD page
-	 * cpa->numpages is guaranteed to be 1 for these areas, so
-	 * there's no need to loop over and check for further remaps.
-	 */
-	remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr);
-	if (remapped) {
-		WARN_ON(cpa->numpages > 1);
-		alias_cpa = *cpa;
-		alias_cpa.vaddr = &remapped;
-		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
-		ret = __change_page_attr_set_clr(&alias_cpa, 0);
-		if (ret)
-			return ret;
-	}
-
 	return 0;
 }
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 25359932740e..878836ca999c 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -82,7 +82,6 @@ enum pcpu_fc {
 	PCPU_FC_AUTO,
 	PCPU_FC_EMBED,
 	PCPU_FC_PAGE,
-	PCPU_FC_LPAGE,
 
 	PCPU_FC_NR,
 };
@@ -95,7 +94,6 @@ typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size,
 typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
 typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
 typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to);
-typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr);
 
 extern struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
 							     int nr_units);
@@ -124,20 +122,6 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size,
 				pcpu_fc_populate_pte_fn_t populate_pte_fn);
 #endif
 
-#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
-extern int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai,
-				pcpu_fc_alloc_fn_t alloc_fn,
-				pcpu_fc_free_fn_t free_fn,
-				pcpu_fc_map_fn_t map_fn);
-
-extern void *pcpu_lpage_remapped(void *kaddr);
-#else
-static inline void *pcpu_lpage_remapped(void *kaddr)
-{
-	return NULL;
-}
-#endif
-
 /*
  * Use this to get to a cpu's version of the per-cpu object
  * dynamically allocated. Non-atomic access to the current CPU's
diff --git a/mm/percpu.c b/mm/percpu.c
index c2826d05505c..77933928107d 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1713,7 +1713,6 @@ const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
 	[PCPU_FC_AUTO]	= "auto",
 	[PCPU_FC_EMBED]	= "embed",
 	[PCPU_FC_PAGE]	= "page",
-	[PCPU_FC_LPAGE]	= "lpage",
 };
 
 enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
@@ -1729,10 +1728,6 @@ static int __init percpu_alloc_setup(char *str)
 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
 	else if (!strcmp(str, "page"))
 		pcpu_chosen_fc = PCPU_FC_PAGE;
-#endif
-#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
-	else if (!strcmp(str, "lpage"))
-		pcpu_chosen_fc = PCPU_FC_LPAGE;
 #endif
 	else
 		pr_warning("PERCPU: unknown allocator %s specified\n", str);
@@ -1970,242 +1965,6 @@ out_free_ar:
 }
 #endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
 
-#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
-struct pcpul_ent {
-	void		*ptr;
-	void		*map_addr;
-};
-
-static size_t pcpul_size;
-static size_t pcpul_lpage_size;
-static int pcpul_nr_lpages;
-static struct pcpul_ent *pcpul_map;
-
-static bool __init pcpul_unit_to_cpu(int unit, const struct pcpu_alloc_info *ai,
-				     unsigned int *cpup)
-{
-	int group, cunit;
-
-	for (group = 0, cunit = 0; group < ai->nr_groups; group++) {
-		const struct pcpu_group_info *gi = &ai->groups[group];
-
-		if (unit < cunit + gi->nr_units) {
-			if (cpup)
-				*cpup = gi->cpu_map[unit - cunit];
-			return true;
-		}
-		cunit += gi->nr_units;
-	}
-
-	return false;
-}
-
-static int __init pcpul_cpu_to_unit(int cpu, const struct pcpu_alloc_info *ai)
-{
-	int group, unit, i;
-
-	for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
-		const struct pcpu_group_info *gi = &ai->groups[group];
-
-		for (i = 0; i < gi->nr_units; i++)
-			if (gi->cpu_map[i] == cpu)
-				return unit + i;
-	}
-	BUG();
-}
-
-/**
- * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
- * @ai: pcpu_alloc_info
- * @alloc_fn: function to allocate percpu lpage, always called with lpage_size
- * @free_fn: function to free percpu memory, @size <= lpage_size
- * @map_fn: function to map percpu lpage, always called with lpage_size
- *
- * This allocator uses large page to build and map the first chunk.
- * Unlike other helpers, the caller should provide fully initialized
- * @ai.  This can be done using pcpu_build_alloc_info().  This two
- * stage initialization is to allow arch code to evaluate the
- * parameters before committing to it.
- *
- * Large pages are allocated as directed by @unit_map and other
- * parameters and mapped to vmalloc space.  Unused holes are returned
- * to the page allocator.  Note that these holes end up being actively
- * mapped twice - once to the physical mapping and to the vmalloc area
- * for the first percpu chunk.  Depending on architecture, this might
- * cause problem when changing page attributes of the returned area.
- * These double mapped areas can be detected using
- * pcpu_lpage_remapped().
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai,
-				  pcpu_fc_alloc_fn_t alloc_fn,
-				  pcpu_fc_free_fn_t free_fn,
-				  pcpu_fc_map_fn_t map_fn)
-{
-	static struct vm_struct vm;
-	const size_t lpage_size = ai->atom_size;
-	size_t chunk_size, map_size;
-	unsigned int cpu;
-	int i, j, unit, nr_units, rc;
-
-	nr_units = 0;
-	for (i = 0; i < ai->nr_groups; i++)
-		nr_units += ai->groups[i].nr_units;
-
-	chunk_size = ai->unit_size * nr_units;
-	BUG_ON(chunk_size % lpage_size);
-
-	pcpul_size = ai->static_size + ai->reserved_size + ai->dyn_size;
-	pcpul_lpage_size = lpage_size;
-	pcpul_nr_lpages = chunk_size / lpage_size;
-
-	/* allocate pointer array and alloc large pages */
-	map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]);
-	pcpul_map = alloc_bootmem(map_size);
-
-	/* allocate all pages */
-	for (i = 0; i < pcpul_nr_lpages; i++) {
-		size_t offset = i * lpage_size;
-		int first_unit = offset / ai->unit_size;
-		int last_unit = (offset + lpage_size - 1) / ai->unit_size;
-		void *ptr;
-
-		/* find out which cpu is mapped to this unit */
-		for (unit = first_unit; unit <= last_unit; unit++)
-			if (pcpul_unit_to_cpu(unit, ai, &cpu))
-				goto found;
-		continue;
-	found:
-		ptr = alloc_fn(cpu, lpage_size, lpage_size);
-		if (!ptr) {
-			pr_warning("PERCPU: failed to allocate large page "
-				   "for cpu%u\n", cpu);
-			goto enomem;
-		}
-
-		pcpul_map[i].ptr = ptr;
-	}
-
-	/* return unused holes */
-	for (unit = 0; unit < nr_units; unit++) {
-		size_t start = unit * ai->unit_size;
-		size_t end = start + ai->unit_size;
-		size_t off, next;
-
-		/* don't free used part of occupied unit */
-		if (pcpul_unit_to_cpu(unit, ai, NULL))
-			start += pcpul_size;
-
-		/* unit can span more than one page, punch the holes */
-		for (off = start; off < end; off = next) {
-			void *ptr = pcpul_map[off / lpage_size].ptr;
-			next = min(roundup(off + 1, lpage_size), end);
-			if (ptr)
-				free_fn(ptr + off % lpage_size, next - off);
-		}
-	}
-
-	/* allocate address, map and copy */
-	vm.flags = VM_ALLOC;
-	vm.size = chunk_size;
-	vm_area_register_early(&vm, ai->unit_size);
-
-	for (i = 0; i < pcpul_nr_lpages; i++) {
-		if (!pcpul_map[i].ptr)
-			continue;
-		pcpul_map[i].map_addr = vm.addr + i * lpage_size;
-		map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr);
-	}
-
-	for_each_possible_cpu(cpu)
-		memcpy(vm.addr + pcpul_cpu_to_unit(cpu, ai) * ai->unit_size,
-		       __per_cpu_load, ai->static_size);
-
-	/* we're ready, commit */
-	pr_info("PERCPU: large pages @%p s%zu r%zu d%zu u%zu\n",
-		vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size,
-		ai->unit_size);
-
-	rc = pcpu_setup_first_chunk(ai, vm.addr);
-
-	/*
-	 * Sort pcpul_map array for pcpu_lpage_remapped().  Unmapped
-	 * lpages are pushed to the end and trimmed.
-	 */
-	for (i = 0; i < pcpul_nr_lpages - 1; i++)
-		for (j = i + 1; j < pcpul_nr_lpages; j++) {
-			struct pcpul_ent tmp;
-
-			if (!pcpul_map[j].ptr)
-				continue;
-			if (pcpul_map[i].ptr &&
-			    pcpul_map[i].ptr < pcpul_map[j].ptr)
-				continue;
-
-			tmp = pcpul_map[i];
-			pcpul_map[i] = pcpul_map[j];
-			pcpul_map[j] = tmp;
-		}
-
-	while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr)
-		pcpul_nr_lpages--;
-
-	return rc;
-
-enomem:
-	for (i = 0; i < pcpul_nr_lpages; i++)
-		if (pcpul_map[i].ptr)
-			free_fn(pcpul_map[i].ptr, lpage_size);
-	free_bootmem(__pa(pcpul_map), map_size);
-	return -ENOMEM;
-}
-
-/**
- * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
- * @kaddr: the kernel address in question
- *
- * Determine whether @kaddr falls in the pcpul recycled area.  This is
- * used by pageattr to detect VM aliases and break up the pcpu large
- * page mapping such that the same physical page is not mapped under
- * different attributes.
- *
- * The recycled area is always at the tail of a partially used large
- * page.
- *
- * RETURNS:
- * Address of corresponding remapped pcpu address if match is found;
- * otherwise, NULL.
- */
-void *pcpu_lpage_remapped(void *kaddr)
-{
-	unsigned long lpage_mask = pcpul_lpage_size - 1;
-	void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask);
-	unsigned long offset = (unsigned long)kaddr & lpage_mask;
-	int left = 0, right = pcpul_nr_lpages - 1;
-	int pos;
-
-	/* pcpul in use at all? */
-	if (!pcpul_map)
-		return NULL;
-
-	/* okay, perform binary search */
-	while (left <= right) {
-		pos = (left + right) / 2;
-
-		if (pcpul_map[pos].ptr < lpage_addr)
-			left = pos + 1;
-		else if (pcpul_map[pos].ptr > lpage_addr)
-			right = pos - 1;
-		else
-			return pcpul_map[pos].map_addr + offset;
-	}
-
-	return NULL;
-}
-#endif /* CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK */
-
 /*
  * Generic percpu area setup.
  *
-- 
cgit v1.2.3-71-gd317


From 31089c13bcb18d2cd2a3ddfbe3a28666346f237e Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Fri, 14 Aug 2009 15:47:18 +0200
Subject: timekeeping: Introduce timekeeping_leap_insert

Move the adjustment of xtime, wall_to_monotonic and the update of the
vsyscall variables to the timekeeping code.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
LKML-Reference: <20090814134807.609730216@de.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/time.h      | 1 +
 kernel/time/ntp.c         | 7 ++-----
 kernel/time/timekeeping.c | 7 +++++++
 3 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index ea16c1a01d51..e7c844558884 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -147,6 +147,7 @@ extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
 extern int timekeeping_valid_for_hres(void);
 extern void update_wall_time(void);
 extern void update_xtime_cache(u64 nsec);
+extern void timekeeping_leap_insert(int leapsecond);
 
 struct tms;
 extern void do_sys_times(struct tms *);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7fc64375ff43..4800f933910e 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -194,8 +194,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
 	case TIME_OK:
 		break;
 	case TIME_INS:
-		xtime.tv_sec--;
-		wall_to_monotonic.tv_sec++;
+		timekeeping_leap_insert(-1);
 		time_state = TIME_OOP;
 		printk(KERN_NOTICE
 			"Clock: inserting leap second 23:59:60 UTC\n");
@@ -203,9 +202,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
 		res = HRTIMER_RESTART;
 		break;
 	case TIME_DEL:
-		xtime.tv_sec++;
+		timekeeping_leap_insert(1);
 		time_tai--;
-		wall_to_monotonic.tv_sec--;
 		time_state = TIME_WAIT;
 		printk(KERN_NOTICE
 			"Clock: deleting leap second 23:59:59 UTC\n");
@@ -219,7 +217,6 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
 			time_state = TIME_OK;
 		break;
 	}
-	update_vsyscall(&xtime, clock);
 
 	write_sequnlock(&xtime_lock);
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 02c0b2c9c674..b8b70fb545fc 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -58,6 +58,13 @@ void update_xtime_cache(u64 nsec)
 
 struct clocksource *clock;
 
+/* must hold xtime_lock */
+void timekeeping_leap_insert(int leapsecond)
+{
+	xtime.tv_sec += leapsecond;
+	wall_to_monotonic.tv_sec -= leapsecond;
+	update_vsyscall(&xtime, clock);
+}
 
 #ifdef CONFIG_GENERIC_TIME
 /**
-- 
cgit v1.2.3-71-gd317


From a0f7d48bfb95a4c5172a2756dbc4b82afc8e9ae4 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 14 Aug 2009 15:47:19 +0200
Subject: timekeeping: Remove clocksource inline functions

The three inline functions clocksource_read, clocksource_enable and
clocksource_disable are simple wrappers of an indirect call plus the
copy from and to the mult_orig value. The functions are exclusively
used by the timekeeping code which has intimate knowledge of the
clocksource anyway. Therefore remove the inline functions. No
functional change.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Acked-by: John Stultz <johnstul@us.ibm.com>
Cc: Daniel Walker <dwalker@fifo99.com>
LKML-Reference: <20090814134807.903108946@de.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clocksource.h | 58 ---------------------------------------------
 kernel/time/timekeeping.c   | 41 ++++++++++++++++++++++----------
 2 files changed, 28 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 1219be4fb42e..a1ef46f61c81 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -267,64 +267,6 @@ static inline u32 clocksource_hz2mult(u32 hz, u32 shift_constant)
 	return (u32)tmp;
 }
 
-/**
- * clocksource_read: - Access the clocksource's current cycle value
- * @cs:		pointer to clocksource being read
- *
- * Uses the clocksource to return the current cycle_t value
- */
-static inline cycle_t clocksource_read(struct clocksource *cs)
-{
-	return cs->read(cs);
-}
-
-/**
- * clocksource_enable: - enable clocksource
- * @cs:		pointer to clocksource
- *
- * Enables the specified clocksource. The clocksource callback
- * function should start up the hardware and setup mult and field
- * members of struct clocksource to reflect hardware capabilities.
- */
-static inline int clocksource_enable(struct clocksource *cs)
-{
-	int ret = 0;
-
-	if (cs->enable)
-		ret = cs->enable(cs);
-
-	/*
-	 * The frequency may have changed while the clocksource
-	 * was disabled. If so the code in ->enable() must update
-	 * the mult value to reflect the new frequency. Make sure
-	 * mult_orig follows this change.
-	 */
-	cs->mult_orig = cs->mult;
-
-	return ret;
-}
-
-/**
- * clocksource_disable: - disable clocksource
- * @cs:		pointer to clocksource
- *
- * Disables the specified clocksource. The clocksource callback
- * function should power down the now unused hardware block to
- * save power.
- */
-static inline void clocksource_disable(struct clocksource *cs)
-{
-	/*
-	 * Save mult_orig in mult so clocksource_enable() can
-	 * restore the value regardless if ->enable() updates
-	 * the value of mult or not.
-	 */
-	cs->mult = cs->mult_orig;
-
-	if (cs->disable)
-		cs->disable(cs);
-}
-
 /**
  * cyc2ns - converts clocksource cycles to nanoseconds
  * @cs:		Pointer to clocksource
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index b8b70fb545fc..016a2591d719 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -79,7 +79,7 @@ static void clocksource_forward_now(void)
 	cycle_t cycle_now, cycle_delta;
 	s64 nsec;
 
-	cycle_now = clocksource_read(clock);
+	cycle_now = clock->read(clock);
 	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
 	clock->cycle_last = cycle_now;
 
@@ -114,7 +114,7 @@ void getnstimeofday(struct timespec *ts)
 		*ts = xtime;
 
 		/* read clocksource: */
-		cycle_now = clocksource_read(clock);
+		cycle_now = clock->read(clock);
 
 		/* calculate the delta since the last update_wall_time: */
 		cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
@@ -146,7 +146,7 @@ ktime_t ktime_get(void)
 		nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
 
 		/* read clocksource: */
-		cycle_now = clocksource_read(clock);
+		cycle_now = clock->read(clock);
 
 		/* calculate the delta since the last update_wall_time: */
 		cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
@@ -186,7 +186,7 @@ void ktime_get_ts(struct timespec *ts)
 		tomono = wall_to_monotonic;
 
 		/* read clocksource: */
-		cycle_now = clocksource_read(clock);
+		cycle_now = clock->read(clock);
 
 		/* calculate the delta since the last update_wall_time: */
 		cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
@@ -274,16 +274,29 @@ static void change_clocksource(void)
 
 	clocksource_forward_now();
 
-	if (clocksource_enable(new))
+	if (new->enable && !new->enable(new))
 		return;
+	/*
+	 * The frequency may have changed while the clocksource
+	 * was disabled. If so the code in ->enable() must update
+	 * the mult value to reflect the new frequency. Make sure
+	 * mult_orig follows this change.
+	 */
+	new->mult_orig = new->mult;
 
 	new->raw_time = clock->raw_time;
 	old = clock;
 	clock = new;
-	clocksource_disable(old);
+	/*
+	 * Save mult_orig in mult so that the value can be restored
+	 * regardless if ->enable() updates the value of mult or not.
+	 */
+	old->mult = old->mult_orig;
+	if (old->disable)
+		old->disable(old);
 
 	clock->cycle_last = 0;
-	clock->cycle_last = clocksource_read(clock);
+	clock->cycle_last = clock->read(clock);
 	clock->error = 0;
 	clock->xtime_nsec = 0;
 	clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
@@ -373,7 +386,7 @@ void getrawmonotonic(struct timespec *ts)
 		seq = read_seqbegin(&xtime_lock);
 
 		/* read clocksource: */
-		cycle_now = clocksource_read(clock);
+		cycle_now = clock->read(clock);
 
 		/* calculate the delta since the last update_wall_time: */
 		cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
@@ -435,9 +448,12 @@ void __init timekeeping_init(void)
 	ntp_init();
 
 	clock = clocksource_get_next();
-	clocksource_enable(clock);
+	if (clock->enable)
+		clock->enable(clock);
+	/* set mult_orig on enable */
+	clock->mult_orig = clock->mult;
 	clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
-	clock->cycle_last = clocksource_read(clock);
+	clock->cycle_last = clock->read(clock);
 
 	xtime.tv_sec = sec;
 	xtime.tv_nsec = 0;
@@ -477,8 +493,7 @@ static int timekeeping_resume(struct sys_device *dev)
 	}
 	update_xtime_cache(0);
 	/* re-base the last cycle value */
-	clock->cycle_last = 0;
-	clock->cycle_last = clocksource_read(clock);
+	clock->cycle_last = clock->read(clock);
 	clock->error = 0;
 	timekeeping_suspended = 0;
 	write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -630,7 +645,7 @@ void update_wall_time(void)
 		return;
 
 #ifdef CONFIG_GENERIC_TIME
-	offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
+	offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
 #else
 	offset = clock->cycle_interval;
 #endif
-- 
cgit v1.2.3-71-gd317


From f1b82746c1e93daf24e1ab9bfbd39bcdb2e7018b Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 14 Aug 2009 15:47:21 +0200
Subject: clocksource: Cleanup clocksource selection

If a non high-resolution clocksource is first set as override clock
and then registered it becomes active even if the system is in one-shot
mode. Move the override check from sysfs_override_clocksource to the
clocksource selection. That fixes the bug and simplifies the code. The
check in clocksource_register for double registration of the same
clocksource is removed without replacement.

To find the initial clocksource a new weak function in jiffies.c is
defined that returns the jiffies clocksource. The architecture code
can then override the weak function with a more suitable clocksource,
e.g. the TOD clock on s390.

[ tglx: Folded in a fix from John Stultz ]

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Acked-by: John Stultz <johnstul@us.ibm.com>
Cc: Daniel Walker <dwalker@fifo99.com>
LKML-Reference: <20090814134808.388024160@de.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/s390/kernel/time.c     |   4 ++
 include/linux/clocksource.h |   2 +
 kernel/time/clocksource.c   | 134 +++++++++++++++++---------------------------
 kernel/time/jiffies.c       |   6 +-
 kernel/time/timekeeping.c   |   4 +-
 5 files changed, 64 insertions(+), 86 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index d4c8e9c47c81..afefe514df0f 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -205,6 +205,10 @@ static struct clocksource clocksource_tod = {
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
+struct clocksource * __init clocksource_default_clock(void)
+{
+	return &clocksource_tod;
+}
 
 void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
 {
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index a1ef46f61c81..f263b3abf46e 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -14,6 +14,7 @@
 #include <linux/list.h>
 #include <linux/cache.h>
 #include <linux/timer.h>
+#include <linux/init.h>
 #include <asm/div64.h>
 #include <asm/io.h>
 
@@ -322,6 +323,7 @@ extern void clocksource_touch_watchdog(void);
 extern struct clocksource* clocksource_get_next(void);
 extern void clocksource_change_rating(struct clocksource *cs, int rating);
 extern void clocksource_resume(void);
+extern struct clocksource * __init __weak clocksource_default_clock(void);
 
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL
 extern void update_vsyscall(struct timespec *ts, struct clocksource *c);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 7466cb811251..e91662e87cde 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -21,7 +21,6 @@
  *
  * TODO WishList:
  *   o Allow clocksource drivers to be unregistered
- *   o get rid of clocksource_jiffies extern
  */
 
 #include <linux/clocksource.h>
@@ -107,12 +106,9 @@ u64 timecounter_cyc2time(struct timecounter *tc,
 }
 EXPORT_SYMBOL(timecounter_cyc2time);
 
-/* XXX - Would like a better way for initializing curr_clocksource */
-extern struct clocksource clocksource_jiffies;
-
 /*[Clocksource internal variables]---------
  * curr_clocksource:
- *	currently selected clocksource. Initialized to clocksource_jiffies.
+ *	currently selected clocksource.
  * next_clocksource:
  *	pending next selected clocksource.
  * clocksource_list:
@@ -123,9 +119,8 @@ extern struct clocksource clocksource_jiffies;
  * override_name:
  *	Name of the user-specified clocksource.
  */
-static struct clocksource *curr_clocksource = &clocksource_jiffies;
+static struct clocksource *curr_clocksource;
 static struct clocksource *next_clocksource;
-static struct clocksource *clocksource_override;
 static LIST_HEAD(clocksource_list);
 static DEFINE_SPINLOCK(clocksource_lock);
 static char override_name[32];
@@ -320,6 +315,7 @@ void clocksource_touch_watchdog(void)
 	clocksource_resume_watchdog();
 }
 
+#ifdef CONFIG_GENERIC_TIME
 /**
  * clocksource_get_next - Returns the selected clocksource
  *
@@ -339,56 +335,65 @@ struct clocksource *clocksource_get_next(void)
 }
 
 /**
- * select_clocksource - Selects the best registered clocksource.
+ * clocksource_select - Select the best clocksource available
  *
  * Private function. Must hold clocksource_lock when called.
  *
  * Select the clocksource with the best rating, or the clocksource,
  * which is selected by userspace override.
  */
-static struct clocksource *select_clocksource(void)
+static void clocksource_select(void)
 {
-	struct clocksource *next;
+	struct clocksource *best, *cs;
 
 	if (list_empty(&clocksource_list))
-		return NULL;
+		return;
+	/* First clocksource on the list has the best rating. */
+	best = list_first_entry(&clocksource_list, struct clocksource, list);
+	/* Check for the override clocksource. */
+	list_for_each_entry(cs, &clocksource_list, list) {
+		if (strcmp(cs->name, override_name) != 0)
+			continue;
+		/*
+		 * Check to make sure we don't switch to a non-highres
+		 * capable clocksource if the tick code is in oneshot
+		 * mode (highres or nohz)
+		 */
+		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
+		    tick_oneshot_mode_active()) {
+			/* Override clocksource cannot be used. */
+			printk(KERN_WARNING "Override clocksource %s is not "
+			       "HRT compatible. Cannot switch while in "
+			       "HRT/NOHZ mode\n", cs->name);
+			override_name[0] = 0;
+		} else
+			/* Override clocksource can be used. */
+			best = cs;
+		break;
+	}
+	if (curr_clocksource != best)
+		next_clocksource = best;
+}
 
-	if (clocksource_override)
-		next = clocksource_override;
-	else
-		next = list_entry(clocksource_list.next, struct clocksource,
-				  list);
+#else /* CONFIG_GENERIC_TIME */
 
-	if (next == curr_clocksource)
-		return NULL;
+static void clocksource_select(void) { }
 
-	return next;
-}
+#endif
 
 /*
  * Enqueue the clocksource sorted by rating
  */
-static int clocksource_enqueue(struct clocksource *c)
+static void clocksource_enqueue(struct clocksource *cs)
 {
-	struct list_head *tmp, *entry = &clocksource_list;
-
-	list_for_each(tmp, &clocksource_list) {
-		struct clocksource *cs;
+	struct list_head *entry = &clocksource_list;
+	struct clocksource *tmp;
 
-		cs = list_entry(tmp, struct clocksource, list);
-		if (cs == c)
-			return -EBUSY;
+	list_for_each_entry(tmp, &clocksource_list, list)
 		/* Keep track of the place, where to insert */
-		if (cs->rating >= c->rating)
-			entry = tmp;
-	}
-	list_add(&c->list, entry);
-
-	if (strlen(c->name) == strlen(override_name) &&
-	    !strcmp(c->name, override_name))
-		clocksource_override = c;
-
-	return 0;
+		if (tmp->rating >= cs->rating)
+			entry = &tmp->list;
+	list_add(&cs->list, entry);
 }
 
 /**
@@ -397,19 +402,16 @@ static int clocksource_enqueue(struct clocksource *c)
  *
  * Returns -EBUSY if registration fails, zero otherwise.
  */
-int clocksource_register(struct clocksource *c)
+int clocksource_register(struct clocksource *cs)
 {
 	unsigned long flags;
-	int ret;
 
 	spin_lock_irqsave(&clocksource_lock, flags);
-	ret = clocksource_enqueue(c);
-	if (!ret)
-		next_clocksource = select_clocksource();
+	clocksource_enqueue(cs);
+	clocksource_select();
 	spin_unlock_irqrestore(&clocksource_lock, flags);
-	if (!ret)
-		clocksource_check_watchdog(c);
-	return ret;
+	clocksource_check_watchdog(cs);
+	return 0;
 }
 EXPORT_SYMBOL(clocksource_register);
 
@@ -425,7 +427,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
 	list_del(&cs->list);
 	cs->rating = rating;
 	clocksource_enqueue(cs);
-	next_clocksource = select_clocksource();
+	clocksource_select();
 	spin_unlock_irqrestore(&clocksource_lock, flags);
 }
 
@@ -438,9 +440,7 @@ void clocksource_unregister(struct clocksource *cs)
 
 	spin_lock_irqsave(&clocksource_lock, flags);
 	list_del(&cs->list);
-	if (clocksource_override == cs)
-		clocksource_override = NULL;
-	next_clocksource = select_clocksource();
+	clocksource_select();
 	spin_unlock_irqrestore(&clocksource_lock, flags);
 }
 
@@ -478,9 +478,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
 					  struct sysdev_attribute *attr,
 					  const char *buf, size_t count)
 {
-	struct clocksource *ovr = NULL;
 	size_t ret = count;
-	int len;
 
 	/* strings from sysfs write are not 0 terminated! */
 	if (count >= sizeof(override_name))
@@ -495,37 +493,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
 	if (count > 0)
 		memcpy(override_name, buf, count);
 	override_name[count] = 0;
-
-	len = strlen(override_name);
-	if (len) {
-		struct clocksource *cs;
-
-		ovr = clocksource_override;
-		/* try to select it: */
-		list_for_each_entry(cs, &clocksource_list, list) {
-			if (strlen(cs->name) == len &&
-			    !strcmp(cs->name, override_name))
-				ovr = cs;
-		}
-	}
-
-	/*
-	 * Check to make sure we don't switch to a non-highres capable
-	 * clocksource if the tick code is in oneshot mode (highres or nohz)
-	 */
-	if (tick_oneshot_mode_active() && ovr &&
-	    !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
-		printk(KERN_WARNING "%s clocksource is not HRT compatible. "
-			"Cannot switch while in HRT/NOHZ mode\n", ovr->name);
-		ovr = NULL;
-		override_name[0] = 0;
-	}
-
-	/* Reselect, when the override name has changed */
-	if (ovr != clocksource_override) {
-		clocksource_override = ovr;
-		next_clocksource = select_clocksource();
-	}
+	clocksource_select();
 
 	spin_unlock_irq(&clocksource_lock);
 
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index c3f6c30816e3..5404a8456909 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -61,7 +61,6 @@ struct clocksource clocksource_jiffies = {
 	.read		= jiffies_read,
 	.mask		= 0xffffffff, /*32bits*/
 	.mult		= NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
-	.mult_orig	= NSEC_PER_JIFFY << JIFFIES_SHIFT,
 	.shift		= JIFFIES_SHIFT,
 };
 
@@ -71,3 +70,8 @@ static int __init init_jiffies_clocksource(void)
 }
 
 core_initcall(init_jiffies_clocksource);
+
+struct clocksource * __init __weak clocksource_default_clock(void)
+{
+	return &clocksource_jiffies;
+}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index b5673016089f..325a9b63265a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -269,7 +269,7 @@ static void change_clocksource(void)
 
 	new = clocksource_get_next();
 
-	if (clock == new)
+	if (!new || clock == new)
 		return;
 
 	clocksource_forward_now();
@@ -446,7 +446,7 @@ void __init timekeeping_init(void)
 
 	ntp_init();
 
-	clock = clocksource_get_next();
+	clock = clocksource_default_clock();
 	if (clock->enable)
 		clock->enable(clock);
 	/* set mult_orig on enable */
-- 
cgit v1.2.3-71-gd317


From c55c87c892c1875deace0c8fc28787335277fdf2 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 14 Aug 2009 15:47:25 +0200
Subject: clocksource: Move watchdog downgrade to a work queue thread

Move the downgrade of an unstable clocksource from the timer interrupt
context into the process context of a work queue thread. This is
needed to be able to do the clocksource switch with stop_machine.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: John Stultz <johnstul@us.ibm.com>
Cc: Daniel Walker <dwalker@fifo99.com>
LKML-Reference: <20090814134809.354926067@de.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clocksource.h |  1 +
 kernel/time/clocksource.c   | 56 +++++++++++++++++++++++++++++++--------------
 2 files changed, 40 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index f263b3abf46e..19ad43af62d0 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -213,6 +213,7 @@ extern struct clocksource *clock;	/* current clocksource */
 
 #define CLOCK_SOURCE_WATCHDOG			0x10
 #define CLOCK_SOURCE_VALID_FOR_HRES		0x20
+#define CLOCK_SOURCE_UNSTABLE			0x40
 
 /* simplify initialization of mask field */
 #define CLOCKSOURCE_MASK(bits) (cycle_t)((bits) < 64 ? ((1ULL<<(bits))-1) : -1)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 56aaa749645d..f1508019bfb4 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -143,10 +143,13 @@ fs_initcall(clocksource_done_booting);
 static LIST_HEAD(watchdog_list);
 static struct clocksource *watchdog;
 static struct timer_list watchdog_timer;
+static struct work_struct watchdog_work;
 static DEFINE_SPINLOCK(watchdog_lock);
 static cycle_t watchdog_last;
 static int watchdog_running;
 
+static void clocksource_watchdog_work(struct work_struct *work);
+
 /*
  * Interval: 0.5sec Threshold: 0.0625s
  */
@@ -158,15 +161,16 @@ static void clocksource_unstable(struct clocksource *cs, int64_t delta)
 	printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
 	       cs->name, delta);
 	cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
-	clocksource_change_rating(cs, 0);
-	list_del(&cs->wd_list);
+	cs->flags |= CLOCK_SOURCE_UNSTABLE;
+	schedule_work(&watchdog_work);
 }
 
 static void clocksource_watchdog(unsigned long data)
 {
-	struct clocksource *cs, *tmp;
+	struct clocksource *cs;
 	cycle_t csnow, wdnow;
 	int64_t wd_nsec, cs_nsec;
+	int next_cpu;
 
 	spin_lock(&watchdog_lock);
 	if (!watchdog_running)
@@ -176,7 +180,12 @@ static void clocksource_watchdog(unsigned long data)
 	wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
 	watchdog_last = wdnow;
 
-	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
+	list_for_each_entry(cs, &watchdog_list, wd_list) {
+
+		/* Clocksource already marked unstable? */
+		if (cs->flags & CLOCK_SOURCE_UNSTABLE)
+			continue;
+
 		csnow = cs->read(cs);
 
 		/* Clocksource initialized ? */
@@ -207,19 +216,15 @@ static void clocksource_watchdog(unsigned long data)
 		}
 	}
 
-	if (!list_empty(&watchdog_list)) {
-		/*
-		 * Cycle through CPUs to check if the CPUs stay
-		 * synchronized to each other.
-		 */
-		int next_cpu = cpumask_next(raw_smp_processor_id(),
-					    cpu_online_mask);
-
-		if (next_cpu >= nr_cpu_ids)
-			next_cpu = cpumask_first(cpu_online_mask);
-		watchdog_timer.expires += WATCHDOG_INTERVAL;
-		add_timer_on(&watchdog_timer, next_cpu);
-	}
+	/*
+	 * Cycle through CPUs to check if the CPUs stay synchronized
+	 * to each other.
+	 */
+	next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
+	if (next_cpu >= nr_cpu_ids)
+		next_cpu = cpumask_first(cpu_online_mask);
+	watchdog_timer.expires += WATCHDOG_INTERVAL;
+	add_timer_on(&watchdog_timer, next_cpu);
 out:
 	spin_unlock(&watchdog_lock);
 }
@@ -228,6 +233,7 @@ static inline void clocksource_start_watchdog(void)
 {
 	if (watchdog_running || !watchdog || list_empty(&watchdog_list))
 		return;
+	INIT_WORK(&watchdog_work, clocksource_watchdog_work);
 	init_timer(&watchdog_timer);
 	watchdog_timer.function = clocksource_watchdog;
 	watchdog_last = watchdog->read(watchdog);
@@ -313,6 +319,22 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs)
 	spin_unlock_irqrestore(&watchdog_lock, flags);
 }
 
+static void clocksource_watchdog_work(struct work_struct *work)
+{
+	struct clocksource *cs, *tmp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&watchdog_lock, flags);
+	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list)
+		if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
+			list_del_init(&cs->wd_list);
+			clocksource_change_rating(cs, 0);
+		}
+	/* Check if the watchdog timer needs to be stopped. */
+	clocksource_stop_watchdog();
+	spin_unlock(&watchdog_lock);
+}
+
 #else /* CONFIG_CLOCKSOURCE_WATCHDOG */
 
 static void clocksource_enqueue_watchdog(struct clocksource *cs)
-- 
cgit v1.2.3-71-gd317


From 155ec60226ae0ae2aadaa57c951a58a359331030 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 14 Aug 2009 15:47:26 +0200
Subject: timekeeping: Introduce struct timekeeper

Add struct timekeeper to keep the internal values timekeeping.c needs
in regard to the currently selected clock source. This moves the
timekeeping intervals, xtime_nsec and the ntp error value from struct
clocksource to struct timekeeper. The raw_time is removed from the
clocksource as well. It gets treated like xtime as a global variable.
Eventually xtime raw_time should be moved to struct timekeeper.

[ tglx: minor cleanup ]

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: John Stultz <johnstul@us.ibm.com>
Cc: Daniel Walker <dwalker@fifo99.com>
LKML-Reference: <20090814134809.613209842@de.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/s390/kernel/time.c     |   1 -
 include/linux/clocksource.h |  54 +---------
 kernel/time/clocksource.c   |   6 +-
 kernel/time/timekeeping.c   | 235 +++++++++++++++++++++++++++++---------------
 4 files changed, 164 insertions(+), 132 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index afefe514df0f..e76c2e7a8b9a 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -280,7 +280,6 @@ void __init time_init(void)
 	now = get_clock();
 	tod_to_timeval(now - TOD_UNIX_EPOCH, &xtime);
 	clocksource_tod.cycle_last = now;
-	clocksource_tod.raw_time = xtime;
 	tod_to_timeval(sched_clock_base_cc - TOD_UNIX_EPOCH, &ts);
 	set_normalized_timespec(&wall_to_monotonic, -ts.tv_sec, -ts.tv_nsec);
 	write_sequnlock_irqrestore(&xtime_lock, flags);
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 19ad43af62d0..e12e3095e2fb 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -155,8 +155,6 @@ extern u64 timecounter_cyc2time(struct timecounter *tc,
  * @flags:		flags describing special properties
  * @vread:		vsyscall based read
  * @resume:		resume function for the clocksource, if necessary
- * @cycle_interval:	Used internally by timekeeping core, please ignore.
- * @xtime_interval:	Used internally by timekeeping core, please ignore.
  */
 struct clocksource {
 	/*
@@ -182,19 +180,12 @@ struct clocksource {
 #define CLKSRC_FSYS_MMIO_SET(mmio, addr)      do { } while (0)
 #endif
 
-	/* timekeeping specific data, ignore */
-	cycle_t cycle_interval;
-	u64	xtime_interval;
-	u32	raw_interval;
 	/*
 	 * Second part is written at each timer interrupt
 	 * Keep it in a different cache line to dirty no
 	 * more than one cache line.
 	 */
 	cycle_t cycle_last ____cacheline_aligned_in_smp;
-	u64 xtime_nsec;
-	s64 error;
-	struct timespec raw_time;
 
 #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
 	/* Watchdog related data, used by the framework */
@@ -203,8 +194,6 @@ struct clocksource {
 #endif
 };
 
-extern struct clocksource *clock;	/* current clocksource */
-
 /*
  * Clock source flags bits::
  */
@@ -270,50 +259,15 @@ static inline u32 clocksource_hz2mult(u32 hz, u32 shift_constant)
 }
 
 /**
- * cyc2ns - converts clocksource cycles to nanoseconds
- * @cs:		Pointer to clocksource
- * @cycles:	Cycles
+ * clocksource_cyc2ns - converts clocksource cycles to nanoseconds
  *
- * Uses the clocksource and ntp ajdustment to convert cycle_ts to nanoseconds.
+ * Converts cycles to nanoseconds, using the given mult and shift.
  *
  * XXX - This could use some mult_lxl_ll() asm optimization
  */
-static inline s64 cyc2ns(struct clocksource *cs, cycle_t cycles)
+static inline s64 clocksource_cyc2ns(cycle_t cycles, u32 mult, u32 shift)
 {
-	u64 ret = (u64)cycles;
-	ret = (ret * cs->mult) >> cs->shift;
-	return ret;
-}
-
-/**
- * clocksource_calculate_interval - Calculates a clocksource interval struct
- *
- * @c:		Pointer to clocksource.
- * @length_nsec: Desired interval length in nanoseconds.
- *
- * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
- * pair and interval request.
- *
- * Unless you're the timekeeping code, you should not be using this!
- */
-static inline void clocksource_calculate_interval(struct clocksource *c,
-					  	  unsigned long length_nsec)
-{
-	u64 tmp;
-
-	/* Do the ns -> cycle conversion first, using original mult */
-	tmp = length_nsec;
-	tmp <<= c->shift;
-	tmp += c->mult_orig/2;
-	do_div(tmp, c->mult_orig);
-
-	c->cycle_interval = (cycle_t)tmp;
-	if (c->cycle_interval == 0)
-		c->cycle_interval = 1;
-
-	/* Go back from cycles -> shifted ns, this time use ntp adjused mult */
-	c->xtime_interval = (u64)c->cycle_interval * c->mult;
-	c->raw_interval = ((u64)c->cycle_interval * c->mult_orig) >> c->shift;
+	return ((u64) cycles * mult) >> shift;
 }
 
 
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index f1508019bfb4..f18c9a6bdcf4 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -177,7 +177,8 @@ static void clocksource_watchdog(unsigned long data)
 		goto out;
 
 	wdnow = watchdog->read(watchdog);
-	wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
+	wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask,
+				     watchdog->mult, watchdog->shift);
 	watchdog_last = wdnow;
 
 	list_for_each_entry(cs, &watchdog_list, wd_list) {
@@ -196,7 +197,8 @@ static void clocksource_watchdog(unsigned long data)
 		}
 
 		/* Check the deviation from the watchdog clocksource. */
-		cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask);
+		cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) &
+					     cs->mask, cs->mult, cs->shift);
 		cs->wd_last = csnow;
 		if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
 			clocksource_unstable(cs, cs_nsec - wd_nsec);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 325a9b63265a..7af45cbf6b13 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -19,6 +19,65 @@
 #include <linux/time.h>
 #include <linux/tick.h>
 
+/* Structure holding internal timekeeping values. */
+struct timekeeper {
+	/* Current clocksource used for timekeeping. */
+	struct clocksource *clock;
+
+	/* Number of clock cycles in one NTP interval. */
+	cycle_t cycle_interval;
+	/* Number of clock shifted nano seconds in one NTP interval. */
+	u64	xtime_interval;
+	/* Raw nano seconds accumulated per NTP interval. */
+	u32	raw_interval;
+
+	/* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */
+	u64	xtime_nsec;
+	/* Difference between accumulated time and NTP time in ntp
+	 * shifted nano seconds. */
+	s64	ntp_error;
+};
+
+struct timekeeper timekeeper;
+
+/**
+ * timekeeper_setup_internals - Set up internals to use clocksource clock.
+ *
+ * @clock:		Pointer to clocksource.
+ *
+ * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
+ * pair and interval request.
+ *
+ * Unless you're the timekeeping code, you should not be using this!
+ */
+static void timekeeper_setup_internals(struct clocksource *clock)
+{
+	cycle_t interval;
+	u64 tmp;
+
+	timekeeper.clock = clock;
+	clock->cycle_last = clock->read(clock);
+
+	/* Do the ns -> cycle conversion first, using original mult */
+	tmp = NTP_INTERVAL_LENGTH;
+	tmp <<= clock->shift;
+	tmp += clock->mult_orig/2;
+	do_div(tmp, clock->mult_orig);
+	if (tmp == 0)
+		tmp = 1;
+
+	interval = (cycle_t) tmp;
+	timekeeper.cycle_interval = interval;
+
+	/* Go back from cycles -> shifted ns */
+	timekeeper.xtime_interval = (u64) interval * clock->mult;
+	timekeeper.raw_interval =
+		((u64) interval * clock->mult_orig) >> clock->shift;
+
+	timekeeper.xtime_nsec = 0;
+
+	timekeeper.ntp_error = 0;
+}
 
 /*
  * This read-write spinlock protects us from races in SMP while
@@ -46,6 +105,11 @@ struct timespec xtime __attribute__ ((aligned (16)));
 struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
 static unsigned long total_sleep_time;		/* seconds */
 
+/*
+ * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
+ */
+struct timespec raw_time;
+
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
 
@@ -56,42 +120,42 @@ void update_xtime_cache(u64 nsec)
 	timespec_add_ns(&xtime_cache, nsec);
 }
 
-struct clocksource *clock;
-
 /* must hold xtime_lock */
 void timekeeping_leap_insert(int leapsecond)
 {
 	xtime.tv_sec += leapsecond;
 	wall_to_monotonic.tv_sec -= leapsecond;
-	update_vsyscall(&xtime, clock);
+	update_vsyscall(&xtime, timekeeper.clock);
 }
 
 #ifdef CONFIG_GENERIC_TIME
 /**
- * clocksource_forward_now - update clock to the current time
+ * timekeeping_forward_now - update clock to the current time
  *
  * Forward the current clock to update its state since the last call to
  * update_wall_time(). This is useful before significant clock changes,
  * as it avoids having to deal with this time offset explicitly.
  */
-static void clocksource_forward_now(void)
+static void timekeeping_forward_now(void)
 {
 	cycle_t cycle_now, cycle_delta;
+	struct clocksource *clock;
 	s64 nsec;
 
+	clock = timekeeper.clock;
 	cycle_now = clock->read(clock);
 	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
 	clock->cycle_last = cycle_now;
 
-	nsec = cyc2ns(clock, cycle_delta);
+	nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
 
 	/* If arch requires, add in gettimeoffset() */
 	nsec += arch_gettimeoffset();
 
 	timespec_add_ns(&xtime, nsec);
 
-	nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
-	clock->raw_time.tv_nsec += nsec;
+	nsec = clocksource_cyc2ns(cycle_delta, clock->mult_orig, clock->shift);
+	timespec_add_ns(&raw_time, nsec);
 }
 
 /**
@@ -103,6 +167,7 @@ static void clocksource_forward_now(void)
 void getnstimeofday(struct timespec *ts)
 {
 	cycle_t cycle_now, cycle_delta;
+	struct clocksource *clock;
 	unsigned long seq;
 	s64 nsecs;
 
@@ -114,13 +179,15 @@ void getnstimeofday(struct timespec *ts)
 		*ts = xtime;
 
 		/* read clocksource: */
+		clock = timekeeper.clock;
 		cycle_now = clock->read(clock);
 
 		/* calculate the delta since the last update_wall_time: */
 		cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
 
 		/* convert to nanoseconds: */
-		nsecs = cyc2ns(clock, cycle_delta);
+		nsecs = clocksource_cyc2ns(cycle_delta, clock->mult,
+					   clock->shift);
 
 		/* If arch requires, add in gettimeoffset() */
 		nsecs += arch_gettimeoffset();
@@ -135,6 +202,7 @@ EXPORT_SYMBOL(getnstimeofday);
 ktime_t ktime_get(void)
 {
 	cycle_t cycle_now, cycle_delta;
+	struct clocksource *clock;
 	unsigned int seq;
 	s64 secs, nsecs;
 
@@ -146,13 +214,15 @@ ktime_t ktime_get(void)
 		nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
 
 		/* read clocksource: */
+		clock = timekeeper.clock;
 		cycle_now = clock->read(clock);
 
 		/* calculate the delta since the last update_wall_time: */
 		cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
 
 		/* convert to nanoseconds: */
-		nsecs += cyc2ns(clock, cycle_delta);
+		nsecs += clocksource_cyc2ns(cycle_delta, clock->mult,
+					    clock->shift);
 
 	} while (read_seqretry(&xtime_lock, seq));
 	/*
@@ -174,6 +244,7 @@ EXPORT_SYMBOL_GPL(ktime_get);
 void ktime_get_ts(struct timespec *ts)
 {
 	cycle_t cycle_now, cycle_delta;
+	struct clocksource *clock;
 	struct timespec tomono;
 	unsigned int seq;
 	s64 nsecs;
@@ -186,13 +257,15 @@ void ktime_get_ts(struct timespec *ts)
 		tomono = wall_to_monotonic;
 
 		/* read clocksource: */
+		clock = timekeeper.clock;
 		cycle_now = clock->read(clock);
 
 		/* calculate the delta since the last update_wall_time: */
 		cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
 
 		/* convert to nanoseconds: */
-		nsecs = cyc2ns(clock, cycle_delta);
+		nsecs = clocksource_cyc2ns(cycle_delta, clock->mult,
+					   clock->shift);
 
 	} while (read_seqretry(&xtime_lock, seq));
 
@@ -233,7 +306,7 @@ int do_settimeofday(struct timespec *tv)
 
 	write_seqlock_irqsave(&xtime_lock, flags);
 
-	clocksource_forward_now();
+	timekeeping_forward_now();
 
 	ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
 	ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
@@ -243,10 +316,10 @@ int do_settimeofday(struct timespec *tv)
 
 	update_xtime_cache(0);
 
-	clock->error = 0;
+	timekeeper.ntp_error = 0;
 	ntp_clear();
 
-	update_vsyscall(&xtime, clock);
+	update_vsyscall(&xtime, timekeeper.clock);
 
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 
@@ -269,10 +342,10 @@ static void change_clocksource(void)
 
 	new = clocksource_get_next();
 
-	if (!new || clock == new)
+	if (!new || timekeeper.clock == new)
 		return;
 
-	clocksource_forward_now();
+	timekeeping_forward_now();
 
 	if (new->enable && !new->enable(new))
 		return;
@@ -284,9 +357,9 @@ static void change_clocksource(void)
 	 */
 	new->mult_orig = new->mult;
 
-	new->raw_time = clock->raw_time;
-	old = clock;
-	clock = new;
+	old = timekeeper.clock;
+	timekeeper_setup_internals(new);
+
 	/*
 	 * Save mult_orig in mult so that the value can be restored
 	 * regardless if ->enable() updates the value of mult or not.
@@ -295,22 +368,10 @@ static void change_clocksource(void)
 	if (old->disable)
 		old->disable(old);
 
-	clock->cycle_last = clock->read(clock);
-	clock->error = 0;
-	clock->xtime_nsec = 0;
-	clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
-
 	tick_clock_notify();
-
-	/*
-	 * We're holding xtime lock and waking up klogd would deadlock
-	 * us on enqueue.  So no printing!
-	printk(KERN_INFO "Time: %s clocksource has been installed.\n",
-	       clock->name);
-	 */
 }
 #else /* GENERIC_TIME */
-static inline void clocksource_forward_now(void) { }
+static inline void timekeeping_forward_now(void) { }
 static inline void change_clocksource(void) { }
 
 /**
@@ -380,20 +441,23 @@ void getrawmonotonic(struct timespec *ts)
 	unsigned long seq;
 	s64 nsecs;
 	cycle_t cycle_now, cycle_delta;
+	struct clocksource *clock;
 
 	do {
 		seq = read_seqbegin(&xtime_lock);
 
 		/* read clocksource: */
+		clock = timekeeper.clock;
 		cycle_now = clock->read(clock);
 
 		/* calculate the delta since the last update_wall_time: */
 		cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
 
 		/* convert to nanoseconds: */
-		nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
+		nsecs = clocksource_cyc2ns(cycle_delta, clock->mult_orig,
+					   clock->shift);
 
-		*ts = clock->raw_time;
+		*ts = raw_time;
 
 	} while (read_seqretry(&xtime_lock, seq));
 
@@ -413,7 +477,7 @@ int timekeeping_valid_for_hres(void)
 	do {
 		seq = read_seqbegin(&xtime_lock);
 
-		ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+		ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
 
 	} while (read_seqretry(&xtime_lock, seq));
 
@@ -439,6 +503,7 @@ unsigned long __attribute__((weak)) read_persistent_clock(void)
  */
 void __init timekeeping_init(void)
 {
+	struct clocksource *clock;
 	unsigned long flags;
 	unsigned long sec = read_persistent_clock();
 
@@ -451,11 +516,13 @@ void __init timekeeping_init(void)
 		clock->enable(clock);
 	/* set mult_orig on enable */
 	clock->mult_orig = clock->mult;
-	clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
-	clock->cycle_last = clock->read(clock);
+
+	timekeeper_setup_internals(clock);
 
 	xtime.tv_sec = sec;
 	xtime.tv_nsec = 0;
+	raw_time.tv_sec = 0;
+	raw_time.tv_nsec = 0;
 	set_normalized_timespec(&wall_to_monotonic,
 		-xtime.tv_sec, -xtime.tv_nsec);
 	update_xtime_cache(0);
@@ -492,8 +559,8 @@ static int timekeeping_resume(struct sys_device *dev)
 	}
 	update_xtime_cache(0);
 	/* re-base the last cycle value */
-	clock->cycle_last = clock->read(clock);
-	clock->error = 0;
+	timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
+	timekeeper.ntp_error = 0;
 	timekeeping_suspended = 0;
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 
@@ -514,7 +581,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
 	timekeeping_suspend_time = read_persistent_clock();
 
 	write_seqlock_irqsave(&xtime_lock, flags);
-	clocksource_forward_now();
+	timekeeping_forward_now();
 	timekeeping_suspended = 1;
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 
@@ -549,7 +616,7 @@ device_initcall(timekeeping_init_device);
  * If the error is already larger, we look ahead even further
  * to compensate for late or lost adjustments.
  */
-static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
+static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
 						 s64 *offset)
 {
 	s64 tick_error, i;
@@ -565,7 +632,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
 	 * here.  This is tuned so that an error of about 1 msec is adjusted
 	 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
 	 */
-	error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
+	error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
 	error2 = abs(error2);
 	for (look_ahead = 0; error2 > 0; look_ahead++)
 		error2 >>= 2;
@@ -574,8 +641,9 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
 	 * Now calculate the error in (1 << look_ahead) ticks, but first
 	 * remove the single look ahead already included in the error.
 	 */
-	tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1);
-	tick_error -= clock->xtime_interval >> 1;
+	tick_error = tick_length >>
+			(NTP_SCALE_SHIFT - timekeeper.clock->shift + 1);
+	tick_error -= timekeeper.xtime_interval >> 1;
 	error = ((error - tick_error) >> look_ahead) + tick_error;
 
 	/* Finally calculate the adjustment shift value.  */
@@ -600,18 +668,19 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
  * this is optimized for the most common adjustments of -1,0,1,
  * for other values we can do a bit more work.
  */
-static void clocksource_adjust(s64 offset)
+static void timekeeping_adjust(s64 offset)
 {
-	s64 error, interval = clock->cycle_interval;
+	s64 error, interval = timekeeper.cycle_interval;
 	int adj;
 
-	error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1);
+	error = timekeeper.ntp_error >>
+		(NTP_SCALE_SHIFT - timekeeper.clock->shift - 1);
 	if (error > interval) {
 		error >>= 2;
 		if (likely(error <= interval))
 			adj = 1;
 		else
-			adj = clocksource_bigadjust(error, &interval, &offset);
+			adj = timekeeping_bigadjust(error, &interval, &offset);
 	} else if (error < -interval) {
 		error >>= 2;
 		if (likely(error >= -interval)) {
@@ -619,15 +688,15 @@ static void clocksource_adjust(s64 offset)
 			interval = -interval;
 			offset = -offset;
 		} else
-			adj = clocksource_bigadjust(error, &interval, &offset);
+			adj = timekeeping_bigadjust(error, &interval, &offset);
 	} else
 		return;
 
-	clock->mult += adj;
-	clock->xtime_interval += interval;
-	clock->xtime_nsec -= offset;
-	clock->error -= (interval - offset) <<
-			(NTP_SCALE_SHIFT - clock->shift);
+	timekeeper.clock->mult += adj;
+	timekeeper.xtime_interval += interval;
+	timekeeper.xtime_nsec -= offset;
+	timekeeper.ntp_error -= (interval - offset) <<
+			(NTP_SCALE_SHIFT - timekeeper.clock->shift);
 }
 
 /**
@@ -637,53 +706,59 @@ static void clocksource_adjust(s64 offset)
  */
 void update_wall_time(void)
 {
+	struct clocksource *clock;
 	cycle_t offset;
+	s64 nsecs;
 
 	/* Make sure we're fully resumed: */
 	if (unlikely(timekeeping_suspended))
 		return;
 
+	clock = timekeeper.clock;
 #ifdef CONFIG_GENERIC_TIME
 	offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
 #else
-	offset = clock->cycle_interval;
+	offset = timekeeper.cycle_interval;
 #endif
-	clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift;
+	timekeeper.xtime_nsec = (s64)xtime.tv_nsec << clock->shift;
 
 	/* normally this loop will run just once, however in the
 	 * case of lost or late ticks, it will accumulate correctly.
 	 */
-	while (offset >= clock->cycle_interval) {
+	while (offset >= timekeeper.cycle_interval) {
+		u64 nsecps = (u64)NSEC_PER_SEC << clock->shift;
+
 		/* accumulate one interval */
-		offset -= clock->cycle_interval;
-		clock->cycle_last += clock->cycle_interval;
+		offset -= timekeeper.cycle_interval;
+		clock->cycle_last += timekeeper.cycle_interval;
 
-		clock->xtime_nsec += clock->xtime_interval;
-		if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
-			clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
+		timekeeper.xtime_nsec += timekeeper.xtime_interval;
+		if (timekeeper.xtime_nsec >= nsecps) {
+			timekeeper.xtime_nsec -= nsecps;
 			xtime.tv_sec++;
 			second_overflow();
 		}
 
-		clock->raw_time.tv_nsec += clock->raw_interval;
-		if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) {
-			clock->raw_time.tv_nsec -= NSEC_PER_SEC;
-			clock->raw_time.tv_sec++;
+		raw_time.tv_nsec += timekeeper.raw_interval;
+		if (raw_time.tv_nsec >= NSEC_PER_SEC) {
+			raw_time.tv_nsec -= NSEC_PER_SEC;
+			raw_time.tv_sec++;
 		}
 
 		/* accumulate error between NTP and clock interval */
-		clock->error += tick_length;
-		clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
+		timekeeper.ntp_error += tick_length;
+		timekeeper.ntp_error -= timekeeper.xtime_interval <<
+					(NTP_SCALE_SHIFT - clock->shift);
 	}
 
 	/* correct the clock when NTP error is too big */
-	clocksource_adjust(offset);
+	timekeeping_adjust(offset);
 
 	/*
 	 * Since in the loop above, we accumulate any amount of time
 	 * in xtime_nsec over a second into xtime.tv_sec, its possible for
 	 * xtime_nsec to be fairly small after the loop. Further, if we're
-	 * slightly speeding the clocksource up in clocksource_adjust(),
+	 * slightly speeding the clocksource up in timekeeping_adjust(),
 	 * its possible the required corrective factor to xtime_nsec could
 	 * cause it to underflow.
 	 *
@@ -695,24 +770,26 @@ void update_wall_time(void)
 	 * We'll correct this error next time through this function, when
 	 * xtime_nsec is not as small.
 	 */
-	if (unlikely((s64)clock->xtime_nsec < 0)) {
-		s64 neg = -(s64)clock->xtime_nsec;
-		clock->xtime_nsec = 0;
-		clock->error += neg << (NTP_SCALE_SHIFT - clock->shift);
+	if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
+		s64 neg = -(s64)timekeeper.xtime_nsec;
+		timekeeper.xtime_nsec = 0;
+		timekeeper.ntp_error += neg << (NTP_SCALE_SHIFT - clock->shift);
 	}
 
 	/* store full nanoseconds into xtime after rounding it up and
 	 * add the remainder to the error difference.
 	 */
-	xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1;
-	clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
-	clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift);
+	xtime.tv_nsec = ((s64)timekeeper.xtime_nsec >> clock->shift) + 1;
+	timekeeper.xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
+	timekeeper.ntp_error += timekeeper.xtime_nsec <<
+				(NTP_SCALE_SHIFT - clock->shift);
 
-	update_xtime_cache(cyc2ns(clock, offset));
+	nsecs = clocksource_cyc2ns(offset, clock->mult, clock->shift);
+	update_xtime_cache(nsecs);
 
 	/* check to see if there is a new clocksource to use */
 	change_clocksource();
-	update_vsyscall(&xtime, clock);
+	update_vsyscall(&xtime, timekeeper.clock);
 }
 
 /**
-- 
cgit v1.2.3-71-gd317


From 0a54419836254a27baecd9037103171bcbabaf67 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 14 Aug 2009 15:47:28 +0200
Subject: timekeeping: Move NTP adjusted clock multiplier to struct timekeeper

The clocksource structure has two multipliers, the unmodified multiplier
clock->mult_orig and the NTP corrected multiplier clock->mult. The NTP
multiplier is misplaced in the struct clocksource, this is private
information of the timekeeping code. Add the mult field to the struct
timekeeper to contain the NTP corrected value, keep the unmodifed
multiplier in clock->mult and remove clock->mult_orig.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: John Stultz <johnstul@us.ibm.com>
Cc: Daniel Walker <dwalker@fifo99.com>
LKML-Reference: <20090814134810.149047645@de.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/arm/plat-omap/common.c |  7 ++----
 include/linux/clocksource.h |  4 +---
 kernel/time/timekeeping.c   | 53 ++++++++++++++++++++-------------------------
 3 files changed, 27 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/plat-omap/common.c b/arch/arm/plat-omap/common.c
index ebcf006406f9..95587b6c0259 100644
--- a/arch/arm/plat-omap/common.c
+++ b/arch/arm/plat-omap/common.c
@@ -253,11 +253,8 @@ static struct clocksource clocksource_32k = {
  */
 unsigned long long sched_clock(void)
 {
-	unsigned long long ret;
-
-	ret = (unsigned long long)clocksource_32k.read(&clocksource_32k);
-	ret = (ret * clocksource_32k.mult_orig) >> clocksource_32k.shift;
-	return ret;
+	return clocksource_cyc2ns(clocksource_32k.read(&clocksource_32k),
+				  clocksource_32k.mult, clocksource_32k.shift);
 }
 
 static int __init omap_init_clocksource_32k(void)
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index e12e3095e2fb..e34015effeb6 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -149,8 +149,7 @@ extern u64 timecounter_cyc2time(struct timecounter *tc,
  * @disable:		optional function to disable the clocksource
  * @mask:		bitmask for two's complement
  *			subtraction of non 64 bit counters
- * @mult:		cycle to nanosecond multiplier (adjusted by NTP)
- * @mult_orig:		cycle to nanosecond multiplier (unadjusted by NTP)
+ * @mult:		cycle to nanosecond multiplier
  * @shift:		cycle to nanosecond divisor (power of two)
  * @flags:		flags describing special properties
  * @vread:		vsyscall based read
@@ -168,7 +167,6 @@ struct clocksource {
 	void (*disable)(struct clocksource *cs);
 	cycle_t mask;
 	u32 mult;
-	u32 mult_orig;
 	u32 shift;
 	unsigned long flags;
 	cycle_t (*vread)(void);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index dfdab1cefe1e..f4056f6c2632 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -41,6 +41,8 @@ struct timekeeper {
 	/* Shift conversion between clock shifted nano seconds and
 	 * ntp shifted nano seconds. */
 	int	ntp_error_shift;
+	/* NTP adjusted clock multiplier */
+	u32	mult;
 };
 
 struct timekeeper timekeeper;
@@ -66,8 +68,8 @@ static void timekeeper_setup_internals(struct clocksource *clock)
 	/* Do the ns -> cycle conversion first, using original mult */
 	tmp = NTP_INTERVAL_LENGTH;
 	tmp <<= clock->shift;
-	tmp += clock->mult_orig/2;
-	do_div(tmp, clock->mult_orig);
+	tmp += clock->mult/2;
+	do_div(tmp, clock->mult);
 	if (tmp == 0)
 		tmp = 1;
 
@@ -77,13 +79,20 @@ static void timekeeper_setup_internals(struct clocksource *clock)
 	/* Go back from cycles -> shifted ns */
 	timekeeper.xtime_interval = (u64) interval * clock->mult;
 	timekeeper.raw_interval =
-		((u64) interval * clock->mult_orig) >> clock->shift;
+		((u64) interval * clock->mult) >> clock->shift;
 
 	timekeeper.xtime_nsec = 0;
 	timekeeper.shift = clock->shift;
 
 	timekeeper.ntp_error = 0;
 	timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
+
+	/*
+	 * The timekeeper keeps its own mult values for the currently
+	 * active clocksource. These value will be adjusted via NTP
+	 * to counteract clock drifting.
+	 */
+	timekeeper.mult = clock->mult;
 }
 
 /*
@@ -154,14 +163,15 @@ static void timekeeping_forward_now(void)
 	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
 	clock->cycle_last = cycle_now;
 
-	nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
+	nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult,
+				  timekeeper.shift);
 
 	/* If arch requires, add in gettimeoffset() */
 	nsec += arch_gettimeoffset();
 
 	timespec_add_ns(&xtime, nsec);
 
-	nsec = clocksource_cyc2ns(cycle_delta, clock->mult_orig, clock->shift);
+	nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
 	timespec_add_ns(&raw_time, nsec);
 }
 
@@ -193,8 +203,8 @@ void getnstimeofday(struct timespec *ts)
 		cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
 
 		/* convert to nanoseconds: */
-		nsecs = clocksource_cyc2ns(cycle_delta, clock->mult,
-					   clock->shift);
+		nsecs = clocksource_cyc2ns(cycle_delta, timekeeper.mult,
+					   timekeeper.shift);
 
 		/* If arch requires, add in gettimeoffset() */
 		nsecs += arch_gettimeoffset();
@@ -228,8 +238,8 @@ ktime_t ktime_get(void)
 		cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
 
 		/* convert to nanoseconds: */
-		nsecs += clocksource_cyc2ns(cycle_delta, clock->mult,
-					    clock->shift);
+		nsecs += clocksource_cyc2ns(cycle_delta, timekeeper.mult,
+					    timekeeper.shift);
 
 	} while (read_seqretry(&xtime_lock, seq));
 	/*
@@ -271,8 +281,8 @@ void ktime_get_ts(struct timespec *ts)
 		cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
 
 		/* convert to nanoseconds: */
-		nsecs = clocksource_cyc2ns(cycle_delta, clock->mult,
-					   clock->shift);
+		nsecs = clocksource_cyc2ns(cycle_delta, timekeeper.mult,
+					   timekeeper.shift);
 
 	} while (read_seqretry(&xtime_lock, seq));
 
@@ -356,22 +366,10 @@ static void change_clocksource(void)
 
 	if (new->enable && !new->enable(new))
 		return;
-	/*
-	 * The frequency may have changed while the clocksource
-	 * was disabled. If so the code in ->enable() must update
-	 * the mult value to reflect the new frequency. Make sure
-	 * mult_orig follows this change.
-	 */
-	new->mult_orig = new->mult;
 
 	old = timekeeper.clock;
 	timekeeper_setup_internals(new);
 
-	/*
-	 * Save mult_orig in mult so that the value can be restored
-	 * regardless if ->enable() updates the value of mult or not.
-	 */
-	old->mult = old->mult_orig;
 	if (old->disable)
 		old->disable(old);
 
@@ -461,7 +459,7 @@ void getrawmonotonic(struct timespec *ts)
 		cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
 
 		/* convert to nanoseconds: */
-		nsecs = clocksource_cyc2ns(cycle_delta, clock->mult_orig,
+		nsecs = clocksource_cyc2ns(cycle_delta, clock->mult,
 					   clock->shift);
 
 		*ts = raw_time;
@@ -521,9 +519,6 @@ void __init timekeeping_init(void)
 	clock = clocksource_default_clock();
 	if (clock->enable)
 		clock->enable(clock);
-	/* set mult_orig on enable */
-	clock->mult_orig = clock->mult;
-
 	timekeeper_setup_internals(clock);
 
 	xtime.tv_sec = sec;
@@ -697,7 +692,7 @@ static void timekeeping_adjust(s64 offset)
 	} else
 		return;
 
-	timekeeper.clock->mult += adj;
+	timekeeper.mult += adj;
 	timekeeper.xtime_interval += interval;
 	timekeeper.xtime_nsec -= offset;
 	timekeeper.ntp_error -= (interval - offset) <<
@@ -789,7 +784,7 @@ void update_wall_time(void)
 	timekeeper.ntp_error +=	timekeeper.xtime_nsec <<
 				timekeeper.ntp_error_shift;
 
-	nsecs = clocksource_cyc2ns(offset, clock->mult, clock->shift);
+	nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
 	update_xtime_cache(nsecs);
 
 	/* check to see if there is a new clocksource to use */
-- 
cgit v1.2.3-71-gd317


From 75c5158f70c065b9704b924503d96e8297838f79 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 14 Aug 2009 15:47:30 +0200
Subject: timekeeping: Update clocksource with stop_machine

update_wall_time calls change_clocksource HZ times per second to check
if a new clock source is available. In close to 100% of all calls
there is no new clock. Replace the tick based check by an update done
with stop_machine.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: John Stultz <johnstul@us.ibm.com>
Cc: Daniel Walker <dwalker@fifo99.com>
LKML-Reference: <20090814134810.711836357@de.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clocksource.h |   2 +
 kernel/time/clocksource.c   | 112 +++++++++++++++++---------------------------
 kernel/time/timekeeping.c   |  41 ++++++++++------
 3 files changed, 72 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index e34015effeb6..9ea40ff26f0e 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -291,4 +291,6 @@ static inline void update_vsyscall_tz(void)
 }
 #endif
 
+extern void timekeeping_notify(struct clocksource *clock);
+
 #endif /* _LINUX_CLOCKSOURCE_H */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index f18c9a6bdcf4..a1657b5fdeb9 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -109,35 +109,17 @@ EXPORT_SYMBOL(timecounter_cyc2time);
 /*[Clocksource internal variables]---------
  * curr_clocksource:
  *	currently selected clocksource.
- * next_clocksource:
- *	pending next selected clocksource.
  * clocksource_list:
  *	linked list with the registered clocksources
- * clocksource_lock:
- *	protects manipulations to curr_clocksource and next_clocksource
- *	and the clocksource_list
+ * clocksource_mutex:
+ *	protects manipulations to curr_clocksource and the clocksource_list
  * override_name:
  *	Name of the user-specified clocksource.
  */
 static struct clocksource *curr_clocksource;
-static struct clocksource *next_clocksource;
 static LIST_HEAD(clocksource_list);
-static DEFINE_SPINLOCK(clocksource_lock);
+static DEFINE_MUTEX(clocksource_mutex);
 static char override_name[32];
-static int finished_booting;
-
-/* clocksource_done_booting - Called near the end of core bootup
- *
- * Hack to avoid lots of clocksource churn at boot time.
- * We use fs_initcall because we want this to start before
- * device_initcall but after subsys_initcall.
- */
-static int __init clocksource_done_booting(void)
-{
-	finished_booting = 1;
-	return 0;
-}
-fs_initcall(clocksource_done_booting);
 
 #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
 static LIST_HEAD(watchdog_list);
@@ -356,18 +338,16 @@ static inline void clocksource_resume_watchdog(void) { }
 void clocksource_resume(void)
 {
 	struct clocksource *cs;
-	unsigned long flags;
 
-	spin_lock_irqsave(&clocksource_lock, flags);
+	mutex_lock(&clocksource_mutex);
 
-	list_for_each_entry(cs, &clocksource_list, list) {
+	list_for_each_entry(cs, &clocksource_list, list)
 		if (cs->resume)
 			cs->resume();
-	}
 
 	clocksource_resume_watchdog();
 
-	spin_unlock_irqrestore(&clocksource_lock, flags);
+	mutex_unlock(&clocksource_mutex);
 }
 
 /**
@@ -383,28 +363,13 @@ void clocksource_touch_watchdog(void)
 }
 
 #ifdef CONFIG_GENERIC_TIME
-/**
- * clocksource_get_next - Returns the selected clocksource
- *
- */
-struct clocksource *clocksource_get_next(void)
-{
-	unsigned long flags;
 
-	spin_lock_irqsave(&clocksource_lock, flags);
-	if (next_clocksource && finished_booting) {
-		curr_clocksource = next_clocksource;
-		next_clocksource = NULL;
-	}
-	spin_unlock_irqrestore(&clocksource_lock, flags);
-
-	return curr_clocksource;
-}
+static int finished_booting;
 
 /**
  * clocksource_select - Select the best clocksource available
  *
- * Private function. Must hold clocksource_lock when called.
+ * Private function. Must hold clocksource_mutex when called.
  *
  * Select the clocksource with the best rating, or the clocksource,
  * which is selected by userspace override.
@@ -413,7 +378,7 @@ static void clocksource_select(void)
 {
 	struct clocksource *best, *cs;
 
-	if (list_empty(&clocksource_list))
+	if (!finished_booting || list_empty(&clocksource_list))
 		return;
 	/* First clocksource on the list has the best rating. */
 	best = list_first_entry(&clocksource_list, struct clocksource, list);
@@ -438,13 +403,31 @@ static void clocksource_select(void)
 			best = cs;
 		break;
 	}
-	if (curr_clocksource != best)
-		next_clocksource = best;
+	if (curr_clocksource != best) {
+		printk(KERN_INFO "Switching to clocksource %s\n", best->name);
+		curr_clocksource = best;
+		timekeeping_notify(curr_clocksource);
+	}
 }
 
+/*
+ * clocksource_done_booting - Called near the end of core bootup
+ *
+ * Hack to avoid lots of clocksource churn at boot time.
+ * We use fs_initcall because we want this to start before
+ * device_initcall but after subsys_initcall.
+ */
+static int __init clocksource_done_booting(void)
+{
+	finished_booting = 1;
+	clocksource_select();
+	return 0;
+}
+fs_initcall(clocksource_done_booting);
+
 #else /* CONFIG_GENERIC_TIME */
 
-static void clocksource_select(void) { }
+static inline void clocksource_select(void) { }
 
 #endif
 
@@ -471,13 +454,11 @@ static void clocksource_enqueue(struct clocksource *cs)
  */
 int clocksource_register(struct clocksource *cs)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&clocksource_lock, flags);
+	mutex_lock(&clocksource_mutex);
 	clocksource_enqueue(cs);
 	clocksource_select();
-	spin_unlock_irqrestore(&clocksource_lock, flags);
 	clocksource_enqueue_watchdog(cs);
+	mutex_unlock(&clocksource_mutex);
 	return 0;
 }
 EXPORT_SYMBOL(clocksource_register);
@@ -487,14 +468,12 @@ EXPORT_SYMBOL(clocksource_register);
  */
 void clocksource_change_rating(struct clocksource *cs, int rating)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&clocksource_lock, flags);
+	mutex_lock(&clocksource_mutex);
 	list_del(&cs->list);
 	cs->rating = rating;
 	clocksource_enqueue(cs);
 	clocksource_select();
-	spin_unlock_irqrestore(&clocksource_lock, flags);
+	mutex_unlock(&clocksource_mutex);
 }
 EXPORT_SYMBOL(clocksource_change_rating);
 
@@ -503,13 +482,11 @@ EXPORT_SYMBOL(clocksource_change_rating);
  */
 void clocksource_unregister(struct clocksource *cs)
 {
-	unsigned long flags;
-
+	mutex_lock(&clocksource_mutex);
 	clocksource_dequeue_watchdog(cs);
-	spin_lock_irqsave(&clocksource_lock, flags);
 	list_del(&cs->list);
 	clocksource_select();
-	spin_unlock_irqrestore(&clocksource_lock, flags);
+	mutex_unlock(&clocksource_mutex);
 }
 EXPORT_SYMBOL(clocksource_unregister);
 
@@ -527,9 +504,9 @@ sysfs_show_current_clocksources(struct sys_device *dev,
 {
 	ssize_t count = 0;
 
-	spin_lock_irq(&clocksource_lock);
+	mutex_lock(&clocksource_mutex);
 	count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
-	spin_unlock_irq(&clocksource_lock);
+	mutex_unlock(&clocksource_mutex);
 
 	return count;
 }
@@ -557,14 +534,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
 	if (buf[count-1] == '\n')
 		count--;
 
-	spin_lock_irq(&clocksource_lock);
+	mutex_lock(&clocksource_mutex);
 
 	if (count > 0)
 		memcpy(override_name, buf, count);
 	override_name[count] = 0;
 	clocksource_select();
 
-	spin_unlock_irq(&clocksource_lock);
+	mutex_unlock(&clocksource_mutex);
 
 	return ret;
 }
@@ -584,7 +561,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
 	struct clocksource *src;
 	ssize_t count = 0;
 
-	spin_lock_irq(&clocksource_lock);
+	mutex_lock(&clocksource_mutex);
 	list_for_each_entry(src, &clocksource_list, list) {
 		/*
 		 * Don't show non-HRES clocksource if the tick code is
@@ -596,7 +573,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
 				  max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
 				  "%s ", src->name);
 	}
-	spin_unlock_irq(&clocksource_lock);
+	mutex_unlock(&clocksource_mutex);
 
 	count += snprintf(buf + count,
 			  max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
@@ -651,11 +628,10 @@ device_initcall(init_clocksource_sysfs);
  */
 static int __init boot_override_clocksource(char* str)
 {
-	unsigned long flags;
-	spin_lock_irqsave(&clocksource_lock, flags);
+	mutex_lock(&clocksource_mutex);
 	if (str)
 		strlcpy(override_name, str, sizeof(override_name));
-	spin_unlock_irqrestore(&clocksource_lock, flags);
+	mutex_unlock(&clocksource_mutex);
 	return 1;
 }
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 27ae01b596b7..41579e7fcf9d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -18,6 +18,7 @@
 #include <linux/jiffies.h>
 #include <linux/time.h>
 #include <linux/tick.h>
+#include <linux/stop_machine.h>
 
 /* Structure holding internal timekeeping values. */
 struct timekeeper {
@@ -179,6 +180,7 @@ void timekeeping_leap_insert(int leapsecond)
 }
 
 #ifdef CONFIG_GENERIC_TIME
+
 /**
  * timekeeping_forward_now - update clock to the current time
  *
@@ -351,31 +353,40 @@ EXPORT_SYMBOL(do_settimeofday);
  *
  * Accumulates current time interval and initializes new clocksource
  */
-static void change_clocksource(void)
+static int change_clocksource(void *data)
 {
 	struct clocksource *new, *old;
 
-	new = clocksource_get_next();
-
-	if (!new || timekeeper.clock == new)
-		return;
+	new = (struct clocksource *) data;
 
 	timekeeping_forward_now();
+	if (!new->enable || new->enable(new) == 0) {
+		old = timekeeper.clock;
+		timekeeper_setup_internals(new);
+		if (old->disable)
+			old->disable(old);
+	}
+	return 0;
+}
 
-	if (new->enable && !new->enable(new))
+/**
+ * timekeeping_notify - Install a new clock source
+ * @clock:		pointer to the clock source
+ *
+ * This function is called from clocksource.c after a new, better clock
+ * source has been registered. The caller holds the clocksource_mutex.
+ */
+void timekeeping_notify(struct clocksource *clock)
+{
+	if (timekeeper.clock == clock)
 		return;
-
-	old = timekeeper.clock;
-	timekeeper_setup_internals(new);
-
-	if (old->disable)
-		old->disable(old);
-
+	stop_machine(change_clocksource, clock, NULL);
 	tick_clock_notify();
 }
+
 #else /* GENERIC_TIME */
+
 static inline void timekeeping_forward_now(void) { }
-static inline void change_clocksource(void) { }
 
 /**
  * ktime_get - get the monotonic time in ktime_t format
@@ -416,6 +427,7 @@ void ktime_get_ts(struct timespec *ts)
 				ts->tv_nsec + tomono.tv_nsec);
 }
 EXPORT_SYMBOL_GPL(ktime_get_ts);
+
 #endif /* !GENERIC_TIME */
 
 /**
@@ -773,7 +785,6 @@ void update_wall_time(void)
 	update_xtime_cache(nsecs);
 
 	/* check to see if there is a new clocksource to use */
-	change_clocksource();
 	update_vsyscall(&xtime, timekeeper.clock);
 }
 
-- 
cgit v1.2.3-71-gd317


From d4f587c67fc39e0030ddd718675e252e208da4d7 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 14 Aug 2009 15:47:31 +0200
Subject: timekeeping: Increase granularity of read_persistent_clock()

The persistent clock of some architectures (e.g. s390) have a
better granularity than seconds. To reduce the delta between the
host clock and the guest clock in a virtualized system change the
read_persistent_clock function to return a struct timespec.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: John Stultz <johnstul@us.ibm.com>
Cc: Daniel Walker <dwalker@fifo99.com>
LKML-Reference: <20090814134811.013873340@de.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/m68knommu/kernel/time.c          |  5 ++--
 arch/mips/dec/time.c                  |  5 ++--
 arch/mips/lasat/ds1603.c              |  5 ++--
 arch/mips/lasat/sysctl.c              |  8 ++++--
 arch/mips/lemote/lm2e/setup.c         |  5 ++--
 arch/mips/mti-malta/malta-time.c      |  5 ++--
 arch/mips/pmc-sierra/yosemite/setup.c |  5 ++--
 arch/mips/sibyte/swarm/setup.c        | 15 +++++++---
 arch/mips/sni/time.c                  |  5 ++--
 arch/powerpc/kernel/time.c            |  7 +++--
 arch/s390/kernel/time.c               | 22 +++------------
 arch/sh/kernel/time.c                 |  6 ++--
 arch/x86/kernel/rtc.c                 |  5 ++--
 arch/xtensa/kernel/time.c             |  5 ++--
 include/linux/time.h                  |  2 +-
 kernel/time/timekeeping.c             | 52 +++++++++++++++++++----------------
 16 files changed, 83 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/arch/m68knommu/kernel/time.c b/arch/m68knommu/kernel/time.c
index d182b2f72211..68432248515c 100644
--- a/arch/m68knommu/kernel/time.c
+++ b/arch/m68knommu/kernel/time.c
@@ -72,9 +72,10 @@ static unsigned long read_rtc_mmss(void)
 	return  mktime(year, mon, day, hour, min, sec);;
 }
 
-unsigned long read_persistent_clock(void)
+void read_persistent_clock(struct timespec *ts)
 {
-	return read_rtc_mmss();
+	ts->tv_sec = read_rtc_mmss();
+	ts->tv_nsec = 0;
 }
 
 int update_persistent_clock(struct timespec now)
diff --git a/arch/mips/dec/time.c b/arch/mips/dec/time.c
index 463136e6685a..02f505f23c32 100644
--- a/arch/mips/dec/time.c
+++ b/arch/mips/dec/time.c
@@ -18,7 +18,7 @@
 #include <asm/dec/ioasic.h>
 #include <asm/dec/machtype.h>
 
-unsigned long read_persistent_clock(void)
+void read_persistent_clock(struct timespec *ts)
 {
 	unsigned int year, mon, day, hour, min, sec, real_year;
 	unsigned long flags;
@@ -53,7 +53,8 @@ unsigned long read_persistent_clock(void)
 
 	year += real_year - 72 + 2000;
 
-	return mktime(year, mon, day, hour, min, sec);
+	ts->tv_sec = mktime(year, mon, day, hour, min, sec);
+	ts->tv_nsec = 0;
 }
 
 /*
diff --git a/arch/mips/lasat/ds1603.c b/arch/mips/lasat/ds1603.c
index 52cb1436a12a..c6fd96ff118d 100644
--- a/arch/mips/lasat/ds1603.c
+++ b/arch/mips/lasat/ds1603.c
@@ -135,7 +135,7 @@ static void rtc_end_op(void)
 	lasat_ndelay(1000);
 }
 
-unsigned long read_persistent_clock(void)
+void read_persistent_clock(struct timespec *ts)
 {
 	unsigned long word;
 	unsigned long flags;
@@ -147,7 +147,8 @@ unsigned long read_persistent_clock(void)
 	rtc_end_op();
 	spin_unlock_irqrestore(&rtc_lock, flags);
 
-	return word;
+	ts->tv_sec = word;
+	ts->tv_nsec = 0;
 }
 
 int rtc_mips_set_mmss(unsigned long time)
diff --git a/arch/mips/lasat/sysctl.c b/arch/mips/lasat/sysctl.c
index 8f88886feb12..3f04d4c406b7 100644
--- a/arch/mips/lasat/sysctl.c
+++ b/arch/mips/lasat/sysctl.c
@@ -92,10 +92,12 @@ static int rtctmp;
 int proc_dolasatrtc(ctl_table *table, int write, struct file *filp,
 		       void *buffer, size_t *lenp, loff_t *ppos)
 {
+	struct timespec ts;
 	int r;
 
 	if (!write) {
-		rtctmp = read_persistent_clock();
+		read_persistent_clock(&ts);
+		rtctmp = ts.tv_sec;
 		/* check for time < 0 and set to 0 */
 		if (rtctmp < 0)
 			rtctmp = 0;
@@ -134,9 +136,11 @@ int sysctl_lasat_rtc(ctl_table *table,
 		    void *oldval, size_t *oldlenp,
 		    void *newval, size_t newlen)
 {
+	struct timespec ts;
 	int r;
 
-	rtctmp = read_persistent_clock();
+	read_persistent_clock(&ts);
+	rtctmp = ts.tv_sec;
 	if (rtctmp < 0)
 		rtctmp = 0;
 	r = sysctl_intvec(table, oldval, oldlenp, newval, newlen);
diff --git a/arch/mips/lemote/lm2e/setup.c b/arch/mips/lemote/lm2e/setup.c
index ebd6ceaef2fd..24b355df6127 100644
--- a/arch/mips/lemote/lm2e/setup.c
+++ b/arch/mips/lemote/lm2e/setup.c
@@ -54,9 +54,10 @@ void __init plat_time_init(void)
 	mips_hpt_frequency = cpu_clock_freq / 2;
 }
 
-unsigned long read_persistent_clock(void)
+void read_persistent_clock(struct timespec *ts)
 {
-	return mc146818_get_cmos_time();
+	ts->tv_sec = mc146818_get_cmos_time();
+	ts->tv_nsec = 0;
 }
 
 void (*__wbflush)(void);
diff --git a/arch/mips/mti-malta/malta-time.c b/arch/mips/mti-malta/malta-time.c
index 0b97d47691fc..3c6f190aa61c 100644
--- a/arch/mips/mti-malta/malta-time.c
+++ b/arch/mips/mti-malta/malta-time.c
@@ -100,9 +100,10 @@ static unsigned int __init estimate_cpu_frequency(void)
 	return count;
 }
 
-unsigned long read_persistent_clock(void)
+void read_persistent_clock(struct timespec *ts)
 {
-	return mc146818_get_cmos_time();
+	ts->tv_sec = mc146818_get_cmos_time();
+	ts->tv_nsec = 0;
 }
 
 static void __init plat_perf_setup(void)
diff --git a/arch/mips/pmc-sierra/yosemite/setup.c b/arch/mips/pmc-sierra/yosemite/setup.c
index 2d3c0dca275d..3498ac9c35af 100644
--- a/arch/mips/pmc-sierra/yosemite/setup.c
+++ b/arch/mips/pmc-sierra/yosemite/setup.c
@@ -70,7 +70,7 @@ void __init bus_error_init(void)
 }
 
 
-unsigned long read_persistent_clock(void)
+void read_persistent_clock(struct timespec *ts)
 {
 	unsigned int year, month, day, hour, min, sec;
 	unsigned long flags;
@@ -92,7 +92,8 @@ unsigned long read_persistent_clock(void)
 	m48t37_base->control = 0x00;
 	spin_unlock_irqrestore(&rtc_lock, flags);
 
-	return mktime(year, month, day, hour, min, sec);
+	ts->tv_sec = mktime(year, month, day, hour, min, sec);
+	ts->tv_nsec = 0;
 }
 
 int rtc_mips_set_time(unsigned long tim)
diff --git a/arch/mips/sibyte/swarm/setup.c b/arch/mips/sibyte/swarm/setup.c
index 672e45d495a9..623ffc933c4c 100644
--- a/arch/mips/sibyte/swarm/setup.c
+++ b/arch/mips/sibyte/swarm/setup.c
@@ -87,19 +87,26 @@ enum swarm_rtc_type {
 
 enum swarm_rtc_type swarm_rtc_type;
 
-unsigned long read_persistent_clock(void)
+void read_persistent_clock(struct timespec *ts)
 {
+	unsigned long sec;
+
 	switch (swarm_rtc_type) {
 	case RTC_XICOR:
-		return xicor_get_time();
+		sec = xicor_get_time();
+		break;
 
 	case RTC_M4LT81:
-		return m41t81_get_time();
+		sec = m41t81_get_time();
+		break;
 
 	case RTC_NONE:
 	default:
-		return mktime(2000, 1, 1, 0, 0, 0);
+		sec = mktime(2000, 1, 1, 0, 0, 0);
+		break;
 	}
+	ts->tv_sec = sec;
+	tv->tv_nsec = 0;
 }
 
 int rtc_mips_set_time(unsigned long sec)
diff --git a/arch/mips/sni/time.c b/arch/mips/sni/time.c
index 0d9ec1a5c24a..62df6a598e0a 100644
--- a/arch/mips/sni/time.c
+++ b/arch/mips/sni/time.c
@@ -182,7 +182,8 @@ void __init plat_time_init(void)
 	setup_pit_timer();
 }
 
-unsigned long read_persistent_clock(void)
+void read_persistent_clock(struct timespec *ts)
 {
-	return -1;
+	ts->tv_sec = -1;
+	ts->tv_nsec = 0;
 }
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index eae4511ceeac..ad63f30fe3da 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -769,7 +769,7 @@ int update_persistent_clock(struct timespec now)
 	return ppc_md.set_rtc_time(&tm);
 }
 
-unsigned long read_persistent_clock(void)
+void read_persistent_clock(struct timespec *ts)
 {
 	struct rtc_time tm;
 	static int first = 1;
@@ -787,8 +787,9 @@ unsigned long read_persistent_clock(void)
 	if (!ppc_md.get_rtc_time)
 		return 0;
 	ppc_md.get_rtc_time(&tm);
-	return mktime(tm.tm_year+1900, tm.tm_mon+1, tm.tm_mday,
-		      tm.tm_hour, tm.tm_min, tm.tm_sec);
+	ts->tv_sec = mktime(tm.tm_year+1900, tm.tm_mon+1, tm.tm_mday,
+			    tm.tm_hour, tm.tm_min, tm.tm_sec);
+	ts->tv_nsec = 0;
 }
 
 /* clocksource code */
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index e76c2e7a8b9a..a94ec48587b4 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -182,12 +182,9 @@ static void timing_alert_interrupt(__u16 code)
 static void etr_reset(void);
 static void stp_reset(void);
 
-unsigned long read_persistent_clock(void)
+void read_persistent_clock(struct timespec *ts)
 {
-	struct timespec ts;
-
-	tod_to_timeval(get_clock() - TOD_UNIX_EPOCH, &ts);
-	return ts.tv_sec;
+	tod_to_timeval(get_clock() - TOD_UNIX_EPOCH, ts);
 }
 
 static cycle_t read_tod_clock(struct clocksource *cs)
@@ -248,7 +245,6 @@ void __init time_init(void)
 {
 	struct timespec ts;
 	unsigned long flags;
-	cycle_t now;
 
 	/* Reset time synchronization interfaces. */
 	etr_reset();
@@ -266,20 +262,10 @@ void __init time_init(void)
 		panic("Could not register TOD clock source");
 
 	/*
-	 * The TOD clock is an accurate clock. The xtime should be
-	 * initialized in a way that the difference between TOD and
-	 * xtime is reasonably small. Too bad that timekeeping_init
-	 * sets xtime.tv_nsec to zero. In addition the clock source
-	 * change from the jiffies clock source to the TOD clock
-	 * source add another error of up to 1/HZ second. The same
-	 * function sets wall_to_monotonic to a value that is too
-	 * small for /proc/uptime to be accurate.
-	 * Reset xtime and wall_to_monotonic to sane values.
+	 * Reset wall_to_monotonic to the initial timestamp created
+	 * in head.S to get a precise value in /proc/uptime.
 	 */
 	write_seqlock_irqsave(&xtime_lock, flags);
-	now = get_clock();
-	tod_to_timeval(now - TOD_UNIX_EPOCH, &xtime);
-	clocksource_tod.cycle_last = now;
 	tod_to_timeval(sched_clock_base_cc - TOD_UNIX_EPOCH, &ts);
 	set_normalized_timespec(&wall_to_monotonic, -ts.tv_sec, -ts.tv_nsec);
 	write_sequnlock_irqrestore(&xtime_lock, flags);
diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index 9b352a1e3fb4..3f4706aa975e 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -39,11 +39,9 @@ void (*rtc_sh_get_time)(struct timespec *) = null_rtc_get_time;
 int (*rtc_sh_set_time)(const time_t) = null_rtc_set_time;
 
 #ifdef CONFIG_GENERIC_CMOS_UPDATE
-unsigned long read_persistent_clock(void)
+void read_persistent_clock(struct timespec *ts)
 {
-	struct timespec tv;
-	rtc_sh_get_time(&tv);
-	return tv.tv_sec;
+	rtc_sh_get_time(&ts);
 }
 
 int update_persistent_clock(struct timespec now)
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 5d465b207e72..bf67dcb4a44c 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -178,7 +178,7 @@ static int set_rtc_mmss(unsigned long nowtime)
 }
 
 /* not static: needed by APM */
-unsigned long read_persistent_clock(void)
+void read_persistent_clock(struct timespec *ts)
 {
 	unsigned long retval, flags;
 
@@ -186,7 +186,8 @@ unsigned long read_persistent_clock(void)
 	retval = get_wallclock();
 	spin_unlock_irqrestore(&rtc_lock, flags);
 
-	return retval;
+	ts->tv_sec = retval;
+	ts->tv_nsec = 0;
 }
 
 int update_persistent_clock(struct timespec now)
diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c
index 8848120d291b..19085ff0484a 100644
--- a/arch/xtensa/kernel/time.c
+++ b/arch/xtensa/kernel/time.c
@@ -59,9 +59,8 @@ static struct irqaction timer_irqaction = {
 
 void __init time_init(void)
 {
-	xtime.tv_nsec = 0;
-	xtime.tv_sec = read_persistent_clock();
-
+	/* FIXME: xtime&wall_to_monotonic are set in timekeeping_init. */
+	read_persistent_clock(&xtime);
 	set_normalized_timespec(&wall_to_monotonic,
 		-xtime.tv_sec, -xtime.tv_nsec);
 
diff --git a/include/linux/time.h b/include/linux/time.h
index e7c844558884..53a3216f0d1b 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -101,7 +101,7 @@ extern struct timespec xtime;
 extern struct timespec wall_to_monotonic;
 extern seqlock_t xtime_lock;
 
-extern unsigned long read_persistent_clock(void);
+extern void read_persistent_clock(struct timespec *ts);
 extern int update_persistent_clock(struct timespec now);
 extern int no_sync_cmos_clock __read_mostly;
 void timekeeping_init(void);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 41579e7fcf9d..f1a21ce491e6 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -154,7 +154,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
  */
 struct timespec xtime __attribute__ ((aligned (16)));
 struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
-static unsigned long total_sleep_time;		/* seconds */
+static struct timespec total_sleep_time;
 
 /*
  * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
@@ -487,17 +487,18 @@ int timekeeping_valid_for_hres(void)
 }
 
 /**
- * read_persistent_clock -  Return time in seconds from the persistent clock.
+ * read_persistent_clock -  Return time from the persistent clock.
  *
  * Weak dummy function for arches that do not yet support it.
- * Returns seconds from epoch using the battery backed persistent clock.
- * Returns zero if unsupported.
+ * Reads the time from the battery backed persistent clock.
+ * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
  *
  *  XXX - Do be sure to remove it once all arches implement it.
  */
-unsigned long __attribute__((weak)) read_persistent_clock(void)
+void __attribute__((weak)) read_persistent_clock(struct timespec *ts)
 {
-	return 0;
+	ts->tv_sec = 0;
+	ts->tv_nsec = 0;
 }
 
 /*
@@ -507,7 +508,9 @@ void __init timekeeping_init(void)
 {
 	struct clocksource *clock;
 	unsigned long flags;
-	unsigned long sec = read_persistent_clock();
+	struct timespec now;
+
+	read_persistent_clock(&now);
 
 	write_seqlock_irqsave(&xtime_lock, flags);
 
@@ -518,19 +521,20 @@ void __init timekeeping_init(void)
 		clock->enable(clock);
 	timekeeper_setup_internals(clock);
 
-	xtime.tv_sec = sec;
-	xtime.tv_nsec = 0;
+	xtime.tv_sec = now.tv_sec;
+	xtime.tv_nsec = now.tv_nsec;
 	raw_time.tv_sec = 0;
 	raw_time.tv_nsec = 0;
 	set_normalized_timespec(&wall_to_monotonic,
 		-xtime.tv_sec, -xtime.tv_nsec);
 	update_xtime_cache(0);
-	total_sleep_time = 0;
+	total_sleep_time.tv_sec = 0;
+	total_sleep_time.tv_nsec = 0;
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 }
 
 /* time in seconds when suspend began */
-static unsigned long timekeeping_suspend_time;
+static struct timespec timekeeping_suspend_time;
 
 /**
  * timekeeping_resume - Resumes the generic timekeeping subsystem.
@@ -543,18 +547,19 @@ static unsigned long timekeeping_suspend_time;
 static int timekeeping_resume(struct sys_device *dev)
 {
 	unsigned long flags;
-	unsigned long now = read_persistent_clock();
+	struct timespec ts;
+
+	read_persistent_clock(&ts);
 
 	clocksource_resume();
 
 	write_seqlock_irqsave(&xtime_lock, flags);
 
-	if (now && (now > timekeeping_suspend_time)) {
-		unsigned long sleep_length = now - timekeeping_suspend_time;
-
-		xtime.tv_sec += sleep_length;
-		wall_to_monotonic.tv_sec -= sleep_length;
-		total_sleep_time += sleep_length;
+	if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
+		ts = timespec_sub(ts, timekeeping_suspend_time);
+		xtime = timespec_add_safe(xtime, ts);
+		wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
+		total_sleep_time = timespec_add_safe(total_sleep_time, ts);
 	}
 	update_xtime_cache(0);
 	/* re-base the last cycle value */
@@ -577,7 +582,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
 {
 	unsigned long flags;
 
-	timekeeping_suspend_time = read_persistent_clock();
+	read_persistent_clock(&timekeeping_suspend_time);
 
 	write_seqlock_irqsave(&xtime_lock, flags);
 	timekeeping_forward_now();
@@ -801,9 +806,10 @@ void update_wall_time(void)
  */
 void getboottime(struct timespec *ts)
 {
-	set_normalized_timespec(ts,
-		- (wall_to_monotonic.tv_sec + total_sleep_time),
-		- wall_to_monotonic.tv_nsec);
+	struct timespec boottime;
+
+	boottime = timespec_add_safe(wall_to_monotonic, total_sleep_time);
+	set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
 }
 
 /**
@@ -812,7 +818,7 @@ void getboottime(struct timespec *ts)
  */
 void monotonic_to_bootbased(struct timespec *ts)
 {
-	ts->tv_sec += total_sleep_time;
+	*ts = timespec_add_safe(*ts, total_sleep_time);
 }
 
 unsigned long get_seconds(void)
-- 
cgit v1.2.3-71-gd317


From 23970e389e9cee43c4b41023935e1417271708b2 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 14 Aug 2009 15:47:32 +0200
Subject: timekeeping: Introduce read_boot_clock

Add the new function read_boot_clock to get the exact time the system
has been started. For architectures without support for exact boot
time a new weak function is added that returns 0.  Use the exact boot
time to initialize wall_to_monotonic, or xtime if the read_boot_clock
returned 0.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: John Stultz <johnstul@us.ibm.com>
Cc: Daniel Walker <dwalker@fifo99.com>
LKML-Reference: <20090814134811.296703241@de.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/s390/kernel/time.c   | 17 +++++------------
 include/linux/time.h      |  1 +
 kernel/time/timekeeping.c | 24 ++++++++++++++++++++++--
 3 files changed, 28 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index a94ec48587b4..6bff1a1d9060 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -187,6 +187,11 @@ void read_persistent_clock(struct timespec *ts)
 	tod_to_timeval(get_clock() - TOD_UNIX_EPOCH, ts);
 }
 
+void read_boot_clock(struct timespec *ts)
+{
+	tod_to_timeval(sched_clock_base_cc - TOD_UNIX_EPOCH, ts);
+}
+
 static cycle_t read_tod_clock(struct clocksource *cs)
 {
 	return get_clock();
@@ -243,9 +248,6 @@ void update_vsyscall_tz(void)
  */
 void __init time_init(void)
 {
-	struct timespec ts;
-	unsigned long flags;
-
 	/* Reset time synchronization interfaces. */
 	etr_reset();
 	stp_reset();
@@ -261,15 +263,6 @@ void __init time_init(void)
 	if (clocksource_register(&clocksource_tod) != 0)
 		panic("Could not register TOD clock source");
 
-	/*
-	 * Reset wall_to_monotonic to the initial timestamp created
-	 * in head.S to get a precise value in /proc/uptime.
-	 */
-	write_seqlock_irqsave(&xtime_lock, flags);
-	tod_to_timeval(sched_clock_base_cc - TOD_UNIX_EPOCH, &ts);
-	set_normalized_timespec(&wall_to_monotonic, -ts.tv_sec, -ts.tv_nsec);
-	write_sequnlock_irqrestore(&xtime_lock, flags);
-
 	/* Enable TOD clock interrupts on the boot cpu. */
 	init_cpu_timer();
 
diff --git a/include/linux/time.h b/include/linux/time.h
index 53a3216f0d1b..f505988398e6 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -102,6 +102,7 @@ extern struct timespec wall_to_monotonic;
 extern seqlock_t xtime_lock;
 
 extern void read_persistent_clock(struct timespec *ts);
+extern void read_boot_clock(struct timespec *ts);
 extern int update_persistent_clock(struct timespec now);
 extern int no_sync_cmos_clock __read_mostly;
 void timekeeping_init(void);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f1a21ce491e6..15e06defca55 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -501,6 +501,21 @@ void __attribute__((weak)) read_persistent_clock(struct timespec *ts)
 	ts->tv_nsec = 0;
 }
 
+/**
+ * read_boot_clock -  Return time of the system start.
+ *
+ * Weak dummy function for arches that do not yet support it.
+ * Function to read the exact time the system has been started.
+ * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
+ *
+ *  XXX - Do be sure to remove it once all arches implement it.
+ */
+void __attribute__((weak)) read_boot_clock(struct timespec *ts)
+{
+	ts->tv_sec = 0;
+	ts->tv_nsec = 0;
+}
+
 /*
  * timekeeping_init - Initializes the clocksource and common timekeeping values
  */
@@ -508,9 +523,10 @@ void __init timekeeping_init(void)
 {
 	struct clocksource *clock;
 	unsigned long flags;
-	struct timespec now;
+	struct timespec now, boot;
 
 	read_persistent_clock(&now);
+	read_boot_clock(&boot);
 
 	write_seqlock_irqsave(&xtime_lock, flags);
 
@@ -525,8 +541,12 @@ void __init timekeeping_init(void)
 	xtime.tv_nsec = now.tv_nsec;
 	raw_time.tv_sec = 0;
 	raw_time.tv_nsec = 0;
+	if (boot.tv_sec == 0 && boot.tv_nsec == 0) {
+		boot.tv_sec = xtime.tv_sec;
+		boot.tv_nsec = xtime.tv_nsec;
+	}
 	set_normalized_timespec(&wall_to_monotonic,
-		-xtime.tv_sec, -xtime.tv_nsec);
+				-boot.tv_sec, -boot.tv_nsec);
 	update_xtime_cache(0);
 	total_sleep_time.tv_sec = 0;
 	total_sleep_time.tv_nsec = 0;
-- 
cgit v1.2.3-71-gd317


From 0ccff1a49def92d6b838a6da166c89004b3a4d0c Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Mon, 17 Aug 2009 22:38:04 -0400
Subject: jbd2: bitfields should be unsigned

This fixes sparse noise:
  error: dubious one-bit signed bitfield

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Jan Kara <jack@ucw.cz>
---
 include/linux/jbd2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index d97eb652d6ca..52695d3dfd0b 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -652,7 +652,7 @@ struct transaction_s
 	 * This transaction is being forced and some process is
 	 * waiting for it to finish.
 	 */
-	int t_synchronous_commit:1;
+	unsigned int t_synchronous_commit:1;
 
 	/*
 	 * For use by the filesystem to store fs-specific data
-- 
cgit v1.2.3-71-gd317


From fc69f4a6af49ee69475dc4217924d9edf77760e0 Mon Sep 17 00:00:00 2001
From: Tai-hwa Liang <avatar@sentelic.com>
Date: Sun, 10 May 2009 18:15:39 -0700
Subject: Input: add new driver for Sentelic Finger Sensing Pad

This is the driver for Sentelic Finger Sensing Pad which can be found
on MSI WIND Netbook.

Signed-off-by: Tai-hwa Liang <avatar@sentelic.com>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 Documentation/input/sentelic.txt   | 475 ++++++++++++++++++++
 drivers/input/mouse/Kconfig        |   8 +
 drivers/input/mouse/Makefile       |   1 +
 drivers/input/mouse/psmouse-base.c |  26 +-
 drivers/input/mouse/psmouse.h      |   1 +
 drivers/input/mouse/sentelic.c     | 867 +++++++++++++++++++++++++++++++++++++
 drivers/input/mouse/sentelic.h     |  98 +++++
 drivers/input/serio/libps2.c       |  15 +-
 include/linux/libps2.h             |   1 +
 9 files changed, 1488 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/input/sentelic.txt
 create mode 100644 drivers/input/mouse/sentelic.c
 create mode 100644 drivers/input/mouse/sentelic.h

(limited to 'include/linux')

diff --git a/Documentation/input/sentelic.txt b/Documentation/input/sentelic.txt
new file mode 100644
index 000000000000..f7160a2fb6a2
--- /dev/null
+++ b/Documentation/input/sentelic.txt
@@ -0,0 +1,475 @@
+Copyright (C) 2002-2008 Sentelic Corporation.
+Last update: Oct-31-2008
+
+==============================================================================
+* Finger Sensing Pad Intellimouse Mode(scrolling wheel, 4th and 5th buttons)
+==============================================================================
+A) MSID 4: Scrolling wheel mode plus Forward page(4th button) and Backward
+   page (5th button)
+@1. Set sample rate to 200;
+@2. Set sample rate to 200;
+@3. Set sample rate to 80;
+@4. Issuing the "Get device ID" command (0xF2) and waits for the response;
+@5. FSP will respond 0x04.
+
+Packet 1
+   Bit 7 6 5 4 3 2 1 0       7 6 5 4 3 2 1 0      7 6 5 4 3 2 1 0      7 6 5 4 3 2 1 0
+BYTE  |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
+  1   |Y|X|y|x|1|M|R|L|  2  |X|X|X|X|X|X|X|X|  3 |Y|Y|Y|Y|Y|Y|Y|Y|  4 | | |B|F|W|W|W|W|
+      |---------------|     |---------------|    |---------------|    |---------------|
+
+Byte 1: Bit7 => Y overflow
+        Bit6 => X overflow
+        Bit5 => Y sign bit
+        Bit4 => X sign bit
+        Bit3 => 1
+        Bit2 => Middle Button, 1 is pressed, 0 is not pressed.
+        Bit1 => Right Button, 1 is pressed, 0 is not pressed.
+        Bit0 => Left Button, 1 is pressed, 0 is not pressed.
+Byte 2: X Movement(9-bit 2's complement integers)
+Byte 3: Y Movement(9-bit 2's complement integers)
+Byte 4: Bit3~Bit0 => the scrolling wheel's movement since the last data report.
+                     valid values, -8 ~ +7
+        Bit4 => 1 = 4th mouse button is pressed, Forward one page.
+                0 = 4th mouse button is not pressed.
+        Bit5 => 1 = 5th mouse button is pressed, Backward one page.
+                0 = 5th mouse button is not pressed.
+
+B) MSID 6: Horizontal and Vertical scrolling.
+@ Set bit 1 in register 0x40 to 1
+
+# FSP replaces scrolling wheel's movement as 4 bits to show horizontal and
+  vertical scrolling.
+
+Packet 1
+   Bit 7 6 5 4 3 2 1 0       7 6 5 4 3 2 1 0      7 6 5 4 3 2 1 0      7 6 5 4 3 2 1 0
+BYTE  |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
+  1   |Y|X|y|x|1|M|R|L|  2  |X|X|X|X|X|X|X|X|  3 |Y|Y|Y|Y|Y|Y|Y|Y|  4 | | |B|F|l|r|u|d|
+      |---------------|     |---------------|    |---------------|    |---------------|
+
+Byte 1: Bit7 => Y overflow
+        Bit6 => X overflow
+        Bit5 => Y sign bit
+        Bit4 => X sign bit
+        Bit3 => 1
+        Bit2 => Middle Button, 1 is pressed, 0 is not pressed.
+        Bit1 => Right Button, 1 is pressed, 0 is not pressed.
+        Bit0 => Left Button, 1 is pressed, 0 is not pressed.
+Byte 2: X Movement(9-bit 2's complement integers)
+Byte 3: Y Movement(9-bit 2's complement integers)
+Byte 4: Bit0 => the Vertical scrolling movement downward.
+	Bit1 => the Vertical scrolling movement upward.
+	Bit2 => the Vertical scrolling movement rightward.
+	Bit3 => the Vertical scrolling movement leftward.
+        Bit4 => 1 = 4th mouse button is pressed, Forward one page.
+                0 = 4th mouse button is not pressed.
+        Bit5 => 1 = 5th mouse button is pressed, Backward one page.
+                0 = 5th mouse button is not pressed.
+
+C) MSID 7:
+# FSP uses 2 packets(8 Bytes) data to represent Absolute Position
+  so we have PACKET NUMBER to identify packets.
+  If PACKET NUMBER is 0, the packet is Packet 1.
+  If PACKET NUMBER is 1, the packet is Packet 2.
+  Please count this number in program.
+
+# MSID6 special packet will be enable at the same time when enable MSID 7.
+
+==============================================================================
+* Absolute position for STL3886-G0.
+==============================================================================
+@ Set bit 2 or 3 in register 0x40 to 1
+@ Set bit 6 in register 0x40 to 1
+
+Packet 1 (ABSOLUTE POSITION)
+   Bit 7 6 5 4 3 2 1 0       7 6 5 4 3 2 1 0      7 6 5 4 3 2 1 0      7 6 5 4 3 2 1 0
+BYTE  |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
+  1   |0|1|V|1|1|M|R|L|  2  |X|X|X|X|X|X|X|X|  3 |Y|Y|Y|Y|Y|Y|Y|Y|  4 |r|l|d|u|X|X|Y|Y|
+      |---------------|     |---------------|    |---------------|    |---------------|
+
+Byte 1: Bit7~Bit6 => 00, Normal data packet
+                  => 01, Absolute coordination packet
+                  => 10, Notify packet
+        Bit5 => valid bit
+        Bit4 => 1
+        Bit3 => 1
+        Bit2 => Middle Button, 1 is pressed, 0 is not pressed.
+        Bit1 => Right Button, 1 is pressed, 0 is not pressed.
+        Bit0 => Left Button, 1 is pressed, 0 is not pressed.
+Byte 2: X coordinate (xpos[9:2])
+Byte 3: Y coordinate (ypos[9:2])
+Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0])
+        Bit3~Bit2 => X coordinate (ypos[1:0])
+        Bit4 => scroll up
+        Bit5 => scroll down
+        Bit6 => scroll left
+        Bit7 => scroll right
+
+Notify Packet for G0
+   Bit 7 6 5 4 3 2 1 0       7 6 5 4 3 2 1 0      7 6 5 4 3 2 1 0      7 6 5 4 3 2 1 0
+BYTE  |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
+  1   |1|0|0|1|1|M|R|L|  2  |C|C|C|C|C|C|C|C|  3 |M|M|M|M|M|M|M|M|  4 |0|0|0|0|0|0|0|0|
+      |---------------|     |---------------|    |---------------|    |---------------|
+
+Byte 1: Bit7~Bit6 => 00, Normal data packet
+                  => 01, Absolute coordination packet
+                  => 10, Notify packet
+        Bit5 => 0
+        Bit4 => 1
+        Bit3 => 1
+        Bit2 => Middle Button, 1 is pressed, 0 is not pressed.
+        Bit1 => Right Button, 1 is pressed, 0 is not pressed.
+        Bit0 => Left Button, 1 is pressed, 0 is not pressed.
+Byte 2: Message Type => 0x5A (Enable/Disable status packet)
+        Mode Type => 0xA5 (Normal/Icon mode status)
+Byte 3: Message Type => 0x00 (Disabled)
+                     => 0x01 (Enabled)
+        Mode Type    => 0x00 (Normal)
+                     => 0x01 (Icon)
+Byte 4: Bit7~Bit0 => Don't Care
+
+==============================================================================
+* Absolute position for STL3888-A0.
+==============================================================================
+Packet 1 (ABSOLUTE POSITION)
+   Bit 7 6 5 4 3 2 1 0       7 6 5 4 3 2 1 0      7 6 5 4 3 2 1 0      7 6 5 4 3 2 1 0
+BYTE  |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
+  1   |0|1|V|A|1|L|0|1|  2  |X|X|X|X|X|X|X|X|  3 |Y|Y|Y|Y|Y|Y|Y|Y|  4 |x|x|y|y|X|X|Y|Y|
+      |---------------|     |---------------|    |---------------|    |---------------|
+
+Byte 1: Bit7~Bit6 => 00, Normal data packet
+                  => 01, Absolute coordination packet
+                  => 10, Notify packet
+        Bit5 => Valid bit, 0 means that the coordinate is invalid or finger up.
+                When both fingers are up, the last two reports have zero valid
+                bit.
+        Bit4 => arc
+        Bit3 => 1
+        Bit2 => Left Button, 1 is pressed, 0 is released.
+        Bit1 => 0
+        Bit0 => 1
+Byte 2: X coordinate (xpos[9:2])
+Byte 3: Y coordinate (ypos[9:2])
+Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0])
+        Bit3~Bit2 => X coordinate (ypos[1:0])
+        Bit5~Bit4 => y1_g
+        Bit7~Bit6 => x1_g
+
+Packet 2 (ABSOLUTE POSITION)
+   Bit 7 6 5 4 3 2 1 0       7 6 5 4 3 2 1 0      7 6 5 4 3 2 1 0      7 6 5 4 3 2 1 0
+BYTE  |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
+  1   |0|1|V|A|1|R|1|0|  2  |X|X|X|X|X|X|X|X|  3 |Y|Y|Y|Y|Y|Y|Y|Y|  4 |x|x|y|y|X|X|Y|Y|
+      |---------------|     |---------------|    |---------------|    |---------------|
+
+Byte 1: Bit7~Bit6 => 00, Normal data packet
+                  => 01, Absolute coordinates packet
+                  => 10, Notify packet
+        Bit5 => Valid bit, 0 means that the coordinate is invalid or finger up.
+                When both fingers are up, the last two reports have zero valid
+                bit.
+        Bit4 => arc
+        Bit3 => 1
+        Bit2 => Right Button, 1 is pressed, 0 is released.
+        Bit1 => 1
+        Bit0 => 0
+Byte 2: X coordinate (xpos[9:2])
+Byte 3: Y coordinate (ypos[9:2])
+Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0])
+        Bit3~Bit2 => X coordinate (ypos[1:0])
+        Bit5~Bit4 => y2_g
+        Bit7~Bit6 => x2_g
+
+Notify Packet for STL3888-A0
+   Bit 7 6 5 4 3 2 1 0       7 6 5 4 3 2 1 0      7 6 5 4 3 2 1 0      7 6 5 4 3 2 1 0
+BYTE  |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
+  1   |1|0|1|P|1|M|R|L|  2  |C|C|C|C|C|C|C|C|  3 |0|0|F|F|0|0|0|i|  4 |r|l|d|u|0|0|0|0|
+      |---------------|     |---------------|    |---------------|    |---------------|
+
+Byte 1: Bit7~Bit6 => 00, Normal data packet
+                  => 01, Absolute coordination packet
+                  => 10, Notify packet
+        Bit5 => 1
+        Bit4 => when in absolute coordinates mode (valid when EN_PKT_GO is 1):
+                0: left button is generated by the on-pad command
+                1: left button is generated by the external button
+        Bit3 => 1
+        Bit2 => Middle Button, 1 is pressed, 0 is not pressed.
+        Bit1 => Right Button, 1 is pressed, 0 is not pressed.
+        Bit0 => Left Button, 1 is pressed, 0 is not pressed.
+Byte 2: Message Type => 0xB7 (Multi Finger, Multi Coordinate mode)
+Byte 3: Bit7~Bit6 => Don't care
+        Bit5~Bit4 => Number of fingers
+        Bit3~Bit1 => Reserved
+        Bit0 => 1: enter gesture mode; 0: leaving gesture mode
+Byte 4: Bit7 => scroll right button
+        Bit6 => scroll left button
+        Bit5 => scroll down button
+        Bit4 => scroll up button
+            * Note that if gesture and additional button (Bit4~Bit7)
+	      happen at the same time, the button information will not
+	      be sent.
+        Bit3~Bit0 => Reserved
+
+Sample sequence of Multi-finger, Multi-coordinate mode:
+
+	notify packet (valid bit == 1), abs pkt 1, abs pkt 2, abs pkt 1,
+	abs pkt 2, ..., notify packet(valid bit == 0)
+
+==============================================================================
+* FSP Enable/Disable packet
+==============================================================================
+   Bit 7 6 5 4 3 2 1 0       7 6 5 4 3 2 1 0      7 6 5 4 3 2 1 0      7 6 5 4 3 2 1 0
+BYTE  |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
+  1   |Y|X|0|0|1|M|R|L|  2  |0|1|0|1|1|0|1|E|  3 | | | | | | | | |  4 | | | | | | | | |
+      |---------------|     |---------------|    |---------------|    |---------------|
+
+FSP will send out enable/disable packet when FSP receive PS/2 enable/disable
+command. Host will receive the packet which Middle, Right, Left button will
+be set. The packet only use byte 0 and byte 1 as a pattern of original packet.
+Ignore the other bytes of the packet.
+
+Byte 1: Bit7 => 0, Y overflow
+        Bit6 => 0, X overflow
+	Bit5 => 0, Y sign bit
+        Bit4 => 0, X sign bit
+	Bit3 => 1
+	Bit2 => 1, Middle Button
+        Bit1 => 1, Right Button
+        Bit0 => 1, Left Button
+Byte 2: Bit7~1 => (0101101b)
+        Bit0 => 1 = Enable
+		0 = Disable
+Byte 3: Don't care
+Byte 4: Don't care (MOUSE ID 3, 4)
+Byte 5~8: Don't care (Absolute packet)
+
+==============================================================================
+* PS/2 Command Set
+==============================================================================
+
+FSP supports basic PS/2 commanding set and modes, refer to following URL for
+details about PS/2 commands:
+
+http://www.computer-engineering.org/index.php?title=PS/2_Mouse_Interface
+
+==============================================================================
+* Programming Sequence for Determining Packet Parsing Flow
+==============================================================================
+1. Identify FSP by reading device ID(0x00) and version(0x01) register
+
+2. Determine number of buttons by reading status2 (0x0b) register
+
+	buttons = reg[0x0b] & 0x30
+
+	if buttons == 0x30 or buttons == 0x20:
+		# two/four buttons
+		Refer to 'Finger Sensing Pad PS/2 Mouse Intellimouse'
+		section A for packet parsing detail(ignore byte 4, bit ~ 7)
+	elif buttons == 0x10:
+		# 6 buttons
+		Refer to 'Finger Sensing Pad PS/2 Mouse Intellimouse'
+		section B for packet parsing detail
+	elif buttons == 0x00:
+		# 6 buttons
+		Refer to 'Finger Sensing Pad PS/2 Mouse Intellimouse'
+		section A for packet parsing detail
+
+==============================================================================
+* Programming Sequence for Register Reading/Writing
+==============================================================================
+
+Register inversion requirement:
+
+  Following values needed to be inverted(the '~' operator in C) before being
+sent to FSP:
+
+	0xe9, 0xee, 0xf2 and 0xff.
+
+Register swapping requirement:
+
+  Following values needed to have their higher 4 bits and lower 4 bits being
+swapped before being sent to FSP:
+
+	10, 20, 40, 60, 80, 100 and 200.
+
+Register reading sequence:
+
+	1. send 0xf3 PS/2 command to FSP;
+
+	2. send 0x66 PS/2 command to FSP;
+
+	3. send 0x88 PS/2 command to FSP;
+
+	4. send 0xf3 PS/2 command to FSP;
+
+	5. if the register address being to read is not required to be
+	inverted(refer to the 'Register inversion requirement' section),
+	goto step 6
+
+	5a. send 0x68 PS/2 command to FSP;
+
+	5b. send the inverted register address to FSP and goto step 8;
+
+	6. if the register address being to read is not required to be
+	swapped(refer to the 'Register swapping requirement' section),
+	goto step 7
+
+	6a. send 0xcc PS/2 command to FSP;
+
+	6b. send the swapped register address to FSP and goto step 8;
+
+	7. send 0x66 PS/2 command to FSP;
+
+	7a. send the original register address to FSP and goto step 8;
+
+	8. send 0xe9(status request) PS/2 command to FSP;
+
+	9. the response read from FSP should be the requested register value.
+
+Register writing sequence:
+
+	1. send 0xf3 PS/2 command to FSP;
+
+	2. if the register address being to write is not required to be
+	inverted(refer to the 'Register inversion requirement' section),
+	goto step 3
+
+	2a. send 0x74 PS/2 command to FSP;
+
+	2b. send the inverted register address to FSP and goto step 5;
+
+	3. if the register address being to write is not required to be
+	swapped(refer to the 'Register swapping requirement' section),
+	goto step 4
+
+	3a. send 0x77 PS/2 command to FSP;
+
+	3b. send the swapped register address to FSP and goto step 5;
+
+	4. send 0x55 PS/2 command to FSP;
+
+	4a. send the register address to FSP and goto step 5;
+
+	5. send 0xf3 PS/2 command to FSP;
+
+	6. if the register value being to write is not required to be
+	inverted(refer to the 'Register inversion requirement' section),
+	goto step 7
+
+	6a. send 0x47 PS/2 command to FSP;
+
+	6b. send the inverted register value to FSP and goto step 9;
+
+	7. if the register value being to write is not required to be
+	swapped(refer to the 'Register swapping requirement' section),
+	goto step 8
+
+	7a. send 0x44 PS/2 command to FSP;
+
+	7b. send the swapped register value to FSP and goto step 9;
+
+	8. send 0x33 PS/2 command to FSP;
+
+	8a. send the register value to FSP;
+
+	9. the register writing sequence is completed.
+
+==============================================================================
+* Register Listing
+==============================================================================
+
+offset	width		default	r/w	name
+0x00	bit7~bit0	0x01	RO	device ID
+
+0x01	bit7~bit0	0xc0	RW	version ID
+
+0x02	bit7~bit0	0x01	RO	vendor ID
+
+0x03	bit7~bit0	0x01	RO	product ID
+
+0x04	bit3~bit0	0x01	RW	revision ID
+
+0x0b				RO	test mode status 1
+	bit3		1	RO	0: rotate 180 degree, 1: no rotation
+
+	bit5~bit4		RO	number of buttons
+			11 => 2, lbtn/rbtn
+			10 => 4, lbtn/rbtn/scru/scrd
+			01 => 6, lbtn/rbtn/scru/scrd/scrl/scrr
+			00 => 6, lbtn/rbtn/scru/scrd/fbtn/bbtn
+
+0x0f				RW	register file page control
+	bit0		0	RW	1 to enable page 1 register files
+
+0x10				RW	system control 1
+	bit0		1	RW	Reserved, must be 1
+	bit1		0	RW	Reserved, must be 0
+	bit4		1	RW	Reserved, must be 0
+	bit5		0	RW	register clock gating enable
+					0: read only, 1: read/write enable
+	(Note that following registers does not require clock gating being
+	enabled prior to write: 05 06 07 08 09 0c 0f 10 11 12 16 17 18 23 2e
+	40 41 42 43.)
+
+0x31				RW	on-pad command detection
+	bit7		0	RW	on-pad command left button down tag
+					enable
+					0: disable, 1: enable
+
+0x34				RW	on-pad command control 5
+	bit4~bit0	0x05	RW	XLO in 0s/4/1, so 03h = 0010.1b = 2.5
+	(Note that position unit is in 0.5 scanline)
+
+	bit7		0	RW	on-pad tap zone enable
+					0: disable, 1: enable
+
+0x35				RW	on-pad command control 6
+	bit4~bit0	0x1d	RW	XHI in 0s/4/1, so 19h = 1100.1b = 12.5
+	(Note that position unit is in 0.5 scanline)
+
+0x36				RW	on-pad command control 7
+	bit4~bit0	0x04	RW	YLO in 0s/4/1, so 03h = 0010.1b = 2.5
+	(Note that position unit is in 0.5 scanline)
+
+0x37				RW	on-pad command control 8
+	bit4~bit0	0x13	RW	YHI in 0s/4/1, so 11h = 1000.1b = 8.5
+	(Note that position unit is in 0.5 scanline)
+
+0x40				RW	system control 5
+	bit1		0	RW	FSP Intellimouse mode enable
+					0: disable, 1: enable
+
+	bit2		0	RW	movement + abs. coordinate mode enable
+					0: disable, 1: enable
+	(Note that this function has the functionality of bit 1 even when
+	bit 1 is not set. However, the format is different from that of bit 1.
+	In addition, when bit 1 and bit 2 are set at the same time, bit 2 will
+	override bit 1.)
+
+	bit3		0	RW	abs. coordinate only mode enable
+					0: disable, 1: enable
+	(Note that this function has the functionality of bit 1 even when
+	bit 1 is not set. However, the format is different from that of bit 1.
+	In addition, when bit 1, bit 2 and bit 3 are set at the same time,
+	bit 3 will override bit 1 and 2.)
+
+	bit5		0	RW	auto switch enable
+					0: disable, 1: enable
+
+	bit6		0	RW	G0 abs. + notify packet format enable
+					0: disable, 1: enable
+	(Note that the absolute/relative coordinate output still depends on
+	bit 2 and 3.  That is, if any of those bit is 1, host will receive
+	absolute coordinates; otherwise, host only receives packets with
+	relative coordinate.)
+
+0x43				RW	on-pad control
+	bit0		0	RW	on-pad control enable
+					0: disable, 1: enable
+	(Note that if this bit is cleared, bit 3/5 will be ineffective)
+
+	bit3		0	RW	on-pad fix vertical scrolling enable
+					0: disable, 1: enable
+
+	bit5		0	RW	on-pad fix horizontal scrolling enable
+					0: disable, 1: enable
diff --git a/drivers/input/mouse/Kconfig b/drivers/input/mouse/Kconfig
index 90bef5d498f0..3feeb3af8abd 100644
--- a/drivers/input/mouse/Kconfig
+++ b/drivers/input/mouse/Kconfig
@@ -107,6 +107,14 @@ config MOUSE_PS2_ELANTECH
 	  entries. For further information,
 	  see <file:Documentation/input/elantech.txt>.
 
+config MOUSE_PS2_SENTELIC
+	bool "Sentelic Finger Sensing Pad PS/2 protocol extension"
+	depends on MOUSE_PS2
+	help
+	  Say Y here if you have a laptop (such as MSI WIND Netbook)
+	  with Sentelic Finger Sensing Pad touchpad.
+
+	  If unsure, say N.
 
 config MOUSE_PS2_TOUCHKIT
 	bool "eGalax TouchKit PS/2 protocol extension"
diff --git a/drivers/input/mouse/Makefile b/drivers/input/mouse/Makefile
index ea58c9a372b6..570c84a4a654 100644
--- a/drivers/input/mouse/Makefile
+++ b/drivers/input/mouse/Makefile
@@ -27,5 +27,6 @@ psmouse-$(CONFIG_MOUSE_PS2_ELANTECH)	+= elantech.o
 psmouse-$(CONFIG_MOUSE_PS2_OLPC)	+= hgpk.o
 psmouse-$(CONFIG_MOUSE_PS2_LOGIPS2PP)	+= logips2pp.o
 psmouse-$(CONFIG_MOUSE_PS2_LIFEBOOK)	+= lifebook.o
+psmouse-$(CONFIG_MOUSE_PS2_SENTELIC)	+= sentelic.o
 psmouse-$(CONFIG_MOUSE_PS2_TRACKPOINT)	+= trackpoint.o
 psmouse-$(CONFIG_MOUSE_PS2_TOUCHKIT)	+= touchkit_ps2.o
diff --git a/drivers/input/mouse/psmouse-base.c b/drivers/input/mouse/psmouse-base.c
index b407b355dceb..df318887ca09 100644
--- a/drivers/input/mouse/psmouse-base.c
+++ b/drivers/input/mouse/psmouse-base.c
@@ -30,6 +30,7 @@
 #include "trackpoint.h"
 #include "touchkit_ps2.h"
 #include "elantech.h"
+#include "sentelic.h"
 
 #define DRIVER_DESC	"PS/2 mouse driver"
 
@@ -666,6 +667,20 @@ static int psmouse_extensions(struct psmouse *psmouse,
 		max_proto = PSMOUSE_IMEX;
 	}
 
+/*
+ * Try Finger Sensing Pad
+ */
+	if (max_proto > PSMOUSE_IMEX) {
+		if (fsp_detect(psmouse, set_properties) == 0) {
+			if (!set_properties || fsp_init(psmouse) == 0)
+				return PSMOUSE_FSP;
+/*
+ * Init failed, try basic relative protocols
+ */
+			max_proto = PSMOUSE_IMEX;
+		}
+	}
+
 	if (max_proto > PSMOUSE_IMEX) {
 		if (genius_detect(psmouse, set_properties) == 0)
 			return PSMOUSE_GENPS;
@@ -813,7 +828,16 @@ static const struct psmouse_protocol psmouse_protocols[] = {
 		.detect		= elantech_detect,
 		.init		= elantech_init,
 	},
- #endif
+#endif
+#ifdef CONFIG_MOUSE_PS2_SENTELIC
+	{
+		.type		= PSMOUSE_FSP,
+		.name		= "FSPPS/2",
+		.alias		= "fsp",
+		.detect		= fsp_detect,
+		.init		= fsp_init,
+	},
+#endif
 	{
 		.type		= PSMOUSE_CORTRON,
 		.name		= "CortronPS/2",
diff --git a/drivers/input/mouse/psmouse.h b/drivers/input/mouse/psmouse.h
index e3562daeb2ed..cca1744c2a08 100644
--- a/drivers/input/mouse/psmouse.h
+++ b/drivers/input/mouse/psmouse.h
@@ -91,6 +91,7 @@ enum psmouse_type {
 	PSMOUSE_CORTRON,
 	PSMOUSE_HGPK,
 	PSMOUSE_ELANTECH,
+	PSMOUSE_FSP,
 	PSMOUSE_AUTO		/* This one should always be last */
 };
 
diff --git a/drivers/input/mouse/sentelic.c b/drivers/input/mouse/sentelic.c
new file mode 100644
index 000000000000..97b1e72855a0
--- /dev/null
+++ b/drivers/input/mouse/sentelic.c
@@ -0,0 +1,867 @@
+/*-
+ * Finger Sensing Pad PS/2 mouse driver.
+ *
+ * Copyright (C) 2005-2007 Asia Vital Components Co., Ltd.
+ * Copyright (C) 2005-2009 Tai-hwa Liang, Sentelic Corporation.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation; either version 2
+ *   of the License, or (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/input.h>
+#include <linux/ctype.h>
+#include <linux/libps2.h>
+#include <linux/serio.h>
+#include <linux/jiffies.h>
+
+#include "psmouse.h"
+#include "sentelic.h"
+
+/*
+ * Timeout for FSP PS/2 command only (in milliseconds).
+ */
+#define	FSP_CMD_TIMEOUT		200
+#define	FSP_CMD_TIMEOUT2	30
+
+/** Driver version. */
+static const char fsp_drv_ver[] = "1.0.0-K";
+
+/*
+ * Make sure that the value being sent to FSP will not conflict with
+ * possible sample rate values.
+ */
+static unsigned char fsp_test_swap_cmd(unsigned char reg_val)
+{
+	switch (reg_val) {
+	case 10: case 20: case 40: case 60: case 80: case 100: case 200:
+		/*
+		 * The requested value being sent to FSP matched to possible
+		 * sample rates, swap the given value such that the hardware
+		 * wouldn't get confused.
+		 */
+		return (reg_val >> 4) | (reg_val << 4);
+	default:
+		return reg_val;	/* swap isn't necessary */
+	}
+}
+
+/*
+ * Make sure that the value being sent to FSP will not conflict with certain
+ * commands.
+ */
+static unsigned char fsp_test_invert_cmd(unsigned char reg_val)
+{
+	switch (reg_val) {
+	case 0xe9: case 0xee: case 0xf2: case 0xff:
+		/*
+		 * The requested value being sent to FSP matched to certain
+		 * commands, inverse the given value such that the hardware
+		 * wouldn't get confused.
+		 */
+		return ~reg_val;
+	default:
+		return reg_val;	/* inversion isn't necessary */
+	}
+}
+
+static int fsp_reg_read(struct psmouse *psmouse, int reg_addr, int *reg_val)
+{
+	struct ps2dev *ps2dev = &psmouse->ps2dev;
+	unsigned char param[3];
+	unsigned char addr;
+	int rc = -1;
+
+	/*
+	 * We need to shut off the device and switch it into command
+	 * mode so we don't confuse our protocol handler. We don't need
+	 * to do that for writes because sysfs set helper does this for
+	 * us.
+	 */
+	ps2_command(ps2dev, NULL, PSMOUSE_CMD_DISABLE);
+	psmouse_set_state(psmouse, PSMOUSE_CMD_MODE);
+	mutex_lock(&ps2dev->cmd_mutex);
+
+	if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0)
+		goto out;
+
+	/* should return 0xfe(request for resending) */
+	ps2_sendbyte(ps2dev, 0x66, FSP_CMD_TIMEOUT2);
+	/* should return 0xfc(failed) */
+	ps2_sendbyte(ps2dev, 0x88, FSP_CMD_TIMEOUT2);
+
+	if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0)
+		goto out;
+
+	if ((addr = fsp_test_invert_cmd(reg_addr)) != reg_addr) {
+		ps2_sendbyte(ps2dev, 0x68, FSP_CMD_TIMEOUT2);
+	} else if ((addr = fsp_test_swap_cmd(reg_addr)) != reg_addr) {
+		/* swapping is required */
+		ps2_sendbyte(ps2dev, 0xcc, FSP_CMD_TIMEOUT2);
+		/* expect 0xfe */
+	} else {
+		/* swapping isn't necessary */
+		ps2_sendbyte(ps2dev, 0x66, FSP_CMD_TIMEOUT2);
+		/* expect 0xfe */
+	}
+	/* should return 0xfc(failed) */
+	ps2_sendbyte(ps2dev, addr, FSP_CMD_TIMEOUT);
+
+	if (__ps2_command(ps2dev, param, PSMOUSE_CMD_GETINFO) < 0)
+		goto out;
+
+	*reg_val = param[2];
+	rc = 0;
+
+ out:
+	mutex_unlock(&ps2dev->cmd_mutex);
+	ps2_command(ps2dev, NULL, PSMOUSE_CMD_ENABLE);
+	psmouse_set_state(psmouse, PSMOUSE_ACTIVATED);
+	dev_dbg(&ps2dev->serio->dev, "READ REG: 0x%02x is 0x%02x (rc = %d)\n",
+		reg_addr, *reg_val, rc);
+	return rc;
+}
+
+static int fsp_reg_write(struct psmouse *psmouse, int reg_addr, int reg_val)
+{
+	struct ps2dev *ps2dev = &psmouse->ps2dev;
+	unsigned char v;
+	int rc = -1;
+
+	mutex_lock(&ps2dev->cmd_mutex);
+
+	if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0)
+		goto out;
+
+	if ((v = fsp_test_invert_cmd(reg_addr)) != reg_addr) {
+		/* inversion is required */
+		ps2_sendbyte(ps2dev, 0x74, FSP_CMD_TIMEOUT2);
+	} else {
+		if ((v = fsp_test_swap_cmd(reg_addr)) != reg_addr) {
+			/* swapping is required */
+			ps2_sendbyte(ps2dev, 0x77, FSP_CMD_TIMEOUT2);
+		} else {
+			/* swapping isn't necessary */
+			ps2_sendbyte(ps2dev, 0x55, FSP_CMD_TIMEOUT2);
+		}
+	}
+	/* write the register address in correct order */
+	ps2_sendbyte(ps2dev, v, FSP_CMD_TIMEOUT2);
+
+	if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0)
+		return -1;
+
+	if ((v = fsp_test_invert_cmd(reg_val)) != reg_val) {
+		/* inversion is required */
+		ps2_sendbyte(ps2dev, 0x47, FSP_CMD_TIMEOUT2);
+	} else if ((v = fsp_test_swap_cmd(reg_val)) != reg_val) {
+		/* swapping is required */
+		ps2_sendbyte(ps2dev, 0x44, FSP_CMD_TIMEOUT2);
+	} else {
+		/* swapping isn't necessary */
+		ps2_sendbyte(ps2dev, 0x33, FSP_CMD_TIMEOUT2);
+	}
+
+	/* write the register value in correct order */
+	ps2_sendbyte(ps2dev, v, FSP_CMD_TIMEOUT2);
+	rc = 0;
+
+ out:
+	mutex_unlock(&ps2dev->cmd_mutex);
+	dev_dbg(&ps2dev->serio->dev, "WRITE REG: 0x%02x to 0x%02x (rc = %d)\n",
+		reg_addr, reg_val, rc);
+	return rc;
+}
+
+/* Enable register clock gating for writing certain registers */
+static int fsp_reg_write_enable(struct psmouse *psmouse, bool enable)
+{
+	int v, nv;
+
+	if (fsp_reg_read(psmouse, FSP_REG_SYSCTL1, &v) == -1)
+		return -1;
+
+	if (enable)
+		nv = v | FSP_BIT_EN_REG_CLK;
+	else
+		nv = v & ~FSP_BIT_EN_REG_CLK;
+
+	/* only write if necessary */
+	if (nv != v)
+		if (fsp_reg_write(psmouse, FSP_REG_SYSCTL1, nv) == -1)
+			return -1;
+
+	return 0;
+}
+
+static int fsp_page_reg_read(struct psmouse *psmouse, int *reg_val)
+{
+	struct ps2dev *ps2dev = &psmouse->ps2dev;
+	unsigned char param[3];
+	int rc = -1;
+
+	ps2_command(ps2dev, NULL, PSMOUSE_CMD_DISABLE);
+	psmouse_set_state(psmouse, PSMOUSE_CMD_MODE);
+	mutex_lock(&ps2dev->cmd_mutex);
+
+	if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0)
+		goto out;
+
+	ps2_sendbyte(ps2dev, 0x66, FSP_CMD_TIMEOUT2);
+	ps2_sendbyte(ps2dev, 0x88, FSP_CMD_TIMEOUT2);
+
+	if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0)
+		goto out;
+
+	ps2_sendbyte(ps2dev, 0x83, FSP_CMD_TIMEOUT2);
+	ps2_sendbyte(ps2dev, 0x88, FSP_CMD_TIMEOUT2);
+
+	/* get the returned result */
+	if (__ps2_command(ps2dev, param, PSMOUSE_CMD_GETINFO))
+		goto out;
+
+	*reg_val = param[2];
+	rc = 0;
+
+ out:
+	mutex_unlock(&ps2dev->cmd_mutex);
+	ps2_command(ps2dev, NULL, PSMOUSE_CMD_ENABLE);
+	psmouse_set_state(psmouse, PSMOUSE_ACTIVATED);
+	dev_dbg(&ps2dev->serio->dev, "READ PAGE REG: 0x%02x (rc = %d)\n",
+		*reg_val, rc);
+	return rc;
+}
+
+static int fsp_page_reg_write(struct psmouse *psmouse, int reg_val)
+{
+	struct ps2dev *ps2dev = &psmouse->ps2dev;
+	unsigned char v;
+	int rc = -1;
+
+	mutex_lock(&ps2dev->cmd_mutex);
+
+	if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0)
+		goto out;
+
+	ps2_sendbyte(ps2dev, 0x38, FSP_CMD_TIMEOUT2);
+	ps2_sendbyte(ps2dev, 0x88, FSP_CMD_TIMEOUT2);
+
+	if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0)
+		return -1;
+
+	if ((v = fsp_test_invert_cmd(reg_val)) != reg_val) {
+		ps2_sendbyte(ps2dev, 0x47, FSP_CMD_TIMEOUT2);
+	} else if ((v = fsp_test_swap_cmd(reg_val)) != reg_val) {
+		/* swapping is required */
+		ps2_sendbyte(ps2dev, 0x44, FSP_CMD_TIMEOUT2);
+	} else {
+		/* swapping isn't necessary */
+		ps2_sendbyte(ps2dev, 0x33, FSP_CMD_TIMEOUT2);
+	}
+
+	ps2_sendbyte(ps2dev, v, FSP_CMD_TIMEOUT2);
+	rc = 0;
+
+ out:
+	mutex_unlock(&ps2dev->cmd_mutex);
+	dev_dbg(&ps2dev->serio->dev, "WRITE PAGE REG: to 0x%02x (rc = %d)\n",
+		reg_val, rc);
+	return rc;
+}
+
+static int fsp_get_version(struct psmouse *psmouse, int *version)
+{
+	if (fsp_reg_read(psmouse, FSP_REG_VERSION, version))
+		return -EIO;
+
+	return 0;
+}
+
+static int fsp_get_revision(struct psmouse *psmouse, int *rev)
+{
+	if (fsp_reg_read(psmouse, FSP_REG_REVISION, rev))
+		return -EIO;
+
+	return 0;
+}
+
+static int fsp_get_buttons(struct psmouse *psmouse, int *btn)
+{
+	static const int buttons[] = {
+		0x16, /* Left/Middle/Right/Forward/Backward & Scroll Up/Down */
+		0x06, /* Left/Middle/Right & Scroll Up/Down/Right/Left */
+		0x04, /* Left/Middle/Right & Scroll Up/Down */
+		0x02, /* Left/Middle/Right */
+	};
+	int val;
+
+	if (fsp_reg_read(psmouse, FSP_REG_TMOD_STATUS1, &val) == -1)
+		return -EIO;
+
+	*btn = buttons[(val & 0x30) >> 4];
+	return 0;
+}
+
+/* Enable on-pad command tag output */
+static int fsp_opc_tag_enable(struct psmouse *psmouse, bool enable)
+{
+	int v, nv;
+	int res = 0;
+
+	if (fsp_reg_read(psmouse, FSP_REG_OPC_QDOWN, &v) == -1) {
+		dev_err(&psmouse->ps2dev.serio->dev, "Unable get OPC state.\n");
+		return -EIO;
+	}
+
+	if (enable)
+		nv = v | FSP_BIT_EN_OPC_TAG;
+	else
+		nv = v & ~FSP_BIT_EN_OPC_TAG;
+
+	/* only write if necessary */
+	if (nv != v) {
+		fsp_reg_write_enable(psmouse, true);
+		res = fsp_reg_write(psmouse, FSP_REG_OPC_QDOWN, nv);
+		fsp_reg_write_enable(psmouse, false);
+	}
+
+	if (res != 0) {
+		dev_err(&psmouse->ps2dev.serio->dev,
+			"Unable to enable OPC tag.\n");
+		res = -EIO;
+	}
+
+	return res;
+}
+
+static int fsp_onpad_vscr(struct psmouse *psmouse, bool enable)
+{
+	struct fsp_data *pad = psmouse->private;
+	int val;
+
+	if (fsp_reg_read(psmouse, FSP_REG_ONPAD_CTL, &val))
+		return -EIO;
+
+	pad->vscroll = enable;
+
+	if (enable)
+		val |= (FSP_BIT_FIX_VSCR | FSP_BIT_ONPAD_ENABLE);
+	else
+		val &= ~FSP_BIT_FIX_VSCR;
+
+	if (fsp_reg_write(psmouse, FSP_REG_ONPAD_CTL, val))
+		return -EIO;
+
+	return 0;
+}
+
+static int fsp_onpad_hscr(struct psmouse *psmouse, bool enable)
+{
+	struct fsp_data *pad = psmouse->private;
+	int val, v2;
+
+	if (fsp_reg_read(psmouse, FSP_REG_ONPAD_CTL, &val))
+		return -EIO;
+
+	if (fsp_reg_read(psmouse, FSP_REG_SYSCTL5, &v2))
+		return -EIO;
+
+	pad->hscroll = enable;
+
+	if (enable) {
+		val |= (FSP_BIT_FIX_HSCR | FSP_BIT_ONPAD_ENABLE);
+		v2 |= FSP_BIT_EN_MSID6;
+	} else {
+		val &= ~FSP_BIT_FIX_HSCR;
+		v2 &= ~(FSP_BIT_EN_MSID6 | FSP_BIT_EN_MSID7 | FSP_BIT_EN_MSID8);
+	}
+
+	if (fsp_reg_write(psmouse, FSP_REG_ONPAD_CTL, val))
+		return -EIO;
+
+	/* reconfigure horizontal scrolling packet output */
+	if (fsp_reg_write(psmouse, FSP_REG_SYSCTL5, v2))
+		return -EIO;
+
+	return 0;
+}
+
+/*
+ * Write device specific initial parameters.
+ *
+ * ex: 0xab 0xcd - write oxcd into register 0xab
+ */
+static ssize_t fsp_attr_set_setreg(struct psmouse *psmouse, void *data,
+				   const char *buf, size_t count)
+{
+	unsigned long reg, val;
+	char *rest;
+	ssize_t retval;
+
+	reg = simple_strtoul(buf, &rest, 16);
+	if (rest == buf || *rest != ' ' || reg > 0xff)
+		return -EINVAL;
+
+	if (strict_strtoul(rest + 1, 16, &val) || val > 0xff)
+		return -EINVAL;
+
+	if (fsp_reg_write_enable(psmouse, true))
+		return -EIO;
+
+	retval = fsp_reg_write(psmouse, reg, val) < 0 ? -EIO : count;
+
+	fsp_reg_write_enable(psmouse, false);
+
+	return count;
+}
+
+PSMOUSE_DEFINE_WO_ATTR(setreg, S_IWUSR, NULL, fsp_attr_set_setreg);
+
+static ssize_t fsp_attr_show_getreg(struct psmouse *psmouse,
+					void *data, char *buf)
+{
+	struct fsp_data *pad = psmouse->private;
+
+	return sprintf(buf, "%02x%02x\n", pad->last_reg, pad->last_val);
+}
+
+/*
+ * Read a register from device.
+ *
+ * ex: 0xab -- read content from register 0xab
+ */
+static ssize_t fsp_attr_set_getreg(struct psmouse *psmouse, void *data,
+					const char *buf, size_t count)
+{
+	struct fsp_data *pad = psmouse->private;
+	unsigned long reg;
+	int val;
+
+	if (strict_strtoul(buf, 16, &reg) || reg > 0xff)
+		return -EINVAL;
+
+	if (fsp_reg_read(psmouse, reg, &val))
+		return -EIO;
+
+	pad->last_reg = reg;
+	pad->last_val = val;
+
+	return count;
+}
+
+PSMOUSE_DEFINE_ATTR(getreg, S_IWUSR | S_IRUGO, NULL,
+			fsp_attr_show_getreg, fsp_attr_set_getreg);
+
+static ssize_t fsp_attr_show_pagereg(struct psmouse *psmouse,
+					void *data, char *buf)
+{
+	int val = 0;
+
+	if (fsp_page_reg_read(psmouse, &val))
+		return -EIO;
+
+	return sprintf(buf, "%02x\n", val);
+}
+
+static ssize_t fsp_attr_set_pagereg(struct psmouse *psmouse, void *data,
+					const char *buf, size_t count)
+{
+	unsigned long val;
+
+	if (strict_strtoul(buf, 16, &val) || val > 0xff)
+		return -EINVAL;
+
+	if (fsp_page_reg_write(psmouse, val))
+		return -EIO;
+
+	return count;
+}
+
+PSMOUSE_DEFINE_ATTR(page, S_IWUSR | S_IRUGO, NULL,
+			fsp_attr_show_pagereg, fsp_attr_set_pagereg);
+
+static ssize_t fsp_attr_show_vscroll(struct psmouse *psmouse,
+					void *data, char *buf)
+{
+	struct fsp_data *pad = psmouse->private;
+
+	return sprintf(buf, "%d\n", pad->vscroll);
+}
+
+static ssize_t fsp_attr_set_vscroll(struct psmouse *psmouse, void *data,
+					const char *buf, size_t count)
+{
+	unsigned long val;
+
+	if (strict_strtoul(buf, 10, &val) || val > 1)
+		return -EINVAL;
+
+	fsp_onpad_vscr(psmouse, val);
+
+	return count;
+}
+
+PSMOUSE_DEFINE_ATTR(vscroll, S_IWUSR | S_IRUGO, NULL,
+			fsp_attr_show_vscroll, fsp_attr_set_vscroll);
+
+static ssize_t fsp_attr_show_hscroll(struct psmouse *psmouse,
+					void *data, char *buf)
+{
+	struct fsp_data *pad = psmouse->private;
+
+	return sprintf(buf, "%d\n", pad->hscroll);
+}
+
+static ssize_t fsp_attr_set_hscroll(struct psmouse *psmouse, void *data,
+					const char *buf, size_t count)
+{
+	unsigned long val;
+
+	if (strict_strtoul(buf, 10, &val) || val > 1)
+		return -EINVAL;
+
+	fsp_onpad_hscr(psmouse, val);
+
+	return count;
+}
+
+PSMOUSE_DEFINE_ATTR(hscroll, S_IWUSR | S_IRUGO, NULL,
+			fsp_attr_show_hscroll, fsp_attr_set_hscroll);
+
+static ssize_t fsp_attr_show_flags(struct psmouse *psmouse,
+					void *data, char *buf)
+{
+	struct fsp_data *pad = psmouse->private;
+
+	return sprintf(buf, "%c\n",
+			pad->flags & FSPDRV_FLAG_EN_OPC ? 'C' : 'c');
+}
+
+static ssize_t fsp_attr_set_flags(struct psmouse *psmouse, void *data,
+					const char *buf, size_t count)
+{
+	struct fsp_data *pad = psmouse->private;
+	size_t i;
+
+	for (i = 0; i < count; i++) {
+		switch (buf[i]) {
+		case 'C':
+			pad->flags |= FSPDRV_FLAG_EN_OPC;
+			break;
+		case 'c':
+			pad->flags &= ~FSPDRV_FLAG_EN_OPC;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+	return count;
+}
+
+PSMOUSE_DEFINE_ATTR(flags, S_IWUSR | S_IRUGO, NULL,
+			fsp_attr_show_flags, fsp_attr_set_flags);
+
+static ssize_t fsp_attr_show_ver(struct psmouse *psmouse,
+					void *data, char *buf)
+{
+	return sprintf(buf, "Sentelic FSP kernel module %s\n", fsp_drv_ver);
+}
+
+PSMOUSE_DEFINE_RO_ATTR(ver, S_IRUGO, NULL, fsp_attr_show_ver);
+
+static struct attribute *fsp_attributes[] = {
+	&psmouse_attr_setreg.dattr.attr,
+	&psmouse_attr_getreg.dattr.attr,
+	&psmouse_attr_page.dattr.attr,
+	&psmouse_attr_vscroll.dattr.attr,
+	&psmouse_attr_hscroll.dattr.attr,
+	&psmouse_attr_flags.dattr.attr,
+	&psmouse_attr_ver.dattr.attr,
+	NULL
+};
+
+static struct attribute_group fsp_attribute_group = {
+	.attrs = fsp_attributes,
+};
+
+#ifdef FSP_DEBUG
+static void fsp_packet_debug(unsigned char packet[])
+{
+	static unsigned int ps2_packet_cnt;
+	static unsigned int ps2_last_second;
+	unsigned int jiffies_msec;
+
+	ps2_packet_cnt++;
+	jiffies_msec = jiffies_to_msecs(jiffies);
+	printk(KERN_DEBUG "%08dms PS/2 packets: %02x, %02x, %02x, %02x\n",
+		jiffies_msec, packet[0], packet[1], packet[2], packet[3]);
+
+	if (jiffies_msec - ps2_last_second > 1000) {
+		printk(KERN_DEBUG "PS/2 packets/sec = %d\n", ps2_packet_cnt);
+		ps2_packet_cnt = 0;
+		ps2_last_second = jiffies_msec;
+	}
+}
+#else
+static void fsp_packet_debug(unsigned char packet[])
+{
+}
+#endif
+
+static psmouse_ret_t fsp_process_byte(struct psmouse *psmouse)
+{
+	struct input_dev *dev = psmouse->dev;
+	struct fsp_data *ad = psmouse->private;
+	unsigned char *packet = psmouse->packet;
+	unsigned char button_status = 0, lscroll = 0, rscroll = 0;
+	int rel_x, rel_y;
+
+	if (psmouse->pktcnt < 4)
+		return PSMOUSE_GOOD_DATA;
+
+	/*
+	 * Full packet accumulated, process it
+	 */
+
+	switch (psmouse->packet[0] >> FSP_PKT_TYPE_SHIFT) {
+	case FSP_PKT_TYPE_ABS:
+		dev_warn(&psmouse->ps2dev.serio->dev,
+			 "Unexpected absolute mode packet, ignored.\n");
+		break;
+
+	case FSP_PKT_TYPE_NORMAL_OPC:
+		/* on-pad click, filter it if necessary */
+		if ((ad->flags & FSPDRV_FLAG_EN_OPC) != FSPDRV_FLAG_EN_OPC)
+			packet[0] &= ~BIT(0);
+		/* fall through */
+
+	case FSP_PKT_TYPE_NORMAL:
+		/* normal packet */
+		/* special packet data translation from on-pad packets */
+		if (packet[3] != 0) {
+			if (packet[3] & BIT(0))
+				button_status |= 0x01;	/* wheel down */
+			if (packet[3] & BIT(1))
+				button_status |= 0x0f;	/* wheel up */
+			if (packet[3] & BIT(2))
+				button_status |= BIT(5);/* horizontal left */
+			if (packet[3] & BIT(3))
+				button_status |= BIT(4);/* horizontal right */
+			/* push back to packet queue */
+			if (button_status != 0)
+				packet[3] = button_status;
+			rscroll = (packet[3] >> 4) & 1;
+			lscroll = (packet[3] >> 5) & 1;
+		}
+		/*
+		 * Processing wheel up/down and extra button events
+		 */
+		input_report_rel(dev, REL_WHEEL,
+				 (int)(packet[3] & 8) - (int)(packet[3] & 7));
+		input_report_rel(dev, REL_HWHEEL, lscroll - rscroll);
+		input_report_key(dev, BTN_BACK, lscroll);
+		input_report_key(dev, BTN_FORWARD, rscroll);
+
+		/*
+		 * Standard PS/2 Mouse
+		 */
+		input_report_key(dev, BTN_LEFT, packet[0] & 1);
+		input_report_key(dev, BTN_MIDDLE, (packet[0] >> 2) & 1);
+		input_report_key(dev, BTN_RIGHT, (packet[0] >> 1) & 1);
+
+		rel_x = packet[1] ? (int)packet[1] - (int)((packet[0] << 4) & 0x100) : 0;
+		rel_y = packet[2] ? (int)((packet[0] << 3) & 0x100) - (int)packet[2] : 0;
+
+		input_report_rel(dev, REL_X, rel_x);
+		input_report_rel(dev, REL_Y, rel_y);
+		break;
+	}
+
+	input_sync(dev);
+
+	fsp_packet_debug(packet);
+
+	return PSMOUSE_FULL_PACKET;
+}
+
+static int fsp_activate_protocol(struct psmouse *psmouse)
+{
+	struct fsp_data *pad = psmouse->private;
+	struct ps2dev *ps2dev = &psmouse->ps2dev;
+	unsigned char param[2];
+	int val;
+
+	/*
+	 * Standard procedure to enter FSP Intellimouse mode
+	 * (scrolling wheel, 4th and 5th buttons)
+	 */
+	param[0] = 200;
+	ps2_command(ps2dev, param, PSMOUSE_CMD_SETRATE);
+	param[0] = 200;
+	ps2_command(ps2dev, param, PSMOUSE_CMD_SETRATE);
+	param[0] =  80;
+	ps2_command(ps2dev, param, PSMOUSE_CMD_SETRATE);
+
+	ps2_command(ps2dev, param, PSMOUSE_CMD_GETID);
+	if (param[0] != 0x04) {
+		dev_err(&psmouse->ps2dev.serio->dev,
+			"Unable to enable 4 bytes packet format.\n");
+		return -EIO;
+	}
+
+	if (fsp_reg_read(psmouse, FSP_REG_SYSCTL5, &val)) {
+		dev_err(&psmouse->ps2dev.serio->dev,
+			"Unable to read SYSCTL5 register.\n");
+		return -EIO;
+	}
+
+	val &= ~(FSP_BIT_EN_MSID7 | FSP_BIT_EN_MSID8 | FSP_BIT_EN_AUTO_MSID8);
+	/* Ensure we are not in absolute mode */
+	val &= ~FSP_BIT_EN_PKT_G0;
+	if (pad->buttons == 0x06) {
+		/* Left/Middle/Right & Scroll Up/Down/Right/Left */
+		val |= FSP_BIT_EN_MSID6;
+	}
+
+	if (fsp_reg_write(psmouse, FSP_REG_SYSCTL5, val)) {
+		dev_err(&psmouse->ps2dev.serio->dev,
+			"Unable to set up required mode bits.\n");
+		return -EIO;
+	}
+
+	/*
+	 * Enable OPC tags such that driver can tell the difference between
+	 * on-pad and real button click
+	 */
+	if (fsp_opc_tag_enable(psmouse, true))
+		dev_warn(&psmouse->ps2dev.serio->dev,
+			 "Failed to enable OPC tag mode.\n");
+
+	/* Enable on-pad vertical and horizontal scrolling */
+	fsp_onpad_vscr(psmouse, true);
+	fsp_onpad_hscr(psmouse, true);
+
+	return 0;
+}
+
+int fsp_detect(struct psmouse *psmouse, int set_properties)
+{
+	int id;
+
+	if (fsp_reg_read(psmouse, FSP_REG_DEVICE_ID, &id))
+		return -EIO;
+
+	if (id != 0x01)
+		return -ENODEV;
+
+	if (set_properties) {
+		psmouse->vendor = "Sentelic";
+		psmouse->name = "FingerSensingPad";
+	}
+
+	return 0;
+}
+
+static void fsp_reset(struct psmouse *psmouse)
+{
+	fsp_opc_tag_enable(psmouse, false);
+	fsp_onpad_vscr(psmouse, false);
+	fsp_onpad_hscr(psmouse, false);
+}
+
+static void fsp_disconnect(struct psmouse *psmouse)
+{
+	sysfs_remove_group(&psmouse->ps2dev.serio->dev.kobj,
+			   &fsp_attribute_group);
+
+	fsp_reset(psmouse);
+	kfree(psmouse->private);
+}
+
+static int fsp_reconnect(struct psmouse *psmouse)
+{
+	int version;
+
+	if (fsp_detect(psmouse, 0))
+		return -ENODEV;
+
+	if (fsp_get_version(psmouse, &version))
+		return -ENODEV;
+
+	if (fsp_activate_protocol(psmouse))
+		return -EIO;
+
+	return 0;
+}
+
+int fsp_init(struct psmouse *psmouse)
+{
+	struct fsp_data *priv;
+	int ver, rev, buttons;
+	int error;
+
+	if (fsp_get_version(psmouse, &ver) ||
+	    fsp_get_revision(psmouse, &rev) ||
+	    fsp_get_buttons(psmouse, &buttons)) {
+		return -ENODEV;
+	}
+
+	printk(KERN_INFO
+		"Finger Sensing Pad, hw: %d.%d.%d, sw: %s, buttons: %d\n",
+		ver >> 4, ver & 0x0F, rev, fsp_drv_ver, buttons & 7);
+
+	psmouse->private = priv = kzalloc(sizeof(struct fsp_data), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->ver = ver;
+	priv->rev = rev;
+	priv->buttons = buttons;
+
+	/* enable on-pad click by default */
+	priv->flags |= FSPDRV_FLAG_EN_OPC;
+
+	/* Set up various supported input event bits */
+	__set_bit(BTN_BACK, psmouse->dev->keybit);
+	__set_bit(BTN_FORWARD, psmouse->dev->keybit);
+	__set_bit(REL_WHEEL, psmouse->dev->relbit);
+	__set_bit(REL_HWHEEL, psmouse->dev->relbit);
+
+	psmouse->protocol_handler = fsp_process_byte;
+	psmouse->disconnect = fsp_disconnect;
+	psmouse->reconnect = fsp_reconnect;
+	psmouse->cleanup = fsp_reset;
+	psmouse->pktsize = 4;
+
+	/* set default packet output based on number of buttons we found */
+	error = fsp_activate_protocol(psmouse);
+	if (error)
+		goto err_out;
+
+	error = sysfs_create_group(&psmouse->ps2dev.serio->dev.kobj,
+				   &fsp_attribute_group);
+	if (error) {
+		dev_err(&psmouse->ps2dev.serio->dev,
+			"Failed to create sysfs attributes (%d)", error);
+		goto err_out;
+	}
+
+	return 0;
+
+ err_out:
+	kfree(psmouse->private);
+	psmouse->private = NULL;
+	return error;
+}
diff --git a/drivers/input/mouse/sentelic.h b/drivers/input/mouse/sentelic.h
new file mode 100644
index 000000000000..083559c7282b
--- /dev/null
+++ b/drivers/input/mouse/sentelic.h
@@ -0,0 +1,98 @@
+/*-
+ * Finger Sensing Pad PS/2 mouse driver.
+ *
+ * Copyright (C) 2005-2007 Asia Vital Components Co., Ltd.
+ * Copyright (C) 2005-2009 Tai-hwa Liang, Sentelic Corporation.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation; either version 2
+ *   of the License, or (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef	__SENTELIC_H
+#define	__SENTELIC_H
+
+/* Finger-sensing Pad information registers */
+#define	FSP_REG_DEVICE_ID	0x00
+#define	FSP_REG_VERSION		0x01
+#define	FSP_REG_REVISION	0x04
+#define	FSP_REG_TMOD_STATUS1	0x0B
+#define	FSP_BIT_NO_ROTATION	BIT(3)
+#define	FSP_REG_PAGE_CTRL	0x0F
+
+/* Finger-sensing Pad control registers */
+#define	FSP_REG_SYSCTL1		0x10
+#define	FSP_BIT_EN_REG_CLK	BIT(5)
+#define	FSP_REG_OPC_QDOWN	0x31
+#define	FSP_BIT_EN_OPC_TAG	BIT(7)
+#define	FSP_REG_OPTZ_XLO	0x34
+#define	FSP_REG_OPTZ_XHI	0x35
+#define	FSP_REG_OPTZ_YLO	0x36
+#define	FSP_REG_OPTZ_YHI	0x37
+#define	FSP_REG_SYSCTL5		0x40
+#define	FSP_BIT_90_DEGREE	BIT(0)
+#define	FSP_BIT_EN_MSID6	BIT(1)
+#define	FSP_BIT_EN_MSID7	BIT(2)
+#define	FSP_BIT_EN_MSID8	BIT(3)
+#define	FSP_BIT_EN_AUTO_MSID8	BIT(5)
+#define	FSP_BIT_EN_PKT_G0	BIT(6)
+
+#define	FSP_REG_ONPAD_CTL	0x43
+#define	FSP_BIT_ONPAD_ENABLE	BIT(0)
+#define	FSP_BIT_ONPAD_FBBB	BIT(1)
+#define	FSP_BIT_FIX_VSCR	BIT(3)
+#define	FSP_BIT_FIX_HSCR	BIT(5)
+#define	FSP_BIT_DRAG_LOCK	BIT(6)
+
+/* Finger-sensing Pad packet formating related definitions */
+
+/* absolute packet type */
+#define	FSP_PKT_TYPE_NORMAL	(0x00)
+#define	FSP_PKT_TYPE_ABS	(0x01)
+#define	FSP_PKT_TYPE_NOTIFY	(0x02)
+#define	FSP_PKT_TYPE_NORMAL_OPC	(0x03)
+#define	FSP_PKT_TYPE_SHIFT	(6)
+
+#ifdef __KERNEL__
+
+struct fsp_data {
+	unsigned char	ver;		/* hardware version */
+	unsigned char	rev;		/* hardware revison */
+	unsigned char	buttons;	/* Number of buttons */
+	unsigned int	flags;
+#define	FSPDRV_FLAG_EN_OPC	(0x001)	/* enable on-pad clicking */
+
+	bool		vscroll;	/* Vertical scroll zone enabled */
+	bool		hscroll;	/* Horizontal scroll zone enabled */
+
+	unsigned char	last_reg;	/* Last register we requested read from */
+	unsigned char	last_val;
+};
+
+#ifdef CONFIG_MOUSE_PS2_SENTELIC
+extern int fsp_detect(struct psmouse *psmouse, int set_properties);
+extern int fsp_init(struct psmouse *psmouse);
+#else
+inline int fsp_detect(struct psmouse *psmouse, int set_properties)
+{
+	return -ENOSYS;
+}
+inline int fsp_init(struct psmouse *psmouse)
+{
+	return -ENOSYS;
+}
+#endif
+
+#endif	/* __KERNEL__ */
+
+#endif	/* !__SENTELIC_H */
diff --git a/drivers/input/serio/libps2.c b/drivers/input/serio/libps2.c
index be5bbbb8ae4e..3a95b508bf27 100644
--- a/drivers/input/serio/libps2.c
+++ b/drivers/input/serio/libps2.c
@@ -161,7 +161,7 @@ static int ps2_adjust_timeout(struct ps2dev *ps2dev, int command, int timeout)
  * ps2_command() can only be called from a process context
  */
 
-int ps2_command(struct ps2dev *ps2dev, unsigned char *param, int command)
+int __ps2_command(struct ps2dev *ps2dev, unsigned char *param, int command)
 {
 	int timeout;
 	int send = (command >> 12) & 0xf;
@@ -179,8 +179,6 @@ int ps2_command(struct ps2dev *ps2dev, unsigned char *param, int command)
 		return -1;
 	}
 
-	mutex_lock(&ps2dev->cmd_mutex);
-
 	serio_pause_rx(ps2dev->serio);
 	ps2dev->flags = command == PS2_CMD_GETID ? PS2_FLAG_WAITID : 0;
 	ps2dev->cmdcnt = receive;
@@ -231,7 +229,18 @@ int ps2_command(struct ps2dev *ps2dev, unsigned char *param, int command)
 	ps2dev->flags = 0;
 	serio_continue_rx(ps2dev->serio);
 
+	return rc;
+}
+EXPORT_SYMBOL(__ps2_command);
+
+int ps2_command(struct ps2dev *ps2dev, unsigned char *param, int command)
+{
+	int rc;
+
+	mutex_lock(&ps2dev->cmd_mutex);
+	rc = __ps2_command(ps2dev, param, command);
 	mutex_unlock(&ps2dev->cmd_mutex);
+
 	return rc;
 }
 EXPORT_SYMBOL(ps2_command);
diff --git a/include/linux/libps2.h b/include/linux/libps2.h
index b94534b7e266..fcf5fbe6a50c 100644
--- a/include/linux/libps2.h
+++ b/include/linux/libps2.h
@@ -44,6 +44,7 @@ struct ps2dev {
 void ps2_init(struct ps2dev *ps2dev, struct serio *serio);
 int ps2_sendbyte(struct ps2dev *ps2dev, unsigned char byte, int timeout);
 void ps2_drain(struct ps2dev *ps2dev, int maxbytes, int timeout);
+int __ps2_command(struct ps2dev *ps2dev, unsigned char *param, int command);
 int ps2_command(struct ps2dev *ps2dev, unsigned char *param, int command);
 int ps2_handle_ack(struct ps2dev *ps2dev, unsigned char data);
 int ps2_handle_response(struct ps2dev *ps2dev, unsigned char data);
-- 
cgit v1.2.3-71-gd317


From 4516fc0454e7ffe2f369e80045b23c2b32155004 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 14 Aug 2009 12:57:54 -0400
Subject: sunrpc: add routine for comparing addresses

lockd needs these sort of routines, as does the NFSv4 callback code.

Move lockd's routines into common code and rename them so that they can
be used by others.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/clntlock.c         |  2 +-
 fs/lockd/host.c             |  4 ++--
 fs/lockd/mon.c              |  2 +-
 fs/lockd/svcsubs.c          |  2 +-
 include/linux/lockd/lockd.h | 43 ----------------------------------------
 include/linux/sunrpc/clnt.h | 48 +++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 53 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 1f3b0fc0d351..fc9032dc8862 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -166,7 +166,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
 		 */
 		if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
 			continue;
-		if (!nlm_cmp_addr(nlm_addr(block->b_host), addr))
+		if (!rpc_cmp_addr(nlm_addr(block->b_host), addr))
 			continue;
 		if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
 			continue;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 7cb076ac6b45..4600c2037b8b 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -111,7 +111,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
 	 */
 	chain = &nlm_hosts[nlm_hash_address(ni->sap)];
 	hlist_for_each_entry(host, pos, chain, h_hash) {
-		if (!nlm_cmp_addr(nlm_addr(host), ni->sap))
+		if (!rpc_cmp_addr(nlm_addr(host), ni->sap))
 			continue;
 
 		/* See if we have an NSM handle for this client */
@@ -125,7 +125,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
 		if (host->h_server != ni->server)
 			continue;
 		if (ni->server &&
-		    !nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap))
+		    !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap))
 			continue;
 
 		/* Move to head of hash chain. */
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 30c933188dd7..f956651d0f65 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -209,7 +209,7 @@ static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap)
 	struct nsm_handle *nsm;
 
 	list_for_each_entry(nsm, &nsm_handles, sm_link)
-		if (nlm_cmp_addr(nsm_addr(nsm), sap))
+		if (rpc_cmp_addr(nsm_addr(nsm), sap))
 			return nsm;
 	return NULL;
 }
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 9e4d6aab611b..ad478da7ca63 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -417,7 +417,7 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb);
 static int
 nlmsvc_match_ip(void *datap, struct nlm_host *host)
 {
-	return nlm_cmp_addr(nlm_srcaddr(host), datap);
+	return rpc_cmp_addr(nlm_srcaddr(host), datap);
 }
 
 /**
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index c325b187966b..e7a251a988c0 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -338,49 +338,6 @@ static inline int nlm_privileged_requester(const struct svc_rqst *rqstp)
 	}
 }
 
-static inline int __nlm_cmp_addr4(const struct sockaddr *sap1,
-				  const struct sockaddr *sap2)
-{
-	const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sap1;
-	const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sap2;
-	return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
-}
-
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static inline int __nlm_cmp_addr6(const struct sockaddr *sap1,
-				  const struct sockaddr *sap2)
-{
-	const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sap1;
-	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sap2;
-	return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr);
-}
-#else	/* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
-static inline int __nlm_cmp_addr6(const struct sockaddr *sap1,
-				  const struct sockaddr *sap2)
-{
-	return 0;
-}
-#endif	/* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
-
-/*
- * Compare two host addresses
- *
- * Return TRUE if the addresses are the same; otherwise FALSE.
- */
-static inline int nlm_cmp_addr(const struct sockaddr *sap1,
-			       const struct sockaddr *sap2)
-{
-	if (sap1->sa_family == sap2->sa_family) {
-		switch (sap1->sa_family) {
-		case AF_INET:
-			return __nlm_cmp_addr4(sap1, sap2);
-		case AF_INET6:
-			return __nlm_cmp_addr6(sap1, sap2);
-		}
-	}
-	return 0;
-}
-
 /*
  * Compare two NLM locks.
  * When the second lock is of type F_UNLCK, this acts like a wildcard.
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index ab3f6e90caa5..b17df361be82 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -22,6 +22,7 @@
 #include <linux/sunrpc/timer.h>
 #include <asm/signal.h>
 #include <linux/path.h>
+#include <net/ipv6.h>
 
 struct rpc_inode;
 
@@ -188,5 +189,52 @@ static inline void rpc_set_port(struct sockaddr *sap,
 #define IPV6_SCOPE_DELIMITER		'%'
 #define IPV6_SCOPE_ID_LEN		sizeof("%nnnnnnnnnn")
 
+static inline bool __rpc_cmp_addr4(const struct sockaddr *sap1,
+				   const struct sockaddr *sap2)
+{
+	const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sap1;
+	const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sap2;
+
+	return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static inline bool __rpc_cmp_addr6(const struct sockaddr *sap1,
+				   const struct sockaddr *sap2)
+{
+	const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sap1;
+	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sap2;
+	return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr);
+}
+#else	/* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
+static inline bool __rpc_cmp_addr6(const struct sockaddr *sap1,
+				   const struct sockaddr *sap2)
+{
+	return false;
+}
+#endif	/* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
+
+/**
+ * rpc_cmp_addr - compare the address portion of two sockaddrs.
+ * @sap1: first sockaddr
+ * @sap2: second sockaddr
+ *
+ * Just compares the family and address portion. Ignores port, scope, etc.
+ * Returns true if the addrs are equal, false if they aren't.
+ */
+static inline bool rpc_cmp_addr(const struct sockaddr *sap1,
+				const struct sockaddr *sap2)
+{
+	if (sap1->sa_family == sap2->sa_family) {
+		switch (sap1->sa_family) {
+		case AF_INET:
+			return __rpc_cmp_addr4(sap1, sap2);
+		case AF_INET6:
+			return __rpc_cmp_addr6(sap1, sap2);
+		}
+	}
+	return false;
+}
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_SUNRPC_CLNT_H */
-- 
cgit v1.2.3-71-gd317


From be3ad6b0b675fd1d6b48362ca30bdee75fbef6b4 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 14 Aug 2009 12:57:55 -0400
Subject: sunrpc: add common routine for copying address portion of a sockaddr

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 include/linux/sunrpc/clnt.h | 50 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index b17df361be82..044f531aee70 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -198,6 +198,17 @@ static inline bool __rpc_cmp_addr4(const struct sockaddr *sap1,
 	return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
 }
 
+static inline bool __rpc_copy_addr4(struct sockaddr *dst,
+				    const struct sockaddr *src)
+{
+	const struct sockaddr_in *ssin = (struct sockaddr_in *) src;
+	struct sockaddr_in *dsin = (struct sockaddr_in *) dst;
+
+	dsin->sin_family = ssin->sin_family;
+	dsin->sin_addr.s_addr = ssin->sin_addr.s_addr;
+	return true;
+}
+
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 static inline bool __rpc_cmp_addr6(const struct sockaddr *sap1,
 				   const struct sockaddr *sap2)
@@ -206,12 +217,29 @@ static inline bool __rpc_cmp_addr6(const struct sockaddr *sap1,
 	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sap2;
 	return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr);
 }
+
+static inline bool __rpc_copy_addr6(struct sockaddr *dst,
+				    const struct sockaddr *src)
+{
+	const struct sockaddr_in6 *ssin6 = (const struct sockaddr_in6 *) src;
+	struct sockaddr_in6 *dsin6 = (struct sockaddr_in6 *) dst;
+
+	dsin6->sin6_family = ssin6->sin6_family;
+	ipv6_addr_copy(&dsin6->sin6_addr, &ssin6->sin6_addr);
+	return true;
+}
 #else	/* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
 static inline bool __rpc_cmp_addr6(const struct sockaddr *sap1,
 				   const struct sockaddr *sap2)
 {
 	return false;
 }
+
+static inline bool __rpc_copy_addr6(struct sockaddr *dst,
+				    const struct sockaddr *src)
+{
+	return false;
+}
 #endif	/* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
 
 /**
@@ -236,5 +264,27 @@ static inline bool rpc_cmp_addr(const struct sockaddr *sap1,
 	return false;
 }
 
+/**
+ * rpc_copy_addr - copy the address portion of one sockaddr to another
+ * @dst: destination sockaddr
+ * @src: source sockaddr
+ *
+ * Just copies the address portion and family. Ignores port, scope, etc.
+ * Caller is responsible for making certain that dst is large enough to hold
+ * the address in src. Returns true if address family is supported. Returns
+ * false otherwise.
+ */
+static inline bool rpc_copy_addr(struct sockaddr *dst,
+				 const struct sockaddr *src)
+{
+	switch (src->sa_family) {
+	case AF_INET:
+		return __rpc_copy_addr4(dst, src);
+	case AF_INET6:
+		return __rpc_copy_addr6(dst, src);
+	}
+	return false;
+}
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_SUNRPC_CLNT_H */
-- 
cgit v1.2.3-71-gd317


From 363168b4ea8ec26aeb982ac6024a09f907ecd27e Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 14 Aug 2009 12:57:56 -0400
Subject: nfsd: make nfs4_client->cl_addr a struct sockaddr_storage

It's currently a __be32, which isn't big enough to hold an IPv6 address.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c        | 32 +++++++++++++++++++-------------
 include/linux/nfsd/state.h |  2 +-
 2 files changed, 20 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9295c4b56bce..bfc14d879ea1 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -55,6 +55,7 @@
 #include <linux/lockd/bind.h>
 #include <linux/module.h>
 #include <linux/sunrpc/svcauth_gss.h>
+#include <linux/sunrpc/clnt.h>
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
@@ -1220,13 +1221,15 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
 	int status;
 	unsigned int		strhashval;
 	char			dname[HEXDIR_LEN];
+	char			addr_str[INET6_ADDRSTRLEN];
 	nfs4_verifier		verf = exid->verifier;
-	u32			ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
+	struct sockaddr		*sa = svc_addr(rqstp);
 
+	rpc_ntop(sa, addr_str, sizeof(addr_str));
 	dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
-		" ip_addr=%u flags %x, spa_how %d\n",
+		"ip_addr=%s flags %x, spa_how %d\n",
 		__func__, rqstp, exid, exid->clname.len, exid->clname.data,
-		ip_addr, exid->flags, exid->spa_how);
+		addr_str, exid->flags, exid->spa_how);
 
 	if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A))
 		return nfserr_inval;
@@ -1315,7 +1318,7 @@ out_new:
 
 	copy_verf(new, &verf);
 	copy_cred(&new->cl_cred, &rqstp->rq_cred);
-	new->cl_addr = ip_addr;
+	rpc_copy_addr((struct sockaddr *) &new->cl_addr, sa);
 	gen_clid(new);
 	gen_confirm(new);
 	add_to_unconfirmed(new, strhashval);
@@ -1389,7 +1392,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 		     struct nfsd4_compound_state *cstate,
 		     struct nfsd4_create_session *cr_ses)
 {
-	u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
+	struct sockaddr *sa = svc_addr(rqstp);
 	struct nfs4_client *conf, *unconf;
 	struct nfsd4_clid_slot *cs_slot = NULL;
 	int status = 0;
@@ -1417,7 +1420,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 		cs_slot->sl_seqid++;
 	} else if (unconf) {
 		if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
-		    (ip_addr != unconf->cl_addr)) {
+		    !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
 			status = nfserr_clid_inuse;
 			goto out;
 		}
@@ -1564,7 +1567,7 @@ __be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		  struct nfsd4_setclientid *setclid)
 {
-	struct sockaddr_in	*sin = svc_addr_in(rqstp);
+	struct sockaddr		*sa = svc_addr(rqstp);
 	struct xdr_netobj 	clname = { 
 		.len = setclid->se_namelen,
 		.data = setclid->se_name,
@@ -1596,8 +1599,11 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		/* RFC 3530 14.2.33 CASE 0: */
 		status = nfserr_clid_inuse;
 		if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
-			dprintk("NFSD: setclientid: string in use by client"
-				" at %pI4\n", &conf->cl_addr);
+			char addr_str[INET6_ADDRSTRLEN];
+			rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str,
+				 sizeof(addr_str));
+			dprintk("NFSD: setclientid: string in use by client "
+				"at %s\n", addr_str);
 			goto out;
 		}
 	}
@@ -1659,7 +1665,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		gen_clid(new);
 	}
 	copy_verf(new, &clverifier);
-	new->cl_addr = sin->sin_addr.s_addr;
+	rpc_copy_addr((struct sockaddr *) &new->cl_addr, sa);
 	new->cl_flavor = rqstp->rq_flavor;
 	princ = svc_gss_principal(rqstp);
 	if (princ) {
@@ -1693,7 +1699,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 			 struct nfsd4_compound_state *cstate,
 			 struct nfsd4_setclientid_confirm *setclientid_confirm)
 {
-	struct sockaddr_in *sin = svc_addr_in(rqstp);
+	struct sockaddr *sa = svc_addr(rqstp);
 	struct nfs4_client *conf, *unconf;
 	nfs4_verifier confirm = setclientid_confirm->sc_confirm; 
 	clientid_t * clid = &setclientid_confirm->sc_clientid;
@@ -1712,9 +1718,9 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 	unconf = find_unconfirmed_client(clid);
 
 	status = nfserr_clid_inuse;
-	if (conf && conf->cl_addr != sin->sin_addr.s_addr)
+	if (conf && !rpc_cmp_addr((struct sockaddr *) &conf->cl_addr, sa))
 		goto out;
-	if (unconf && unconf->cl_addr != sin->sin_addr.s_addr)
+	if (unconf && !rpc_cmp_addr((struct sockaddr *) &unconf->cl_addr, sa))
 		goto out;
 
 	/*
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 58bb19784e12..3510ddd4be49 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -200,7 +200,7 @@ struct nfs4_client {
 	char                    cl_recdir[HEXDIR_LEN]; /* recovery dir */
 	nfs4_verifier		cl_verifier; 	/* generated by client */
 	time_t                  cl_time;        /* time of last lease renewal */
-	__be32			cl_addr; 	/* client ipaddress */
+	struct sockaddr_storage	cl_addr; 	/* client ipaddress */
 	u32			cl_flavor;	/* setclientid pseudoflavor */
 	char			*cl_principal;	/* setclientid principal name */
 	struct svc_cred		cl_cred; 	/* setclientid principal */
-- 
cgit v1.2.3-71-gd317


From aa9a4ec7707a5391cde556f3fa1b0eb4bca3bcf6 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 14 Aug 2009 12:57:57 -0400
Subject: nfsd: convert nfs4_cb_conn struct to hold address in sockaddr_storage

...rather than as a separate address and port fields. This will be
necessary for implementing callbacks over IPv6. Also, convert
gen_callback to use the standard rpcuaddr2sockaddr routine rather than
its own private one.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c     | 11 ++-----
 fs/nfsd/nfs4state.c        | 81 ++++++----------------------------------------
 include/linux/nfsd/state.h |  4 +--
 3 files changed, 13 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 3fd23f7aceca..81d1c5285dcc 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -377,7 +377,6 @@ static int max_cb_time(void)
 
 int setup_callback_client(struct nfs4_client *clp)
 {
-	struct sockaddr_in	addr;
 	struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
 	struct rpc_timeout	timeparms = {
 		.to_initval	= max_cb_time(),
@@ -385,8 +384,8 @@ int setup_callback_client(struct nfs4_client *clp)
 	};
 	struct rpc_create_args args = {
 		.protocol	= IPPROTO_TCP,
-		.address	= (struct sockaddr *)&addr,
-		.addrsize	= sizeof(addr),
+		.address	= (struct sockaddr *) &cb->cb_addr,
+		.addrsize	= cb->cb_addrlen,
 		.timeout	= &timeparms,
 		.program	= &cb_program,
 		.prognumber	= cb->cb_prog,
@@ -400,12 +399,6 @@ int setup_callback_client(struct nfs4_client *clp)
 	if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
 		return -EINVAL;
 
-	/* Initialize address */
-	memset(&addr, 0, sizeof(addr));
-	addr.sin_family = AF_INET;
-	addr.sin_port = htons(cb->cb_port);
-	addr.sin_addr.s_addr = htonl(cb->cb_addr);
-
 	/* Create RPC client */
 	client = rpc_create(&args);
 	if (IS_ERR(client)) {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index bfc14d879ea1..96a742308cee 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -897,76 +897,6 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
 	return NULL;
 }
 
-/* a helper function for parse_callback */
-static int
-parse_octet(unsigned int *lenp, char **addrp)
-{
-	unsigned int len = *lenp;
-	char *p = *addrp;
-	int n = -1;
-	char c;
-
-	for (;;) {
-		if (!len)
-			break;
-		len--;
-		c = *p++;
-		if (c == '.')
-			break;
-		if ((c < '0') || (c > '9')) {
-			n = -1;
-			break;
-		}
-		if (n < 0)
-			n = 0;
-		n = (n * 10) + (c - '0');
-		if (n > 255) {
-			n = -1;
-			break;
-		}
-	}
-	*lenp = len;
-	*addrp = p;
-	return n;
-}
-
-/* parse and set the setclientid ipv4 callback address */
-static int
-parse_ipv4(unsigned int addr_len, char *addr_val, unsigned int *cbaddrp, unsigned short *cbportp)
-{
-	int temp = 0;
-	u32 cbaddr = 0;
-	u16 cbport = 0;
-	u32 addrlen = addr_len;
-	char *addr = addr_val;
-	int i, shift;
-
-	/* ipaddress */
-	shift = 24;
-	for(i = 4; i > 0  ; i--) {
-		if ((temp = parse_octet(&addrlen, &addr)) < 0) {
-			return 0;
-		}
-		cbaddr |= (temp << shift);
-		if (shift > 0)
-		shift -= 8;
-	}
-	*cbaddrp = cbaddr;
-
-	/* port */
-	shift = 8;
-	for(i = 2; i > 0  ; i--) {
-		if ((temp = parse_octet(&addrlen, &addr)) < 0) {
-			return 0;
-		}
-		cbport |= (temp << shift);
-		if (shift > 0)
-			shift -= 8;
-	}
-	*cbportp = cbport;
-	return 1;
-}
-
 static void
 gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se)
 {
@@ -976,14 +906,21 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se)
 	if ((se->se_callback_netid_len != 3) || memcmp((char *)se->se_callback_netid_val, "tcp", 3))
 		goto out_err;
 
-	if ( !(parse_ipv4(se->se_callback_addr_len, se->se_callback_addr_val,
-	                 &cb->cb_addr, &cb->cb_port)))
+	cb->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val,
+					    se->se_callback_addr_len,
+					    (struct sockaddr *) &cb->cb_addr,
+					    sizeof(cb->cb_addr));
+
+	if (!cb->cb_addrlen || cb->cb_addr.ss_family != AF_INET)
 		goto out_err;
+
 	cb->cb_minorversion = 0;
 	cb->cb_prog = se->se_callback_prog;
 	cb->cb_ident = se->se_callback_ident;
 	return;
 out_err:
+	cb->cb_addr.ss_family = AF_UNSPEC;
+	cb->cb_addrlen = 0;
 	dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
 		"will not receive delegations\n",
 		clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 3510ddd4be49..fb0c404c7c5c 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -81,8 +81,8 @@ struct nfs4_delegation {
 /* client delegation callback info */
 struct nfs4_cb_conn {
 	/* SETCLIENTID info */
-	u32                     cb_addr;
-	unsigned short          cb_port;
+	struct sockaddr_storage	cb_addr;
+	size_t			cb_addrlen;
 	u32                     cb_prog;
 	u32			cb_minorversion;
 	u32                     cb_ident;	/* minorversion 0 only */
-- 
cgit v1.2.3-71-gd317


From fbf4665f41b02e757ab9d9198df65e319388e728 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 14 Aug 2009 12:57:59 -0400
Subject: nfsd: populate sin6_scope_id on callback address with scopeid from
 rq_addr on SETCLIENTID call

When a SETCLIENTID call comes in, one of the args given is the svc_rqst.
This struct contains an rq_addr field which holds the address that sent
the call. If this is an IPv6 address, then we can use the sin6_scope_id
field in this address to populate the sin6_scope_id field in the
callback address.

AFAICT, the rq_addr.sin6_scope_id is non-zero if and only if the client
mounted the server's link-local address.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c         |  7 +++++--
 include/linux/sunrpc/clnt.h | 15 +++++++++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9ec0ca1ef4ea..d2a052480908 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -898,7 +898,7 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
 }
 
 static void
-gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se)
+gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
 {
 	struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
 	unsigned short expected_family;
@@ -921,6 +921,9 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se)
 	if (!cb->cb_addrlen || cb->cb_addr.ss_family != expected_family)
 		goto out_err;
 
+	if (cb->cb_addr.ss_family == AF_INET6)
+		((struct sockaddr_in6 *) &cb->cb_addr)->sin6_scope_id = scopeid;
+
 	cb->cb_minorversion = 0;
 	cb->cb_prog = se->se_callback_prog;
 	cb->cb_ident = se->se_callback_ident;
@@ -1621,7 +1624,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	}
 	copy_cred(&new->cl_cred, &rqstp->rq_cred);
 	gen_confirm(new);
-	gen_callback(new, setclid);
+	gen_callback(new, setclid, rpc_get_scope_id(sa));
 	add_to_unconfirmed(new, strhashval);
 	setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
 	setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 044f531aee70..3d025588e56e 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -286,5 +286,20 @@ static inline bool rpc_copy_addr(struct sockaddr *dst,
 	return false;
 }
 
+/**
+ * rpc_get_scope_id - return scopeid for a given sockaddr
+ * @sa: sockaddr to get scopeid from
+ *
+ * Returns the value of the sin6_scope_id for AF_INET6 addrs, or 0 if
+ * not an AF_INET6 address.
+ */
+static inline u32 rpc_get_scope_id(const struct sockaddr *sa)
+{
+	if (sa->sa_family != AF_INET6)
+		return 0;
+
+	return ((struct sockaddr_in6 *) sa)->sin6_scope_id;
+}
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_SUNRPC_CLNT_H */
-- 
cgit v1.2.3-71-gd317


From da15cfdae03351c689736f8d142618592e3cebc3 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Wed, 19 Aug 2009 19:13:34 -0700
Subject: time: Introduce CLOCK_REALTIME_COARSE

After talking with some application writers who want very fast, but not
fine-grained timestamps, I decided to try to implement new clock_ids
to clock_gettime(): CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE
which returns the time at the last tick. This is very fast as we don't
have to access any hardware (which can be very painful if you're using
something like the acpi_pm clocksource), and we can even use the vdso
clock_gettime() method to avoid the syscall. The only trade off is you
only get low-res tick grained time resolution.

This isn't a new idea, I know Ingo has a patch in the -rt tree that made
the vsyscall gettimeofday() return coarse grained time when the
vsyscall64 sysctrl was set to 2. However this affects all applications
on a system.

With this method, applications can choose the proper speed/granularity
trade-off for themselves.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: nikolag@ca.ibm.com
Cc: Darren Hart <dvhltc@us.ibm.com>
Cc: arjan@infradead.org
Cc: jonathan@jonmasters.org
LKML-Reference: <1250734414.6897.5.camel@localhost.localdomain>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/vgtod.h   |  1 +
 arch/x86/kernel/vsyscall_64.c  |  1 +
 arch/x86/vdso/vclock_gettime.c | 39 ++++++++++++++++++++++++++++++++++++---
 include/linux/time.h           |  4 ++++
 kernel/posix-timers.c          | 35 +++++++++++++++++++++++++++++++++++
 kernel/time/timekeeping.c      | 21 +++++++++++++++++++++
 6 files changed, 98 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index dc27a69e5d2a..3d61e204826f 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -21,6 +21,7 @@ struct vsyscall_gtod_data {
 		u32	shift;
 	} clock;
 	struct timespec wall_to_monotonic;
+	struct timespec wall_time_coarse;
 };
 extern struct vsyscall_gtod_data __vsyscall_gtod_data
 __section_vsyscall_gtod_data;
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 25ee06a80aad..cf53a78e2dcf 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -87,6 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
 	vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
 	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
 	vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
+	vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
 	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
 
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 6a40b78b46aa..ee55754cc3c5 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -86,14 +86,47 @@ notrace static noinline int do_monotonic(struct timespec *ts)
 	return 0;
 }
 
+notrace static noinline int do_realtime_coarse(struct timespec *ts)
+{
+	unsigned long seq;
+	do {
+		seq = read_seqbegin(&gtod->lock);
+		ts->tv_sec = gtod->wall_time_coarse.tv_sec;
+		ts->tv_nsec = gtod->wall_time_coarse.tv_nsec;
+	} while (unlikely(read_seqretry(&gtod->lock, seq)));
+	return 0;
+}
+
+notrace static noinline int do_monotonic_coarse(struct timespec *ts)
+{
+	unsigned long seq, ns, secs;
+	do {
+		seq = read_seqbegin(&gtod->lock);
+		secs = gtod->wall_time_coarse.tv_sec;
+		ns = gtod->wall_time_coarse.tv_nsec;
+		secs += gtod->wall_to_monotonic.tv_sec;
+		ns += gtod->wall_to_monotonic.tv_nsec;
+	} while (unlikely(read_seqretry(&gtod->lock, seq)));
+	vset_normalized_timespec(ts, secs, ns);
+	return 0;
+}
+
 notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 {
-	if (likely(gtod->sysctl_enabled && gtod->clock.vread))
+	if (likely(gtod->sysctl_enabled))
 		switch (clock) {
 		case CLOCK_REALTIME:
-			return do_realtime(ts);
+			if (likely(gtod->clock.vread))
+				return do_realtime(ts);
+			break;
 		case CLOCK_MONOTONIC:
-			return do_monotonic(ts);
+			if (likely(gtod->clock.vread))
+				return do_monotonic(ts);
+			break;
+		case CLOCK_REALTIME_COARSE:
+			return do_realtime_coarse(ts);
+		case CLOCK_MONOTONIC_COARSE:
+			return do_monotonic_coarse(ts);
 		}
 	return vdso_fallback_gettime(clock, ts);
 }
diff --git a/include/linux/time.h b/include/linux/time.h
index f505988398e6..256232f7e5e6 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -110,6 +110,8 @@ extern int timekeeping_suspended;
 
 unsigned long get_seconds(void);
 struct timespec current_kernel_time(void);
+struct timespec __current_kernel_time(void); /* does not hold xtime_lock */
+struct timespec get_monotonic_coarse(void);
 
 #define CURRENT_TIME		(current_kernel_time())
 #define CURRENT_TIME_SEC	((struct timespec) { get_seconds(), 0 })
@@ -243,6 +245,8 @@ struct itimerval {
 #define CLOCK_PROCESS_CPUTIME_ID	2
 #define CLOCK_THREAD_CPUTIME_ID		3
 #define CLOCK_MONOTONIC_RAW		4
+#define CLOCK_REALTIME_COARSE		5
+#define CLOCK_MONOTONIC_COARSE		6
 
 /*
  * The IDs of various hardware clocks:
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index d089d052c4a9..495440779ce3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -242,6 +242,25 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
 	return 0;
 }
 
+
+static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
+{
+	*tp = current_kernel_time();
+	return 0;
+}
+
+static int posix_get_monotonic_coarse(clockid_t which_clock,
+						struct timespec *tp)
+{
+	*tp = get_monotonic_coarse();
+	return 0;
+}
+
+int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
+{
+	*tp = ktime_to_timespec(KTIME_LOW_RES);
+	return 0;
+}
 /*
  * Initialize everything, well, just everything in Posix clocks/timers ;)
  */
@@ -262,10 +281,26 @@ static __init int init_posix_timers(void)
 		.timer_create = no_timer_create,
 		.nsleep = no_nsleep,
 	};
+	struct k_clock clock_realtime_coarse = {
+		.clock_getres = posix_get_coarse_res,
+		.clock_get = posix_get_realtime_coarse,
+		.clock_set = do_posix_clock_nosettime,
+		.timer_create = no_timer_create,
+		.nsleep = no_nsleep,
+	};
+	struct k_clock clock_monotonic_coarse = {
+		.clock_getres = posix_get_coarse_res,
+		.clock_get = posix_get_monotonic_coarse,
+		.clock_set = do_posix_clock_nosettime,
+		.timer_create = no_timer_create,
+		.nsleep = no_nsleep,
+	};
 
 	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
 	register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
 	register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
+	register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
+	register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
 
 	posix_timers_cache = kmem_cache_create("posix_timers_cache",
 					sizeof (struct k_itimer), 0, SLAB_PANIC,
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 15e06defca55..03cbeb34d141 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -847,6 +847,10 @@ unsigned long get_seconds(void)
 }
 EXPORT_SYMBOL(get_seconds);
 
+struct timespec __current_kernel_time(void)
+{
+	return xtime_cache;
+}
 
 struct timespec current_kernel_time(void)
 {
@@ -862,3 +866,20 @@ struct timespec current_kernel_time(void)
 	return now;
 }
 EXPORT_SYMBOL(current_kernel_time);
+
+struct timespec get_monotonic_coarse(void)
+{
+	struct timespec now, mono;
+	unsigned long seq;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+
+		now = xtime_cache;
+		mono = wall_to_monotonic;
+	} while (read_seqretry(&xtime_lock, seq));
+
+	set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
+				now.tv_nsec + mono.tv_nsec);
+	return now;
+}
-- 
cgit v1.2.3-71-gd317


From 5e928f77a09a07f9dd595bb8a489965d69a83458 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 18 Aug 2009 23:38:32 +0200
Subject: PM: Introduce core framework for run-time PM of I/O devices (rev. 17)

Introduce a core framework for run-time power management of I/O
devices.  Add device run-time PM fields to 'struct dev_pm_info'
and device run-time PM callbacks to 'struct dev_pm_ops'.  Introduce
a run-time PM workqueue and define some device run-time PM helper
functions at the core level.  Document all these things.

Special thanks to Alan Stern for his help with the design and
multiple detailed reviews of the pereceding versions of this patch
and to Magnus Damm for testing feedback.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Magnus Damm <damm@igel.co.jp>
---
 Documentation/power/runtime_pm.txt |  378 ++++++++++++++
 drivers/base/dd.c                  |   11 +
 drivers/base/power/Makefile        |    1 +
 drivers/base/power/main.c          |   22 +-
 drivers/base/power/power.h         |   31 +-
 drivers/base/power/runtime.c       | 1011 ++++++++++++++++++++++++++++++++++++
 include/linux/pm.h                 |  101 +++-
 include/linux/pm_runtime.h         |  114 ++++
 kernel/power/Kconfig               |   14 +
 kernel/power/main.c                |   17 +
 10 files changed, 1689 insertions(+), 11 deletions(-)
 create mode 100644 Documentation/power/runtime_pm.txt
 create mode 100644 drivers/base/power/runtime.c
 create mode 100644 include/linux/pm_runtime.h

(limited to 'include/linux')

diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt
new file mode 100644
index 000000000000..f49a33b704d2
--- /dev/null
+++ b/Documentation/power/runtime_pm.txt
@@ -0,0 +1,378 @@
+Run-time Power Management Framework for I/O Devices
+
+(C) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+
+1. Introduction
+
+Support for run-time power management (run-time PM) of I/O devices is provided
+at the power management core (PM core) level by means of:
+
+* The power management workqueue pm_wq in which bus types and device drivers can
+  put their PM-related work items.  It is strongly recommended that pm_wq be
+  used for queuing all work items related to run-time PM, because this allows
+  them to be synchronized with system-wide power transitions (suspend to RAM,
+  hibernation and resume from system sleep states).  pm_wq is declared in
+  include/linux/pm_runtime.h and defined in kernel/power/main.c.
+
+* A number of run-time PM fields in the 'power' member of 'struct device' (which
+  is of the type 'struct dev_pm_info', defined in include/linux/pm.h) that can
+  be used for synchronizing run-time PM operations with one another.
+
+* Three device run-time PM callbacks in 'struct dev_pm_ops' (defined in
+  include/linux/pm.h).
+
+* A set of helper functions defined in drivers/base/power/runtime.c that can be
+  used for carrying out run-time PM operations in such a way that the
+  synchronization between them is taken care of by the PM core.  Bus types and
+  device drivers are encouraged to use these functions.
+
+The run-time PM callbacks present in 'struct dev_pm_ops', the device run-time PM
+fields of 'struct dev_pm_info' and the core helper functions provided for
+run-time PM are described below.
+
+2. Device Run-time PM Callbacks
+
+There are three device run-time PM callbacks defined in 'struct dev_pm_ops':
+
+struct dev_pm_ops {
+	...
+	int (*runtime_suspend)(struct device *dev);
+	int (*runtime_resume)(struct device *dev);
+	void (*runtime_idle)(struct device *dev);
+	...
+};
+
+The ->runtime_suspend() callback is executed by the PM core for the bus type of
+the device being suspended.  The bus type's callback is then _entirely_
+_responsible_ for handling the device as appropriate, which may, but need not
+include executing the device driver's own ->runtime_suspend() callback (from the
+PM core's point of view it is not necessary to implement a ->runtime_suspend()
+callback in a device driver as long as the bus type's ->runtime_suspend() knows
+what to do to handle the device).
+
+  * Once the bus type's ->runtime_suspend() callback has completed successfully
+    for given device, the PM core regards the device as suspended, which need
+    not mean that the device has been put into a low power state.  It is
+    supposed to mean, however, that the device will not process data and will
+    not communicate with the CPU(s) and RAM until its bus type's
+    ->runtime_resume() callback is executed for it.  The run-time PM status of
+    a device after successful execution of its bus type's ->runtime_suspend()
+    callback is 'suspended'.
+
+  * If the bus type's ->runtime_suspend() callback returns -EBUSY or -EAGAIN,
+    the device's run-time PM status is supposed to be 'active', which means that
+    the device _must_ be fully operational afterwards.
+
+  * If the bus type's ->runtime_suspend() callback returns an error code
+    different from -EBUSY or -EAGAIN, the PM core regards this as a fatal
+    error and will refuse to run the helper functions described in Section 4
+    for the device, until the status of it is directly set either to 'active'
+    or to 'suspended' (the PM core provides special helper functions for this
+    purpose).
+
+In particular, if the driver requires remote wakeup capability for proper
+functioning and device_may_wakeup() returns 'false' for the device, then
+->runtime_suspend() should return -EBUSY.  On the other hand, if
+device_may_wakeup() returns 'true' for the device and the device is put
+into a low power state during the execution of its bus type's
+->runtime_suspend(), it is expected that remote wake-up (i.e. hardware mechanism
+allowing the device to request a change of its power state, such as PCI PME)
+will be enabled for the device.  Generally, remote wake-up should be enabled
+for all input devices put into a low power state at run time.
+
+The ->runtime_resume() callback is executed by the PM core for the bus type of
+the device being woken up.  The bus type's callback is then _entirely_
+_responsible_ for handling the device as appropriate, which may, but need not
+include executing the device driver's own ->runtime_resume() callback (from the
+PM core's point of view it is not necessary to implement a ->runtime_resume()
+callback in a device driver as long as the bus type's ->runtime_resume() knows
+what to do to handle the device).
+
+  * Once the bus type's ->runtime_resume() callback has completed successfully,
+    the PM core regards the device as fully operational, which means that the
+    device _must_ be able to complete I/O operations as needed.  The run-time
+    PM status of the device is then 'active'.
+
+  * If the bus type's ->runtime_resume() callback returns an error code, the PM
+    core regards this as a fatal error and will refuse to run the helper
+    functions described in Section 4 for the device, until its status is
+    directly set either to 'active' or to 'suspended' (the PM core provides
+    special helper functions for this purpose).
+
+The ->runtime_idle() callback is executed by the PM core for the bus type of
+given device whenever the device appears to be idle, which is indicated to the
+PM core by two counters, the device's usage counter and the counter of 'active'
+children of the device.
+
+  * If any of these counters is decreased using a helper function provided by
+    the PM core and it turns out to be equal to zero, the other counter is
+    checked.  If that counter also is equal to zero, the PM core executes the
+    device bus type's ->runtime_idle() callback (with the device as an
+    argument).
+
+The action performed by a bus type's ->runtime_idle() callback is totally
+dependent on the bus type in question, but the expected and recommended action
+is to check if the device can be suspended (i.e. if all of the conditions
+necessary for suspending the device are satisfied) and to queue up a suspend
+request for the device in that case.
+
+The helper functions provided by the PM core, described in Section 4, guarantee
+that the following constraints are met with respect to the bus type's run-time
+PM callbacks:
+
+(1) The callbacks are mutually exclusive (e.g. it is forbidden to execute
+    ->runtime_suspend() in parallel with ->runtime_resume() or with another
+    instance of ->runtime_suspend() for the same device) with the exception that
+    ->runtime_suspend() or ->runtime_resume() can be executed in parallel with
+    ->runtime_idle() (although ->runtime_idle() will not be started while any
+    of the other callbacks is being executed for the same device).
+
+(2) ->runtime_idle() and ->runtime_suspend() can only be executed for 'active'
+    devices (i.e. the PM core will only execute ->runtime_idle() or
+    ->runtime_suspend() for the devices the run-time PM status of which is
+    'active').
+
+(3) ->runtime_idle() and ->runtime_suspend() can only be executed for a device
+    the usage counter of which is equal to zero _and_ either the counter of
+    'active' children of which is equal to zero, or the 'power.ignore_children'
+    flag of which is set.
+
+(4) ->runtime_resume() can only be executed for 'suspended' devices  (i.e. the
+    PM core will only execute ->runtime_resume() for the devices the run-time
+    PM status of which is 'suspended').
+
+Additionally, the helper functions provided by the PM core obey the following
+rules:
+
+  * If ->runtime_suspend() is about to be executed or there's a pending request
+    to execute it, ->runtime_idle() will not be executed for the same device.
+
+  * A request to execute or to schedule the execution of ->runtime_suspend()
+    will cancel any pending requests to execute ->runtime_idle() for the same
+    device.
+
+  * If ->runtime_resume() is about to be executed or there's a pending request
+    to execute it, the other callbacks will not be executed for the same device.
+
+  * A request to execute ->runtime_resume() will cancel any pending or
+    scheduled requests to execute the other callbacks for the same device.
+
+3. Run-time PM Device Fields
+
+The following device run-time PM fields are present in 'struct dev_pm_info', as
+defined in include/linux/pm.h:
+
+  struct timer_list suspend_timer;
+    - timer used for scheduling (delayed) suspend request
+
+  unsigned long timer_expires;
+    - timer expiration time, in jiffies (if this is different from zero, the
+      timer is running and will expire at that time, otherwise the timer is not
+      running)
+
+  struct work_struct work;
+    - work structure used for queuing up requests (i.e. work items in pm_wq)
+
+  wait_queue_head_t wait_queue;
+    - wait queue used if any of the helper functions needs to wait for another
+      one to complete
+
+  spinlock_t lock;
+    - lock used for synchronisation
+
+  atomic_t usage_count;
+    - the usage counter of the device
+
+  atomic_t child_count;
+    - the count of 'active' children of the device
+
+  unsigned int ignore_children;
+    - if set, the value of child_count is ignored (but still updated)
+
+  unsigned int disable_depth;
+    - used for disabling the helper funcions (they work normally if this is
+      equal to zero); the initial value of it is 1 (i.e. run-time PM is
+      initially disabled for all devices)
+
+  unsigned int runtime_error;
+    - if set, there was a fatal error (one of the callbacks returned error code
+      as described in Section 2), so the helper funtions will not work until
+      this flag is cleared; this is the error code returned by the failing
+      callback
+
+  unsigned int idle_notification;
+    - if set, ->runtime_idle() is being executed
+
+  unsigned int request_pending;
+    - if set, there's a pending request (i.e. a work item queued up into pm_wq)
+
+  enum rpm_request request;
+    - type of request that's pending (valid if request_pending is set)
+
+  unsigned int deferred_resume;
+    - set if ->runtime_resume() is about to be run while ->runtime_suspend() is
+      being executed for that device and it is not practical to wait for the
+      suspend to complete; means "start a resume as soon as you've suspended"
+
+  enum rpm_status runtime_status;
+    - the run-time PM status of the device; this field's initial value is
+      RPM_SUSPENDED, which means that each device is initially regarded by the
+      PM core as 'suspended', regardless of its real hardware status
+
+All of the above fields are members of the 'power' member of 'struct device'.
+
+4. Run-time PM Device Helper Functions
+
+The following run-time PM helper functions are defined in
+drivers/base/power/runtime.c and include/linux/pm_runtime.h:
+
+  void pm_runtime_init(struct device *dev);
+    - initialize the device run-time PM fields in 'struct dev_pm_info'
+
+  void pm_runtime_remove(struct device *dev);
+    - make sure that the run-time PM of the device will be disabled after
+      removing the device from device hierarchy
+
+  int pm_runtime_idle(struct device *dev);
+    - execute ->runtime_idle() for the device's bus type; returns 0 on success
+      or error code on failure, where -EINPROGRESS means that ->runtime_idle()
+      is already being executed
+
+  int pm_runtime_suspend(struct device *dev);
+    - execute ->runtime_suspend() for the device's bus type; returns 0 on
+      success, 1 if the device's run-time PM status was already 'suspended', or
+      error code on failure, where -EAGAIN or -EBUSY means it is safe to attempt
+      to suspend the device again in future
+
+  int pm_runtime_resume(struct device *dev);
+    - execute ->runtime_resume() for the device's bus type; returns 0 on
+      success, 1 if the device's run-time PM status was already 'active' or
+      error code on failure, where -EAGAIN means it may be safe to attempt to
+      resume the device again in future, but 'power.runtime_error' should be
+      checked additionally
+
+  int pm_request_idle(struct device *dev);
+    - submit a request to execute ->runtime_idle() for the device's bus type
+      (the request is represented by a work item in pm_wq); returns 0 on success
+      or error code if the request has not been queued up
+
+  int pm_schedule_suspend(struct device *dev, unsigned int delay);
+    - schedule the execution of ->runtime_suspend() for the device's bus type
+      in future, where 'delay' is the time to wait before queuing up a suspend
+      work item in pm_wq, in milliseconds (if 'delay' is zero, the work item is
+      queued up immediately); returns 0 on success, 1 if the device's PM
+      run-time status was already 'suspended', or error code if the request
+      hasn't been scheduled (or queued up if 'delay' is 0); if the execution of
+      ->runtime_suspend() is already scheduled and not yet expired, the new
+      value of 'delay' will be used as the time to wait
+
+  int pm_request_resume(struct device *dev);
+    - submit a request to execute ->runtime_resume() for the device's bus type
+      (the request is represented by a work item in pm_wq); returns 0 on
+      success, 1 if the device's run-time PM status was already 'active', or
+      error code if the request hasn't been queued up
+
+  void pm_runtime_get_noresume(struct device *dev);
+    - increment the device's usage counter
+
+  int pm_runtime_get(struct device *dev);
+    - increment the device's usage counter, run pm_request_resume(dev) and
+      return its result
+
+  int pm_runtime_get_sync(struct device *dev);
+    - increment the device's usage counter, run pm_runtime_resume(dev) and
+      return its result
+
+  void pm_runtime_put_noidle(struct device *dev);
+    - decrement the device's usage counter
+
+  int pm_runtime_put(struct device *dev);
+    - decrement the device's usage counter, run pm_request_idle(dev) and return
+      its result
+
+  int pm_runtime_put_sync(struct device *dev);
+    - decrement the device's usage counter, run pm_runtime_idle(dev) and return
+      its result
+
+  void pm_runtime_enable(struct device *dev);
+    - enable the run-time PM helper functions to run the device bus type's
+      run-time PM callbacks described in Section 2
+
+  int pm_runtime_disable(struct device *dev);
+    - prevent the run-time PM helper functions from running the device bus
+      type's run-time PM callbacks, make sure that all of the pending run-time
+      PM operations on the device are either completed or canceled; returns
+      1 if there was a resume request pending and it was necessary to execute
+      ->runtime_resume() for the device's bus type to satisfy that request,
+      otherwise 0 is returned
+
+  void pm_suspend_ignore_children(struct device *dev, bool enable);
+    - set/unset the power.ignore_children flag of the device
+
+  int pm_runtime_set_active(struct device *dev);
+    - clear the device's 'power.runtime_error' flag, set the device's run-time
+      PM status to 'active' and update its parent's counter of 'active'
+      children as appropriate (it is only valid to use this function if
+      'power.runtime_error' is set or 'power.disable_depth' is greater than
+      zero); it will fail and return error code if the device has a parent
+      which is not active and the 'power.ignore_children' flag of which is unset
+
+  void pm_runtime_set_suspended(struct device *dev);
+    - clear the device's 'power.runtime_error' flag, set the device's run-time
+      PM status to 'suspended' and update its parent's counter of 'active'
+      children as appropriate (it is only valid to use this function if
+      'power.runtime_error' is set or 'power.disable_depth' is greater than
+      zero)
+
+It is safe to execute the following helper functions from interrupt context:
+
+pm_request_idle()
+pm_schedule_suspend()
+pm_request_resume()
+pm_runtime_get_noresume()
+pm_runtime_get()
+pm_runtime_put_noidle()
+pm_runtime_put()
+pm_suspend_ignore_children()
+pm_runtime_set_active()
+pm_runtime_set_suspended()
+pm_runtime_enable()
+
+5. Run-time PM Initialization, Device Probing and Removal
+
+Initially, the run-time PM is disabled for all devices, which means that the
+majority of the run-time PM helper funtions described in Section 4 will return
+-EAGAIN until pm_runtime_enable() is called for the device.
+
+In addition to that, the initial run-time PM status of all devices is
+'suspended', but it need not reflect the actual physical state of the device.
+Thus, if the device is initially active (i.e. it is able to process I/O), its
+run-time PM status must be changed to 'active', with the help of
+pm_runtime_set_active(), before pm_runtime_enable() is called for the device.
+
+However, if the device has a parent and the parent's run-time PM is enabled,
+calling pm_runtime_set_active() for the device will affect the parent, unless
+the parent's 'power.ignore_children' flag is set.  Namely, in that case the
+parent won't be able to suspend at run time, using the PM core's helper
+functions, as long as the child's status is 'active', even if the child's
+run-time PM is still disabled (i.e. pm_runtime_enable() hasn't been called for
+the child yet or pm_runtime_disable() has been called for it).  For this reason,
+once pm_runtime_set_active() has been called for the device, pm_runtime_enable()
+should be called for it too as soon as reasonably possible or its run-time PM
+status should be changed back to 'suspended' with the help of
+pm_runtime_set_suspended().
+
+If the default initial run-time PM status of the device (i.e. 'suspended')
+reflects the actual state of the device, its bus type's or its driver's
+->probe() callback will likely need to wake it up using one of the PM core's
+helper functions described in Section 4.  In that case, pm_runtime_resume()
+should be used.  Of course, for this purpose the device's run-time PM has to be
+enabled earlier by calling pm_runtime_enable().
+
+If the device bus type's or driver's ->probe() or ->remove() callback runs
+pm_runtime_suspend() or pm_runtime_idle() or their asynchronous counterparts,
+they will fail returning -EAGAIN, because the device's usage counter is
+incremented by the core before executing ->probe() and ->remove().  Still, it
+may be desirable to suspend the device as soon as ->probe() or ->remove() has
+finished, so the PM core uses pm_runtime_idle_sync() to invoke the device bus
+type's ->runtime_idle() callback at that time.
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index f0106875f01d..7b34b3a48f67 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -23,6 +23,7 @@
 #include <linux/kthread.h>
 #include <linux/wait.h>
 #include <linux/async.h>
+#include <linux/pm_runtime.h>
 
 #include "base.h"
 #include "power/power.h"
@@ -202,7 +203,10 @@ int driver_probe_device(struct device_driver *drv, struct device *dev)
 	pr_debug("bus: '%s': %s: matched device %s with driver %s\n",
 		 drv->bus->name, __func__, dev_name(dev), drv->name);
 
+	pm_runtime_get_noresume(dev);
+	pm_runtime_barrier(dev);
 	ret = really_probe(dev, drv);
+	pm_runtime_put_sync(dev);
 
 	return ret;
 }
@@ -245,7 +249,9 @@ int device_attach(struct device *dev)
 			ret = 0;
 		}
 	} else {
+		pm_runtime_get_noresume(dev);
 		ret = bus_for_each_drv(dev->bus, NULL, dev, __device_attach);
+		pm_runtime_put_sync(dev);
 	}
 	up(&dev->sem);
 	return ret;
@@ -306,6 +312,9 @@ static void __device_release_driver(struct device *dev)
 
 	drv = dev->driver;
 	if (drv) {
+		pm_runtime_get_noresume(dev);
+		pm_runtime_barrier(dev);
+
 		driver_sysfs_remove(dev);
 
 		if (dev->bus)
@@ -324,6 +333,8 @@ static void __device_release_driver(struct device *dev)
 			blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
 						     BUS_NOTIFY_UNBOUND_DRIVER,
 						     dev);
+
+		pm_runtime_put_sync(dev);
 	}
 }
 
diff --git a/drivers/base/power/Makefile b/drivers/base/power/Makefile
index 911208b89259..3ce3519e8f30 100644
--- a/drivers/base/power/Makefile
+++ b/drivers/base/power/Makefile
@@ -1,5 +1,6 @@
 obj-$(CONFIG_PM)	+= sysfs.o
 obj-$(CONFIG_PM_SLEEP)	+= main.o
+obj-$(CONFIG_PM_RUNTIME)	+= runtime.o
 obj-$(CONFIG_PM_TRACE_RTC)	+= trace.o
 
 ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 1b1a786b7dec..86990011277b 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -21,6 +21,7 @@
 #include <linux/kallsyms.h>
 #include <linux/mutex.h>
 #include <linux/pm.h>
+#include <linux/pm_runtime.h>
 #include <linux/resume-trace.h>
 #include <linux/rwsem.h>
 #include <linux/interrupt.h>
@@ -48,6 +49,16 @@ static DEFINE_MUTEX(dpm_list_mtx);
  */
 static bool transition_started;
 
+/**
+ * device_pm_init - Initialize the PM-related part of a device object
+ * @dev: Device object being initialized.
+ */
+void device_pm_init(struct device *dev)
+{
+	dev->power.status = DPM_ON;
+	pm_runtime_init(dev);
+}
+
 /**
  *	device_pm_lock - lock the list of active devices used by the PM core
  */
@@ -105,6 +116,7 @@ void device_pm_remove(struct device *dev)
 	mutex_lock(&dpm_list_mtx);
 	list_del_init(&dev->power.entry);
 	mutex_unlock(&dpm_list_mtx);
+	pm_runtime_remove(dev);
 }
 
 /**
@@ -512,6 +524,7 @@ static void dpm_complete(pm_message_t state)
 			mutex_unlock(&dpm_list_mtx);
 
 			device_complete(dev, state);
+			pm_runtime_put_noidle(dev);
 
 			mutex_lock(&dpm_list_mtx);
 		}
@@ -757,7 +770,14 @@ static int dpm_prepare(pm_message_t state)
 		dev->power.status = DPM_PREPARING;
 		mutex_unlock(&dpm_list_mtx);
 
-		error = device_prepare(dev, state);
+		pm_runtime_get_noresume(dev);
+		if (pm_runtime_barrier(dev) && device_may_wakeup(dev)) {
+			/* Wake-up requested during system sleep transition. */
+			pm_runtime_put_noidle(dev);
+			error = -EBUSY;
+		} else {
+			error = device_prepare(dev, state);
+		}
 
 		mutex_lock(&dpm_list_mtx);
 		if (error) {
diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h
index c7cb4fc3735c..b8fa1aa5225a 100644
--- a/drivers/base/power/power.h
+++ b/drivers/base/power/power.h
@@ -1,7 +1,14 @@
-static inline void device_pm_init(struct device *dev)
-{
-	dev->power.status = DPM_ON;
-}
+#ifdef CONFIG_PM_RUNTIME
+
+extern void pm_runtime_init(struct device *dev);
+extern void pm_runtime_remove(struct device *dev);
+
+#else /* !CONFIG_PM_RUNTIME */
+
+static inline void pm_runtime_init(struct device *dev) {}
+static inline void pm_runtime_remove(struct device *dev) {}
+
+#endif /* !CONFIG_PM_RUNTIME */
 
 #ifdef CONFIG_PM_SLEEP
 
@@ -16,23 +23,33 @@ static inline struct device *to_device(struct list_head *entry)
 	return container_of(entry, struct device, power.entry);
 }
 
+extern void device_pm_init(struct device *dev);
 extern void device_pm_add(struct device *);
 extern void device_pm_remove(struct device *);
 extern void device_pm_move_before(struct device *, struct device *);
 extern void device_pm_move_after(struct device *, struct device *);
 extern void device_pm_move_last(struct device *);
 
-#else /* CONFIG_PM_SLEEP */
+#else /* !CONFIG_PM_SLEEP */
+
+static inline void device_pm_init(struct device *dev)
+{
+	pm_runtime_init(dev);
+}
+
+static inline void device_pm_remove(struct device *dev)
+{
+	pm_runtime_remove(dev);
+}
 
 static inline void device_pm_add(struct device *dev) {}
-static inline void device_pm_remove(struct device *dev) {}
 static inline void device_pm_move_before(struct device *deva,
 					 struct device *devb) {}
 static inline void device_pm_move_after(struct device *deva,
 					struct device *devb) {}
 static inline void device_pm_move_last(struct device *dev) {}
 
-#endif
+#endif /* !CONFIG_PM_SLEEP */
 
 #ifdef CONFIG_PM
 
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
new file mode 100644
index 000000000000..38556f6cc22d
--- /dev/null
+++ b/drivers/base/power/runtime.c
@@ -0,0 +1,1011 @@
+/*
+ * drivers/base/power/runtime.c - Helper functions for device run-time PM
+ *
+ * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/sched.h>
+#include <linux/pm_runtime.h>
+#include <linux/jiffies.h>
+
+static int __pm_runtime_resume(struct device *dev, bool from_wq);
+static int __pm_request_idle(struct device *dev);
+static int __pm_request_resume(struct device *dev);
+
+/**
+ * pm_runtime_deactivate_timer - Deactivate given device's suspend timer.
+ * @dev: Device to handle.
+ */
+static void pm_runtime_deactivate_timer(struct device *dev)
+{
+	if (dev->power.timer_expires > 0) {
+		del_timer(&dev->power.suspend_timer);
+		dev->power.timer_expires = 0;
+	}
+}
+
+/**
+ * pm_runtime_cancel_pending - Deactivate suspend timer and cancel requests.
+ * @dev: Device to handle.
+ */
+static void pm_runtime_cancel_pending(struct device *dev)
+{
+	pm_runtime_deactivate_timer(dev);
+	/*
+	 * In case there's a request pending, make sure its work function will
+	 * return without doing anything.
+	 */
+	dev->power.request = RPM_REQ_NONE;
+}
+
+/**
+ * __pm_runtime_idle - Notify device bus type if the device can be suspended.
+ * @dev: Device to notify the bus type about.
+ *
+ * This function must be called under dev->power.lock with interrupts disabled.
+ */
+static int __pm_runtime_idle(struct device *dev)
+	__releases(&dev->power.lock) __acquires(&dev->power.lock)
+{
+	int retval = 0;
+
+	dev_dbg(dev, "__pm_runtime_idle()!\n");
+
+	if (dev->power.runtime_error)
+		retval = -EINVAL;
+	else if (dev->power.idle_notification)
+		retval = -EINPROGRESS;
+	else if (atomic_read(&dev->power.usage_count) > 0
+	    || dev->power.disable_depth > 0
+	    || dev->power.runtime_status != RPM_ACTIVE)
+		retval = -EAGAIN;
+	else if (!pm_children_suspended(dev))
+		retval = -EBUSY;
+	if (retval)
+		goto out;
+
+	if (dev->power.request_pending) {
+		/*
+		 * If an idle notification request is pending, cancel it.  Any
+		 * other pending request takes precedence over us.
+		 */
+		if (dev->power.request == RPM_REQ_IDLE) {
+			dev->power.request = RPM_REQ_NONE;
+		} else if (dev->power.request != RPM_REQ_NONE) {
+			retval = -EAGAIN;
+			goto out;
+		}
+	}
+
+	dev->power.idle_notification = true;
+
+	if (dev->bus && dev->bus->pm && dev->bus->pm->runtime_idle) {
+		spin_unlock_irq(&dev->power.lock);
+
+		dev->bus->pm->runtime_idle(dev);
+
+		spin_lock_irq(&dev->power.lock);
+	}
+
+	dev->power.idle_notification = false;
+	wake_up_all(&dev->power.wait_queue);
+
+ out:
+	dev_dbg(dev, "__pm_runtime_idle() returns %d!\n", retval);
+
+	return retval;
+}
+
+/**
+ * pm_runtime_idle - Notify device bus type if the device can be suspended.
+ * @dev: Device to notify the bus type about.
+ */
+int pm_runtime_idle(struct device *dev)
+{
+	int retval;
+
+	spin_lock_irq(&dev->power.lock);
+	retval = __pm_runtime_idle(dev);
+	spin_unlock_irq(&dev->power.lock);
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(pm_runtime_idle);
+
+/**
+ * __pm_runtime_suspend - Carry out run-time suspend of given device.
+ * @dev: Device to suspend.
+ * @from_wq: If set, the function has been called via pm_wq.
+ *
+ * Check if the device can be suspended and run the ->runtime_suspend() callback
+ * provided by its bus type.  If another suspend has been started earlier, wait
+ * for it to finish.  If an idle notification or suspend request is pending or
+ * scheduled, cancel it.
+ *
+ * This function must be called under dev->power.lock with interrupts disabled.
+ */
+int __pm_runtime_suspend(struct device *dev, bool from_wq)
+	__releases(&dev->power.lock) __acquires(&dev->power.lock)
+{
+	struct device *parent = NULL;
+	bool notify = false;
+	int retval = 0;
+
+	dev_dbg(dev, "__pm_runtime_suspend()%s!\n",
+		from_wq ? " from workqueue" : "");
+
+ repeat:
+	if (dev->power.runtime_error) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	/* Pending resume requests take precedence over us. */
+	if (dev->power.request_pending
+	    && dev->power.request == RPM_REQ_RESUME) {
+		retval = -EAGAIN;
+		goto out;
+	}
+
+	/* Other scheduled or pending requests need to be canceled. */
+	pm_runtime_cancel_pending(dev);
+
+	if (dev->power.runtime_status == RPM_SUSPENDED)
+		retval = 1;
+	else if (dev->power.runtime_status == RPM_RESUMING
+	    || dev->power.disable_depth > 0
+	    || atomic_read(&dev->power.usage_count) > 0)
+		retval = -EAGAIN;
+	else if (!pm_children_suspended(dev))
+		retval = -EBUSY;
+	if (retval)
+		goto out;
+
+	if (dev->power.runtime_status == RPM_SUSPENDING) {
+		DEFINE_WAIT(wait);
+
+		if (from_wq) {
+			retval = -EINPROGRESS;
+			goto out;
+		}
+
+		/* Wait for the other suspend running in parallel with us. */
+		for (;;) {
+			prepare_to_wait(&dev->power.wait_queue, &wait,
+					TASK_UNINTERRUPTIBLE);
+			if (dev->power.runtime_status != RPM_SUSPENDING)
+				break;
+
+			spin_unlock_irq(&dev->power.lock);
+
+			schedule();
+
+			spin_lock_irq(&dev->power.lock);
+		}
+		finish_wait(&dev->power.wait_queue, &wait);
+		goto repeat;
+	}
+
+	dev->power.runtime_status = RPM_SUSPENDING;
+
+	if (dev->bus && dev->bus->pm && dev->bus->pm->runtime_suspend) {
+		spin_unlock_irq(&dev->power.lock);
+
+		retval = dev->bus->pm->runtime_suspend(dev);
+
+		spin_lock_irq(&dev->power.lock);
+		dev->power.runtime_error = retval;
+	} else {
+		retval = -ENOSYS;
+	}
+
+	if (retval) {
+		dev->power.runtime_status = RPM_ACTIVE;
+		pm_runtime_cancel_pending(dev);
+		dev->power.deferred_resume = false;
+
+		if (retval == -EAGAIN || retval == -EBUSY) {
+			notify = true;
+			dev->power.runtime_error = 0;
+		}
+	} else {
+		dev->power.runtime_status = RPM_SUSPENDED;
+
+		if (dev->parent) {
+			parent = dev->parent;
+			atomic_add_unless(&parent->power.child_count, -1, 0);
+		}
+	}
+	wake_up_all(&dev->power.wait_queue);
+
+	if (dev->power.deferred_resume) {
+		dev->power.deferred_resume = false;
+		__pm_runtime_resume(dev, false);
+		retval = -EAGAIN;
+		goto out;
+	}
+
+	if (notify)
+		__pm_runtime_idle(dev);
+
+	if (parent && !parent->power.ignore_children) {
+		spin_unlock_irq(&dev->power.lock);
+
+		pm_request_idle(parent);
+
+		spin_lock_irq(&dev->power.lock);
+	}
+
+ out:
+	dev_dbg(dev, "__pm_runtime_suspend() returns %d!\n", retval);
+
+	return retval;
+}
+
+/**
+ * pm_runtime_suspend - Carry out run-time suspend of given device.
+ * @dev: Device to suspend.
+ */
+int pm_runtime_suspend(struct device *dev)
+{
+	int retval;
+
+	spin_lock_irq(&dev->power.lock);
+	retval = __pm_runtime_suspend(dev, false);
+	spin_unlock_irq(&dev->power.lock);
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(pm_runtime_suspend);
+
+/**
+ * __pm_runtime_resume - Carry out run-time resume of given device.
+ * @dev: Device to resume.
+ * @from_wq: If set, the function has been called via pm_wq.
+ *
+ * Check if the device can be woken up and run the ->runtime_resume() callback
+ * provided by its bus type.  If another resume has been started earlier, wait
+ * for it to finish.  If there's a suspend running in parallel with this
+ * function, wait for it to finish and resume the device.  Cancel any scheduled
+ * or pending requests.
+ *
+ * This function must be called under dev->power.lock with interrupts disabled.
+ */
+int __pm_runtime_resume(struct device *dev, bool from_wq)
+	__releases(&dev->power.lock) __acquires(&dev->power.lock)
+{
+	struct device *parent = NULL;
+	int retval = 0;
+
+	dev_dbg(dev, "__pm_runtime_resume()%s!\n",
+		from_wq ? " from workqueue" : "");
+
+ repeat:
+	if (dev->power.runtime_error) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	pm_runtime_cancel_pending(dev);
+
+	if (dev->power.runtime_status == RPM_ACTIVE)
+		retval = 1;
+	else if (dev->power.disable_depth > 0)
+		retval = -EAGAIN;
+	if (retval)
+		goto out;
+
+	if (dev->power.runtime_status == RPM_RESUMING
+	    || dev->power.runtime_status == RPM_SUSPENDING) {
+		DEFINE_WAIT(wait);
+
+		if (from_wq) {
+			if (dev->power.runtime_status == RPM_SUSPENDING)
+				dev->power.deferred_resume = true;
+			retval = -EINPROGRESS;
+			goto out;
+		}
+
+		/* Wait for the operation carried out in parallel with us. */
+		for (;;) {
+			prepare_to_wait(&dev->power.wait_queue, &wait,
+					TASK_UNINTERRUPTIBLE);
+			if (dev->power.runtime_status != RPM_RESUMING
+			    && dev->power.runtime_status != RPM_SUSPENDING)
+				break;
+
+			spin_unlock_irq(&dev->power.lock);
+
+			schedule();
+
+			spin_lock_irq(&dev->power.lock);
+		}
+		finish_wait(&dev->power.wait_queue, &wait);
+		goto repeat;
+	}
+
+	if (!parent && dev->parent) {
+		/*
+		 * Increment the parent's resume counter and resume it if
+		 * necessary.
+		 */
+		parent = dev->parent;
+		spin_unlock_irq(&dev->power.lock);
+
+		pm_runtime_get_noresume(parent);
+
+		spin_lock_irq(&parent->power.lock);
+		/*
+		 * We can resume if the parent's run-time PM is disabled or it
+		 * is set to ignore children.
+		 */
+		if (!parent->power.disable_depth
+		    && !parent->power.ignore_children) {
+			__pm_runtime_resume(parent, false);
+			if (parent->power.runtime_status != RPM_ACTIVE)
+				retval = -EBUSY;
+		}
+		spin_unlock_irq(&parent->power.lock);
+
+		spin_lock_irq(&dev->power.lock);
+		if (retval)
+			goto out;
+		goto repeat;
+	}
+
+	dev->power.runtime_status = RPM_RESUMING;
+
+	if (dev->bus && dev->bus->pm && dev->bus->pm->runtime_resume) {
+		spin_unlock_irq(&dev->power.lock);
+
+		retval = dev->bus->pm->runtime_resume(dev);
+
+		spin_lock_irq(&dev->power.lock);
+		dev->power.runtime_error = retval;
+	} else {
+		retval = -ENOSYS;
+	}
+
+	if (retval) {
+		dev->power.runtime_status = RPM_SUSPENDED;
+		pm_runtime_cancel_pending(dev);
+	} else {
+		dev->power.runtime_status = RPM_ACTIVE;
+		if (parent)
+			atomic_inc(&parent->power.child_count);
+	}
+	wake_up_all(&dev->power.wait_queue);
+
+	if (!retval)
+		__pm_request_idle(dev);
+
+ out:
+	if (parent) {
+		spin_unlock_irq(&dev->power.lock);
+
+		pm_runtime_put(parent);
+
+		spin_lock_irq(&dev->power.lock);
+	}
+
+	dev_dbg(dev, "__pm_runtime_resume() returns %d!\n", retval);
+
+	return retval;
+}
+
+/**
+ * pm_runtime_resume - Carry out run-time resume of given device.
+ * @dev: Device to suspend.
+ */
+int pm_runtime_resume(struct device *dev)
+{
+	int retval;
+
+	spin_lock_irq(&dev->power.lock);
+	retval = __pm_runtime_resume(dev, false);
+	spin_unlock_irq(&dev->power.lock);
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(pm_runtime_resume);
+
+/**
+ * pm_runtime_work - Universal run-time PM work function.
+ * @work: Work structure used for scheduling the execution of this function.
+ *
+ * Use @work to get the device object the work is to be done for, determine what
+ * is to be done and execute the appropriate run-time PM function.
+ */
+static void pm_runtime_work(struct work_struct *work)
+{
+	struct device *dev = container_of(work, struct device, power.work);
+	enum rpm_request req;
+
+	spin_lock_irq(&dev->power.lock);
+
+	if (!dev->power.request_pending)
+		goto out;
+
+	req = dev->power.request;
+	dev->power.request = RPM_REQ_NONE;
+	dev->power.request_pending = false;
+
+	switch (req) {
+	case RPM_REQ_NONE:
+		break;
+	case RPM_REQ_IDLE:
+		__pm_runtime_idle(dev);
+		break;
+	case RPM_REQ_SUSPEND:
+		__pm_runtime_suspend(dev, true);
+		break;
+	case RPM_REQ_RESUME:
+		__pm_runtime_resume(dev, true);
+		break;
+	}
+
+ out:
+	spin_unlock_irq(&dev->power.lock);
+}
+
+/**
+ * __pm_request_idle - Submit an idle notification request for given device.
+ * @dev: Device to handle.
+ *
+ * Check if the device's run-time PM status is correct for suspending the device
+ * and queue up a request to run __pm_runtime_idle() for it.
+ *
+ * This function must be called under dev->power.lock with interrupts disabled.
+ */
+static int __pm_request_idle(struct device *dev)
+{
+	int retval = 0;
+
+	if (dev->power.runtime_error)
+		retval = -EINVAL;
+	else if (atomic_read(&dev->power.usage_count) > 0
+	    || dev->power.disable_depth > 0
+	    || dev->power.runtime_status == RPM_SUSPENDED
+	    || dev->power.runtime_status == RPM_SUSPENDING)
+		retval = -EAGAIN;
+	else if (!pm_children_suspended(dev))
+		retval = -EBUSY;
+	if (retval)
+		return retval;
+
+	if (dev->power.request_pending) {
+		/* Any requests other then RPM_REQ_IDLE take precedence. */
+		if (dev->power.request == RPM_REQ_NONE)
+			dev->power.request = RPM_REQ_IDLE;
+		else if (dev->power.request != RPM_REQ_IDLE)
+			retval = -EAGAIN;
+		return retval;
+	}
+
+	dev->power.request = RPM_REQ_IDLE;
+	dev->power.request_pending = true;
+	queue_work(pm_wq, &dev->power.work);
+
+	return retval;
+}
+
+/**
+ * pm_request_idle - Submit an idle notification request for given device.
+ * @dev: Device to handle.
+ */
+int pm_request_idle(struct device *dev)
+{
+	unsigned long flags;
+	int retval;
+
+	spin_lock_irqsave(&dev->power.lock, flags);
+	retval = __pm_request_idle(dev);
+	spin_unlock_irqrestore(&dev->power.lock, flags);
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(pm_request_idle);
+
+/**
+ * __pm_request_suspend - Submit a suspend request for given device.
+ * @dev: Device to suspend.
+ *
+ * This function must be called under dev->power.lock with interrupts disabled.
+ */
+static int __pm_request_suspend(struct device *dev)
+{
+	int retval = 0;
+
+	if (dev->power.runtime_error)
+		return -EINVAL;
+
+	if (dev->power.runtime_status == RPM_SUSPENDED)
+		retval = 1;
+	else if (atomic_read(&dev->power.usage_count) > 0
+	    || dev->power.disable_depth > 0)
+		retval = -EAGAIN;
+	else if (dev->power.runtime_status == RPM_SUSPENDING)
+		retval = -EINPROGRESS;
+	else if (!pm_children_suspended(dev))
+		retval = -EBUSY;
+	if (retval < 0)
+		return retval;
+
+	pm_runtime_deactivate_timer(dev);
+
+	if (dev->power.request_pending) {
+		/*
+		 * Pending resume requests take precedence over us, but we can
+		 * overtake any other pending request.
+		 */
+		if (dev->power.request == RPM_REQ_RESUME)
+			retval = -EAGAIN;
+		else if (dev->power.request != RPM_REQ_SUSPEND)
+			dev->power.request = retval ?
+						RPM_REQ_NONE : RPM_REQ_SUSPEND;
+		return retval;
+	} else if (retval) {
+		return retval;
+	}
+
+	dev->power.request = RPM_REQ_SUSPEND;
+	dev->power.request_pending = true;
+	queue_work(pm_wq, &dev->power.work);
+
+	return 0;
+}
+
+/**
+ * pm_suspend_timer_fn - Timer function for pm_schedule_suspend().
+ * @data: Device pointer passed by pm_schedule_suspend().
+ *
+ * Check if the time is right and execute __pm_request_suspend() in that case.
+ */
+static void pm_suspend_timer_fn(unsigned long data)
+{
+	struct device *dev = (struct device *)data;
+	unsigned long flags;
+	unsigned long expires;
+
+	spin_lock_irqsave(&dev->power.lock, flags);
+
+	expires = dev->power.timer_expires;
+	/* If 'expire' is after 'jiffies' we've been called too early. */
+	if (expires > 0 && !time_after(expires, jiffies)) {
+		dev->power.timer_expires = 0;
+		__pm_request_suspend(dev);
+	}
+
+	spin_unlock_irqrestore(&dev->power.lock, flags);
+}
+
+/**
+ * pm_schedule_suspend - Set up a timer to submit a suspend request in future.
+ * @dev: Device to suspend.
+ * @delay: Time to wait before submitting a suspend request, in milliseconds.
+ */
+int pm_schedule_suspend(struct device *dev, unsigned int delay)
+{
+	unsigned long flags;
+	int retval = 0;
+
+	spin_lock_irqsave(&dev->power.lock, flags);
+
+	if (dev->power.runtime_error) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	if (!delay) {
+		retval = __pm_request_suspend(dev);
+		goto out;
+	}
+
+	pm_runtime_deactivate_timer(dev);
+
+	if (dev->power.request_pending) {
+		/*
+		 * Pending resume requests take precedence over us, but any
+		 * other pending requests have to be canceled.
+		 */
+		if (dev->power.request == RPM_REQ_RESUME) {
+			retval = -EAGAIN;
+			goto out;
+		}
+		dev->power.request = RPM_REQ_NONE;
+	}
+
+	if (dev->power.runtime_status == RPM_SUSPENDED)
+		retval = 1;
+	else if (dev->power.runtime_status == RPM_SUSPENDING)
+		retval = -EINPROGRESS;
+	else if (atomic_read(&dev->power.usage_count) > 0
+	    || dev->power.disable_depth > 0)
+		retval = -EAGAIN;
+	else if (!pm_children_suspended(dev))
+		retval = -EBUSY;
+	if (retval)
+		goto out;
+
+	dev->power.timer_expires = jiffies + msecs_to_jiffies(delay);
+	mod_timer(&dev->power.suspend_timer, dev->power.timer_expires);
+
+ out:
+	spin_unlock_irqrestore(&dev->power.lock, flags);
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(pm_schedule_suspend);
+
+/**
+ * pm_request_resume - Submit a resume request for given device.
+ * @dev: Device to resume.
+ *
+ * This function must be called under dev->power.lock with interrupts disabled.
+ */
+static int __pm_request_resume(struct device *dev)
+{
+	int retval = 0;
+
+	if (dev->power.runtime_error)
+		return -EINVAL;
+
+	if (dev->power.runtime_status == RPM_ACTIVE)
+		retval = 1;
+	else if (dev->power.runtime_status == RPM_RESUMING)
+		retval = -EINPROGRESS;
+	else if (dev->power.disable_depth > 0)
+		retval = -EAGAIN;
+	if (retval < 0)
+		return retval;
+
+	pm_runtime_deactivate_timer(dev);
+
+	if (dev->power.request_pending) {
+		/* If non-resume request is pending, we can overtake it. */
+		dev->power.request = retval ? RPM_REQ_NONE : RPM_REQ_RESUME;
+		return retval;
+	} else if (retval) {
+		return retval;
+	}
+
+	dev->power.request = RPM_REQ_RESUME;
+	dev->power.request_pending = true;
+	queue_work(pm_wq, &dev->power.work);
+
+	return retval;
+}
+
+/**
+ * pm_request_resume - Submit a resume request for given device.
+ * @dev: Device to resume.
+ */
+int pm_request_resume(struct device *dev)
+{
+	unsigned long flags;
+	int retval;
+
+	spin_lock_irqsave(&dev->power.lock, flags);
+	retval = __pm_request_resume(dev);
+	spin_unlock_irqrestore(&dev->power.lock, flags);
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(pm_request_resume);
+
+/**
+ * __pm_runtime_get - Reference count a device and wake it up, if necessary.
+ * @dev: Device to handle.
+ * @sync: If set and the device is suspended, resume it synchronously.
+ *
+ * Increment the usage count of the device and if it was zero previously,
+ * resume it or submit a resume request for it, depending on the value of @sync.
+ */
+int __pm_runtime_get(struct device *dev, bool sync)
+{
+	int retval = 1;
+
+	if (atomic_add_return(1, &dev->power.usage_count) == 1)
+		retval = sync ? pm_runtime_resume(dev) : pm_request_resume(dev);
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(__pm_runtime_get);
+
+/**
+ * __pm_runtime_put - Decrement the device's usage counter and notify its bus.
+ * @dev: Device to handle.
+ * @sync: If the device's bus type is to be notified, do that synchronously.
+ *
+ * Decrement the usage count of the device and if it reaches zero, carry out a
+ * synchronous idle notification or submit an idle notification request for it,
+ * depending on the value of @sync.
+ */
+int __pm_runtime_put(struct device *dev, bool sync)
+{
+	int retval = 0;
+
+	if (atomic_dec_and_test(&dev->power.usage_count))
+		retval = sync ? pm_runtime_idle(dev) : pm_request_idle(dev);
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(__pm_runtime_put);
+
+/**
+ * __pm_runtime_set_status - Set run-time PM status of a device.
+ * @dev: Device to handle.
+ * @status: New run-time PM status of the device.
+ *
+ * If run-time PM of the device is disabled or its power.runtime_error field is
+ * different from zero, the status may be changed either to RPM_ACTIVE, or to
+ * RPM_SUSPENDED, as long as that reflects the actual state of the device.
+ * However, if the device has a parent and the parent is not active, and the
+ * parent's power.ignore_children flag is unset, the device's status cannot be
+ * set to RPM_ACTIVE, so -EBUSY is returned in that case.
+ *
+ * If successful, __pm_runtime_set_status() clears the power.runtime_error field
+ * and the device parent's counter of unsuspended children is modified to
+ * reflect the new status.  If the new status is RPM_SUSPENDED, an idle
+ * notification request for the parent is submitted.
+ */
+int __pm_runtime_set_status(struct device *dev, unsigned int status)
+{
+	struct device *parent = dev->parent;
+	unsigned long flags;
+	bool notify_parent = false;
+	int error = 0;
+
+	if (status != RPM_ACTIVE && status != RPM_SUSPENDED)
+		return -EINVAL;
+
+	spin_lock_irqsave(&dev->power.lock, flags);
+
+	if (!dev->power.runtime_error && !dev->power.disable_depth) {
+		error = -EAGAIN;
+		goto out;
+	}
+
+	if (dev->power.runtime_status == status)
+		goto out_set;
+
+	if (status == RPM_SUSPENDED) {
+		/* It always is possible to set the status to 'suspended'. */
+		if (parent) {
+			atomic_add_unless(&parent->power.child_count, -1, 0);
+			notify_parent = !parent->power.ignore_children;
+		}
+		goto out_set;
+	}
+
+	if (parent) {
+		spin_lock_irq(&parent->power.lock);
+
+		/*
+		 * It is invalid to put an active child under a parent that is
+		 * not active, has run-time PM enabled and the
+		 * 'power.ignore_children' flag unset.
+		 */
+		if (!parent->power.disable_depth
+		    && !parent->power.ignore_children
+		    && parent->power.runtime_status != RPM_ACTIVE) {
+			error = -EBUSY;
+		} else {
+			if (dev->power.runtime_status == RPM_SUSPENDED)
+				atomic_inc(&parent->power.child_count);
+		}
+
+		spin_unlock_irq(&parent->power.lock);
+
+		if (error)
+			goto out;
+	}
+
+ out_set:
+	dev->power.runtime_status = status;
+	dev->power.runtime_error = 0;
+ out:
+	spin_unlock_irqrestore(&dev->power.lock, flags);
+
+	if (notify_parent)
+		pm_request_idle(parent);
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(__pm_runtime_set_status);
+
+/**
+ * __pm_runtime_barrier - Cancel pending requests and wait for completions.
+ * @dev: Device to handle.
+ *
+ * Flush all pending requests for the device from pm_wq and wait for all
+ * run-time PM operations involving the device in progress to complete.
+ *
+ * Should be called under dev->power.lock with interrupts disabled.
+ */
+static void __pm_runtime_barrier(struct device *dev)
+{
+	pm_runtime_deactivate_timer(dev);
+
+	if (dev->power.request_pending) {
+		dev->power.request = RPM_REQ_NONE;
+		spin_unlock_irq(&dev->power.lock);
+
+		cancel_work_sync(&dev->power.work);
+
+		spin_lock_irq(&dev->power.lock);
+		dev->power.request_pending = false;
+	}
+
+	if (dev->power.runtime_status == RPM_SUSPENDING
+	    || dev->power.runtime_status == RPM_RESUMING
+	    || dev->power.idle_notification) {
+		DEFINE_WAIT(wait);
+
+		/* Suspend, wake-up or idle notification in progress. */
+		for (;;) {
+			prepare_to_wait(&dev->power.wait_queue, &wait,
+					TASK_UNINTERRUPTIBLE);
+			if (dev->power.runtime_status != RPM_SUSPENDING
+			    && dev->power.runtime_status != RPM_RESUMING
+			    && !dev->power.idle_notification)
+				break;
+			spin_unlock_irq(&dev->power.lock);
+
+			schedule();
+
+			spin_lock_irq(&dev->power.lock);
+		}
+		finish_wait(&dev->power.wait_queue, &wait);
+	}
+}
+
+/**
+ * pm_runtime_barrier - Flush pending requests and wait for completions.
+ * @dev: Device to handle.
+ *
+ * Prevent the device from being suspended by incrementing its usage counter and
+ * if there's a pending resume request for the device, wake the device up.
+ * Next, make sure that all pending requests for the device have been flushed
+ * from pm_wq and wait for all run-time PM operations involving the device in
+ * progress to complete.
+ *
+ * Return value:
+ * 1, if there was a resume request pending and the device had to be woken up,
+ * 0, otherwise
+ */
+int pm_runtime_barrier(struct device *dev)
+{
+	int retval = 0;
+
+	pm_runtime_get_noresume(dev);
+	spin_lock_irq(&dev->power.lock);
+
+	if (dev->power.request_pending
+	    && dev->power.request == RPM_REQ_RESUME) {
+		__pm_runtime_resume(dev, false);
+		retval = 1;
+	}
+
+	__pm_runtime_barrier(dev);
+
+	spin_unlock_irq(&dev->power.lock);
+	pm_runtime_put_noidle(dev);
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(pm_runtime_barrier);
+
+/**
+ * __pm_runtime_disable - Disable run-time PM of a device.
+ * @dev: Device to handle.
+ * @check_resume: If set, check if there's a resume request for the device.
+ *
+ * Increment power.disable_depth for the device and if was zero previously,
+ * cancel all pending run-time PM requests for the device and wait for all
+ * operations in progress to complete.  The device can be either active or
+ * suspended after its run-time PM has been disabled.
+ *
+ * If @check_resume is set and there's a resume request pending when
+ * __pm_runtime_disable() is called and power.disable_depth is zero, the
+ * function will wake up the device before disabling its run-time PM.
+ */
+void __pm_runtime_disable(struct device *dev, bool check_resume)
+{
+	spin_lock_irq(&dev->power.lock);
+
+	if (dev->power.disable_depth > 0) {
+		dev->power.disable_depth++;
+		goto out;
+	}
+
+	/*
+	 * Wake up the device if there's a resume request pending, because that
+	 * means there probably is some I/O to process and disabling run-time PM
+	 * shouldn't prevent the device from processing the I/O.
+	 */
+	if (check_resume && dev->power.request_pending
+	    && dev->power.request == RPM_REQ_RESUME) {
+		/*
+		 * Prevent suspends and idle notifications from being carried
+		 * out after we have woken up the device.
+		 */
+		pm_runtime_get_noresume(dev);
+
+		__pm_runtime_resume(dev, false);
+
+		pm_runtime_put_noidle(dev);
+	}
+
+	if (!dev->power.disable_depth++)
+		__pm_runtime_barrier(dev);
+
+ out:
+	spin_unlock_irq(&dev->power.lock);
+}
+EXPORT_SYMBOL_GPL(__pm_runtime_disable);
+
+/**
+ * pm_runtime_enable - Enable run-time PM of a device.
+ * @dev: Device to handle.
+ */
+void pm_runtime_enable(struct device *dev)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->power.lock, flags);
+
+	if (dev->power.disable_depth > 0)
+		dev->power.disable_depth--;
+	else
+		dev_warn(dev, "Unbalanced %s!\n", __func__);
+
+	spin_unlock_irqrestore(&dev->power.lock, flags);
+}
+EXPORT_SYMBOL_GPL(pm_runtime_enable);
+
+/**
+ * pm_runtime_init - Initialize run-time PM fields in given device object.
+ * @dev: Device object to initialize.
+ */
+void pm_runtime_init(struct device *dev)
+{
+	spin_lock_init(&dev->power.lock);
+
+	dev->power.runtime_status = RPM_SUSPENDED;
+	dev->power.idle_notification = false;
+
+	dev->power.disable_depth = 1;
+	atomic_set(&dev->power.usage_count, 0);
+
+	dev->power.runtime_error = 0;
+
+	atomic_set(&dev->power.child_count, 0);
+	pm_suspend_ignore_children(dev, false);
+
+	dev->power.request_pending = false;
+	dev->power.request = RPM_REQ_NONE;
+	dev->power.deferred_resume = false;
+	INIT_WORK(&dev->power.work, pm_runtime_work);
+
+	dev->power.timer_expires = 0;
+	setup_timer(&dev->power.suspend_timer, pm_suspend_timer_fn,
+			(unsigned long)dev);
+
+	init_waitqueue_head(&dev->power.wait_queue);
+}
+
+/**
+ * pm_runtime_remove - Prepare for removing a device from device hierarchy.
+ * @dev: Device object being removed from device hierarchy.
+ */
+void pm_runtime_remove(struct device *dev)
+{
+	__pm_runtime_disable(dev, false);
+
+	/* Change the status back to 'suspended' to match the initial status. */
+	if (dev->power.runtime_status == RPM_ACTIVE)
+		pm_runtime_set_suspended(dev);
+}
diff --git a/include/linux/pm.h b/include/linux/pm.h
index b3f74764a586..2b6e20df0e52 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -22,6 +22,10 @@
 #define _LINUX_PM_H
 
 #include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <linux/timer.h>
 
 /*
  * Callbacks for platform drivers to implement.
@@ -165,6 +169,28 @@ typedef struct pm_message {
  * It is allowed to unregister devices while the above callbacks are being
  * executed.  However, it is not allowed to unregister a device from within any
  * of its own callbacks.
+ *
+ * There also are the following callbacks related to run-time power management
+ * of devices:
+ *
+ * @runtime_suspend: Prepare the device for a condition in which it won't be
+ *	able to communicate with the CPU(s) and RAM due to power management.
+ *	This need not mean that the device should be put into a low power state.
+ *	For example, if the device is behind a link which is about to be turned
+ *	off, the device may remain at full power.  If the device does go to low
+ *	power and if device_may_wakeup(dev) is true, remote wake-up (i.e., a
+ *	hardware mechanism allowing the device to request a change of its power
+ *	state, such as PCI PME) should be enabled for it.
+ *
+ * @runtime_resume: Put the device into the fully active state in response to a
+ *	wake-up event generated by hardware or at the request of software.  If
+ *	necessary, put the device into the full power state and restore its
+ *	registers, so that it is fully operational.
+ *
+ * @runtime_idle: Device appears to be inactive and it might be put into a low
+ *	power state if all of the necessary conditions are satisfied.  Check
+ *	these conditions and handle the device as appropriate, possibly queueing
+ *	a suspend request for it.  The return value is ignored by the PM core.
  */
 
 struct dev_pm_ops {
@@ -182,6 +208,9 @@ struct dev_pm_ops {
 	int (*thaw_noirq)(struct device *dev);
 	int (*poweroff_noirq)(struct device *dev);
 	int (*restore_noirq)(struct device *dev);
+	int (*runtime_suspend)(struct device *dev);
+	int (*runtime_resume)(struct device *dev);
+	int (*runtime_idle)(struct device *dev);
 };
 
 /**
@@ -315,14 +344,80 @@ enum dpm_state {
 	DPM_OFF_IRQ,
 };
 
+/**
+ * Device run-time power management status.
+ *
+ * These status labels are used internally by the PM core to indicate the
+ * current status of a device with respect to the PM core operations.  They do
+ * not reflect the actual power state of the device or its status as seen by the
+ * driver.
+ *
+ * RPM_ACTIVE		Device is fully operational.  Indicates that the device
+ *			bus type's ->runtime_resume() callback has completed
+ *			successfully.
+ *
+ * RPM_SUSPENDED	Device bus type's ->runtime_suspend() callback has
+ *			completed successfully.  The device is regarded as
+ *			suspended.
+ *
+ * RPM_RESUMING		Device bus type's ->runtime_resume() callback is being
+ *			executed.
+ *
+ * RPM_SUSPENDING	Device bus type's ->runtime_suspend() callback is being
+ *			executed.
+ */
+
+enum rpm_status {
+	RPM_ACTIVE = 0,
+	RPM_RESUMING,
+	RPM_SUSPENDED,
+	RPM_SUSPENDING,
+};
+
+/**
+ * Device run-time power management request types.
+ *
+ * RPM_REQ_NONE		Do nothing.
+ *
+ * RPM_REQ_IDLE		Run the device bus type's ->runtime_idle() callback
+ *
+ * RPM_REQ_SUSPEND	Run the device bus type's ->runtime_suspend() callback
+ *
+ * RPM_REQ_RESUME	Run the device bus type's ->runtime_resume() callback
+ */
+
+enum rpm_request {
+	RPM_REQ_NONE = 0,
+	RPM_REQ_IDLE,
+	RPM_REQ_SUSPEND,
+	RPM_REQ_RESUME,
+};
+
 struct dev_pm_info {
 	pm_message_t		power_state;
-	unsigned		can_wakeup:1;
-	unsigned		should_wakeup:1;
+	unsigned int		can_wakeup:1;
+	unsigned int		should_wakeup:1;
 	enum dpm_state		status;		/* Owned by the PM core */
-#ifdef	CONFIG_PM_SLEEP
+#ifdef CONFIG_PM_SLEEP
 	struct list_head	entry;
 #endif
+#ifdef CONFIG_PM_RUNTIME
+	struct timer_list	suspend_timer;
+	unsigned long		timer_expires;
+	struct work_struct	work;
+	wait_queue_head_t	wait_queue;
+	spinlock_t		lock;
+	atomic_t		usage_count;
+	atomic_t		child_count;
+	unsigned int		disable_depth:3;
+	unsigned int		ignore_children:1;
+	unsigned int		idle_notification:1;
+	unsigned int		request_pending:1;
+	unsigned int		deferred_resume:1;
+	enum rpm_request	request;
+	enum rpm_status		runtime_status;
+	int			runtime_error;
+#endif
 };
 
 /*
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
new file mode 100644
index 000000000000..44087044910f
--- /dev/null
+++ b/include/linux/pm_runtime.h
@@ -0,0 +1,114 @@
+/*
+ * pm_runtime.h - Device run-time power management helper functions.
+ *
+ * Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#ifndef _LINUX_PM_RUNTIME_H
+#define _LINUX_PM_RUNTIME_H
+
+#include <linux/device.h>
+#include <linux/pm.h>
+
+#ifdef CONFIG_PM_RUNTIME
+
+extern struct workqueue_struct *pm_wq;
+
+extern int pm_runtime_idle(struct device *dev);
+extern int pm_runtime_suspend(struct device *dev);
+extern int pm_runtime_resume(struct device *dev);
+extern int pm_request_idle(struct device *dev);
+extern int pm_schedule_suspend(struct device *dev, unsigned int delay);
+extern int pm_request_resume(struct device *dev);
+extern int __pm_runtime_get(struct device *dev, bool sync);
+extern int __pm_runtime_put(struct device *dev, bool sync);
+extern int __pm_runtime_set_status(struct device *dev, unsigned int status);
+extern int pm_runtime_barrier(struct device *dev);
+extern void pm_runtime_enable(struct device *dev);
+extern void __pm_runtime_disable(struct device *dev, bool check_resume);
+
+static inline bool pm_children_suspended(struct device *dev)
+{
+	return dev->power.ignore_children
+		|| !atomic_read(&dev->power.child_count);
+}
+
+static inline void pm_suspend_ignore_children(struct device *dev, bool enable)
+{
+	dev->power.ignore_children = enable;
+}
+
+static inline void pm_runtime_get_noresume(struct device *dev)
+{
+	atomic_inc(&dev->power.usage_count);
+}
+
+static inline void pm_runtime_put_noidle(struct device *dev)
+{
+	atomic_add_unless(&dev->power.usage_count, -1, 0);
+}
+
+#else /* !CONFIG_PM_RUNTIME */
+
+static inline int pm_runtime_idle(struct device *dev) { return -ENOSYS; }
+static inline int pm_runtime_suspend(struct device *dev) { return -ENOSYS; }
+static inline int pm_runtime_resume(struct device *dev) { return 0; }
+static inline int pm_request_idle(struct device *dev) { return -ENOSYS; }
+static inline int pm_schedule_suspend(struct device *dev, unsigned int delay)
+{
+	return -ENOSYS;
+}
+static inline int pm_request_resume(struct device *dev) { return 0; }
+static inline int __pm_runtime_get(struct device *dev, bool sync) { return 1; }
+static inline int __pm_runtime_put(struct device *dev, bool sync) { return 0; }
+static inline int __pm_runtime_set_status(struct device *dev,
+					    unsigned int status) { return 0; }
+static inline int pm_runtime_barrier(struct device *dev) { return 0; }
+static inline void pm_runtime_enable(struct device *dev) {}
+static inline void __pm_runtime_disable(struct device *dev, bool c) {}
+
+static inline bool pm_children_suspended(struct device *dev) { return false; }
+static inline void pm_suspend_ignore_children(struct device *dev, bool en) {}
+static inline void pm_runtime_get_noresume(struct device *dev) {}
+static inline void pm_runtime_put_noidle(struct device *dev) {}
+
+#endif /* !CONFIG_PM_RUNTIME */
+
+static inline int pm_runtime_get(struct device *dev)
+{
+	return __pm_runtime_get(dev, false);
+}
+
+static inline int pm_runtime_get_sync(struct device *dev)
+{
+	return __pm_runtime_get(dev, true);
+}
+
+static inline int pm_runtime_put(struct device *dev)
+{
+	return __pm_runtime_put(dev, false);
+}
+
+static inline int pm_runtime_put_sync(struct device *dev)
+{
+	return __pm_runtime_put(dev, true);
+}
+
+static inline int pm_runtime_set_active(struct device *dev)
+{
+	return __pm_runtime_set_status(dev, RPM_ACTIVE);
+}
+
+static inline void pm_runtime_set_suspended(struct device *dev)
+{
+	__pm_runtime_set_status(dev, RPM_SUSPENDED);
+}
+
+static inline void pm_runtime_disable(struct device *dev)
+{
+	__pm_runtime_disable(dev, true);
+}
+
+#endif
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 72067cbdb37f..91e09d3b2eb2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -208,3 +208,17 @@ config APM_EMULATION
 	  random kernel OOPSes or reboots that don't seem to be related to
 	  anything, try disabling/enabling this option (or disabling/enabling
 	  APM in your BIOS).
+
+config PM_RUNTIME
+	bool "Run-time PM core functionality"
+	depends on PM
+	---help---
+	  Enable functionality allowing I/O devices to be put into energy-saving
+	  (low power) states at run time (or autosuspended) after a specified
+	  period of inactivity and woken up in response to a hardware-generated
+	  wake-up event or a driver's request.
+
+	  Hardware support is generally required for this functionality to work
+	  and the bus type drivers of the buses the devices are on are
+	  responsible for the actual handling of the autosuspend requests and
+	  wake-up events.
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f710e36930cc..347d2cc88cd0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -11,6 +11,7 @@
 #include <linux/kobject.h>
 #include <linux/string.h>
 #include <linux/resume-trace.h>
+#include <linux/workqueue.h>
 
 #include "power.h"
 
@@ -217,8 +218,24 @@ static struct attribute_group attr_group = {
 	.attrs = g,
 };
 
+#ifdef CONFIG_PM_RUNTIME
+struct workqueue_struct *pm_wq;
+
+static int __init pm_start_workqueue(void)
+{
+	pm_wq = create_freezeable_workqueue("pm");
+
+	return pm_wq ? 0 : -ENOMEM;
+}
+#else
+static inline int pm_start_workqueue(void) { return 0; }
+#endif
+
 static int __init pm_init(void)
 {
+	int error = pm_start_workqueue();
+	if (error)
+		return error;
 	power_kobj = kobject_create_and_add("power", NULL);
 	if (!power_kobj)
 		return -ENOMEM;
-- 
cgit v1.2.3-71-gd317


From 163f52b6cf3a639df6a72c7937e0eb88b20f1ef3 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Sat, 1 Aug 2009 00:39:36 +0000
Subject: [SCSI] ses: fix hotplug with multiple devices and expanders

In a situation either with expanders or with multiple enclosure
devices, hot add doesn't always work.  This is because we try to find
a single enclosure device attached to the host.  Fix this by looping
over all enclosure devices attached to the host and also by making the
find loop recognise that the enclosure devices may be expander remote
(i.e. not parented by the host).

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: James Bottomley <James.Bottomley@suse.de>
---
 drivers/misc/enclosure.c  | 44 ++++++++++++++++++++++++++++++++------------
 drivers/scsi/ses.c        | 10 ++++++----
 include/linux/enclosure.h |  3 ++-
 3 files changed, 40 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/enclosure.c b/drivers/misc/enclosure.c
index 348443bdb23b..789d12128c24 100644
--- a/drivers/misc/enclosure.c
+++ b/drivers/misc/enclosure.c
@@ -33,24 +33,44 @@ static DEFINE_MUTEX(container_list_lock);
 static struct class enclosure_class;
 
 /**
- * enclosure_find - find an enclosure given a device
- * @dev:	the device to find for
+ * enclosure_find - find an enclosure given a parent device
+ * @dev:	the parent to match against
+ * @start:	Optional enclosure device to start from (NULL if none)
  *
- * Looks through the list of registered enclosures to see
- * if it can find a match for a device.  Returns NULL if no
- * enclosure is found. Obtains a reference to the enclosure class
- * device which must be released with device_put().
+ * Looks through the list of registered enclosures to find all those
+ * with @dev as a parent.  Returns NULL if no enclosure is
+ * found. @start can be used as a starting point to obtain multiple
+ * enclosures per parent (should begin with NULL and then be set to
+ * each returned enclosure device). Obtains a reference to the
+ * enclosure class device which must be released with device_put().
+ * If @start is not NULL, a reference must be taken on it which is
+ * released before returning (this allows a loop through all
+ * enclosures to exit with only the reference on the enclosure of
+ * interest held).  Note that the @dev may correspond to the actual
+ * device housing the enclosure, in which case no iteration via @start
+ * is required.
  */
-struct enclosure_device *enclosure_find(struct device *dev)
+struct enclosure_device *enclosure_find(struct device *dev,
+					struct enclosure_device *start)
 {
 	struct enclosure_device *edev;
 
 	mutex_lock(&container_list_lock);
-	list_for_each_entry(edev, &container_list, node) {
-		if (edev->edev.parent == dev) {
-			get_device(&edev->edev);
-			mutex_unlock(&container_list_lock);
-			return edev;
+	edev = list_prepare_entry(start, &container_list, node);
+	if (start)
+		put_device(&start->edev);
+
+	list_for_each_entry_continue(edev, &container_list, node) {
+		struct device *parent = edev->edev.parent;
+		/* parent might not be immediate, so iterate up to
+		 * the root of the tree if necessary */
+		while (parent) {
+			if (parent == dev) {
+				get_device(&edev->edev);
+				mutex_unlock(&container_list_lock);
+				return edev;
+			}
+			parent = parent->parent;
 		}
 	}
 	mutex_unlock(&container_list_lock);
diff --git a/drivers/scsi/ses.c b/drivers/scsi/ses.c
index 4f618f487356..e1b8c828f03a 100644
--- a/drivers/scsi/ses.c
+++ b/drivers/scsi/ses.c
@@ -413,10 +413,11 @@ static int ses_intf_add(struct device *cdev,
 
 	if (!scsi_device_enclosure(sdev)) {
 		/* not an enclosure, but might be in one */
-		edev = enclosure_find(&sdev->host->shost_gendev);
-		if (edev) {
+		struct enclosure_device *prev = NULL;
+
+		while ((edev = enclosure_find(&sdev->host->shost_gendev, prev)) != NULL) {
 			ses_match_to_enclosure(edev, sdev);
-			put_device(&edev->edev);
+			prev = edev;
 		}
 		return -ENODEV;
 	}
@@ -625,7 +626,8 @@ static void ses_intf_remove(struct device *cdev,
 	if (!scsi_device_enclosure(sdev))
 		return;
 
-	edev = enclosure_find(cdev->parent);
+	/*  exact match to this enclosure */
+	edev = enclosure_find(cdev->parent, NULL);
 	if (!edev)
 		return;
 
diff --git a/include/linux/enclosure.h b/include/linux/enclosure.h
index 4332442b1b57..d77811e9ed84 100644
--- a/include/linux/enclosure.h
+++ b/include/linux/enclosure.h
@@ -123,7 +123,8 @@ enclosure_component_register(struct enclosure_device *, unsigned int,
 int enclosure_add_device(struct enclosure_device *enclosure, int component,
 			 struct device *dev);
 int enclosure_remove_device(struct enclosure_device *enclosure, int component);
-struct enclosure_device *enclosure_find(struct device *dev);
+struct enclosure_device *enclosure_find(struct device *dev,
+					struct enclosure_device *start);
 int enclosure_for_each_device(int (*fn)(struct enclosure_device *, void *),
 			      void *data);
 
-- 
cgit v1.2.3-71-gd317


From 43d8eb9cfd0aea93be32181c64e18191b69c211c Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Sat, 1 Aug 2009 00:41:22 +0000
Subject: [SCSI] ses: add support for enclosure component hot removal

Right at the moment, hot removal of a device within an enclosure does
nothing (because the intf_remove only copes with enclosure removal not
with component removal). Fix this by adding a function to remove the
component.  Also needed to fix the prototype of
enclosure_remove_device, since we know the device we've removed but
not the internal component number

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: James Bottomley <James.Bottomley@suse.de>
---
 drivers/misc/enclosure.c  | 22 ++++++++++++++--------
 drivers/scsi/ses.c        | 33 ++++++++++++++++++++++++++-------
 include/linux/enclosure.h |  2 +-
 3 files changed, 41 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/enclosure.c b/drivers/misc/enclosure.c
index 789d12128c24..850706a5e553 100644
--- a/drivers/misc/enclosure.c
+++ b/drivers/misc/enclosure.c
@@ -332,19 +332,25 @@ EXPORT_SYMBOL_GPL(enclosure_add_device);
  * Returns zero on success or an error.
  *
  */
-int enclosure_remove_device(struct enclosure_device *edev, int component)
+int enclosure_remove_device(struct enclosure_device *edev, struct device *dev)
 {
 	struct enclosure_component *cdev;
+	int i;
 
-	if (!edev || component >= edev->components)
+	if (!edev || !dev)
 		return -EINVAL;
 
-	cdev = &edev->component[component];
-
-	device_del(&cdev->cdev);
-	put_device(cdev->dev);
-	cdev->dev = NULL;
-	return device_add(&cdev->cdev);
+	for (i = 0; i < edev->components; i++) {
+		cdev = &edev->component[i];
+		if (cdev->dev == dev) {
+			enclosure_remove_links(cdev);
+			device_del(&cdev->cdev);
+			put_device(dev);
+			cdev->dev = NULL;
+			return device_add(&cdev->cdev);
+		}
+	}
+	return -ENODEV;
 }
 EXPORT_SYMBOL_GPL(enclosure_remove_device);
 
diff --git a/drivers/scsi/ses.c b/drivers/scsi/ses.c
index e1b8c828f03a..be593c8525b5 100644
--- a/drivers/scsi/ses.c
+++ b/drivers/scsi/ses.c
@@ -616,18 +616,26 @@ static int ses_remove(struct device *dev)
 	return 0;
 }
 
-static void ses_intf_remove(struct device *cdev,
-			    struct class_interface *intf)
+static void ses_intf_remove_component(struct scsi_device *sdev)
+{
+	struct enclosure_device *edev, *prev = NULL;
+
+	while ((edev = enclosure_find(&sdev->host->shost_gendev, prev)) != NULL) {
+		prev = edev;
+		if (!enclosure_remove_device(edev, &sdev->sdev_gendev))
+			break;
+	}
+	if (edev)
+		put_device(&edev->edev);
+}
+
+static void ses_intf_remove_enclosure(struct scsi_device *sdev)
 {
-	struct scsi_device *sdev = to_scsi_device(cdev->parent);
 	struct enclosure_device *edev;
 	struct ses_device *ses_dev;
 
-	if (!scsi_device_enclosure(sdev))
-		return;
-
 	/*  exact match to this enclosure */
-	edev = enclosure_find(cdev->parent, NULL);
+	edev = enclosure_find(&sdev->sdev_gendev, NULL);
 	if (!edev)
 		return;
 
@@ -645,6 +653,17 @@ static void ses_intf_remove(struct device *cdev,
 	enclosure_unregister(edev);
 }
 
+static void ses_intf_remove(struct device *cdev,
+			    struct class_interface *intf)
+{
+	struct scsi_device *sdev = to_scsi_device(cdev->parent);
+
+	if (!scsi_device_enclosure(sdev))
+		ses_intf_remove_component(sdev);
+	else
+		ses_intf_remove_enclosure(sdev);
+}
+
 static struct class_interface ses_interface = {
 	.add_dev	= ses_intf_add,
 	.remove_dev	= ses_intf_remove,
diff --git a/include/linux/enclosure.h b/include/linux/enclosure.h
index d77811e9ed84..90d1c2184112 100644
--- a/include/linux/enclosure.h
+++ b/include/linux/enclosure.h
@@ -122,7 +122,7 @@ enclosure_component_register(struct enclosure_device *, unsigned int,
 				 enum enclosure_component_type, const char *);
 int enclosure_add_device(struct enclosure_device *enclosure, int component,
 			 struct device *dev);
-int enclosure_remove_device(struct enclosure_device *enclosure, int component);
+int enclosure_remove_device(struct enclosure_device *, struct device *);
 struct enclosure_device *enclosure_find(struct device *dev,
 					struct enclosure_device *start);
 int enclosure_for_each_device(int (*fn)(struct enclosure_device *, void *),
-- 
cgit v1.2.3-71-gd317


From 05ecd5a1f76c183cca381705b3adb7d77c9a0439 Mon Sep 17 00:00:00 2001
From: Pawel Moll <pawel.moll@st.com>
Date: Mon, 24 Aug 2009 19:52:38 +0900
Subject: sh: Simplify "multi-evt" interrupt handling.

This patch changes the way in which "multi-evt" interrups are handled.
The intc_evt2irq_table and related intc_evt2irq() have been removed and
the "redirecting" handler is installed for the coupled interrupts.

Thanks to that the do_IRQ() function don't have to use another level
of indirection for all the interrupts...

Signed-off-by: Pawel Moll <pawel.moll@st.com>
Signed-off-by: Stuart Menefy <stuart.menefy@st.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/irq.c    |  2 +-
 drivers/sh/intc.c       | 54 ++++++++++++++++---------------------------------
 include/linux/sh_intc.h |  1 -
 3 files changed, 18 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index 278c68c60488..d1053392e287 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -114,7 +114,7 @@ asmlinkage int do_IRQ(unsigned int irq, struct pt_regs *regs)
 #endif
 
 	irq_enter();
-	irq = irq_demux(intc_evt2irq(irq));
+	irq = irq_demux(evt2irq(irq));
 
 #ifdef CONFIG_IRQSTACKS
 	curctx = (union irq_ctx *)current_thread_info();
diff --git a/drivers/sh/intc.c b/drivers/sh/intc.c
index 4b1ca9d28353..a9174ec72853 100644
--- a/drivers/sh/intc.c
+++ b/drivers/sh/intc.c
@@ -663,16 +663,9 @@ static unsigned int __init save_reg(struct intc_desc_int *d,
 	return 0;
 }
 
-static unsigned char *intc_evt2irq_table;
-
-unsigned int intc_evt2irq(unsigned int vector)
+static void intc_redirect_irq(unsigned int irq, struct irq_desc *desc)
 {
-	unsigned int irq = evt2irq(vector);
-
-	if (intc_evt2irq_table && intc_evt2irq_table[irq])
-		irq = intc_evt2irq_table[irq];
-
-	return irq;
+	generic_handle_irq((unsigned int)get_irq_data(irq));
 }
 
 void __init register_intc_controller(struct intc_desc *desc)
@@ -745,34 +738,6 @@ void __init register_intc_controller(struct intc_desc *desc)
 
 	BUG_ON(k > 256); /* _INTC_ADDR_E() and _INTC_ADDR_D() are 8 bits */
 
-	/* keep the first vector only if same enum is used multiple times */
-	for (i = 0; i < desc->nr_vectors; i++) {
-		struct intc_vect *vect = desc->vectors + i;
-		int first_irq = evt2irq(vect->vect);
-
-		if (!vect->enum_id)
-			continue;
-
-		for (k = i + 1; k < desc->nr_vectors; k++) {
-			struct intc_vect *vect2 = desc->vectors + k;
-
-			if (vect->enum_id != vect2->enum_id)
-				continue;
-
-			vect2->enum_id = 0;
-
-			if (!intc_evt2irq_table)
-				intc_evt2irq_table = kzalloc(NR_IRQS, GFP_NOWAIT);
-
-			if (!intc_evt2irq_table) {
-				pr_warning("intc: cannot allocate evt2irq!\n");
-				continue;
-			}
-
-			intc_evt2irq_table[evt2irq(vect2->vect)] = first_irq;
-		}
-	}
-
 	/* register the vectors one by one */
 	for (i = 0; i < desc->nr_vectors; i++) {
 		struct intc_vect *vect = desc->vectors + i;
@@ -789,6 +754,21 @@ void __init register_intc_controller(struct intc_desc *desc)
 		}
 
 		intc_register_irq(desc, d, vect->enum_id, irq);
+
+		for (k = i + 1; k < desc->nr_vectors; k++) {
+			struct intc_vect *vect2 = desc->vectors + k;
+			unsigned int irq2 = evt2irq(vect2->vect);
+
+			if (vect->enum_id != vect2->enum_id)
+				continue;
+
+			vect2->enum_id = 0;
+
+			/* redirect this interrupts to the first one */
+			set_irq_chip_and_handler_name(irq2, &d->chip,
+					intc_redirect_irq, "redirect");
+			set_irq_data(irq2, (void *)irq);
+		}
 	}
 }
 
diff --git a/include/linux/sh_intc.h b/include/linux/sh_intc.h
index eb1423a0078d..68e212ff9dde 100644
--- a/include/linux/sh_intc.h
+++ b/include/linux/sh_intc.h
@@ -85,7 +85,6 @@ struct intc_desc symbol __initdata = {					\
 }
 #endif
 
-unsigned int intc_evt2irq(unsigned int vector);
 void __init register_intc_controller(struct intc_desc *desc);
 int intc_set_priority(unsigned int irq, unsigned int prio);
 
-- 
cgit v1.2.3-71-gd317


From 0396c215f301e92677d1e9a064b405e31501dc1d Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@csr.com>
Date: Tue, 25 Aug 2009 16:41:06 +0100
Subject: uwb: avoid radio controller reset loops

If a radio controller reset attempt occurs while a probe() or remove()
is in progress it fails and is retried endlessly, potentially preventing
the probe() or remove() from completing.

If a reset fails, sleep for a bit before retrying the reset.  This
allows the probe()/remove() to complete.

Signed-off-by: David Vrabel <david.vrabel@csr.com>
---
 drivers/uwb/hwa-rc.c  |  3 +--
 drivers/uwb/reset.c   | 21 +++++++++++----------
 drivers/uwb/umc-bus.c |  2 +-
 drivers/uwb/whc-rc.c  |  3 +--
 include/linux/uwb.h   |  2 +-
 5 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/uwb/hwa-rc.c b/drivers/uwb/hwa-rc.c
index 9052bcb4f528..e7eeb63fab23 100644
--- a/drivers/uwb/hwa-rc.c
+++ b/drivers/uwb/hwa-rc.c
@@ -887,8 +887,7 @@ static int hwarc_post_reset(struct usb_interface *iface)
 	struct hwarc *hwarc = usb_get_intfdata(iface);
 	struct uwb_rc *uwb_rc = hwarc->uwb_rc;
 
-	uwb_rc_post_reset(uwb_rc);
-	return 0;
+	return uwb_rc_post_reset(uwb_rc);
 }
 
 /** USB device ID's that we handle */
diff --git a/drivers/uwb/reset.c b/drivers/uwb/reset.c
index 70f8050221ff..7f0512e43d9d 100644
--- a/drivers/uwb/reset.c
+++ b/drivers/uwb/reset.c
@@ -30,6 +30,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/err.h>
+#include <linux/delay.h>
 
 #include "uwb-internal.h"
 
@@ -323,13 +324,15 @@ int uwbd_msg_handle_reset(struct uwb_event *evt)
 
 	dev_info(&rc->uwb_dev.dev, "resetting radio controller\n");
 	ret = rc->reset(rc);
-	if (ret) {
+	if (ret < 0) {
 		dev_err(&rc->uwb_dev.dev, "failed to reset hardware: %d\n", ret);
 		goto error;
 	}
 	return 0;
 error:
-	/* Nothing can be done except try the reset again. */
+	/* Nothing can be done except try the reset again. Wait a bit
+	   to avoid reset loops during probe() or remove(). */
+	msleep(1000);
 	uwb_rc_reset_all(rc);
 	return ret;
 }
@@ -368,22 +371,20 @@ void uwb_rc_pre_reset(struct uwb_rc *rc)
 }
 EXPORT_SYMBOL_GPL(uwb_rc_pre_reset);
 
-void uwb_rc_post_reset(struct uwb_rc *rc)
+int uwb_rc_post_reset(struct uwb_rc *rc)
 {
 	int ret;
 
 	ret = rc->start(rc);
 	if (ret)
-		goto error;
+		goto out;
 	ret = uwb_rc_mac_addr_set(rc, &rc->uwb_dev.mac_addr);
 	if (ret)
-		goto error;
+		goto out;
 	ret = uwb_rc_dev_addr_set(rc, &rc->uwb_dev.dev_addr);
 	if (ret)
-		goto error;
-	return;
-error:
-	/* Nothing can be done except try the reset again. */
-	uwb_rc_reset_all(rc);
+		goto out;
+out:
+	return ret;
 }
 EXPORT_SYMBOL_GPL(uwb_rc_post_reset);
diff --git a/drivers/uwb/umc-bus.c b/drivers/uwb/umc-bus.c
index 5ad36164c13b..cdd6c8efc9f8 100644
--- a/drivers/uwb/umc-bus.c
+++ b/drivers/uwb/umc-bus.c
@@ -66,7 +66,7 @@ int umc_controller_reset(struct umc_dev *umc)
 		return -EAGAIN;
 	ret = device_for_each_child(parent, parent, umc_bus_pre_reset_helper);
 	if (ret >= 0)
-		device_for_each_child(parent, parent, umc_bus_post_reset_helper);
+		ret = device_for_each_child(parent, parent, umc_bus_post_reset_helper);
 	up(&parent->sem);
 
 	return ret;
diff --git a/drivers/uwb/whc-rc.c b/drivers/uwb/whc-rc.c
index 19a1dd129212..1d9a6f54658e 100644
--- a/drivers/uwb/whc-rc.c
+++ b/drivers/uwb/whc-rc.c
@@ -443,8 +443,7 @@ static int whcrc_post_reset(struct umc_dev *umc)
 	struct whcrc *whcrc = umc_get_drvdata(umc);
 	struct uwb_rc *uwb_rc = whcrc->uwb_rc;
 
-	uwb_rc_post_reset(uwb_rc);
-	return 0;
+	return uwb_rc_post_reset(uwb_rc);
 }
 
 /* PCI device ID's that we handle [so it gets loaded] */
diff --git a/include/linux/uwb.h b/include/linux/uwb.h
index c02128991ff7..7fc9746f22cd 100644
--- a/include/linux/uwb.h
+++ b/include/linux/uwb.h
@@ -597,7 +597,7 @@ void uwb_rc_neh_grok(struct uwb_rc *, void *, size_t);
 void uwb_rc_neh_error(struct uwb_rc *, int);
 void uwb_rc_reset_all(struct uwb_rc *rc);
 void uwb_rc_pre_reset(struct uwb_rc *rc);
-void uwb_rc_post_reset(struct uwb_rc *rc);
+int uwb_rc_post_reset(struct uwb_rc *rc);
 
 /**
  * uwb_rsv_is_owner - is the owner of this reservation the RC?
-- 
cgit v1.2.3-71-gd317


From 9e36fda0b359d2a6ae039c3d7e71a04502a77898 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Fri, 10 Jul 2009 09:57:35 -0700
Subject: x86, pat: Add PAT reserve free to io_mapping* APIs

io_mapping_* interfaces were added, mainly for graphics drivers.
Make this interface go through the PAT reserve/free, instead of
hardcoding WC mapping. This makes sure that there are no
aliases due to unconditional WC setting.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/iomap.h |  9 ++++++---
 arch/x86/mm/iomap_32.c       | 27 +++++++++++++++++++++++++--
 include/linux/io-mapping.h   | 17 ++++++++++++-----
 3 files changed, 43 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index 0e9fe1d9d971..f35eb45d6576 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -26,13 +26,16 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 
-int
-is_io_mapping_possible(resource_size_t base, unsigned long size);
-
 void *
 iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
 
 void
 iounmap_atomic(void *kvaddr, enum km_type type);
 
+int
+iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot);
+
+void
+iomap_free(resource_size_t base, unsigned long size);
+
 #endif /* _ASM_X86_IOMAP_H */
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index fe6f84ca121e..84e236ce76ba 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -21,7 +21,7 @@
 #include <linux/module.h>
 #include <linux/highmem.h>
 
-int is_io_mapping_possible(resource_size_t base, unsigned long size)
+static int is_io_mapping_possible(resource_size_t base, unsigned long size)
 {
 #if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT)
 	/* There is no way to map greater than 1 << 32 address without PAE */
@@ -30,7 +30,30 @@ int is_io_mapping_possible(resource_size_t base, unsigned long size)
 #endif
 	return 1;
 }
-EXPORT_SYMBOL_GPL(is_io_mapping_possible);
+
+int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
+{
+	unsigned long flag = _PAGE_CACHE_WC;
+	int ret;
+
+	if (!is_io_mapping_possible(base, size))
+		return -EINVAL;
+
+	ret = io_reserve_memtype(base, base + size, &flag);
+	if (ret)
+		return ret;
+
+	*prot = __pgprot(__PAGE_KERNEL | flag);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_create_wc);
+
+void
+iomap_free(resource_size_t base, unsigned long size)
+{
+	io_free_memtype(base, base + size);
+}
+EXPORT_SYMBOL_GPL(iomap_free);
 
 void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
 {
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index 0adb0f91568c..97eb928b4924 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -49,23 +49,30 @@ static inline struct io_mapping *
 io_mapping_create_wc(resource_size_t base, unsigned long size)
 {
 	struct io_mapping *iomap;
-
-	if (!is_io_mapping_possible(base, size))
-		return NULL;
+	pgprot_t prot;
 
 	iomap = kmalloc(sizeof(*iomap), GFP_KERNEL);
 	if (!iomap)
-		return NULL;
+		goto out_err;
+
+	if (iomap_create_wc(base, size, &prot))
+		goto out_free;
 
 	iomap->base = base;
 	iomap->size = size;
-	iomap->prot = pgprot_writecombine(__pgprot(__PAGE_KERNEL));
+	iomap->prot = prot;
 	return iomap;
+
+out_free:
+	kfree(iomap);
+out_err:
+	return NULL;
 }
 
 static inline void
 io_mapping_free(struct io_mapping *mapping)
 {
+	iomap_free(mapping->base, mapping->size);
 	kfree(mapping);
 }
 
-- 
cgit v1.2.3-71-gd317


From 46cf98cdaef5471926010b5bddf84c44ec177fdd Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Fri, 10 Jul 2009 09:57:37 -0700
Subject: x86, pat: Generalize the use of page flag PG_uncached

Only IA64 was using PG_uncached as of now. We now intend to use this bit
in x86 as well, to keep track of memory type of those addresses that
have page struct for them. So, generalize the use of that bit across
ia64 and x86.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/ia64/Kconfig          | 4 ++++
 arch/x86/Kconfig           | 4 ++++
 include/linux/page-flags.h | 4 ++--
 3 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 170042b420d4..e6246119932a 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -112,6 +112,10 @@ config IA64_UNCACHED_ALLOCATOR
 	bool
 	select GENERIC_ALLOCATOR
 
+config ARCH_USES_PG_UNCACHED
+	def_bool y
+	depends on IA64_UNCACHED_ALLOCATOR
+
 config AUDIT_ARCH
 	bool
 	default y
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c07f72205909..8e1595382196 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1414,6 +1414,10 @@ config X86_PAT
 
 	  If unsure, say Y.
 
+config ARCH_USES_PG_UNCACHED
+	def_bool y
+	depends on X86_PAT
+
 config EFI
 	bool "EFI runtime service support"
 	depends on ACPI
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index e2e5ce543595..2b87acfc5f87 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -99,7 +99,7 @@ enum pageflags {
 #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
 	PG_mlocked,		/* Page is vma mlocked */
 #endif
-#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
+#ifdef CONFIG_ARCH_USES_PG_UNCACHED
 	PG_uncached,		/* Page has been mapped as uncached */
 #endif
 	__NR_PAGEFLAGS,
@@ -257,7 +257,7 @@ PAGEFLAG_FALSE(Mlocked)
 	SETPAGEFLAG_NOOP(Mlocked) TESTCLEARFLAG_FALSE(Mlocked)
 #endif
 
-#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
+#ifdef CONFIG_ARCH_USES_PG_UNCACHED
 PAGEFLAG(Uncached, uncached)
 #else
 PAGEFLAG_FALSE(Uncached)
-- 
cgit v1.2.3-71-gd317


From f726f30e32305a34a203ff975e60885aa7556c6a Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 4 Aug 2009 19:08:24 +0000
Subject: dma: Add set_dma_mask hook to struct dma_map_ops

POWERPC needs this hook. SPARC could use it too.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Becky Bruce <beckyb@kernel.crashing.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 include/linux/dma-mapping.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index c0f6c3cd788c..91b761846061 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -58,6 +58,7 @@ struct dma_map_ops {
 				   enum dma_data_direction dir);
 	int (*mapping_error)(struct device *dev, dma_addr_t dma_addr);
 	int (*dma_supported)(struct device *dev, u64 mask);
+	int (*set_dma_mask)(struct device *dev, u64 mask);
 	int is_phys;
 };
 
-- 
cgit v1.2.3-71-gd317


From 77a53fd21870c726b670c0d8179294ac1ea33468 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Tue, 25 Aug 2009 19:24:13 -0700
Subject: Input: matrix-keypad - add function to build device keymap

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/keyboard/matrix_keypad.c  | 15 +++------------
 drivers/input/keyboard/w90p910_keypad.c | 16 ++--------------
 include/linux/input/matrix_keypad.h     | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 37 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/keyboard/matrix_keypad.c b/drivers/input/keyboard/matrix_keypad.c
index 541b981ff075..91cfe5170265 100644
--- a/drivers/input/keyboard/matrix_keypad.c
+++ b/drivers/input/keyboard/matrix_keypad.c
@@ -319,7 +319,6 @@ static int __devinit matrix_keypad_probe(struct platform_device *pdev)
 	struct input_dev *input_dev;
 	unsigned short *keycodes;
 	unsigned int row_shift;
-	int i;
 	int err;
 
 	pdata = pdev->dev.platform_data;
@@ -363,18 +362,10 @@ static int __devinit matrix_keypad_probe(struct platform_device *pdev)
 
 	input_dev->keycode	= keycodes;
 	input_dev->keycodesize	= sizeof(*keycodes);
-	input_dev->keycodemax	= pdata->num_row_gpios << keypad->row_shift;
-
-	for (i = 0; i < keymap_data->keymap_size; i++) {
-		unsigned int key = keymap_data->keymap[i];
-		unsigned int row = KEY_ROW(key);
-		unsigned int col = KEY_COL(key);
-		unsigned short code = KEY_VAL(key);
+	input_dev->keycodemax	= pdata->num_row_gpios << row_shift;
 
-		keycodes[MATRIX_SCAN_CODE(row, col, row_shift)] = code;
-		__set_bit(code, input_dev->keybit);
-	}
-	__clear_bit(KEY_RESERVED, input_dev->keybit);
+	matrix_keypad_build_keymap(keymap_data, row_shift,
+				   input_dev->keycode, input_dev->keybit);
 
 	input_set_capability(input_dev, EV_MSC, MSC_SCAN);
 	input_set_drvdata(input_dev, keypad);
diff --git a/drivers/input/keyboard/w90p910_keypad.c b/drivers/input/keyboard/w90p910_keypad.c
index b8598ae124ee..2d03dd0f9e07 100644
--- a/drivers/input/keyboard/w90p910_keypad.c
+++ b/drivers/input/keyboard/w90p910_keypad.c
@@ -126,7 +126,6 @@ static int __devinit w90p910_keypad_probe(struct platform_device *pdev)
 	struct resource *res;
 	int irq;
 	int error;
-	int i;
 
 	if (!pdata) {
 		dev_err(&pdev->dev, "no platform data defined\n");
@@ -197,19 +196,8 @@ static int __devinit w90p910_keypad_probe(struct platform_device *pdev)
 	input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP);
 	input_set_capability(input_dev, EV_MSC, MSC_SCAN);
 
-	for (i = 0; i < keymap_data->keymap_size; i++) {
-		unsigned int key = keymap_data->keymap[i];
-		unsigned int row = KEY_ROW(key);
-		unsigned int col = KEY_COL(key);
-		unsigned short keycode = KEY_VAL(key);
-		unsigned int scancode = MATRIX_SCAN_CODE(row, col,
-							 W90P910_ROW_SHIFT);
-
-		keypad->keymap[scancode] = keycode;
-		__set_bit(keycode, input_dev->keybit);
-	}
-	__clear_bit(KEY_RESERVED, input_dev->keybit);
-
+	matrix_keypad_build_keymap(keymap_data, W90P910_ROW_SHIFT,
+				   input_dev->keycode, input_dev->keybit);
 
 	error = request_irq(keypad->irq, w90p910_keypad_irq_handler,
 			    IRQF_DISABLED, pdev->name, keypad);
diff --git a/include/linux/input/matrix_keypad.h b/include/linux/input/matrix_keypad.h
index 15d5903af2dd..b3cd42d50e16 100644
--- a/include/linux/input/matrix_keypad.h
+++ b/include/linux/input/matrix_keypad.h
@@ -63,4 +63,36 @@ struct matrix_keypad_platform_data {
 	bool		wakeup;
 };
 
+/**
+ * matrix_keypad_build_keymap - convert platform keymap into matrix keymap
+ * @keymap_data: keymap supplied by the platform code
+ * @row_shift: number of bits to shift row value by to advance to the next
+ * line in the keymap
+ * @keymap: expanded version of keymap that is suitable for use by
+ * matrix keyboad driver
+ * @keybit: pointer to bitmap of keys supported by input device
+ *
+ * This function converts platform keymap (encoded with KEY() macro) into
+ * an array of keycodes that is suitable for using in a standard matrix
+ * keyboard driver that uses row and col as indices.
+ */
+static inline void
+matrix_keypad_build_keymap(const struct matrix_keymap_data *keymap_data,
+			   unsigned int row_shift,
+			   unsigned short *keymap, unsigned long *keybit)
+{
+	int i;
+
+	for (i = 0; i < keymap_data->keymap_size; i++) {
+		unsigned int key = keymap_data->keymap[i];
+		unsigned int row = KEY_ROW(key);
+		unsigned int col = KEY_COL(key);
+		unsigned short code = KEY_VAL(key);
+
+		keymap[MATRIX_SCAN_CODE(row, col, row_shift)] = code;
+		__set_bit(code, keybit);
+	}
+	__clear_bit(KEY_RESERVED, keybit);
+}
+
 #endif /* _MATRIX_KEYPAD_H */
-- 
cgit v1.2.3-71-gd317


From 9d8340687c524ce61e3c9c76758c4c81303acfc0 Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Tue, 25 Aug 2009 19:24:14 -0700
Subject: Input: add twl4030_keypad driver

Add a driver for the keypad controller on TWL4030 family chips.
These support up to an 8x8 key matrix.  The TWL4030 multifunction
chips are mostly used on OMAP3 (or OMAP 2430) based boards.

[dtor@mail.ru: switch to matrix-keypad framework, fix changing
keymap from userspace]
Reviewed-by: Trilok Soni <soni.trilok@gmail.com>
Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/keyboard/Kconfig          |  11 +
 drivers/input/keyboard/Makefile         |   1 +
 drivers/input/keyboard/twl4030_keypad.c | 480 ++++++++++++++++++++++++++++++++
 include/linux/i2c/twl4030.h             |  19 +-
 4 files changed, 505 insertions(+), 6 deletions(-)
 create mode 100644 drivers/input/keyboard/twl4030_keypad.c

(limited to 'include/linux')

diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig
index 50e407de8a78..3525c19be428 100644
--- a/drivers/input/keyboard/Kconfig
+++ b/drivers/input/keyboard/Kconfig
@@ -330,6 +330,17 @@ config KEYBOARD_OMAP
 	  To compile this driver as a module, choose M here: the
 	  module will be called omap-keypad.
 
+config KEYBOARD_TWL4030
+	tristate "TI TWL4030/TWL5030/TPS659x0 keypad support"
+	depends on TWL4030_CORE
+	help
+	  Say Y here if your board use the keypad controller on
+	  TWL4030 family chips.  It's safe to say enable this
+	  even on boards that don't use the keypad controller.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called twl4030_keypad.
+
 config KEYBOARD_TOSA
 	tristate "Tosa keyboard"
 	depends on MACH_TOSA
diff --git a/drivers/input/keyboard/Makefile b/drivers/input/keyboard/Makefile
index 152303029203..8a7a22b30266 100644
--- a/drivers/input/keyboard/Makefile
+++ b/drivers/input/keyboard/Makefile
@@ -30,5 +30,6 @@ obj-$(CONFIG_KEYBOARD_SPITZ)		+= spitzkbd.o
 obj-$(CONFIG_KEYBOARD_STOWAWAY)		+= stowaway.o
 obj-$(CONFIG_KEYBOARD_SUNKBD)		+= sunkbd.o
 obj-$(CONFIG_KEYBOARD_TOSA)		+= tosakbd.o
+obj-$(CONFIG_KEYBOARD_TWL4030)		+= twl4030_keypad.o
 obj-$(CONFIG_KEYBOARD_XTKBD)		+= xtkbd.o
 obj-$(CONFIG_KEYBOARD_W90P910)		+= w90p910_keypad.o
diff --git a/drivers/input/keyboard/twl4030_keypad.c b/drivers/input/keyboard/twl4030_keypad.c
new file mode 100644
index 000000000000..9a2977c21696
--- /dev/null
+++ b/drivers/input/keyboard/twl4030_keypad.c
@@ -0,0 +1,480 @@
+/*
+ * twl4030_keypad.c - driver for 8x8 keypad controller in twl4030 chips
+ *
+ * Copyright (C) 2007 Texas Instruments, Inc.
+ * Copyright (C) 2008 Nokia Corporation
+ *
+ * Code re-written for 2430SDP by:
+ * Syed Mohammed Khasim <x0khasim@ti.com>
+ *
+ * Initial Code:
+ * Manjunatha G K <manjugk@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/input.h>
+#include <linux/platform_device.h>
+#include <linux/i2c/twl4030.h>
+
+
+/*
+ * The TWL4030 family chips include a keypad controller that supports
+ * up to an 8x8 switch matrix.  The controller can issue system wakeup
+ * events, since it uses only the always-on 32KiHz oscillator, and has
+ * an internal state machine that decodes pressed keys, including
+ * multi-key combinations.
+ *
+ * This driver lets boards define what keycodes they wish to report for
+ * which scancodes, as part of the "struct twl4030_keypad_data" used in
+ * the probe() routine.
+ *
+ * See the TPS65950 documentation; that's the general availability
+ * version of the TWL5030 second generation part.
+ */
+#define TWL4030_MAX_ROWS	8	/* TWL4030 hard limit */
+#define TWL4030_MAX_COLS	8
+#define TWL4030_ROW_SHIFT	3
+#define TWL4030_KEYMAP_SIZE	(TWL4030_MAX_ROWS * TWL4030_MAX_COLS)
+
+struct twl4030_keypad {
+	unsigned short	keymap[TWL4030_KEYMAP_SIZE];
+	u16		kp_state[TWL4030_MAX_ROWS];
+	unsigned	n_rows;
+	unsigned	n_cols;
+	unsigned	irq;
+
+	struct device *dbg_dev;
+	struct input_dev *input;
+};
+
+/*----------------------------------------------------------------------*/
+
+/* arbitrary prescaler value 0..7 */
+#define PTV_PRESCALER			4
+
+/* Register Offsets */
+#define KEYP_CTRL			0x00
+#define KEYP_DEB			0x01
+#define KEYP_LONG_KEY			0x02
+#define KEYP_LK_PTV			0x03
+#define KEYP_TIMEOUT_L			0x04
+#define KEYP_TIMEOUT_H			0x05
+#define KEYP_KBC			0x06
+#define KEYP_KBR			0x07
+#define KEYP_SMS			0x08
+#define KEYP_FULL_CODE_7_0		0x09	/* row 0 column status */
+#define KEYP_FULL_CODE_15_8		0x0a	/* ... row 1 ... */
+#define KEYP_FULL_CODE_23_16		0x0b
+#define KEYP_FULL_CODE_31_24		0x0c
+#define KEYP_FULL_CODE_39_32		0x0d
+#define KEYP_FULL_CODE_47_40		0x0e
+#define KEYP_FULL_CODE_55_48		0x0f
+#define KEYP_FULL_CODE_63_56		0x10
+#define KEYP_ISR1			0x11
+#define KEYP_IMR1			0x12
+#define KEYP_ISR2			0x13
+#define KEYP_IMR2			0x14
+#define KEYP_SIR			0x15
+#define KEYP_EDR			0x16	/* edge triggers */
+#define KEYP_SIH_CTRL			0x17
+
+/* KEYP_CTRL_REG Fields */
+#define KEYP_CTRL_SOFT_NRST		BIT(0)
+#define KEYP_CTRL_SOFTMODEN		BIT(1)
+#define KEYP_CTRL_LK_EN			BIT(2)
+#define KEYP_CTRL_TOE_EN		BIT(3)
+#define KEYP_CTRL_TOLE_EN		BIT(4)
+#define KEYP_CTRL_RP_EN			BIT(5)
+#define KEYP_CTRL_KBD_ON		BIT(6)
+
+/* KEYP_DEB, KEYP_LONG_KEY, KEYP_TIMEOUT_x*/
+#define KEYP_PERIOD_US(t, prescale)	((t) / (31 << (prescale + 1)) - 1)
+
+/* KEYP_LK_PTV_REG Fields */
+#define KEYP_LK_PTV_PTV_SHIFT		5
+
+/* KEYP_{IMR,ISR,SIR} Fields */
+#define KEYP_IMR1_MIS			BIT(3)
+#define KEYP_IMR1_TO			BIT(2)
+#define KEYP_IMR1_LK			BIT(1)
+#define KEYP_IMR1_KP			BIT(0)
+
+/* KEYP_EDR Fields */
+#define KEYP_EDR_KP_FALLING		0x01
+#define KEYP_EDR_KP_RISING		0x02
+#define KEYP_EDR_KP_BOTH		0x03
+#define KEYP_EDR_LK_FALLING		0x04
+#define KEYP_EDR_LK_RISING		0x08
+#define KEYP_EDR_TO_FALLING		0x10
+#define KEYP_EDR_TO_RISING		0x20
+#define KEYP_EDR_MIS_FALLING		0x40
+#define KEYP_EDR_MIS_RISING		0x80
+
+
+/*----------------------------------------------------------------------*/
+
+static int twl4030_kpread(struct twl4030_keypad *kp,
+		u8 *data, u32 reg, u8 num_bytes)
+{
+	int ret = twl4030_i2c_read(TWL4030_MODULE_KEYPAD, data, reg, num_bytes);
+
+	if (ret < 0)
+		dev_warn(kp->dbg_dev,
+			"Couldn't read TWL4030: %X - ret %d[%x]\n",
+			 reg, ret, ret);
+
+	return ret;
+}
+
+static int twl4030_kpwrite_u8(struct twl4030_keypad *kp, u8 data, u32 reg)
+{
+	int ret = twl4030_i2c_write_u8(TWL4030_MODULE_KEYPAD, data, reg);
+
+	if (ret < 0)
+		dev_warn(kp->dbg_dev,
+			"Could not write TWL4030: %X - ret %d[%x]\n",
+			 reg, ret, ret);
+
+	return ret;
+}
+
+static inline u16 twl4030_col_xlate(struct twl4030_keypad *kp, u8 col)
+{
+	/* If all bits in a row are active for all coloumns then
+	 * we have that row line connected to gnd. Mark this
+	 * key on as if it was on matrix position n_cols (ie
+	 * one higher than the size of the matrix).
+	 */
+	if (col == 0xFF)
+		return 1 << kp->n_cols;
+	else
+		return col & ((1 << kp->n_cols) - 1);
+}
+
+static int twl4030_read_kp_matrix_state(struct twl4030_keypad *kp, u16 *state)
+{
+	u8 new_state[TWL4030_MAX_ROWS];
+	int row;
+	int ret = twl4030_kpread(kp, new_state,
+				 KEYP_FULL_CODE_7_0, kp->n_rows);
+	if (ret >= 0)
+		for (row = 0; row < kp->n_rows; row++)
+			state[row] = twl4030_col_xlate(kp, new_state[row]);
+
+	return ret;
+}
+
+static int twl4030_is_in_ghost_state(struct twl4030_keypad *kp, u16 *key_state)
+{
+	int i;
+	u16 check = 0;
+
+	for (i = 0; i < kp->n_rows; i++) {
+		u16 col = key_state[i];
+
+		if ((col & check) && hweight16(col) > 1)
+			return 1;
+
+		check |= col;
+	}
+
+	return 0;
+}
+
+static void twl4030_kp_scan(struct twl4030_keypad *kp, bool release_all)
+{
+	struct input_dev *input = kp->input;
+	u16 new_state[TWL4030_MAX_ROWS];
+	int col, row;
+
+	if (release_all)
+		memset(new_state, 0, sizeof(new_state));
+	else {
+		/* check for any changes */
+		int ret = twl4030_read_kp_matrix_state(kp, new_state);
+
+		if (ret < 0)	/* panic ... */
+			return;
+
+		if (twl4030_is_in_ghost_state(kp, new_state))
+			return;
+	}
+
+	/* check for changes and print those */
+	for (row = 0; row < kp->n_rows; row++) {
+		int changed = new_state[row] ^ kp->kp_state[row];
+
+		if (!changed)
+			continue;
+
+		for (col = 0; col < kp->n_cols; col++) {
+			int code;
+
+			if (!(changed & (1 << col)))
+				continue;
+
+			dev_dbg(kp->dbg_dev, "key [%d:%d] %s\n", row, col,
+				(new_state[row] & (1 << col)) ?
+				"press" : "release");
+
+			code = MATRIX_SCAN_CODE(row, col, TWL4030_ROW_SHIFT);
+			input_event(input, EV_MSC, MSC_SCAN, code);
+			input_report_key(input, kp->keymap[code],
+					 new_state[row] & (1 << col));
+		}
+		kp->kp_state[row] = new_state[row];
+	}
+	input_sync(input);
+}
+
+/*
+ * Keypad interrupt handler
+ */
+static irqreturn_t do_kp_irq(int irq, void *_kp)
+{
+	struct twl4030_keypad *kp = _kp;
+	u8 reg;
+	int ret;
+
+#ifdef CONFIG_LOCKDEP
+	/* WORKAROUND for lockdep forcing IRQF_DISABLED on us, which
+	 * we don't want and can't tolerate.  Although it might be
+	 * friendlier not to borrow this thread context...
+	 */
+	local_irq_enable();
+#endif
+
+	/* Read & Clear TWL4030 pending interrupt */
+	ret = twl4030_kpread(kp, &reg, KEYP_ISR1, 1);
+
+	/* Release all keys if I2C has gone bad or
+	 * the KEYP has gone to idle state */
+	if (ret >= 0 && (reg & KEYP_IMR1_KP))
+		twl4030_kp_scan(kp, false);
+	else
+		twl4030_kp_scan(kp, true);
+
+	return IRQ_HANDLED;
+}
+
+static int __devinit twl4030_kp_program(struct twl4030_keypad *kp)
+{
+	u8 reg;
+	int i;
+
+	/* Enable controller, with hardware decoding but not autorepeat */
+	reg = KEYP_CTRL_SOFT_NRST | KEYP_CTRL_SOFTMODEN
+		| KEYP_CTRL_TOE_EN | KEYP_CTRL_KBD_ON;
+	if (twl4030_kpwrite_u8(kp, reg, KEYP_CTRL) < 0)
+		return -EIO;
+
+	/* NOTE:  we could use sih_setup() here to package keypad
+	 * event sources as four different IRQs ... but we don't.
+	 */
+
+	/* Enable TO rising and KP rising and falling edge detection */
+	reg = KEYP_EDR_KP_BOTH | KEYP_EDR_TO_RISING;
+	if (twl4030_kpwrite_u8(kp, reg, KEYP_EDR) < 0)
+		return -EIO;
+
+	/* Set PTV prescaler Field */
+	reg = (PTV_PRESCALER << KEYP_LK_PTV_PTV_SHIFT);
+	if (twl4030_kpwrite_u8(kp, reg, KEYP_LK_PTV) < 0)
+		return -EIO;
+
+	/* Set key debounce time to 20 ms */
+	i = KEYP_PERIOD_US(20000, PTV_PRESCALER);
+	if (twl4030_kpwrite_u8(kp, i, KEYP_DEB) < 0)
+		return -EIO;
+
+	/* Set timeout period to 100 ms */
+	i = KEYP_PERIOD_US(200000, PTV_PRESCALER);
+	if (twl4030_kpwrite_u8(kp, (i & 0xFF), KEYP_TIMEOUT_L) < 0)
+		return -EIO;
+
+	if (twl4030_kpwrite_u8(kp, (i >> 8), KEYP_TIMEOUT_H) < 0)
+		return -EIO;
+
+	/*
+	 * Enable Clear-on-Read; disable remembering events that fire
+	 * after the IRQ but before our handler acks (reads) them,
+	 */
+	reg = TWL4030_SIH_CTRL_COR_MASK | TWL4030_SIH_CTRL_PENDDIS_MASK;
+	if (twl4030_kpwrite_u8(kp, reg, KEYP_SIH_CTRL) < 0)
+		return -EIO;
+
+	/* initialize key state; irqs update it from here on */
+	if (twl4030_read_kp_matrix_state(kp, kp->kp_state) < 0)
+		return -EIO;
+
+	return 0;
+}
+
+/*
+ * Registers keypad device with input subsystem
+ * and configures TWL4030 keypad registers
+ */
+static int __devinit twl4030_kp_probe(struct platform_device *pdev)
+{
+	struct twl4030_keypad_data *pdata = pdev->dev.platform_data;
+	const struct matrix_keymap_data *keymap_data = pdata->keymap_data;
+	struct twl4030_keypad *kp;
+	struct input_dev *input;
+	u8 reg;
+	int error;
+
+	if (!pdata || !pdata->rows || !pdata->cols ||
+	    pdata->rows > TWL4030_MAX_ROWS || pdata->cols > TWL4030_MAX_COLS) {
+		dev_err(&pdev->dev, "Invalid platform_data\n");
+		return -EINVAL;
+	}
+
+	kp = kzalloc(sizeof(*kp), GFP_KERNEL);
+	input = input_allocate_device();
+	if (!kp || !input) {
+		error = -ENOMEM;
+		goto err1;
+	}
+
+	/* Get the debug Device */
+	kp->dbg_dev = &pdev->dev;
+	kp->input = input;
+
+	kp->n_rows = pdata->rows;
+	kp->n_cols = pdata->cols;
+	kp->irq = platform_get_irq(pdev, 0);
+
+	/* setup input device */
+	__set_bit(EV_KEY, input->evbit);
+
+	/* Enable auto repeat feature of Linux input subsystem */
+	if (pdata->rep)
+		__set_bit(EV_REP, input->evbit);
+
+	input_set_capability(input, EV_MSC, MSC_SCAN);
+
+	input->name		= "TWL4030 Keypad";
+	input->phys		= "twl4030_keypad/input0";
+	input->dev.parent	= &pdev->dev;
+
+	input->id.bustype	= BUS_HOST;
+	input->id.vendor	= 0x0001;
+	input->id.product	= 0x0001;
+	input->id.version	= 0x0003;
+
+	input->keycode		= kp->keymap;
+	input->keycodesize	= sizeof(kp->keymap[0]);
+	input->keycodemax	= ARRAY_SIZE(kp->keymap);
+
+	matrix_keypad_build_keymap(keymap_data, TWL4030_ROW_SHIFT,
+				   input->keycode, input->keybit);
+
+	error = input_register_device(input);
+	if (error) {
+		dev_err(kp->dbg_dev,
+			"Unable to register twl4030 keypad device\n");
+		goto err1;
+	}
+
+	error = twl4030_kp_program(kp);
+	if (error)
+		goto err2;
+
+	/*
+	 * This ISR will always execute in kernel thread context because of
+	 * the need to access the TWL4030 over the I2C bus.
+	 *
+	 * NOTE:  we assume this host is wired to TWL4040 INT1, not INT2 ...
+	 */
+	error = request_irq(kp->irq, do_kp_irq, 0, pdev->name, kp);
+	if (error) {
+		dev_info(kp->dbg_dev, "request_irq failed for irq no=%d\n",
+			kp->irq);
+		goto err3;
+	}
+
+	/* Enable KP and TO interrupts now. */
+	reg = (u8) ~(KEYP_IMR1_KP | KEYP_IMR1_TO);
+	if (twl4030_kpwrite_u8(kp, reg, KEYP_IMR1)) {
+		error = -EIO;
+		goto err4;
+	}
+
+	platform_set_drvdata(pdev, kp);
+	return 0;
+
+err4:
+	/* mask all events - we don't care about the result */
+	(void) twl4030_kpwrite_u8(kp, 0xff, KEYP_IMR1);
+err3:
+	free_irq(kp->irq, NULL);
+err2:
+	input_unregister_device(input);
+	input = NULL;
+err1:
+	input_free_device(input);
+	kfree(kp);
+	return error;
+}
+
+static int __devexit twl4030_kp_remove(struct platform_device *pdev)
+{
+	struct twl4030_keypad *kp = platform_get_drvdata(pdev);
+
+	free_irq(kp->irq, kp);
+	input_unregister_device(kp->input);
+	platform_set_drvdata(pdev, NULL);
+	kfree(kp);
+
+	return 0;
+}
+
+/*
+ * NOTE: twl4030 are multi-function devices connected via I2C.
+ * So this device is a child of an I2C parent, thus it needs to
+ * support unplug/replug (which most platform devices don't).
+ */
+
+static struct platform_driver twl4030_kp_driver = {
+	.probe		= twl4030_kp_probe,
+	.remove		= __devexit_p(twl4030_kp_remove),
+	.driver		= {
+		.name	= "twl4030_keypad",
+		.owner	= THIS_MODULE,
+	},
+};
+
+static int __init twl4030_kp_init(void)
+{
+	return platform_driver_register(&twl4030_kp_driver);
+}
+module_init(twl4030_kp_init);
+
+static void __exit twl4030_kp_exit(void)
+{
+	platform_driver_unregister(&twl4030_kp_driver);
+}
+module_exit(twl4030_kp_exit);
+
+MODULE_AUTHOR("Texas Instruments");
+MODULE_DESCRIPTION("TWL4030 Keypad Driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:twl4030_keypad");
+
diff --git a/include/linux/i2c/twl4030.h b/include/linux/i2c/twl4030.h
index 0dc80ef24975..3fd21d7cb6bf 100644
--- a/include/linux/i2c/twl4030.h
+++ b/include/linux/i2c/twl4030.h
@@ -25,6 +25,9 @@
 #ifndef __TWL4030_H_
 #define __TWL4030_H_
 
+#include <linux/types.h>
+#include <linux/input/matrix_keypad.h>
+
 /*
  * Using the twl4030 core we address registers using a pair
  *	{ module id, relative register offset }
@@ -302,13 +305,17 @@ struct twl4030_madc_platform_data {
 	int		irq_line;
 };
 
+/* Boards have uniqe mappings of {col, row} --> keycode.
+ * Column and row are 4 bits, but range only from 0..7.
+ * a PERSISTENT_KEY is "always on" and never reported.
+ */
+#define PERSISTENT_KEY(c, r)	KEY((c), (r), KEY_RESERVED)
+
 struct twl4030_keypad_data {
-	int rows;
-	int cols;
-	int *keymap;
-	int irq;
-	unsigned int keymapsize;
-	unsigned int rep:1;
+	const struct matrix_keymap_data *keymap_data;
+	unsigned rows;
+	unsigned cols;
+	bool rep;
 };
 
 enum twl4030_usb_mode {
-- 
cgit v1.2.3-71-gd317


From 468de9e54a900559b55aa939a4daeaea1915e572 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Thu, 27 Aug 2009 12:07:40 -0400
Subject: nfsd41: expand solo sequence check

Compounds consisting of only a sequence operation don't need any
additional caching beyond the sequence information we store in the slot
entry.  Fix nfsd4_is_solo_sequence to identify this case correctly.

The additional check for a failed sequence in nfsd4_store_cache_entry()
is redundant, since the nfsd4_is_solo_sequence call lower down catches
this case.

The final ce_cachethis set in nfsd4_sequence is also redundant.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c       | 9 ---------
 include/linux/nfsd/xdr4.h | 2 +-
 2 files changed, 1 insertion(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 5f634d24861c..b44a2cfde6f1 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -991,16 +991,10 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
 {
 	struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
 	struct svc_rqst *rqstp = resp->rqstp;
-	struct nfsd4_compoundargs *args = rqstp->rq_argp;
-	struct nfsd4_op *op = &args->ops[resp->opcnt];
 	struct kvec *resv = &rqstp->rq_res.head[0];
 
 	dprintk("--> %s entry %p\n", __func__, entry);
 
-	/* Don't cache a failed OP_SEQUENCE. */
-	if (resp->opcnt == 1 && op->opnum == OP_SEQUENCE && resp->cstate.status)
-		return;
-
 	nfsd4_release_respages(entry->ce_respages, entry->ce_resused);
 	entry->ce_opcnt = resp->opcnt;
 	entry->ce_status = resp->cstate.status;
@@ -1490,9 +1484,6 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	slot->sl_inuse = true;
 	slot->sl_seqid = seq->seqid;
 	slot->sl_cache_entry.ce_cachethis = seq->cachethis;
-	/* Always set the cache entry cachethis for solo sequence */
-	if (nfsd4_is_solo_sequence(resp))
-		slot->sl_cache_entry.ce_cachethis = 1;
 
 	cstate->slot = slot;
 	cstate->session = session;
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index 5e4beb0deb80..3f716607c86d 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -467,7 +467,7 @@ struct nfsd4_compoundres {
 static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp)
 {
 	struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
-	return args->opcnt == 1;
+	return resp->opcnt == 1 && args->ops[0].opnum == OP_SEQUENCE;
 }
 
 static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)
-- 
cgit v1.2.3-71-gd317


From 7285dd7fd375763bfb8ab1ac9cf3f1206f503c16 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 28 Aug 2009 20:25:24 +0200
Subject: clocksource: Resolve cpu hotplug dead lock with TSC unstable

Martin Schwidefsky analyzed it:
To register a clocksource the clocksource_mutex is acquired and if
necessary timekeeping_notify is called to install the clocksource as
the timekeeper clock. timekeeping_notify uses stop_machine which needs
to take cpu_add_remove_lock mutex.
Starting a new cpu is done with the cpu_add_remove_lock mutex held.
native_cpu_up checks the tsc of the new cpu and if the tsc is no good
clocksource_change_rating is called. Which needs the clocksource_mutex
and the deadlock is complete.

The solution is to replace the TSC via the clocksource watchdog
mechanism. Mark the TSC as unstable and schedule the watchdog work so
it gets removed in the watchdog thread context.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: John Stultz <johnstul@us.ibm.com>
---
 arch/x86/kernel/tsc.c       |  8 +++++---
 include/linux/clocksource.h |  1 +
 kernel/time/clocksource.c   | 33 ++++++++++++++++++++++++++++++---
 3 files changed, 36 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 968425422c46..fc3672a303d6 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -767,12 +767,14 @@ void mark_tsc_unstable(char *reason)
 {
 	if (!tsc_unstable) {
 		tsc_unstable = 1;
-		printk("Marking TSC unstable due to %s\n", reason);
+		printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
 		/* Change only the rating, when not registered */
 		if (clocksource_tsc.mult)
-			clocksource_change_rating(&clocksource_tsc, 0);
-		else
+			clocksource_mark_unstable(&clocksource_tsc);
+		else {
+			clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE;
 			clocksource_tsc.rating = 0;
+		}
 	}
 }
 
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 9ea40ff26f0e..83d2fbd81b93 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -277,6 +277,7 @@ extern struct clocksource* clocksource_get_next(void);
 extern void clocksource_change_rating(struct clocksource *cs, int rating);
 extern void clocksource_resume(void);
 extern struct clocksource * __init __weak clocksource_default_clock(void);
+extern void clocksource_mark_unstable(struct clocksource *cs);
 
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL
 extern void update_vsyscall(struct timespec *ts, struct clocksource *c);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e0c86ad6e9fb..a0af4ffcb6e5 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -149,15 +149,42 @@ static void clocksource_watchdog_work(struct work_struct *work)
 	kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
 }
 
-static void clocksource_unstable(struct clocksource *cs, int64_t delta)
+static void __clocksource_unstable(struct clocksource *cs)
 {
-	printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
-	       cs->name, delta);
 	cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
 	cs->flags |= CLOCK_SOURCE_UNSTABLE;
 	schedule_work(&watchdog_work);
 }
 
+static void clocksource_unstable(struct clocksource *cs, int64_t delta)
+{
+	printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
+	       cs->name, delta);
+	__clocksource_unstable(cs);
+}
+
+/**
+ * clocksource_mark_unstable - mark clocksource unstable via watchdog
+ * @cs:		clocksource to be marked unstable
+ *
+ * This function is called instead of clocksource_change_rating from
+ * cpu hotplug code to avoid a deadlock between the clocksource mutex
+ * and the cpu hotplug mutex. It defers the update of the clocksource
+ * to the watchdog thread.
+ */
+void clocksource_mark_unstable(struct clocksource *cs)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&watchdog_lock, flags);
+	if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
+		if (list_empty(&cs->wd_list))
+			list_add(&cs->wd_list, &watchdog_list);
+		__clocksource_unstable(cs);
+	}
+	spin_unlock_irqrestore(&watchdog_lock, flags);
+}
+
 static void clocksource_watchdog(unsigned long data)
 {
 	struct clocksource *cs;
-- 
cgit v1.2.3-71-gd317


From e55a5999ffcf72dc4d43d73618957964cb87065a Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Tue, 28 Jul 2009 17:41:53 +0800
Subject: ACPI: Handle CONFIG_ACPI=n better from linux/acpi.h

linux/acpi.h is the top level header for interfacing
with the ACPI sub-system, so acpi_disabled should be
up there instead of down in asm/acpi.h -- particularly
since asm/acpi.h doesn't exist for all architectures.

Same story for acpi_table_parse(), which is a top-level
API to Linux/ACPI.

This is necessary for building some code that
used to always depend on CONFIG_ACPI=y, but will soon
also need to build with CONFIG_ACPI=n.

Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/include/asm/acpi.h |  1 -
 include/linux/acpi.h        | 11 ++++++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 20d1465a2ab0..4518dc500903 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -144,7 +144,6 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
 
 #else /* !CONFIG_ACPI */
 
-#define acpi_disabled 1
 #define acpi_lapic 0
 #define acpi_ioapic 0
 static inline void acpi_noirq_set(void) { }
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 34321cfffeab..3fce811bf9ac 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -292,7 +292,10 @@ void __init acpi_s4_no_nvs(void);
 extern acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 flags);
 extern void acpi_early_init(void);
 
-#else	/* CONFIG_ACPI */
+#else	/* !CONFIG_ACPI */
+
+#define acpi_disabled 1
+
 static inline void acpi_early_init(void) { }
 
 static inline int early_acpi_boot_init(void)
@@ -331,5 +334,11 @@ static inline int acpi_check_mem_region(resource_size_t start,
 	return 0;
 }
 
+struct acpi_table_header;
+static inline int acpi_table_parse(char *id,
+				int (*handler)(struct acpi_table_header *))
+{
+	return -1;
+}
 #endif	/* !CONFIG_ACPI */
 #endif	/*_LINUX_ACPI_H*/
-- 
cgit v1.2.3-71-gd317


From 117a9ac777f8034d4675b821172d2ff71f6ec47a Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Fri, 14 Aug 2009 15:10:24 -0400
Subject: SFI: create linux/sfi.h

include/linux/include/sfi.h defines everything that customers
of SFI need to know in order to use the SFI suport in the kernel.

The primary API is sfi_table_parse(), where a driver or another part
of the kernel can supply a handler to parse the named table.

sfi.h also includes the currently defined table signatures and table
formats.

Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 include/linux/sfi.h | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 206 insertions(+)
 create mode 100644 include/linux/sfi.h

(limited to 'include/linux')

diff --git a/include/linux/sfi.h b/include/linux/sfi.h
new file mode 100644
index 000000000000..9a6f7607174e
--- /dev/null
+++ b/include/linux/sfi.h
@@ -0,0 +1,206 @@
+/* sfi.h Simple Firmware Interface */
+
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2009 Intel Corporation. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+  The full GNU General Public License is included in this distribution
+  in the file called LICENSE.GPL.
+
+  BSD LICENSE
+
+  Copyright(c) 2009 Intel Corporation. All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef _LINUX_SFI_H
+#define _LINUX_SFI_H
+
+/* Table signatures reserved by the SFI specification */
+#define SFI_SIG_SYST		"SYST"
+#define SFI_SIG_FREQ		"FREQ"
+#define SFI_SIG_IDLE		"IDLE"
+#define SFI_SIG_CPUS		"CPUS"
+#define SFI_SIG_MTMR		"MTMR"
+#define SFI_SIG_MRTC		"MRTC"
+#define SFI_SIG_MMAP		"MMAP"
+#define SFI_SIG_APIC		"APIC"
+#define SFI_SIG_XSDT		"XSDT"
+#define SFI_SIG_WAKE		"WAKE"
+#define SFI_SIG_SPIB		"SPIB"
+#define SFI_SIG_I2CB		"I2CB"
+#define SFI_SIG_GPEM		"GPEM"
+
+#define SFI_SIGNATURE_SIZE	4
+#define SFI_OEM_ID_SIZE		6
+#define SFI_OEM_TABLE_ID_SIZE	8
+
+#define SFI_SYST_SEARCH_BEGIN		0x000E0000
+#define SFI_SYST_SEARCH_END		0x000FFFFF
+
+#define SFI_GET_NUM_ENTRIES(ptable, entry_type) \
+	((ptable->header.len - sizeof(struct sfi_table_header)) / \
+	(sizeof(entry_type)))
+/*
+ * Table structures must be byte-packed to match the SFI specification,
+ * as they are provided by the BIOS.
+ */
+struct sfi_table_header {
+	char	sig[SFI_SIGNATURE_SIZE];
+	u32	len;
+	u8	rev;
+	u8	csum;
+	char	oem_id[SFI_OEM_ID_SIZE];
+	char	oem_table_id[SFI_OEM_TABLE_ID_SIZE];
+} __packed;
+
+struct sfi_table_simple {
+	struct sfi_table_header		header;
+	u64				pentry[1];
+} __packed;
+
+/* Comply with UEFI spec 2.1 */
+struct sfi_mem_entry {
+	u32	type;
+	u64	phys_start;
+	u64	virt_start;
+	u64	pages;
+	u64	attrib;
+} __packed;
+
+struct sfi_cpu_table_entry {
+	u32	apic_id;
+} __packed;
+
+struct sfi_cstate_table_entry {
+	u32	hint;		/* MWAIT hint */
+	u32	latency;	/* latency in ms */
+} __packed;
+
+struct sfi_apic_table_entry {
+	u64	phys_addr;	/* phy base addr for APIC reg */
+} __packed;
+
+struct sfi_freq_table_entry {
+	u32	freq_mhz;	/* in MHZ */
+	u32	latency;	/* transition latency in ms */
+	u32	ctrl_val;	/* value to write to PERF_CTL */
+} __packed;
+
+struct sfi_wake_table_entry {
+	u64	phys_addr;	/* pointer to where the wake vector locates */
+} __packed;
+
+struct sfi_timer_table_entry {
+	u64	phys_addr;	/* phy base addr for the timer */
+	u32	freq_hz;	/* in HZ */
+	u32	irq;
+} __packed;
+
+struct sfi_rtc_table_entry {
+	u64	phys_addr;	/* phy base addr for the RTC */
+	u32	irq;
+} __packed;
+
+struct sfi_spi_table_entry {
+	u16	host_num;	/* attached to host 0, 1...*/
+	u16	cs;		/* chip select */
+	u16	irq_info;
+	char	name[16];
+	u8	dev_info[10];
+} __packed;
+
+struct sfi_i2c_table_entry {
+	u16	host_num;
+	u16	addr;		/* slave addr */
+	u16	irq_info;
+	char	name[16];
+	u8	dev_info[10];
+} __packed;
+
+struct sfi_gpe_table_entry {
+	u16	logical_id;	/* logical id */
+	u16	phys_id;	/* physical GPE id */
+} __packed;
+
+
+typedef int (*sfi_table_handler) (struct sfi_table_header *table);
+
+#ifdef CONFIG_SFI
+extern void __init sfi_init(void);
+extern int __init sfi_platform_init(void);
+extern void __init sfi_init_late(void);
+extern int sfi_table_parse(char *signature, char *oem_id, char *oem_table_id,
+				sfi_table_handler handler);
+
+extern int sfi_disabled;
+static inline void disable_sfi(void)
+{
+	sfi_disabled = 1;
+}
+
+#else /* !CONFIG_SFI */
+
+static inline void sfi_init(void)
+{
+}
+
+static inline void sfi_init_late(void)
+{
+}
+
+#define sfi_disabled	0
+
+static inline int sfi_table_parse(char *signature, char *oem_id,
+					char *oem_table_id,
+					sfi_table_handler handler)
+{
+	return -1;
+}
+
+#endif /* !CONFIG_SFI */
+
+#endif /*_LINUX_SFI_H*/
-- 
cgit v1.2.3-71-gd317


From 13e82d023c4c3f13ab1e665cbb917a7ebba8935c Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Fri, 14 Aug 2009 15:17:53 -0400
Subject: SFI: add capability to parse ACPI tables

Extend SFI to access standard ACPI tables.
(eg. the PCI MCFG) using sfi_acpi_table_parse().

Note that this is _not_ a hybrid ACPI + SFI mode.
The platform boots in either ACPI mode or SFI mode.

SFI runs only with acpi_disabled=1, which can be set
at build-time via CONFIG_ACPI=n, or at boot time by
the failure to find ACPI platform support.

So this extension simply allows SFI-platforms to
re-use existing standard table formats that happen to
be defined to live in ACPI envelopes.

Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/sfi/sfi_acpi.c   | 175 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/sfi_acpi.h |  93 +++++++++++++++++++++++++
 2 files changed, 268 insertions(+)
 create mode 100644 drivers/sfi/sfi_acpi.c
 create mode 100644 include/linux/sfi_acpi.h

(limited to 'include/linux')

diff --git a/drivers/sfi/sfi_acpi.c b/drivers/sfi/sfi_acpi.c
new file mode 100644
index 000000000000..34aba30eb84b
--- /dev/null
+++ b/drivers/sfi/sfi_acpi.c
@@ -0,0 +1,175 @@
+/* sfi_acpi.c Simple Firmware Interface - ACPI extensions */
+
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2009 Intel Corporation. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+  The full GNU General Public License is included in this distribution
+  in the file called LICENSE.GPL.
+
+  BSD LICENSE
+
+  Copyright(c) 2009 Intel Corporation. All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#define KMSG_COMPONENT "SFI"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <acpi/acpi.h>
+
+#include <linux/sfi.h>
+#include "sfi_core.h"
+
+/*
+ * SFI can access ACPI-defined tables via an optional ACPI XSDT.
+ *
+ * This allows re-use, and avoids re-definition, of standard tables.
+ * For example, the "MCFG" table is defined by PCI, reserved by ACPI,
+ * and is expected to be present many SFI-only systems.
+ */
+
+static struct acpi_table_xsdt *xsdt_va __read_mostly;
+
+#define XSDT_GET_NUM_ENTRIES(ptable, entry_type) \
+	((ptable->header.length - sizeof(struct acpi_table_header)) / \
+	(sizeof(entry_type)))
+
+static inline struct sfi_table_header *acpi_to_sfi_th(
+				struct acpi_table_header *th)
+{
+	return (struct sfi_table_header *)th;
+}
+
+static inline struct acpi_table_header *sfi_to_acpi_th(
+				struct sfi_table_header *th)
+{
+	return (struct acpi_table_header *)th;
+}
+
+/*
+ * sfi_acpi_parse_xsdt()
+ *
+ * Parse the ACPI XSDT for later access by sfi_acpi_table_parse().
+ */
+static int __init sfi_acpi_parse_xsdt(struct sfi_table_header *th)
+{
+	struct sfi_table_key key = SFI_ANY_KEY;
+	int tbl_cnt, i;
+	void *ret;
+
+	xsdt_va = (struct acpi_table_xsdt *)th;
+	tbl_cnt = XSDT_GET_NUM_ENTRIES(xsdt_va, u64);
+	for (i = 0; i < tbl_cnt; i++) {
+		ret = sfi_check_table(xsdt_va->table_offset_entry[i], &key);
+		if (IS_ERR(ret)) {
+			disable_sfi();
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+int __init sfi_acpi_init(void)
+{
+	struct sfi_table_key xsdt_key = { .sig = SFI_SIG_XSDT };
+
+	sfi_table_parse(SFI_SIG_XSDT, NULL, NULL, sfi_acpi_parse_xsdt);
+
+	/* Only call the get_table to keep the table mapped */
+	xsdt_va = (struct acpi_table_xsdt *)sfi_get_table(&xsdt_key);
+	return 0;
+}
+
+static struct acpi_table_header *sfi_acpi_get_table(struct sfi_table_key *key)
+{
+	u32 tbl_cnt, i;
+	void *ret;
+
+	tbl_cnt = XSDT_GET_NUM_ENTRIES(xsdt_va, u64);
+	for (i = 0; i < tbl_cnt; i++) {
+		ret = sfi_check_table(xsdt_va->table_offset_entry[i], key);
+		if (!IS_ERR(ret) && ret)
+			return sfi_to_acpi_th(ret);
+	}
+
+	return NULL;
+}
+
+static void sfi_acpi_put_table(struct acpi_table_header *table)
+{
+	sfi_put_table(acpi_to_sfi_th(table));
+}
+
+/*
+ * sfi_acpi_table_parse()
+ *
+ * Find specified table in XSDT, run handler on it and return its return value
+ */
+int sfi_acpi_table_parse(char *signature, char *oem_id, char *oem_table_id,
+			int(*handler)(struct acpi_table_header *))
+{
+	struct acpi_table_header *table = NULL;
+	struct sfi_table_key key;
+	int ret = 0;
+
+	if (sfi_disabled)
+		return -1;
+
+	key.sig = signature;
+	key.oem_id = oem_id;
+	key.oem_table_id = oem_table_id;
+
+	table = sfi_acpi_get_table(&key);
+	if (!table)
+		return -EINVAL;
+
+	ret = handler(table);
+	sfi_acpi_put_table(table);
+	return ret;
+}
diff --git a/include/linux/sfi_acpi.h b/include/linux/sfi_acpi.h
new file mode 100644
index 000000000000..c4a5a8cd4469
--- /dev/null
+++ b/include/linux/sfi_acpi.h
@@ -0,0 +1,93 @@
+/* sfi.h Simple Firmware Interface */
+
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2009 Intel Corporation. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+  The full GNU General Public License is included in this distribution
+  in the file called LICENSE.GPL.
+
+  BSD LICENSE
+
+  Copyright(c) 2009 Intel Corporation. All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef _LINUX_SFI_ACPI_H
+#define _LINUX_SFI_ACPI_H
+
+#ifdef CONFIG_SFI
+#include <acpi/acpi.h>		/* struct acpi_table_header */
+
+extern int sfi_acpi_table_parse(char *signature, char *oem_id,
+				char *oem_table_id,
+				int (*handler)(struct acpi_table_header *));
+
+static inline int acpi_sfi_table_parse(char *signature,
+				int (*handler)(struct acpi_table_header *))
+{
+	if (!acpi_table_parse(signature, handler))
+		return 0;
+
+	return sfi_acpi_table_parse(signature, NULL, NULL, handler);
+}
+#else /* !CONFIG_SFI */
+
+static inline int sfi_acpi_table_parse(char *signature, char *oem_id,
+				char *oem_table_id,
+				int (*handler)(struct acpi_table_header *))
+{
+	return -1;
+}
+
+static inline int acpi_sfi_table_parse(char *signature,
+				int (*handler)(struct acpi_table_header *))
+{
+	return acpi_table_parse(signature, handler);
+}
+#endif /* !CONFIG_SFI */
+
+#endif /*_LINUX_SFI_ACPI_H*/
-- 
cgit v1.2.3-71-gd317


From ad283ea4a3ce82cda2efe33163748a397b31b1eb Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 29 Aug 2009 19:09:26 -0700
Subject: async_tx: add sum check flags

Replace the flat zero_sum_result with a collection of flags to contain
the P (xor) zero-sum result, and the soon to be utilized Q (raid6 reed
solomon syndrome) zero-sum result.  Use the SUM_CHECK_ namespace instead
of DMA_ since these flags will be used on non-dma-zero-sum enabled
platforms.

Reviewed-by: Andre Noll <maan@systemlinux.org>
Acked-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/arm/include/asm/hardware/iop3xx-adma.h |  5 +++--
 arch/arm/mach-iop13xx/include/mach/adma.h   | 12 +++++++-----
 crypto/async_tx/async_xor.c                 |  4 ++--
 drivers/md/raid5.c                          |  2 +-
 drivers/md/raid5.h                          |  5 +++--
 include/linux/async_tx.h                    |  2 +-
 include/linux/dmaengine.h                   | 21 ++++++++++++++++++++-
 7 files changed, 37 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/hardware/iop3xx-adma.h b/arch/arm/include/asm/hardware/iop3xx-adma.h
index 83e6ba338e2c..26eefea02314 100644
--- a/arch/arm/include/asm/hardware/iop3xx-adma.h
+++ b/arch/arm/include/asm/hardware/iop3xx-adma.h
@@ -756,13 +756,14 @@ static inline void iop_desc_set_block_fill_val(struct iop_adma_desc_slot *desc,
 	hw_desc->src[0] = val;
 }
 
-static inline int iop_desc_get_zero_result(struct iop_adma_desc_slot *desc)
+static inline enum sum_check_flags
+iop_desc_get_zero_result(struct iop_adma_desc_slot *desc)
 {
 	struct iop3xx_desc_aau *hw_desc = desc->hw_desc;
 	struct iop3xx_aau_desc_ctrl desc_ctrl = hw_desc->desc_ctrl_field;
 
 	iop_paranoia(!(desc_ctrl.tx_complete && desc_ctrl.zero_result_en));
-	return desc_ctrl.zero_result_err;
+	return desc_ctrl.zero_result_err << SUM_CHECK_P;
 }
 
 static inline void iop_chan_append(struct iop_adma_chan *chan)
diff --git a/arch/arm/mach-iop13xx/include/mach/adma.h b/arch/arm/mach-iop13xx/include/mach/adma.h
index 5722e86f2174..1cd31df8924d 100644
--- a/arch/arm/mach-iop13xx/include/mach/adma.h
+++ b/arch/arm/mach-iop13xx/include/mach/adma.h
@@ -428,18 +428,20 @@ static inline void iop_desc_set_block_fill_val(struct iop_adma_desc_slot *desc,
 	hw_desc->block_fill_data = val;
 }
 
-static inline int iop_desc_get_zero_result(struct iop_adma_desc_slot *desc)
+static inline enum sum_check_flags
+iop_desc_get_zero_result(struct iop_adma_desc_slot *desc)
 {
 	struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc;
 	struct iop13xx_adma_desc_ctrl desc_ctrl = hw_desc->desc_ctrl_field;
 	struct iop13xx_adma_byte_count byte_count = hw_desc->byte_count_field;
+	enum sum_check_flags flags;
 
 	BUG_ON(!(byte_count.tx_complete && desc_ctrl.zero_result));
 
-	if (desc_ctrl.pq_xfer_en)
-		return byte_count.zero_result_err_q;
-	else
-		return byte_count.zero_result_err;
+	flags = byte_count.zero_result_err_q << SUM_CHECK_Q;
+	flags |= byte_count.zero_result_err << SUM_CHECK_P;
+
+	return flags;
 }
 
 static inline void iop_chan_append(struct iop_adma_chan *chan)
diff --git a/crypto/async_tx/async_xor.c b/crypto/async_tx/async_xor.c
index 1e96c4df7061..78fb7780272a 100644
--- a/crypto/async_tx/async_xor.c
+++ b/crypto/async_tx/async_xor.c
@@ -246,7 +246,7 @@ static int page_is_zero(struct page *p, unsigned int offset, size_t len)
  */
 struct dma_async_tx_descriptor *
 async_xor_val(struct page *dest, struct page **src_list, unsigned int offset,
-	      int src_cnt, size_t len, u32 *result,
+	      int src_cnt, size_t len, enum sum_check_flags *result,
 	      struct async_submit_ctl *submit)
 {
 	struct dma_chan *chan = async_tx_find_channel(submit, DMA_XOR_VAL,
@@ -304,7 +304,7 @@ async_xor_val(struct page *dest, struct page **src_list, unsigned int offset,
 
 		async_tx_quiesce(&tx);
 
-		*result = page_is_zero(dest, offset, len) ? 0 : 1;
+		*result = !page_is_zero(dest, offset, len) << SUM_CHECK_P;
 
 		async_tx_sync_epilog(submit);
 		submit->flags = flags_orig;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7727954cf726..1f2a266f3cf7 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2590,7 +2590,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
 		 * we are done.  Otherwise update the mismatch count and repair
 		 * parity if !MD_RECOVERY_CHECK
 		 */
-		if (sh->ops.zero_sum_result == 0)
+		if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
 			/* parity is correct (on disc,
 			 * not in buffer any more)
 			 */
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index e7baabffee86..75f2c6c4cf90 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -2,6 +2,7 @@
 #define _RAID5_H
 
 #include <linux/raid/xor.h>
+#include <linux/dmaengine.h>
 
 /*
  *
@@ -215,8 +216,8 @@ struct stripe_head {
 	 * @target - STRIPE_OP_COMPUTE_BLK target
 	 */
 	struct stripe_operations {
-		int		   target;
-		u32		   zero_sum_result;
+		int		     target;
+		enum sum_check_flags zero_sum_result;
 	} ops;
 	struct r5dev {
 		struct bio	req;
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
index 00cfb637ddf2..3d21a2517518 100644
--- a/include/linux/async_tx.h
+++ b/include/linux/async_tx.h
@@ -148,7 +148,7 @@ async_xor(struct page *dest, struct page **src_list, unsigned int offset,
 
 struct dma_async_tx_descriptor *
 async_xor_val(struct page *dest, struct page **src_list, unsigned int offset,
-	      int src_cnt, size_t len, u32 *result,
+	      int src_cnt, size_t len, enum sum_check_flags *result,
 	      struct async_submit_ctl *submit);
 
 struct dma_async_tx_descriptor *
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 6768727d00d7..02447afcebad 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -86,6 +86,25 @@ enum dma_ctrl_flags {
 	DMA_COMPL_SKIP_DEST_UNMAP = (1 << 3),
 };
 
+/**
+ * enum sum_check_bits - bit position of pq_check_flags
+ */
+enum sum_check_bits {
+	SUM_CHECK_P = 0,
+	SUM_CHECK_Q = 1,
+};
+
+/**
+ * enum pq_check_flags - result of async_{xor,pq}_zero_sum operations
+ * @SUM_CHECK_P_RESULT - 1 if xor zero sum error, 0 otherwise
+ * @SUM_CHECK_Q_RESULT - 1 if reed-solomon zero sum error, 0 otherwise
+ */
+enum sum_check_flags {
+	SUM_CHECK_P_RESULT = (1 << SUM_CHECK_P),
+	SUM_CHECK_Q_RESULT = (1 << SUM_CHECK_Q),
+};
+
+
 /**
  * dma_cap_mask_t - capabilities bitmap modeled after cpumask_t.
  * See linux/cpumask.h
@@ -245,7 +264,7 @@ struct dma_device {
 		unsigned int src_cnt, size_t len, unsigned long flags);
 	struct dma_async_tx_descriptor *(*device_prep_dma_xor_val)(
 		struct dma_chan *chan, dma_addr_t *src,	unsigned int src_cnt,
-		size_t len, u32 *result, unsigned long flags);
+		size_t len, enum sum_check_flags *result, unsigned long flags);
 	struct dma_async_tx_descriptor *(*device_prep_dma_memset)(
 		struct dma_chan *chan, dma_addr_t dest, int value, size_t len,
 		unsigned long flags);
-- 
cgit v1.2.3-71-gd317


From 95475e57113c66aac7583925736ed2e2d58c990d Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 14 Jul 2009 12:19:02 -0700
Subject: async_tx: remove walk of tx->parent chain in dma_wait_for_async_tx

We currently walk the parent chain when waiting for a given tx to
complete however this walk may race with the driver cleanup routine.
The routines in async_raid6_recov.c may fall back to the synchronous
path at any point so we need to be prepared to call async_tx_quiesce()
(which calls  dma_wait_for_async_tx).  To remove the ->parent walk we
guarantee that every time a dependency is attached ->issue_pending() is
invoked, then we can simply poll the initial descriptor until
completion.

This also allows for a lighter weight 'issue pending' implementation as
there is no longer a requirement to iterate through all the channels'
->issue_pending() routines as long as operations have been submitted in
an ordered chain.  async_tx_issue_pending() is added for this case.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 crypto/async_tx/async_tx.c | 13 +++++++------
 drivers/dma/dmaengine.c    | 45 ++++++++++-----------------------------------
 include/linux/async_tx.h   | 23 +++++++++++++++++++++++
 3 files changed, 40 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c
index 6e37ad3f4417..60615fedcf5e 100644
--- a/crypto/async_tx/async_tx.c
+++ b/crypto/async_tx/async_tx.c
@@ -77,8 +77,8 @@ static void
 async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx,
 			struct dma_async_tx_descriptor *tx)
 {
-	struct dma_chan *chan;
-	struct dma_device *device;
+	struct dma_chan *chan = depend_tx->chan;
+	struct dma_device *device = chan->device;
 	struct dma_async_tx_descriptor *intr_tx = (void *) ~0;
 
 	/* first check to see if we can still append to depend_tx */
@@ -90,11 +90,11 @@ async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx,
 	}
 	spin_unlock_bh(&depend_tx->lock);
 
-	if (!intr_tx)
+	/* attached dependency, flush the parent channel */
+	if (!intr_tx) {
+		device->device_issue_pending(chan);
 		return;
-
-	chan = depend_tx->chan;
-	device = chan->device;
+	}
 
 	/* see if we can schedule an interrupt
 	 * otherwise poll for completion
@@ -128,6 +128,7 @@ async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx,
 			intr_tx->tx_submit(intr_tx);
 			async_tx_ack(intr_tx);
 		}
+		device->device_issue_pending(chan);
 	} else {
 		if (dma_wait_for_async_tx(depend_tx) == DMA_ERROR)
 			panic("%s: DMA_ERROR waiting for depend_tx\n",
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 6781e8f3c064..e002e0e0d055 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -934,49 +934,24 @@ EXPORT_SYMBOL(dma_async_tx_descriptor_init);
 
 /* dma_wait_for_async_tx - spin wait for a transaction to complete
  * @tx: in-flight transaction to wait on
- *
- * This routine assumes that tx was obtained from a call to async_memcpy,
- * async_xor, async_memset, etc which ensures that tx is "in-flight" (prepped
- * and submitted).  Walking the parent chain is only meant to cover for DMA
- * drivers that do not implement the DMA_INTERRUPT capability and may race with
- * the driver's descriptor cleanup routine.
  */
 enum dma_status
 dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
 {
-	enum dma_status status;
-	struct dma_async_tx_descriptor *iter;
-	struct dma_async_tx_descriptor *parent;
+	unsigned long dma_sync_wait_timeout = jiffies + msecs_to_jiffies(5000);
 
 	if (!tx)
 		return DMA_SUCCESS;
 
-	WARN_ONCE(tx->parent, "%s: speculatively walking dependency chain for"
-		  " %s\n", __func__, dma_chan_name(tx->chan));
-
-	/* poll through the dependency chain, return when tx is complete */
-	do {
-		iter = tx;
-
-		/* find the root of the unsubmitted dependency chain */
-		do {
-			parent = iter->parent;
-			if (!parent)
-				break;
-			else
-				iter = parent;
-		} while (parent);
-
-		/* there is a small window for ->parent == NULL and
-		 * ->cookie == -EBUSY
-		 */
-		while (iter->cookie == -EBUSY)
-			cpu_relax();
-
-		status = dma_sync_wait(iter->chan, iter->cookie);
-	} while (status == DMA_IN_PROGRESS || (iter != tx));
-
-	return status;
+	while (tx->cookie == -EBUSY) {
+		if (time_after_eq(jiffies, dma_sync_wait_timeout)) {
+			pr_err("%s timeout waiting for descriptor submission\n",
+				__func__);
+			return DMA_ERROR;
+		}
+		cpu_relax();
+	}
+	return dma_sync_wait(tx->chan, tx->cookie);
 }
 EXPORT_SYMBOL_GPL(dma_wait_for_async_tx);
 
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
index 3d21a2517518..12a2efcbd565 100644
--- a/include/linux/async_tx.h
+++ b/include/linux/async_tx.h
@@ -83,6 +83,24 @@ struct async_submit_ctl {
 
 #ifdef CONFIG_DMA_ENGINE
 #define async_tx_issue_pending_all dma_issue_pending_all
+
+/**
+ * async_tx_issue_pending - send pending descriptor to the hardware channel
+ * @tx: descriptor handle to retrieve hardware context
+ *
+ * Note: any dependent operations will have already been issued by
+ * async_tx_channel_switch, or (in the case of no channel switch) will
+ * be already pending on this channel.
+ */
+static inline void async_tx_issue_pending(struct dma_async_tx_descriptor *tx)
+{
+	if (likely(tx)) {
+		struct dma_chan *chan = tx->chan;
+		struct dma_device *dma = chan->device;
+
+		dma->device_issue_pending(chan);
+	}
+}
 #ifdef CONFIG_ARCH_HAS_ASYNC_TX_FIND_CHANNEL
 #include <asm/async_tx.h>
 #else
@@ -98,6 +116,11 @@ static inline void async_tx_issue_pending_all(void)
 	do { } while (0);
 }
 
+static inline void async_tx_issue_pending(struct dma_async_tx_descriptor *tx)
+{
+	do { } while (0);
+}
+
 static inline struct dma_chan *
 async_tx_find_channel(struct async_submit_ctl *submit,
 		      enum dma_transaction_type tx_type, struct page **dst,
-- 
cgit v1.2.3-71-gd317


From b2f46fd8ef3dff2ab30f31126833f78b7480283a Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 14 Jul 2009 12:20:36 -0700
Subject: async_tx: add support for asynchronous GF multiplication

[ Based on an original patch by Yuri Tikhonov ]

This adds support for doing asynchronous GF multiplication by adding
two additional functions to the async_tx API:

 async_gen_syndrome() does simultaneous XOR and Galois field
    multiplication of sources.

 async_syndrome_val() validates the given source buffers against known P
    and Q values.

When a request is made to run async_pq against more than the hardware
maximum number of supported sources we need to reuse the previous
generated P and Q values as sources into the next operation.  Care must
be taken to remove Q from P' and P from Q'.  For example to perform a 5
source pq op with hardware that only supports 4 sources at a time the
following approach is taken:

p, q = PQ(src0, src1, src2, src3, COEF({01}, {02}, {04}, {08}))
p', q' = PQ(p, q, q, src4, COEF({00}, {01}, {00}, {10}))

p' = p + q + q + src4 = p + src4
q' = {00}*p + {01}*q + {00}*q + {10}*src4 = q + {10}*src4

Note: 4 is the minimum acceptable maxpq otherwise we punt to
synchronous-software path.

The DMA_PREP_CONTINUE flag indicates to the driver to reuse p and q as
sources (in the above manner) and fill the remaining slots up to maxpq
with the new sources/coefficients.

Note1: Some devices have native support for P+Q continuation and can skip
this extra work.  Devices with this capability can advertise it with
dma_set_maxpq.  It is up to each driver how to handle the
DMA_PREP_CONTINUE flag.

Note2: The api supports disabling the generation of P when generating Q,
this is ignored by the synchronous path but is implemented by some dma
devices to save unnecessary writes.  In this case the continuation
algorithm is simplified to only reuse Q as a source.

Cc: H. Peter Anvin <hpa@zytor.com>
Cc: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Yuri Tikhonov <yur@emcraft.com>
Signed-off-by: Ilya Yanok <yanok@emcraft.com>
Reviewed-by: Andre Noll <maan@systemlinux.org>
Acked-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 Documentation/crypto/async-tx-api.txt |   3 +
 arch/arm/mach-iop13xx/setup.c         |   2 +-
 crypto/async_tx/Kconfig               |   4 +
 crypto/async_tx/Makefile              |   1 +
 crypto/async_tx/async_pq.c            | 388 ++++++++++++++++++++++++++++++++++
 crypto/async_tx/async_xor.c           |   2 +-
 drivers/dma/dmaengine.c               |   4 +
 drivers/dma/iop-adma.c                |   2 +-
 include/linux/async_tx.h              |   9 +
 include/linux/dmaengine.h             |  87 +++++++-
 10 files changed, 493 insertions(+), 9 deletions(-)
 create mode 100644 crypto/async_tx/async_pq.c

(limited to 'include/linux')

diff --git a/Documentation/crypto/async-tx-api.txt b/Documentation/crypto/async-tx-api.txt
index 6b15e488c0e7..0e48e054d69a 100644
--- a/Documentation/crypto/async-tx-api.txt
+++ b/Documentation/crypto/async-tx-api.txt
@@ -64,6 +64,9 @@ xor     - xor a series of source buffers and write the result to a
 xor_val - xor a series of source buffers and set a flag if the
 	  result is zero.  The implementation attempts to prevent
 	  writes to memory
+pq	- generate the p+q (raid6 syndrome) from a series of source buffers
+pq_val  - validate that a p and or q buffer are in sync with a given series of
+	  sources
 
 3.3 Descriptor management:
 The return value is non-NULL and points to a 'descriptor' when the operation
diff --git a/arch/arm/mach-iop13xx/setup.c b/arch/arm/mach-iop13xx/setup.c
index 9800228b71d3..2e7ca0d75f8a 100644
--- a/arch/arm/mach-iop13xx/setup.c
+++ b/arch/arm/mach-iop13xx/setup.c
@@ -506,7 +506,7 @@ void __init iop13xx_platform_init(void)
 			dma_cap_set(DMA_MEMSET, plat_data->cap_mask);
 			dma_cap_set(DMA_MEMCPY_CRC32C, plat_data->cap_mask);
 			dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask);
-			dma_cap_set(DMA_PQ_XOR, plat_data->cap_mask);
+			dma_cap_set(DMA_PQ, plat_data->cap_mask);
 			dma_cap_set(DMA_PQ_UPDATE, plat_data->cap_mask);
 			dma_cap_set(DMA_PQ_VAL, plat_data->cap_mask);
 			break;
diff --git a/crypto/async_tx/Kconfig b/crypto/async_tx/Kconfig
index d8fb39145986..cb6d7314f198 100644
--- a/crypto/async_tx/Kconfig
+++ b/crypto/async_tx/Kconfig
@@ -14,3 +14,7 @@ config ASYNC_MEMSET
 	tristate
 	select ASYNC_CORE
 
+config ASYNC_PQ
+	tristate
+	select ASYNC_CORE
+
diff --git a/crypto/async_tx/Makefile b/crypto/async_tx/Makefile
index 27baa7d52fbc..1b9926588259 100644
--- a/crypto/async_tx/Makefile
+++ b/crypto/async_tx/Makefile
@@ -2,3 +2,4 @@ obj-$(CONFIG_ASYNC_CORE) += async_tx.o
 obj-$(CONFIG_ASYNC_MEMCPY) += async_memcpy.o
 obj-$(CONFIG_ASYNC_MEMSET) += async_memset.o
 obj-$(CONFIG_ASYNC_XOR) += async_xor.o
+obj-$(CONFIG_ASYNC_PQ) += async_pq.o
diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c
new file mode 100644
index 000000000000..108b21efb499
--- /dev/null
+++ b/crypto/async_tx/async_pq.c
@@ -0,0 +1,388 @@
+/*
+ * Copyright(c) 2007 Yuri Tikhonov <yur@emcraft.com>
+ * Copyright(c) 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/dma-mapping.h>
+#include <linux/raid/pq.h>
+#include <linux/async_tx.h>
+
+/**
+ * scribble - space to hold throwaway P buffer for synchronous gen_syndrome
+ */
+static struct page *scribble;
+
+static bool is_raid6_zero_block(struct page *p)
+{
+	return p == (void *) raid6_empty_zero_page;
+}
+
+/* the struct page *blocks[] parameter passed to async_gen_syndrome()
+ * and async_syndrome_val() contains the 'P' destination address at
+ * blocks[disks-2] and the 'Q' destination address at blocks[disks-1]
+ *
+ * note: these are macros as they are used as lvalues
+ */
+#define P(b, d) (b[d-2])
+#define Q(b, d) (b[d-1])
+
+/**
+ * do_async_gen_syndrome - asynchronously calculate P and/or Q
+ */
+static __async_inline struct dma_async_tx_descriptor *
+do_async_gen_syndrome(struct dma_chan *chan, struct page **blocks,
+		      const unsigned char *scfs, unsigned int offset, int disks,
+		      size_t len, dma_addr_t *dma_src,
+		      struct async_submit_ctl *submit)
+{
+	struct dma_async_tx_descriptor *tx = NULL;
+	struct dma_device *dma = chan->device;
+	enum dma_ctrl_flags dma_flags = 0;
+	enum async_tx_flags flags_orig = submit->flags;
+	dma_async_tx_callback cb_fn_orig = submit->cb_fn;
+	dma_async_tx_callback cb_param_orig = submit->cb_param;
+	int src_cnt = disks - 2;
+	unsigned char coefs[src_cnt];
+	unsigned short pq_src_cnt;
+	dma_addr_t dma_dest[2];
+	int src_off = 0;
+	int idx;
+	int i;
+
+	/* DMAs use destinations as sources, so use BIDIRECTIONAL mapping */
+	if (P(blocks, disks))
+		dma_dest[0] = dma_map_page(dma->dev, P(blocks, disks), offset,
+					   len, DMA_BIDIRECTIONAL);
+	else
+		dma_flags |= DMA_PREP_PQ_DISABLE_P;
+	if (Q(blocks, disks))
+		dma_dest[1] = dma_map_page(dma->dev, Q(blocks, disks), offset,
+					   len, DMA_BIDIRECTIONAL);
+	else
+		dma_flags |= DMA_PREP_PQ_DISABLE_Q;
+
+	/* convert source addresses being careful to collapse 'empty'
+	 * sources and update the coefficients accordingly
+	 */
+	for (i = 0, idx = 0; i < src_cnt; i++) {
+		if (is_raid6_zero_block(blocks[i]))
+			continue;
+		dma_src[idx] = dma_map_page(dma->dev, blocks[i], offset, len,
+					    DMA_TO_DEVICE);
+		coefs[idx] = scfs[i];
+		idx++;
+	}
+	src_cnt = idx;
+
+	while (src_cnt > 0) {
+		submit->flags = flags_orig;
+		pq_src_cnt = min(src_cnt, dma_maxpq(dma, dma_flags));
+		/* if we are submitting additional pqs, leave the chain open,
+		 * clear the callback parameters, and leave the destination
+		 * buffers mapped
+		 */
+		if (src_cnt > pq_src_cnt) {
+			submit->flags &= ~ASYNC_TX_ACK;
+			dma_flags |= DMA_COMPL_SKIP_DEST_UNMAP;
+			submit->cb_fn = NULL;
+			submit->cb_param = NULL;
+		} else {
+			dma_flags &= ~DMA_COMPL_SKIP_DEST_UNMAP;
+			submit->cb_fn = cb_fn_orig;
+			submit->cb_param = cb_param_orig;
+			if (cb_fn_orig)
+				dma_flags |= DMA_PREP_INTERRUPT;
+		}
+
+		/* Since we have clobbered the src_list we are committed
+		 * to doing this asynchronously.  Drivers force forward
+		 * progress in case they can not provide a descriptor
+		 */
+		for (;;) {
+			tx = dma->device_prep_dma_pq(chan, dma_dest,
+						     &dma_src[src_off],
+						     pq_src_cnt,
+						     &coefs[src_off], len,
+						     dma_flags);
+			if (likely(tx))
+				break;
+			async_tx_quiesce(&submit->depend_tx);
+			dma_async_issue_pending(chan);
+		}
+
+		async_tx_submit(chan, tx, submit);
+		submit->depend_tx = tx;
+
+		/* drop completed sources */
+		src_cnt -= pq_src_cnt;
+		src_off += pq_src_cnt;
+
+		dma_flags |= DMA_PREP_CONTINUE;
+	}
+
+	return tx;
+}
+
+/**
+ * do_sync_gen_syndrome - synchronously calculate a raid6 syndrome
+ */
+static void
+do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
+		     size_t len, struct async_submit_ctl *submit)
+{
+	void **srcs;
+	int i;
+
+	if (submit->scribble)
+		srcs = submit->scribble;
+	else
+		srcs = (void **) blocks;
+
+	for (i = 0; i < disks; i++) {
+		if (is_raid6_zero_block(blocks[i])) {
+			BUG_ON(i > disks - 3); /* P or Q can't be zero */
+			srcs[i] = blocks[i];
+		} else
+			srcs[i] = page_address(blocks[i]) + offset;
+	}
+	raid6_call.gen_syndrome(disks, len, srcs);
+	async_tx_sync_epilog(submit);
+}
+
+/**
+ * async_gen_syndrome - asynchronously calculate a raid6 syndrome
+ * @blocks: source blocks from idx 0..disks-3, P @ disks-2 and Q @ disks-1
+ * @offset: common offset into each block (src and dest) to start transaction
+ * @disks: number of blocks (including missing P or Q, see below)
+ * @len: length of operation in bytes
+ * @submit: submission/completion modifiers
+ *
+ * General note: This routine assumes a field of GF(2^8) with a
+ * primitive polynomial of 0x11d and a generator of {02}.
+ *
+ * 'disks' note: callers can optionally omit either P or Q (but not
+ * both) from the calculation by setting blocks[disks-2] or
+ * blocks[disks-1] to NULL.  When P or Q is omitted 'len' must be <=
+ * PAGE_SIZE as a temporary buffer of this size is used in the
+ * synchronous path.  'disks' always accounts for both destination
+ * buffers.
+ *
+ * 'blocks' note: if submit->scribble is NULL then the contents of
+ * 'blocks' may be overridden
+ */
+struct dma_async_tx_descriptor *
+async_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
+		   size_t len, struct async_submit_ctl *submit)
+{
+	int src_cnt = disks - 2;
+	struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ,
+						      &P(blocks, disks), 2,
+						      blocks, src_cnt, len);
+	struct dma_device *device = chan ? chan->device : NULL;
+	dma_addr_t *dma_src = NULL;
+
+	BUG_ON(disks > 255 || !(P(blocks, disks) || Q(blocks, disks)));
+
+	if (submit->scribble)
+		dma_src = submit->scribble;
+	else if (sizeof(dma_addr_t) <= sizeof(struct page *))
+		dma_src = (dma_addr_t *) blocks;
+
+	if (dma_src && device &&
+	    (src_cnt <= dma_maxpq(device, 0) ||
+	     dma_maxpq(device, DMA_PREP_CONTINUE) > 0)) {
+		/* run the p+q asynchronously */
+		pr_debug("%s: (async) disks: %d len: %zu\n",
+			 __func__, disks, len);
+		return do_async_gen_syndrome(chan, blocks, raid6_gfexp, offset,
+					     disks, len, dma_src, submit);
+	}
+
+	/* run the pq synchronously */
+	pr_debug("%s: (sync) disks: %d len: %zu\n", __func__, disks, len);
+
+	/* wait for any prerequisite operations */
+	async_tx_quiesce(&submit->depend_tx);
+
+	if (!P(blocks, disks)) {
+		P(blocks, disks) = scribble;
+		BUG_ON(len + offset > PAGE_SIZE);
+	}
+	if (!Q(blocks, disks)) {
+		Q(blocks, disks) = scribble;
+		BUG_ON(len + offset > PAGE_SIZE);
+	}
+	do_sync_gen_syndrome(blocks, offset, disks, len, submit);
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(async_gen_syndrome);
+
+/**
+ * async_syndrome_val - asynchronously validate a raid6 syndrome
+ * @blocks: source blocks from idx 0..disks-3, P @ disks-2 and Q @ disks-1
+ * @offset: common offset into each block (src and dest) to start transaction
+ * @disks: number of blocks (including missing P or Q, see below)
+ * @len: length of operation in bytes
+ * @pqres: on val failure SUM_CHECK_P_RESULT and/or SUM_CHECK_Q_RESULT are set
+ * @spare: temporary result buffer for the synchronous case
+ * @submit: submission / completion modifiers
+ *
+ * The same notes from async_gen_syndrome apply to the 'blocks',
+ * and 'disks' parameters of this routine.  The synchronous path
+ * requires a temporary result buffer and submit->scribble to be
+ * specified.
+ */
+struct dma_async_tx_descriptor *
+async_syndrome_val(struct page **blocks, unsigned int offset, int disks,
+		   size_t len, enum sum_check_flags *pqres, struct page *spare,
+		   struct async_submit_ctl *submit)
+{
+	struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ_VAL,
+						      NULL, 0,  blocks, disks,
+						      len);
+	struct dma_device *device = chan ? chan->device : NULL;
+	struct dma_async_tx_descriptor *tx;
+	enum dma_ctrl_flags dma_flags = submit->cb_fn ? DMA_PREP_INTERRUPT : 0;
+	dma_addr_t *dma_src = NULL;
+
+	BUG_ON(disks < 4);
+
+	if (submit->scribble)
+		dma_src = submit->scribble;
+	else if (sizeof(dma_addr_t) <= sizeof(struct page *))
+		dma_src = (dma_addr_t *) blocks;
+
+	if (dma_src && device && disks <= dma_maxpq(device, 0)) {
+		struct device *dev = device->dev;
+		dma_addr_t *pq = &dma_src[disks-2];
+		int i;
+
+		pr_debug("%s: (async) disks: %d len: %zu\n",
+			 __func__, disks, len);
+		if (!P(blocks, disks))
+			dma_flags |= DMA_PREP_PQ_DISABLE_P;
+		if (!Q(blocks, disks))
+			dma_flags |= DMA_PREP_PQ_DISABLE_Q;
+		for (i = 0; i < disks; i++)
+			if (likely(blocks[i])) {
+				BUG_ON(is_raid6_zero_block(blocks[i]));
+				dma_src[i] = dma_map_page(dev, blocks[i],
+							  offset, len,
+							  DMA_TO_DEVICE);
+			}
+
+		for (;;) {
+			tx = device->device_prep_dma_pq_val(chan, pq, dma_src,
+							    disks - 2,
+							    raid6_gfexp,
+							    len, pqres,
+							    dma_flags);
+			if (likely(tx))
+				break;
+			async_tx_quiesce(&submit->depend_tx);
+			dma_async_issue_pending(chan);
+		}
+		async_tx_submit(chan, tx, submit);
+
+		return tx;
+	} else {
+		struct page *p_src = P(blocks, disks);
+		struct page *q_src = Q(blocks, disks);
+		enum async_tx_flags flags_orig = submit->flags;
+		dma_async_tx_callback cb_fn_orig = submit->cb_fn;
+		void *scribble = submit->scribble;
+		void *cb_param_orig = submit->cb_param;
+		void *p, *q, *s;
+
+		pr_debug("%s: (sync) disks: %d len: %zu\n",
+			 __func__, disks, len);
+
+		/* caller must provide a temporary result buffer and
+		 * allow the input parameters to be preserved
+		 */
+		BUG_ON(!spare || !scribble);
+
+		/* wait for any prerequisite operations */
+		async_tx_quiesce(&submit->depend_tx);
+
+		/* recompute p and/or q into the temporary buffer and then
+		 * check to see the result matches the current value
+		 */
+		tx = NULL;
+		*pqres = 0;
+		if (p_src) {
+			init_async_submit(submit, ASYNC_TX_XOR_ZERO_DST, NULL,
+					  NULL, NULL, scribble);
+			tx = async_xor(spare, blocks, offset, disks-2, len, submit);
+			async_tx_quiesce(&tx);
+			p = page_address(p_src) + offset;
+			s = page_address(spare) + offset;
+			*pqres |= !!memcmp(p, s, len) << SUM_CHECK_P;
+		}
+
+		if (q_src) {
+			P(blocks, disks) = NULL;
+			Q(blocks, disks) = spare;
+			init_async_submit(submit, 0, NULL, NULL, NULL, scribble);
+			tx = async_gen_syndrome(blocks, offset, disks, len, submit);
+			async_tx_quiesce(&tx);
+			q = page_address(q_src) + offset;
+			s = page_address(spare) + offset;
+			*pqres |= !!memcmp(q, s, len) << SUM_CHECK_Q;
+		}
+
+		/* restore P, Q and submit */
+		P(blocks, disks) = p_src;
+		Q(blocks, disks) = q_src;
+
+		submit->cb_fn = cb_fn_orig;
+		submit->cb_param = cb_param_orig;
+		submit->flags = flags_orig;
+		async_tx_sync_epilog(submit);
+
+		return NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(async_syndrome_val);
+
+static int __init async_pq_init(void)
+{
+	scribble = alloc_page(GFP_KERNEL);
+
+	if (scribble)
+		return 0;
+
+	pr_err("%s: failed to allocate required spare page\n", __func__);
+
+	return -ENOMEM;
+}
+
+static void __exit async_pq_exit(void)
+{
+	put_page(scribble);
+}
+
+module_init(async_pq_init);
+module_exit(async_pq_exit);
+
+MODULE_DESCRIPTION("asynchronous raid6 syndrome generation/validation");
+MODULE_LICENSE("GPL");
diff --git a/crypto/async_tx/async_xor.c b/crypto/async_tx/async_xor.c
index 78fb7780272a..56b5f98da463 100644
--- a/crypto/async_tx/async_xor.c
+++ b/crypto/async_tx/async_xor.c
@@ -62,7 +62,7 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
 	while (src_cnt) {
 		submit->flags = flags_orig;
 		dma_flags = 0;
-		xor_src_cnt = min(src_cnt, dma->max_xor);
+		xor_src_cnt = min(src_cnt, (int)dma->max_xor);
 		/* if we are submitting additional xors, leave the chain open,
 		 * clear the callback parameters, and leave the destination
 		 * buffer mapped
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index e002e0e0d055..cd5673d3043b 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -646,6 +646,10 @@ int dma_async_device_register(struct dma_device *device)
 		!device->device_prep_dma_xor);
 	BUG_ON(dma_has_cap(DMA_XOR_VAL, device->cap_mask) &&
 		!device->device_prep_dma_xor_val);
+	BUG_ON(dma_has_cap(DMA_PQ, device->cap_mask) &&
+		!device->device_prep_dma_pq);
+	BUG_ON(dma_has_cap(DMA_PQ_VAL, device->cap_mask) &&
+		!device->device_prep_dma_pq_val);
 	BUG_ON(dma_has_cap(DMA_MEMSET, device->cap_mask) &&
 		!device->device_prep_dma_memset);
 	BUG_ON(dma_has_cap(DMA_INTERRUPT, device->cap_mask) &&
diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c
index 6ff79a672699..4496bc606662 100644
--- a/drivers/dma/iop-adma.c
+++ b/drivers/dma/iop-adma.c
@@ -1257,7 +1257,7 @@ static int __devinit iop_adma_probe(struct platform_device *pdev)
 
 	dev_printk(KERN_INFO, &pdev->dev, "Intel(R) IOP: "
 	  "( %s%s%s%s%s%s%s%s%s%s)\n",
-	  dma_has_cap(DMA_PQ_XOR, dma_dev->cap_mask) ? "pq_xor " : "",
+	  dma_has_cap(DMA_PQ, dma_dev->cap_mask) ? "pq " : "",
 	  dma_has_cap(DMA_PQ_UPDATE, dma_dev->cap_mask) ? "pq_update " : "",
 	  dma_has_cap(DMA_PQ_VAL, dma_dev->cap_mask) ? "pq_val " : "",
 	  dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "xor " : "",
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
index 12a2efcbd565..e6ce5f004f98 100644
--- a/include/linux/async_tx.h
+++ b/include/linux/async_tx.h
@@ -185,5 +185,14 @@ async_memset(struct page *dest, int val, unsigned int offset,
 
 struct dma_async_tx_descriptor *async_trigger_callback(struct async_submit_ctl *submit);
 
+struct dma_async_tx_descriptor *
+async_gen_syndrome(struct page **blocks, unsigned int offset, int src_cnt,
+		   size_t len, struct async_submit_ctl *submit);
+
+struct dma_async_tx_descriptor *
+async_syndrome_val(struct page **blocks, unsigned int offset, int src_cnt,
+		   size_t len, enum sum_check_flags *pqres, struct page *spare,
+		   struct async_submit_ctl *submit);
+
 void async_tx_quiesce(struct dma_async_tx_descriptor **tx);
 #endif /* _ASYNC_TX_H_ */
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 02447afcebad..ce010cd991d2 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -52,7 +52,7 @@ enum dma_status {
 enum dma_transaction_type {
 	DMA_MEMCPY,
 	DMA_XOR,
-	DMA_PQ_XOR,
+	DMA_PQ,
 	DMA_DUAL_XOR,
 	DMA_PQ_UPDATE,
 	DMA_XOR_VAL,
@@ -70,20 +70,28 @@ enum dma_transaction_type {
 
 /**
  * enum dma_ctrl_flags - DMA flags to augment operation preparation,
- * 	control completion, and communicate status.
+ *  control completion, and communicate status.
  * @DMA_PREP_INTERRUPT - trigger an interrupt (callback) upon completion of
- * 	this transaction
+ *  this transaction
  * @DMA_CTRL_ACK - the descriptor cannot be reused until the client
- * 	acknowledges receipt, i.e. has has a chance to establish any
- * 	dependency chains
+ *  acknowledges receipt, i.e. has has a chance to establish any dependency
+ *  chains
  * @DMA_COMPL_SKIP_SRC_UNMAP - set to disable dma-unmapping the source buffer(s)
  * @DMA_COMPL_SKIP_DEST_UNMAP - set to disable dma-unmapping the destination(s)
+ * @DMA_PREP_PQ_DISABLE_P - prevent generation of P while generating Q
+ * @DMA_PREP_PQ_DISABLE_Q - prevent generation of Q while generating P
+ * @DMA_PREP_CONTINUE - indicate to a driver that it is reusing buffers as
+ *  sources that were the result of a previous operation, in the case of a PQ
+ *  operation it continues the calculation with new sources
  */
 enum dma_ctrl_flags {
 	DMA_PREP_INTERRUPT = (1 << 0),
 	DMA_CTRL_ACK = (1 << 1),
 	DMA_COMPL_SKIP_SRC_UNMAP = (1 << 2),
 	DMA_COMPL_SKIP_DEST_UNMAP = (1 << 3),
+	DMA_PREP_PQ_DISABLE_P = (1 << 4),
+	DMA_PREP_PQ_DISABLE_Q = (1 << 5),
+	DMA_PREP_CONTINUE = (1 << 6),
 };
 
 /**
@@ -226,6 +234,7 @@ struct dma_async_tx_descriptor {
  * @global_node: list_head for global dma_device_list
  * @cap_mask: one or more dma_capability flags
  * @max_xor: maximum number of xor sources, 0 if no capability
+ * @max_pq: maximum number of PQ sources and PQ-continue capability
  * @dev_id: unique device ID
  * @dev: struct device reference for dma mapping api
  * @device_alloc_chan_resources: allocate resources and return the
@@ -234,6 +243,8 @@ struct dma_async_tx_descriptor {
  * @device_prep_dma_memcpy: prepares a memcpy operation
  * @device_prep_dma_xor: prepares a xor operation
  * @device_prep_dma_xor_val: prepares a xor validation operation
+ * @device_prep_dma_pq: prepares a pq operation
+ * @device_prep_dma_pq_val: prepares a pqzero_sum operation
  * @device_prep_dma_memset: prepares a memset operation
  * @device_prep_dma_interrupt: prepares an end of chain interrupt operation
  * @device_prep_slave_sg: prepares a slave dma operation
@@ -248,7 +259,9 @@ struct dma_device {
 	struct list_head channels;
 	struct list_head global_node;
 	dma_cap_mask_t  cap_mask;
-	int max_xor;
+	unsigned short max_xor;
+	unsigned short max_pq;
+	#define DMA_HAS_PQ_CONTINUE (1 << 15)
 
 	int dev_id;
 	struct device *dev;
@@ -265,6 +278,14 @@ struct dma_device {
 	struct dma_async_tx_descriptor *(*device_prep_dma_xor_val)(
 		struct dma_chan *chan, dma_addr_t *src,	unsigned int src_cnt,
 		size_t len, enum sum_check_flags *result, unsigned long flags);
+	struct dma_async_tx_descriptor *(*device_prep_dma_pq)(
+		struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src,
+		unsigned int src_cnt, const unsigned char *scf,
+		size_t len, unsigned long flags);
+	struct dma_async_tx_descriptor *(*device_prep_dma_pq_val)(
+		struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src,
+		unsigned int src_cnt, const unsigned char *scf, size_t len,
+		enum sum_check_flags *pqres, unsigned long flags);
 	struct dma_async_tx_descriptor *(*device_prep_dma_memset)(
 		struct dma_chan *chan, dma_addr_t dest, int value, size_t len,
 		unsigned long flags);
@@ -283,6 +304,60 @@ struct dma_device {
 	void (*device_issue_pending)(struct dma_chan *chan);
 };
 
+static inline void
+dma_set_maxpq(struct dma_device *dma, int maxpq, int has_pq_continue)
+{
+	dma->max_pq = maxpq;
+	if (has_pq_continue)
+		dma->max_pq |= DMA_HAS_PQ_CONTINUE;
+}
+
+static inline bool dmaf_continue(enum dma_ctrl_flags flags)
+{
+	return (flags & DMA_PREP_CONTINUE) == DMA_PREP_CONTINUE;
+}
+
+static inline bool dmaf_p_disabled_continue(enum dma_ctrl_flags flags)
+{
+	enum dma_ctrl_flags mask = DMA_PREP_CONTINUE | DMA_PREP_PQ_DISABLE_P;
+
+	return (flags & mask) == mask;
+}
+
+static inline bool dma_dev_has_pq_continue(struct dma_device *dma)
+{
+	return (dma->max_pq & DMA_HAS_PQ_CONTINUE) == DMA_HAS_PQ_CONTINUE;
+}
+
+static unsigned short dma_dev_to_maxpq(struct dma_device *dma)
+{
+	return dma->max_pq & ~DMA_HAS_PQ_CONTINUE;
+}
+
+/* dma_maxpq - reduce maxpq in the face of continued operations
+ * @dma - dma device with PQ capability
+ * @flags - to check if DMA_PREP_CONTINUE and DMA_PREP_PQ_DISABLE_P are set
+ *
+ * When an engine does not support native continuation we need 3 extra
+ * source slots to reuse P and Q with the following coefficients:
+ * 1/ {00} * P : remove P from Q', but use it as a source for P'
+ * 2/ {01} * Q : use Q to continue Q' calculation
+ * 3/ {00} * Q : subtract Q from P' to cancel (2)
+ *
+ * In the case where P is disabled we only need 1 extra source:
+ * 1/ {01} * Q : use Q to continue Q' calculation
+ */
+static inline int dma_maxpq(struct dma_device *dma, enum dma_ctrl_flags flags)
+{
+	if (dma_dev_has_pq_continue(dma) || !dmaf_continue(flags))
+		return dma_dev_to_maxpq(dma);
+	else if (dmaf_p_disabled_continue(flags))
+		return dma_dev_to_maxpq(dma) - 1;
+	else if (dmaf_continue(flags))
+		return dma_dev_to_maxpq(dma) - 3;
+	BUG();
+}
+
 /* --- public DMA engine API --- */
 
 #ifdef CONFIG_DMA_ENGINE
-- 
cgit v1.2.3-71-gd317


From 0a82a6239beecc95db6e05fe43ee62d16b381d38 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 14 Jul 2009 12:20:37 -0700
Subject: async_tx: add support for asynchronous RAID6 recovery operations

 async_raid6_2data_recov() recovers two data disk failures

 async_raid6_datap_recov() recovers a data disk and the P disk

These routines are a port of the synchronous versions found in
drivers/md/raid6recov.c.  The primary difference is breaking out the xor
operations into separate calls to async_xor.  Two helper routines are
introduced to perform scalar multiplication where needed.
async_sum_product() multiplies two sources by scalar coefficients and
then sums (xor) the result.  async_mult() simply multiplies a single
source by a scalar.

This implemention also includes, in contrast to the original
synchronous-only code, special case handling for the 4-disk and 5-disk
array cases.  In these situations the default N-disk algorithm will
present 0-source or 1-source operations to dma devices.  To cover for
dma devices where the minimum source count is 2 we implement 4-disk and
5-disk handling in the recovery code.

[ Impact: asynchronous raid6 recovery routines for 2data and datap cases ]

Cc: Yuri Tikhonov <yur@emcraft.com>
Cc: Ilya Yanok <yanok@emcraft.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: David Woodhouse <David.Woodhouse@intel.com>
Reviewed-by: Andre Noll <maan@systemlinux.org>
Acked-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 Documentation/crypto/async-tx-api.txt |   4 +
 crypto/async_tx/Kconfig               |   5 +
 crypto/async_tx/Makefile              |   1 +
 crypto/async_tx/async_raid6_recov.c   | 448 ++++++++++++++++++++++++++++++++++
 include/linux/async_tx.h              |   8 +
 5 files changed, 466 insertions(+)
 create mode 100644 crypto/async_tx/async_raid6_recov.c

(limited to 'include/linux')

diff --git a/Documentation/crypto/async-tx-api.txt b/Documentation/crypto/async-tx-api.txt
index 0e48e054d69a..ba046b8fa92f 100644
--- a/Documentation/crypto/async-tx-api.txt
+++ b/Documentation/crypto/async-tx-api.txt
@@ -67,6 +67,10 @@ xor_val - xor a series of source buffers and set a flag if the
 pq	- generate the p+q (raid6 syndrome) from a series of source buffers
 pq_val  - validate that a p and or q buffer are in sync with a given series of
 	  sources
+datap	- (raid6_datap_recov) recover a raid6 data block and the p block
+	  from the given sources
+2data	- (raid6_2data_recov) recover 2 raid6 data blocks from the given
+	  sources
 
 3.3 Descriptor management:
 The return value is non-NULL and points to a 'descriptor' when the operation
diff --git a/crypto/async_tx/Kconfig b/crypto/async_tx/Kconfig
index cb6d7314f198..e5aeb2b79e6f 100644
--- a/crypto/async_tx/Kconfig
+++ b/crypto/async_tx/Kconfig
@@ -18,3 +18,8 @@ config ASYNC_PQ
 	tristate
 	select ASYNC_CORE
 
+config ASYNC_RAID6_RECOV
+	tristate
+	select ASYNC_CORE
+	select ASYNC_PQ
+
diff --git a/crypto/async_tx/Makefile b/crypto/async_tx/Makefile
index 1b9926588259..9a1a76811b80 100644
--- a/crypto/async_tx/Makefile
+++ b/crypto/async_tx/Makefile
@@ -3,3 +3,4 @@ obj-$(CONFIG_ASYNC_MEMCPY) += async_memcpy.o
 obj-$(CONFIG_ASYNC_MEMSET) += async_memset.o
 obj-$(CONFIG_ASYNC_XOR) += async_xor.o
 obj-$(CONFIG_ASYNC_PQ) += async_pq.o
+obj-$(CONFIG_ASYNC_RAID6_RECOV) += async_raid6_recov.o
diff --git a/crypto/async_tx/async_raid6_recov.c b/crypto/async_tx/async_raid6_recov.c
new file mode 100644
index 000000000000..0c14d48c9896
--- /dev/null
+++ b/crypto/async_tx/async_raid6_recov.c
@@ -0,0 +1,448 @@
+/*
+ * Asynchronous RAID-6 recovery calculations ASYNC_TX API.
+ * Copyright(c) 2009 Intel Corporation
+ *
+ * based on raid6recov.c:
+ *   Copyright 2002 H. Peter Anvin
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/dma-mapping.h>
+#include <linux/raid/pq.h>
+#include <linux/async_tx.h>
+
+static struct dma_async_tx_descriptor *
+async_sum_product(struct page *dest, struct page **srcs, unsigned char *coef,
+		  size_t len, struct async_submit_ctl *submit)
+{
+	struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ,
+						      &dest, 1, srcs, 2, len);
+	struct dma_device *dma = chan ? chan->device : NULL;
+	const u8 *amul, *bmul;
+	u8 ax, bx;
+	u8 *a, *b, *c;
+
+	if (dma) {
+		dma_addr_t dma_dest[2];
+		dma_addr_t dma_src[2];
+		struct device *dev = dma->dev;
+		struct dma_async_tx_descriptor *tx;
+		enum dma_ctrl_flags dma_flags = DMA_PREP_PQ_DISABLE_P;
+
+		dma_dest[1] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL);
+		dma_src[0] = dma_map_page(dev, srcs[0], 0, len, DMA_TO_DEVICE);
+		dma_src[1] = dma_map_page(dev, srcs[1], 0, len, DMA_TO_DEVICE);
+		tx = dma->device_prep_dma_pq(chan, dma_dest, dma_src, 2, coef,
+					     len, dma_flags);
+		if (tx) {
+			async_tx_submit(chan, tx, submit);
+			return tx;
+		}
+	}
+
+	/* run the operation synchronously */
+	async_tx_quiesce(&submit->depend_tx);
+	amul = raid6_gfmul[coef[0]];
+	bmul = raid6_gfmul[coef[1]];
+	a = page_address(srcs[0]);
+	b = page_address(srcs[1]);
+	c = page_address(dest);
+
+	while (len--) {
+		ax    = amul[*a++];
+		bx    = bmul[*b++];
+		*c++ = ax ^ bx;
+	}
+
+	return NULL;
+}
+
+static struct dma_async_tx_descriptor *
+async_mult(struct page *dest, struct page *src, u8 coef, size_t len,
+	   struct async_submit_ctl *submit)
+{
+	struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ,
+						      &dest, 1, &src, 1, len);
+	struct dma_device *dma = chan ? chan->device : NULL;
+	const u8 *qmul; /* Q multiplier table */
+	u8 *d, *s;
+
+	if (dma) {
+		dma_addr_t dma_dest[2];
+		dma_addr_t dma_src[1];
+		struct device *dev = dma->dev;
+		struct dma_async_tx_descriptor *tx;
+		enum dma_ctrl_flags dma_flags = DMA_PREP_PQ_DISABLE_P;
+
+		dma_dest[1] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL);
+		dma_src[0] = dma_map_page(dev, src, 0, len, DMA_TO_DEVICE);
+		tx = dma->device_prep_dma_pq(chan, dma_dest, dma_src, 1, &coef,
+					     len, dma_flags);
+		if (tx) {
+			async_tx_submit(chan, tx, submit);
+			return tx;
+		}
+	}
+
+	/* no channel available, or failed to allocate a descriptor, so
+	 * perform the operation synchronously
+	 */
+	async_tx_quiesce(&submit->depend_tx);
+	qmul  = raid6_gfmul[coef];
+	d = page_address(dest);
+	s = page_address(src);
+
+	while (len--)
+		*d++ = qmul[*s++];
+
+	return NULL;
+}
+
+static struct dma_async_tx_descriptor *
+__2data_recov_4(size_t bytes, int faila, int failb, struct page **blocks,
+	      struct async_submit_ctl *submit)
+{
+	struct dma_async_tx_descriptor *tx = NULL;
+	struct page *p, *q, *a, *b;
+	struct page *srcs[2];
+	unsigned char coef[2];
+	enum async_tx_flags flags = submit->flags;
+	dma_async_tx_callback cb_fn = submit->cb_fn;
+	void *cb_param = submit->cb_param;
+	void *scribble = submit->scribble;
+
+	p = blocks[4-2];
+	q = blocks[4-1];
+
+	a = blocks[faila];
+	b = blocks[failb];
+
+	/* in the 4 disk case P + Pxy == P and Q + Qxy == Q */
+	/* Dx = A*(P+Pxy) + B*(Q+Qxy) */
+	srcs[0] = p;
+	srcs[1] = q;
+	coef[0] = raid6_gfexi[failb-faila];
+	coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]];
+	init_async_submit(submit, 0, tx, NULL, NULL, scribble);
+	tx = async_sum_product(b, srcs, coef, bytes, submit);
+
+	/* Dy = P+Pxy+Dx */
+	srcs[0] = p;
+	srcs[1] = b;
+	init_async_submit(submit, flags | ASYNC_TX_XOR_ZERO_DST, tx, cb_fn,
+			  cb_param, scribble);
+	tx = async_xor(a, srcs, 0, 2, bytes, submit);
+
+	return tx;
+
+}
+
+static struct dma_async_tx_descriptor *
+__2data_recov_5(size_t bytes, int faila, int failb, struct page **blocks,
+	      struct async_submit_ctl *submit)
+{
+	struct dma_async_tx_descriptor *tx = NULL;
+	struct page *p, *q, *g, *dp, *dq;
+	struct page *srcs[2];
+	unsigned char coef[2];
+	enum async_tx_flags flags = submit->flags;
+	dma_async_tx_callback cb_fn = submit->cb_fn;
+	void *cb_param = submit->cb_param;
+	void *scribble = submit->scribble;
+	int uninitialized_var(good);
+	int i;
+
+	for (i = 0; i < 3; i++) {
+		if (i == faila || i == failb)
+			continue;
+		else {
+			good = i;
+			break;
+		}
+	}
+	BUG_ON(i >= 3);
+
+	p = blocks[5-2];
+	q = blocks[5-1];
+	g = blocks[good];
+
+	/* Compute syndrome with zero for the missing data pages
+	 * Use the dead data pages as temporary storage for delta p and
+	 * delta q
+	 */
+	dp = blocks[faila];
+	dq = blocks[failb];
+
+	init_async_submit(submit, 0, tx, NULL, NULL, scribble);
+	tx = async_memcpy(dp, g, 0, 0, bytes, submit);
+	init_async_submit(submit, 0, tx, NULL, NULL, scribble);
+	tx = async_mult(dq, g, raid6_gfexp[good], bytes, submit);
+
+	/* compute P + Pxy */
+	srcs[0] = dp;
+	srcs[1] = p;
+	init_async_submit(submit, ASYNC_TX_XOR_DROP_DST, tx, NULL, NULL,
+			  scribble);
+	tx = async_xor(dp, srcs, 0, 2, bytes, submit);
+
+	/* compute Q + Qxy */
+	srcs[0] = dq;
+	srcs[1] = q;
+	init_async_submit(submit, ASYNC_TX_XOR_DROP_DST, tx, NULL, NULL,
+			  scribble);
+	tx = async_xor(dq, srcs, 0, 2, bytes, submit);
+
+	/* Dx = A*(P+Pxy) + B*(Q+Qxy) */
+	srcs[0] = dp;
+	srcs[1] = dq;
+	coef[0] = raid6_gfexi[failb-faila];
+	coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]];
+	init_async_submit(submit, 0, tx, NULL, NULL, scribble);
+	tx = async_sum_product(dq, srcs, coef, bytes, submit);
+
+	/* Dy = P+Pxy+Dx */
+	srcs[0] = dp;
+	srcs[1] = dq;
+	init_async_submit(submit, flags | ASYNC_TX_XOR_DROP_DST, tx, cb_fn,
+			  cb_param, scribble);
+	tx = async_xor(dp, srcs, 0, 2, bytes, submit);
+
+	return tx;
+}
+
+static struct dma_async_tx_descriptor *
+__2data_recov_n(int disks, size_t bytes, int faila, int failb,
+	      struct page **blocks, struct async_submit_ctl *submit)
+{
+	struct dma_async_tx_descriptor *tx = NULL;
+	struct page *p, *q, *dp, *dq;
+	struct page *srcs[2];
+	unsigned char coef[2];
+	enum async_tx_flags flags = submit->flags;
+	dma_async_tx_callback cb_fn = submit->cb_fn;
+	void *cb_param = submit->cb_param;
+	void *scribble = submit->scribble;
+
+	p = blocks[disks-2];
+	q = blocks[disks-1];
+
+	/* Compute syndrome with zero for the missing data pages
+	 * Use the dead data pages as temporary storage for
+	 * delta p and delta q
+	 */
+	dp = blocks[faila];
+	blocks[faila] = (void *)raid6_empty_zero_page;
+	blocks[disks-2] = dp;
+	dq = blocks[failb];
+	blocks[failb] = (void *)raid6_empty_zero_page;
+	blocks[disks-1] = dq;
+
+	init_async_submit(submit, 0, tx, NULL, NULL, scribble);
+	tx = async_gen_syndrome(blocks, 0, disks, bytes, submit);
+
+	/* Restore pointer table */
+	blocks[faila]   = dp;
+	blocks[failb]   = dq;
+	blocks[disks-2] = p;
+	blocks[disks-1] = q;
+
+	/* compute P + Pxy */
+	srcs[0] = dp;
+	srcs[1] = p;
+	init_async_submit(submit, ASYNC_TX_XOR_DROP_DST, tx, NULL, NULL,
+			  scribble);
+	tx = async_xor(dp, srcs, 0, 2, bytes, submit);
+
+	/* compute Q + Qxy */
+	srcs[0] = dq;
+	srcs[1] = q;
+	init_async_submit(submit, ASYNC_TX_XOR_DROP_DST, tx, NULL, NULL,
+			  scribble);
+	tx = async_xor(dq, srcs, 0, 2, bytes, submit);
+
+	/* Dx = A*(P+Pxy) + B*(Q+Qxy) */
+	srcs[0] = dp;
+	srcs[1] = dq;
+	coef[0] = raid6_gfexi[failb-faila];
+	coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]];
+	init_async_submit(submit, 0, tx, NULL, NULL, scribble);
+	tx = async_sum_product(dq, srcs, coef, bytes, submit);
+
+	/* Dy = P+Pxy+Dx */
+	srcs[0] = dp;
+	srcs[1] = dq;
+	init_async_submit(submit, flags | ASYNC_TX_XOR_DROP_DST, tx, cb_fn,
+			  cb_param, scribble);
+	tx = async_xor(dp, srcs, 0, 2, bytes, submit);
+
+	return tx;
+}
+
+/**
+ * async_raid6_2data_recov - asynchronously calculate two missing data blocks
+ * @disks: number of disks in the RAID-6 array
+ * @bytes: block size
+ * @faila: first failed drive index
+ * @failb: second failed drive index
+ * @blocks: array of source pointers where the last two entries are p and q
+ * @submit: submission/completion modifiers
+ */
+struct dma_async_tx_descriptor *
+async_raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
+			struct page **blocks, struct async_submit_ctl *submit)
+{
+	BUG_ON(faila == failb);
+	if (failb < faila)
+		swap(faila, failb);
+
+	pr_debug("%s: disks: %d len: %zu\n", __func__, disks, bytes);
+
+	/* we need to preserve the contents of 'blocks' for the async
+	 * case, so punt to synchronous if a scribble buffer is not available
+	 */
+	if (!submit->scribble) {
+		void **ptrs = (void **) blocks;
+		int i;
+
+		async_tx_quiesce(&submit->depend_tx);
+		for (i = 0; i < disks; i++)
+			ptrs[i] = page_address(blocks[i]);
+
+		raid6_2data_recov(disks, bytes, faila, failb, ptrs);
+
+		async_tx_sync_epilog(submit);
+
+		return NULL;
+	}
+
+	switch (disks) {
+	case 4:
+		/* dma devices do not uniformly understand a zero source pq
+		 * operation (in contrast to the synchronous case), so
+		 * explicitly handle the 4 disk special case
+		 */
+		return __2data_recov_4(bytes, faila, failb, blocks, submit);
+	case 5:
+		/* dma devices do not uniformly understand a single
+		 * source pq operation (in contrast to the synchronous
+		 * case), so explicitly handle the 5 disk special case
+		 */
+		return __2data_recov_5(bytes, faila, failb, blocks, submit);
+	default:
+		return __2data_recov_n(disks, bytes, faila, failb, blocks, submit);
+	}
+}
+EXPORT_SYMBOL_GPL(async_raid6_2data_recov);
+
+/**
+ * async_raid6_datap_recov - asynchronously calculate a data and the 'p' block
+ * @disks: number of disks in the RAID-6 array
+ * @bytes: block size
+ * @faila: failed drive index
+ * @blocks: array of source pointers where the last two entries are p and q
+ * @submit: submission/completion modifiers
+ */
+struct dma_async_tx_descriptor *
+async_raid6_datap_recov(int disks, size_t bytes, int faila,
+			struct page **blocks, struct async_submit_ctl *submit)
+{
+	struct dma_async_tx_descriptor *tx = NULL;
+	struct page *p, *q, *dq;
+	u8 coef;
+	enum async_tx_flags flags = submit->flags;
+	dma_async_tx_callback cb_fn = submit->cb_fn;
+	void *cb_param = submit->cb_param;
+	void *scribble = submit->scribble;
+	struct page *srcs[2];
+
+	pr_debug("%s: disks: %d len: %zu\n", __func__, disks, bytes);
+
+	/* we need to preserve the contents of 'blocks' for the async
+	 * case, so punt to synchronous if a scribble buffer is not available
+	 */
+	if (!scribble) {
+		void **ptrs = (void **) blocks;
+		int i;
+
+		async_tx_quiesce(&submit->depend_tx);
+		for (i = 0; i < disks; i++)
+			ptrs[i] = page_address(blocks[i]);
+
+		raid6_datap_recov(disks, bytes, faila, ptrs);
+
+		async_tx_sync_epilog(submit);
+
+		return NULL;
+	}
+
+	p = blocks[disks-2];
+	q = blocks[disks-1];
+
+	/* Compute syndrome with zero for the missing data page
+	 * Use the dead data page as temporary storage for delta q
+	 */
+	dq = blocks[faila];
+	blocks[faila] = (void *)raid6_empty_zero_page;
+	blocks[disks-1] = dq;
+
+	/* in the 4 disk case we only need to perform a single source
+	 * multiplication
+	 */
+	if (disks == 4) {
+		int good = faila == 0 ? 1 : 0;
+		struct page *g = blocks[good];
+
+		init_async_submit(submit, 0, tx, NULL, NULL, scribble);
+		tx = async_memcpy(p, g, 0, 0, bytes, submit);
+
+		init_async_submit(submit, 0, tx, NULL, NULL, scribble);
+		tx = async_mult(dq, g, raid6_gfexp[good], bytes, submit);
+	} else {
+		init_async_submit(submit, 0, tx, NULL, NULL, scribble);
+		tx = async_gen_syndrome(blocks, 0, disks, bytes, submit);
+	}
+
+	/* Restore pointer table */
+	blocks[faila]   = dq;
+	blocks[disks-1] = q;
+
+	/* calculate g^{-faila} */
+	coef = raid6_gfinv[raid6_gfexp[faila]];
+
+	srcs[0] = dq;
+	srcs[1] = q;
+	init_async_submit(submit, ASYNC_TX_XOR_DROP_DST, tx, NULL, NULL,
+			  scribble);
+	tx = async_xor(dq, srcs, 0, 2, bytes, submit);
+
+	init_async_submit(submit, 0, tx, NULL, NULL, scribble);
+	tx = async_mult(dq, dq, coef, bytes, submit);
+
+	srcs[0] = p;
+	srcs[1] = dq;
+	init_async_submit(submit, flags | ASYNC_TX_XOR_DROP_DST, tx, cb_fn,
+			  cb_param, scribble);
+	tx = async_xor(p, srcs, 0, 2, bytes, submit);
+
+	return tx;
+}
+EXPORT_SYMBOL_GPL(async_raid6_datap_recov);
+
+MODULE_AUTHOR("Dan Williams <dan.j.williams@intel.com>");
+MODULE_DESCRIPTION("asynchronous RAID-6 recovery api");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
index e6ce5f004f98..866e61c4e2e0 100644
--- a/include/linux/async_tx.h
+++ b/include/linux/async_tx.h
@@ -194,5 +194,13 @@ async_syndrome_val(struct page **blocks, unsigned int offset, int src_cnt,
 		   size_t len, enum sum_check_flags *pqres, struct page *spare,
 		   struct async_submit_ctl *submit);
 
+struct dma_async_tx_descriptor *
+async_raid6_2data_recov(int src_num, size_t bytes, int faila, int failb,
+			struct page **ptrs, struct async_submit_ctl *submit);
+
+struct dma_async_tx_descriptor *
+async_raid6_datap_recov(int src_num, size_t bytes, int faila,
+			struct page **ptrs, struct async_submit_ctl *submit);
+
 void async_tx_quiesce(struct dma_async_tx_descriptor **tx);
 #endif /* _ASYNC_TX_H_ */
-- 
cgit v1.2.3-71-gd317


From acdfcd04d9df7d084ff752f82afad6ed4ad5f363 Mon Sep 17 00:00:00 2001
From: Aaro Koskinen <aaro.koskinen@nokia.com>
Date: Fri, 28 Aug 2009 14:28:54 +0300
Subject: SLUB: fix ARCH_KMALLOC_MINALIGN cases 64 and 256

If the minalign is 64 bytes, then the 96 byte cache should not be created
because it would conflict with the 128 byte cache.

If the minalign is 256 bytes, patching the size_index table should not
result in a buffer overrun.

The calculation "(i - 1) / 8" used to access size_index[] is moved to
a separate function as suggested by Christoph Lameter.

Acked-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Aaro Koskinen <aaro.koskinen@nokia.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/slub_def.h |  6 ++----
 mm/slub.c                | 30 ++++++++++++++++++++++++------
 2 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 4dcbc2c71491..aa5d4a69d461 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -152,12 +152,10 @@ static __always_inline int kmalloc_index(size_t size)
 	if (size <= KMALLOC_MIN_SIZE)
 		return KMALLOC_SHIFT_LOW;
 
-#if KMALLOC_MIN_SIZE <= 64
-	if (size > 64 && size <= 96)
+	if (KMALLOC_MIN_SIZE <= 32 && size > 64 && size <= 96)
 		return 1;
-	if (size > 128 && size <= 192)
+	if (KMALLOC_MIN_SIZE <= 64 && size > 128 && size <= 192)
 		return 2;
-#endif
 	if (size <=          8) return 3;
 	if (size <=         16) return 4;
 	if (size <=         32) return 5;
diff --git a/mm/slub.c b/mm/slub.c
index e16c9fb1f48b..be493bd63c31 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2825,6 +2825,11 @@ static s8 size_index[24] = {
 	2	/* 192 */
 };
 
+static inline int size_index_elem(size_t bytes)
+{
+	return (bytes - 1) / 8;
+}
+
 static struct kmem_cache *get_slab(size_t size, gfp_t flags)
 {
 	int index;
@@ -2833,7 +2838,7 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
 		if (!size)
 			return ZERO_SIZE_PTR;
 
-		index = size_index[(size - 1) / 8];
+		index = size_index[size_index_elem(size)];
 	} else
 		index = fls(size - 1);
 
@@ -3188,10 +3193,12 @@ void __init kmem_cache_init(void)
 	slab_state = PARTIAL;
 
 	/* Caches that are not of the two-to-the-power-of size */
-	if (KMALLOC_MIN_SIZE <= 64) {
+	if (KMALLOC_MIN_SIZE <= 32) {
 		create_kmalloc_cache(&kmalloc_caches[1],
 				"kmalloc-96", 96, GFP_NOWAIT);
 		caches++;
+	}
+	if (KMALLOC_MIN_SIZE <= 64) {
 		create_kmalloc_cache(&kmalloc_caches[2],
 				"kmalloc-192", 192, GFP_NOWAIT);
 		caches++;
@@ -3218,17 +3225,28 @@ void __init kmem_cache_init(void)
 	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
 		(KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
 
-	for (i = 8; i < KMALLOC_MIN_SIZE; i += 8)
-		size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW;
+	for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
+		int elem = size_index_elem(i);
+		if (elem >= ARRAY_SIZE(size_index))
+			break;
+		size_index[elem] = KMALLOC_SHIFT_LOW;
+	}
 
-	if (KMALLOC_MIN_SIZE == 128) {
+	if (KMALLOC_MIN_SIZE == 64) {
+		/*
+		 * The 96 byte size cache is not used if the alignment
+		 * is 64 byte.
+		 */
+		for (i = 64 + 8; i <= 96; i += 8)
+			size_index[size_index_elem(i)] = 7;
+	} else if (KMALLOC_MIN_SIZE == 128) {
 		/*
 		 * The 192 byte sized cache is not used if the alignment
 		 * is 128 byte. Redirect kmalloc to use the 256 byte cache
 		 * instead.
 		 */
 		for (i = 128 + 8; i <= 192; i += 8)
-			size_index[(i - 1) / 8] = 8;
+			size_index[size_index_elem(i)] = 8;
 	}
 
 	slab_state = UP;
-- 
cgit v1.2.3-71-gd317


From e500011ffa191d662ac64d4ada6a5187b3180e16 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Sun, 30 Aug 2009 13:19:12 -0700
Subject: timers: Drop a function prototype

Drop prototype for non-existent next_timer_interrupt() function.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: akpm <akpm@linux-foundation.org>
LKML-Reference: <4A9ADEC0.70306@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timer.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index be62ec2ebea5..a2d1eb6cb3f0 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -173,11 +173,6 @@ extern int mod_timer_pinned(struct timer_list *timer, unsigned long expires);
  */
 #define NEXT_TIMER_MAX_DELTA	((1UL << 30) - 1)
 
-/*
- * Return when the next timer-wheel timeout occurs (in absolute jiffies),
- * locks the timer base:
- */
-extern unsigned long next_timer_interrupt(void);
 /*
  * Return when the next timer-wheel timeout occurs (in absolute jiffies),
  * locks the timer base and does the comparison against the given
-- 
cgit v1.2.3-71-gd317


From 8aa84ad8d6c740a04386f599694609ee4998e82e Mon Sep 17 00:00:00 2001
From: Thomas Renninger <trenn@suse.de>
Date: Fri, 24 Jul 2009 15:25:05 +0200
Subject: [CPUFREQ] Introduce global, not per core:
 /sys/devices/system/cpu/cpufreq

Currently everything in the cpufreq layer is per core based.
This does not reflect reality, for example ondemand on conservative
governors have global sysfs variables.

Introduce a global cpufreq directory and add the kobject to the governor
struct, so that governors can easily access it.
The directory is initialized in the cpufreq_core_init initcall and thus will
always be created if cpufreq is compiled in, even if no cpufreq driver is
active later.

Signed-off-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 drivers/cpufreq/cpufreq.c |  9 ++++++++-
 include/linux/cpufreq.h   | 10 ++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index bbd5c2164ab6..4da28444b235 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -686,6 +686,9 @@ static struct attribute *default_attrs[] = {
 	NULL
 };
 
+struct kobject *cpufreq_global_kobject;
+EXPORT_SYMBOL(cpufreq_global_kobject);
+
 #define to_policy(k) container_of(k, struct cpufreq_policy, kobj)
 #define to_attr(a) container_of(a, struct freq_attr, attr)
 
@@ -1935,7 +1938,11 @@ static int __init cpufreq_core_init(void)
 		per_cpu(policy_cpu, cpu) = -1;
 		init_rwsem(&per_cpu(cpu_policy_rwsem, cpu));
 	}
+
+	cpufreq_global_kobject = kobject_create_and_add("cpufreq",
+						&cpu_sysdev_class.kset.kobj);
+	BUG_ON(!cpufreq_global_kobject);
+
 	return 0;
 }
-
 core_initcall(cpufreq_core_init);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 161042746afc..44717eb47639 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -65,6 +65,9 @@ static inline int cpufreq_unregister_notifier(struct notifier_block *nb,
 
 struct cpufreq_governor;
 
+/* /sys/devices/system/cpu/cpufreq: entry point for global variables */
+extern struct kobject *cpufreq_global_kobject;
+
 #define CPUFREQ_ETERNAL			(-1)
 struct cpufreq_cpuinfo {
 	unsigned int		max_freq;
@@ -274,6 +277,13 @@ struct freq_attr {
 	ssize_t (*store)(struct cpufreq_policy *, const char *, size_t count);
 };
 
+struct global_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct kobject *kobj,
+			struct attribute *attr, char *buf);
+	ssize_t (*store)(struct kobject *a, struct attribute *b,
+			 const char *c, size_t count);
+};
 
 /*********************************************************************
  *                        CPUFREQ 2.6. INTERFACE                     *
-- 
cgit v1.2.3-71-gd317


From 6d703a81ad5fdd102334751ddacb053ecc6ff046 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 1 Sep 2009 17:52:57 -0700
Subject: ide: convert to ->proc_fops

->read_proc, ->write_proc are going away, ->proc_fops should be used instead.

The only tricky place is IDENTIFY handling: if for some reason
taskfile_lib_get_identify() fails, buffer _is_ changed and at least
first byte is overwritten. Emulate old behaviour with returning
that first byte to userspace and reporting length=1 despite overall -E.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ide/ide-cd.c          |  28 +++-
 drivers/ide/ide-disk_proc.c   | 129 +++++++++++------
 drivers/ide/ide-floppy_proc.c |  30 ++--
 drivers/ide/ide-proc.c        | 330 +++++++++++++++++++++++++++---------------
 drivers/ide/ide-tape.c        |  31 ++--
 include/linux/ide.h           |  24 +--
 6 files changed, 365 insertions(+), 207 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index ad0ab0c0a493..b79ca419d8d9 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -30,6 +30,7 @@
 #include <linux/kernel.h>
 #include <linux/delay.h>
 #include <linux/timer.h>
+#include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/errno.h>
@@ -1389,19 +1390,30 @@ static sector_t ide_cdrom_capacity(ide_drive_t *drive)
 	return capacity * sectors_per_frame;
 }
 
-static int proc_idecd_read_capacity(char *page, char **start, off_t off,
-					int count, int *eof, void *data)
+static int idecd_capacity_proc_show(struct seq_file *m, void *v)
 {
-	ide_drive_t *drive = data;
-	int len;
+	ide_drive_t *drive = m->private;
 
-	len = sprintf(page, "%llu\n", (long long)ide_cdrom_capacity(drive));
-	PROC_IDE_READ_RETURN(page, start, off, count, eof, len);
+	seq_printf(m, "%llu\n", (long long)ide_cdrom_capacity(drive));
+	return 0;
+}
+
+static int idecd_capacity_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, idecd_capacity_proc_show, PDE(inode)->data);
 }
 
+static const struct file_operations idecd_capacity_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= idecd_capacity_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 static ide_proc_entry_t idecd_proc[] = {
-	{ "capacity", S_IFREG|S_IRUGO, proc_idecd_read_capacity, NULL },
-	{ NULL, 0, NULL, NULL }
+	{ "capacity", S_IFREG|S_IRUGO, &idecd_capacity_proc_fops },
+	{}
 };
 
 static ide_proc_entry_t *ide_cd_proc_entries(ide_drive_t *drive)
diff --git a/drivers/ide/ide-disk_proc.c b/drivers/ide/ide-disk_proc.c
index 19f263bf0a9e..60b0590ccc9c 100644
--- a/drivers/ide/ide-disk_proc.c
+++ b/drivers/ide/ide-disk_proc.c
@@ -1,5 +1,6 @@
 #include <linux/kernel.h>
 #include <linux/ide.h>
+#include <linux/seq_file.h>
 
 #include "ide-disk.h"
 
@@ -37,77 +38,117 @@ static int get_smart_data(ide_drive_t *drive, u8 *buf, u8 sub_cmd)
 	return ide_raw_taskfile(drive, &cmd, buf, 1);
 }
 
-static int proc_idedisk_read_cache
-	(char *page, char **start, off_t off, int count, int *eof, void *data)
+static int idedisk_cache_proc_show(struct seq_file *m, void *v)
 {
-	ide_drive_t	*drive = (ide_drive_t *) data;
-	char		*out = page;
-	int		len;
+	ide_drive_t	*drive = (ide_drive_t *) m->private;
 
 	if (drive->dev_flags & IDE_DFLAG_ID_READ)
-		len = sprintf(out, "%i\n", drive->id[ATA_ID_BUF_SIZE] / 2);
+		seq_printf(m, "%i\n", drive->id[ATA_ID_BUF_SIZE] / 2);
 	else
-		len = sprintf(out, "(none)\n");
+		seq_printf(m, "(none)\n");
+	return 0;
+}
 
-	PROC_IDE_READ_RETURN(page, start, off, count, eof, len);
+static int idedisk_cache_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, idedisk_cache_proc_show, PDE(inode)->data);
 }
 
-static int proc_idedisk_read_capacity
-	(char *page, char **start, off_t off, int count, int *eof, void *data)
+static const struct file_operations idedisk_cache_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= idedisk_cache_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int idedisk_capacity_proc_show(struct seq_file *m, void *v)
 {
-	ide_drive_t*drive = (ide_drive_t *)data;
-	int len;
+	ide_drive_t*drive = (ide_drive_t *)m->private;
 
-	len = sprintf(page, "%llu\n", (long long)ide_gd_capacity(drive));
+	seq_printf(m, "%llu\n", (long long)ide_gd_capacity(drive));
+	return 0;
+}
 
-	PROC_IDE_READ_RETURN(page, start, off, count, eof, len);
+static int idedisk_capacity_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, idedisk_capacity_proc_show, PDE(inode)->data);
 }
 
-static int proc_idedisk_read_smart(char *page, char **start, off_t off,
-				   int count, int *eof, void *data, u8 sub_cmd)
+static const struct file_operations idedisk_capacity_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= idedisk_capacity_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __idedisk_proc_show(struct seq_file *m, ide_drive_t *drive, u8 sub_cmd)
 {
-	ide_drive_t	*drive = (ide_drive_t *)data;
-	int		len = 0, i = 0;
+	u8 *buf;
+
+	buf = kmalloc(SECTOR_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
 
 	(void)smart_enable(drive);
 
-	if (get_smart_data(drive, page, sub_cmd) == 0) {
-		unsigned short *val = (unsigned short *) page;
-		char *out = (char *)val + SECTOR_SIZE;
-
-		page = out;
-		do {
-			out += sprintf(out, "%04x%c", le16_to_cpu(*val),
-				       (++i & 7) ? ' ' : '\n');
-			val += 1;
-		} while (i < SECTOR_SIZE / 2);
-		len = out - page;
+	if (get_smart_data(drive, buf, sub_cmd) == 0) {
+		__le16 *val = (__le16 *)buf;
+		int i;
+
+		for (i = 0; i < SECTOR_SIZE / 2; i++) {
+			seq_printf(m, "%04x%c", le16_to_cpu(val[i]),
+					(i % 8) == 7 ? '\n' : ' ');
+		}
 	}
+	kfree(buf);
+	return 0;
+}
 
-	PROC_IDE_READ_RETURN(page, start, off, count, eof, len);
+static int idedisk_sv_proc_show(struct seq_file *m, void *v)
+{
+	return __idedisk_proc_show(m, m->private, ATA_SMART_READ_VALUES);
 }
 
-static int proc_idedisk_read_sv
-	(char *page, char **start, off_t off, int count, int *eof, void *data)
+static int idedisk_sv_proc_open(struct inode *inode, struct file *file)
 {
-	return proc_idedisk_read_smart(page, start, off, count, eof, data,
-				       ATA_SMART_READ_VALUES);
+	return single_open(file, idedisk_sv_proc_show, PDE(inode)->data);
 }
 
-static int proc_idedisk_read_st
-	(char *page, char **start, off_t off, int count, int *eof, void *data)
+static const struct file_operations idedisk_sv_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= idedisk_sv_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int idedisk_st_proc_show(struct seq_file *m, void *v)
 {
-	return proc_idedisk_read_smart(page, start, off, count, eof, data,
-				       ATA_SMART_READ_THRESHOLDS);
+	return __idedisk_proc_show(m, m->private, ATA_SMART_READ_THRESHOLDS);
 }
 
+static int idedisk_st_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, idedisk_st_proc_show, PDE(inode)->data);
+}
+
+static const struct file_operations idedisk_st_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= idedisk_st_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 ide_proc_entry_t ide_disk_proc[] = {
-	{ "cache",	  S_IFREG|S_IRUGO, proc_idedisk_read_cache,    NULL },
-	{ "capacity",	  S_IFREG|S_IRUGO, proc_idedisk_read_capacity, NULL },
-	{ "geometry",	  S_IFREG|S_IRUGO, proc_ide_read_geometry,     NULL },
-	{ "smart_values", S_IFREG|S_IRUSR, proc_idedisk_read_sv,       NULL },
-	{ "smart_thresholds", S_IFREG|S_IRUSR, proc_idedisk_read_st,   NULL },
-	{ NULL, 0, NULL, NULL }
+	{ "cache",	  S_IFREG|S_IRUGO, &idedisk_cache_proc_fops	},
+	{ "capacity",	  S_IFREG|S_IRUGO, &idedisk_capacity_proc_fops	},
+	{ "geometry",	  S_IFREG|S_IRUGO, &ide_geometry_proc_fops	},
+	{ "smart_values", S_IFREG|S_IRUSR, &idedisk_sv_proc_fops	},
+	{ "smart_thresholds", S_IFREG|S_IRUSR, &idedisk_st_proc_fops	},
+	{}
 };
 
 ide_devset_rw_field(bios_cyl, bios_cyl);
diff --git a/drivers/ide/ide-floppy_proc.c b/drivers/ide/ide-floppy_proc.c
index fcd4d8153df5..d711d9b883de 100644
--- a/drivers/ide/ide-floppy_proc.c
+++ b/drivers/ide/ide-floppy_proc.c
@@ -1,22 +1,34 @@
 #include <linux/kernel.h>
 #include <linux/ide.h>
+#include <linux/seq_file.h>
 
 #include "ide-floppy.h"
 
-static int proc_idefloppy_read_capacity(char *page, char **start, off_t off,
-		int count, int *eof, void *data)
+static int idefloppy_capacity_proc_show(struct seq_file *m, void *v)
 {
-	ide_drive_t*drive = (ide_drive_t *)data;
-	int len;
+	ide_drive_t*drive = (ide_drive_t *)m->private;
 
-	len = sprintf(page, "%llu\n", (long long)ide_gd_capacity(drive));
-	PROC_IDE_READ_RETURN(page, start, off, count, eof, len);
+	seq_printf(m, "%llu\n", (long long)ide_gd_capacity(drive));
+	return 0;
 }
 
+static int idefloppy_capacity_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, idefloppy_capacity_proc_show, PDE(inode)->data);
+}
+
+static const struct file_operations idefloppy_capacity_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= idefloppy_capacity_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 ide_proc_entry_t ide_floppy_proc[] = {
-	{ "capacity",	S_IFREG|S_IRUGO, proc_idefloppy_read_capacity,	NULL },
-	{ "geometry",	S_IFREG|S_IRUGO, proc_ide_read_geometry,	NULL },
-	{ NULL, 0, NULL, NULL }
+	{ "capacity",	S_IFREG|S_IRUGO, &idefloppy_capacity_proc_fops	},
+	{ "geometry",	S_IFREG|S_IRUGO, &ide_geometry_proc_fops	},
+	{}
 };
 
 ide_devset_rw_field(bios_cyl, bios_cyl);
diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c
index 021de41655e6..28d09a5d8450 100644
--- a/drivers/ide/ide-proc.c
+++ b/drivers/ide/ide-proc.c
@@ -30,11 +30,9 @@
 
 static struct proc_dir_entry *proc_ide_root;
 
-static int proc_ide_read_imodel
-	(char *page, char **start, off_t off, int count, int *eof, void *data)
+static int ide_imodel_proc_show(struct seq_file *m, void *v)
 {
-	ide_hwif_t	*hwif = (ide_hwif_t *) data;
-	int		len;
+	ide_hwif_t	*hwif = (ide_hwif_t *) m->private;
 	const char	*name;
 
 	switch (hwif->chipset) {
@@ -53,63 +51,108 @@ static int proc_ide_read_imodel
 	case ide_acorn:		name = "acorn";		break;
 	default:		name = "(unknown)";	break;
 	}
-	len = sprintf(page, "%s\n", name);
-	PROC_IDE_READ_RETURN(page, start, off, count, eof, len);
+	seq_printf(m, "%s\n", name);
+	return 0;
 }
 
-static int proc_ide_read_mate
-	(char *page, char **start, off_t off, int count, int *eof, void *data)
+static int ide_imodel_proc_open(struct inode *inode, struct file *file)
 {
-	ide_hwif_t	*hwif = (ide_hwif_t *) data;
-	int		len;
+	return single_open(file, ide_imodel_proc_show, PDE(inode)->data);
+}
+
+static const struct file_operations ide_imodel_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ide_imodel_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int ide_mate_proc_show(struct seq_file *m, void *v)
+{
+	ide_hwif_t	*hwif = (ide_hwif_t *) m->private;
 
 	if (hwif && hwif->mate)
-		len = sprintf(page, "%s\n", hwif->mate->name);
+		seq_printf(m, "%s\n", hwif->mate->name);
 	else
-		len = sprintf(page, "(none)\n");
-	PROC_IDE_READ_RETURN(page, start, off, count, eof, len);
+		seq_printf(m, "(none)\n");
+	return 0;
+}
+
+static int ide_mate_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ide_mate_proc_show, PDE(inode)->data);
 }
 
-static int proc_ide_read_channel
-	(char *page, char **start, off_t off, int count, int *eof, void *data)
+static const struct file_operations ide_mate_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ide_mate_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int ide_channel_proc_show(struct seq_file *m, void *v)
 {
-	ide_hwif_t	*hwif = (ide_hwif_t *) data;
-	int		len;
+	ide_hwif_t	*hwif = (ide_hwif_t *) m->private;
 
-	page[0] = hwif->channel ? '1' : '0';
-	page[1] = '\n';
-	len = 2;
-	PROC_IDE_READ_RETURN(page, start, off, count, eof, len);
+	seq_printf(m, "%c\n", hwif->channel ? '1' : '0');
+	return 0;
 }
 
-static int proc_ide_read_identify
-	(char *page, char **start, off_t off, int count, int *eof, void *data)
+static int ide_channel_proc_open(struct inode *inode, struct file *file)
 {
-	ide_drive_t	*drive = (ide_drive_t *)data;
-	int		len = 0, i = 0;
-	int		err = 0;
+	return single_open(file, ide_channel_proc_show, PDE(inode)->data);
+}
 
-	len = sprintf(page, "\n");
+static const struct file_operations ide_channel_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ide_channel_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 
-	if (drive) {
-		__le16 *val = (__le16 *)page;
+static int ide_identify_proc_show(struct seq_file *m, void *v)
+{
+	ide_drive_t *drive = (ide_drive_t *)m->private;
+	u8 *buf;
 
-		err = taskfile_lib_get_identify(drive, page);
-		if (!err) {
-			char *out = (char *)page + SECTOR_SIZE;
+	if (!drive) {
+		seq_putc(m, '\n');
+		return 0;
+	}
 
-			page = out;
-			do {
-				out += sprintf(out, "%04x%c",
-					le16_to_cpup(val), (++i & 7) ? ' ' : '\n');
-				val += 1;
-			} while (i < SECTOR_SIZE / 2);
-			len = out - page;
+	buf = kmalloc(SECTOR_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+	if (taskfile_lib_get_identify(drive, buf) == 0) {
+		__le16 *val = (__le16 *)buf;
+		int i;
+
+		for (i = 0; i < SECTOR_SIZE / 2; i++) {
+			seq_printf(m, "%04x%c", le16_to_cpu(val[i]),
+					(i % 8) == 7 ? '\n' : ' ');
 		}
-	}
-	PROC_IDE_READ_RETURN(page, start, off, count, eof, len);
+	} else
+		seq_putc(m, buf[0]);
+	kfree(buf);
+	return 0;
+}
+
+static int ide_identify_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ide_identify_proc_show, PDE(inode)->data);
 }
 
+static const struct file_operations ide_identify_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ide_identify_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 /**
  *	ide_find_setting	-	find a specific setting
  *	@st: setting table pointer
@@ -240,22 +283,20 @@ static void proc_ide_settings_warn(void)
 	warned = 1;
 }
 
-static int proc_ide_read_settings
-	(char *page, char **start, off_t off, int count, int *eof, void *data)
+static int ide_settings_proc_show(struct seq_file *m, void *v)
 {
 	const struct ide_proc_devset *setting, *g, *d;
 	const struct ide_devset *ds;
-	ide_drive_t	*drive = (ide_drive_t *) data;
-	char		*out = page;
-	int		len, rc, mul_factor, div_factor;
+	ide_drive_t	*drive = (ide_drive_t *) m->private;
+	int		rc, mul_factor, div_factor;
 
 	proc_ide_settings_warn();
 
 	mutex_lock(&ide_setting_mtx);
 	g = ide_generic_settings;
 	d = drive->settings;
-	out += sprintf(out, "name\t\t\tvalue\t\tmin\t\tmax\t\tmode\n");
-	out += sprintf(out, "----\t\t\t-----\t\t---\t\t---\t\t----\n");
+	seq_printf(m, "name\t\t\tvalue\t\tmin\t\tmax\t\tmode\n");
+	seq_printf(m, "----\t\t\t-----\t\t---\t\t---\t\t----\n");
 	while (g->name || (d && d->name)) {
 		/* read settings in the alphabetical order */
 		if (g->name && d && d->name) {
@@ -269,31 +310,35 @@ static int proc_ide_read_settings
 			setting = g++;
 		mul_factor = setting->mulf ? setting->mulf(drive) : 1;
 		div_factor = setting->divf ? setting->divf(drive) : 1;
-		out += sprintf(out, "%-24s", setting->name);
+		seq_printf(m, "%-24s", setting->name);
 		rc = ide_read_setting(drive, setting);
 		if (rc >= 0)
-			out += sprintf(out, "%-16d", rc * mul_factor / div_factor);
+			seq_printf(m, "%-16d", rc * mul_factor / div_factor);
 		else
-			out += sprintf(out, "%-16s", "write-only");
-		out += sprintf(out, "%-16d%-16d", (setting->min * mul_factor + div_factor - 1) / div_factor, setting->max * mul_factor / div_factor);
+			seq_printf(m, "%-16s", "write-only");
+		seq_printf(m, "%-16d%-16d", (setting->min * mul_factor + div_factor - 1) / div_factor, setting->max * mul_factor / div_factor);
 		ds = setting->setting;
 		if (ds->get)
-			out += sprintf(out, "r");
+			seq_printf(m, "r");
 		if (ds->set)
-			out += sprintf(out, "w");
-		out += sprintf(out, "\n");
+			seq_printf(m, "w");
+		seq_printf(m, "\n");
 	}
-	len = out - page;
 	mutex_unlock(&ide_setting_mtx);
-	PROC_IDE_READ_RETURN(page, start, off, count, eof, len);
+	return 0;
+}
+
+static int ide_settings_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ide_settings_proc_show, PDE(inode)->data);
 }
 
 #define MAX_LEN	30
 
-static int proc_ide_write_settings(struct file *file, const char __user *buffer,
-				   unsigned long count, void *data)
+static ssize_t ide_settings_proc_write(struct file *file, const char __user *buffer,
+				       size_t count, loff_t *pos)
 {
-	ide_drive_t	*drive = (ide_drive_t *) data;
+	ide_drive_t	*drive = (ide_drive_t *) PDE(file->f_path.dentry->d_inode)->data;
 	char		name[MAX_LEN + 1];
 	int		for_real = 0, mul_factor, div_factor;
 	unsigned long	n;
@@ -388,63 +433,104 @@ static int proc_ide_write_settings(struct file *file, const char __user *buffer,
 	return count;
 parse_error:
 	free_page((unsigned long)buf);
-	printk("proc_ide_write_settings(): parse error\n");
+	printk("%s(): parse error\n", __func__);
 	return -EINVAL;
 }
 
-int proc_ide_read_capacity
-	(char *page, char **start, off_t off, int count, int *eof, void *data)
+static const struct file_operations ide_settings_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ide_settings_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= ide_settings_proc_write,
+};
+
+static int ide_capacity_proc_show(struct seq_file *m, void *v)
 {
-	int len = sprintf(page, "%llu\n", (long long)0x7fffffff);
-	PROC_IDE_READ_RETURN(page, start, off, count, eof, len);
+	seq_printf(m, "%llu\n", (long long)0x7fffffff);
+	return 0;
 }
 
-EXPORT_SYMBOL_GPL(proc_ide_read_capacity);
+static int ide_capacity_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ide_capacity_proc_show, NULL);
+}
 
-int proc_ide_read_geometry
-	(char *page, char **start, off_t off, int count, int *eof, void *data)
+const struct file_operations ide_capacity_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ide_capacity_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+EXPORT_SYMBOL_GPL(ide_capacity_proc_fops);
+
+static int ide_geometry_proc_show(struct seq_file *m, void *v)
 {
-	ide_drive_t	*drive = (ide_drive_t *) data;
-	char		*out = page;
-	int		len;
+	ide_drive_t	*drive = (ide_drive_t *) m->private;
 
-	out += sprintf(out, "physical     %d/%d/%d\n",
+	seq_printf(m, "physical     %d/%d/%d\n",
 			drive->cyl, drive->head, drive->sect);
-	out += sprintf(out, "logical      %d/%d/%d\n",
+	seq_printf(m, "logical      %d/%d/%d\n",
 			drive->bios_cyl, drive->bios_head, drive->bios_sect);
+	return 0;
+}
 
-	len = out - page;
-	PROC_IDE_READ_RETURN(page, start, off, count, eof, len);
+static int ide_geometry_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ide_geometry_proc_show, PDE(inode)->data);
 }
 
-EXPORT_SYMBOL(proc_ide_read_geometry);
+const struct file_operations ide_geometry_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ide_geometry_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+EXPORT_SYMBOL(ide_geometry_proc_fops);
 
-static int proc_ide_read_dmodel
-	(char *page, char **start, off_t off, int count, int *eof, void *data)
+static int ide_dmodel_proc_show(struct seq_file *seq, void *v)
 {
-	ide_drive_t	*drive = (ide_drive_t *) data;
+	ide_drive_t	*drive = (ide_drive_t *) seq->private;
 	char		*m = (char *)&drive->id[ATA_ID_PROD];
-	int		len;
 
-	len = sprintf(page, "%.40s\n", m[0] ? m : "(none)");
-	PROC_IDE_READ_RETURN(page, start, off, count, eof, len);
+	seq_printf(seq, "%.40s\n", m[0] ? m : "(none)");
+	return 0;
+}
+
+static int ide_dmodel_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ide_dmodel_proc_show, PDE(inode)->data);
 }
 
-static int proc_ide_read_driver
-	(char *page, char **start, off_t off, int count, int *eof, void *data)
+static const struct file_operations ide_dmodel_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ide_dmodel_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int ide_driver_proc_show(struct seq_file *m, void *v)
 {
-	ide_drive_t		*drive = (ide_drive_t *)data;
+	ide_drive_t		*drive = (ide_drive_t *)m->private;
 	struct device		*dev = &drive->gendev;
 	struct ide_driver	*ide_drv;
-	int			len;
 
 	if (dev->driver) {
 		ide_drv = to_ide_driver(dev->driver);
-		len = sprintf(page, "%s version %s\n",
+		seq_printf(m, "%s version %s\n",
 				dev->driver->name, ide_drv->version);
 	} else
-		len = sprintf(page, "ide-default version 0.9.newide\n");
-	PROC_IDE_READ_RETURN(page, start, off, count, eof, len);
+		seq_printf(m, "ide-default version 0.9.newide\n");
+	return 0;
+}
+
+static int ide_driver_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ide_driver_proc_show, PDE(inode)->data);
 }
 
 static int ide_replace_subdriver(ide_drive_t *drive, const char *driver)
@@ -474,10 +560,10 @@ static int ide_replace_subdriver(ide_drive_t *drive, const char *driver)
 	return ret;
 }
 
-static int proc_ide_write_driver
-	(struct file *file, const char __user *buffer, unsigned long count, void *data)
+static ssize_t ide_driver_proc_write(struct file *file, const char __user *buffer,
+				     size_t count, loff_t *pos)
 {
-	ide_drive_t	*drive = (ide_drive_t *) data;
+	ide_drive_t	*drive = (ide_drive_t *) PDE(file->f_path.dentry->d_inode)->data;
 	char name[32];
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -492,12 +578,19 @@ static int proc_ide_write_driver
 	return count;
 }
 
-static int proc_ide_read_media
-	(char *page, char **start, off_t off, int count, int *eof, void *data)
+static const struct file_operations ide_driver_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ide_driver_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= ide_driver_proc_write,
+};
+
+static int ide_media_proc_show(struct seq_file *m, void *v)
 {
-	ide_drive_t	*drive = (ide_drive_t *) data;
+	ide_drive_t	*drive = (ide_drive_t *) m->private;
 	const char	*media;
-	int		len;
 
 	switch (drive->media) {
 	case ide_disk:		media = "disk\n";	break;
@@ -507,20 +600,30 @@ static int proc_ide_read_media
 	case ide_optical:	media = "optical\n";	break;
 	default:		media = "UNKNOWN\n";	break;
 	}
-	strcpy(page, media);
-	len = strlen(media);
-	PROC_IDE_READ_RETURN(page, start, off, count, eof, len);
+	seq_puts(m, media);
+	return 0;
+}
+
+static int ide_media_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ide_media_proc_show, PDE(inode)->data);
 }
 
+static const struct file_operations ide_media_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ide_media_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 static ide_proc_entry_t generic_drive_entries[] = {
-	{ "driver",	S_IFREG|S_IRUGO,	 proc_ide_read_driver,
-						 proc_ide_write_driver },
-	{ "identify",	S_IFREG|S_IRUSR,	 proc_ide_read_identify, NULL },
-	{ "media",	S_IFREG|S_IRUGO,	 proc_ide_read_media,    NULL },
-	{ "model",	S_IFREG|S_IRUGO,	 proc_ide_read_dmodel,   NULL },
-	{ "settings",	S_IFREG|S_IRUSR|S_IWUSR, proc_ide_read_settings,
-						 proc_ide_write_settings },
-	{ NULL,	0, NULL, NULL }
+	{ "driver",	S_IFREG|S_IRUGO,	 &ide_driver_proc_fops	},
+	{ "identify",	S_IFREG|S_IRUSR,	 &ide_identify_proc_fops},
+	{ "media",	S_IFREG|S_IRUGO,	 &ide_media_proc_fops	},
+	{ "model",	S_IFREG|S_IRUGO,	 &ide_dmodel_proc_fops	},
+	{ "settings",	S_IFREG|S_IRUSR|S_IWUSR, &ide_settings_proc_fops},
+	{}
 };
 
 static void ide_add_proc_entries(struct proc_dir_entry *dir, ide_proc_entry_t *p, void *data)
@@ -530,11 +633,8 @@ static void ide_add_proc_entries(struct proc_dir_entry *dir, ide_proc_entry_t *p
 	if (!dir || !p)
 		return;
 	while (p->name != NULL) {
-		ent = create_proc_entry(p->name, p->mode, dir);
+		ent = proc_create_data(p->name, p->mode, dir, p->proc_fops, data);
 		if (!ent) return;
-		ent->data = data;
-		ent->read_proc = p->read_proc;
-		ent->write_proc = p->write_proc;
 		p++;
 	}
 }
@@ -617,10 +717,10 @@ void ide_proc_unregister_device(ide_drive_t *drive)
 }
 
 static ide_proc_entry_t hwif_entries[] = {
-	{ "channel",	S_IFREG|S_IRUGO,	proc_ide_read_channel,	NULL },
-	{ "mate",	S_IFREG|S_IRUGO,	proc_ide_read_mate,	NULL },
-	{ "model",	S_IFREG|S_IRUGO,	proc_ide_read_imodel,	NULL },
-	{ NULL,	0, NULL, NULL }
+	{ "channel",	S_IFREG|S_IRUGO,	&ide_channel_proc_fops	},
+	{ "mate",	S_IFREG|S_IRUGO,	&ide_mate_proc_fops	},
+	{ "model",	S_IFREG|S_IRUGO,	&ide_imodel_proc_fops	},
+	{}
 };
 
 void ide_proc_register_port(ide_hwif_t *hwif)
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 7b2032bc357b..9d6f62baac27 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -31,6 +31,7 @@
 #include <linux/major.h>
 #include <linux/errno.h>
 #include <linux/genhd.h>
+#include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/pci.h>
 #include <linux/ide.h>
@@ -1816,22 +1817,32 @@ static void ide_tape_release(struct device *dev)
 }
 
 #ifdef CONFIG_IDE_PROC_FS
-static int proc_idetape_read_name
-	(char *page, char **start, off_t off, int count, int *eof, void *data)
+static int idetape_name_proc_show(struct seq_file *m, void *v)
 {
-	ide_drive_t	*drive = (ide_drive_t *) data;
+	ide_drive_t	*drive = (ide_drive_t *) m->private;
 	idetape_tape_t	*tape = drive->driver_data;
-	char		*out = page;
-	int		len;
 
-	len = sprintf(out, "%s\n", tape->name);
-	PROC_IDE_READ_RETURN(page, start, off, count, eof, len);
+	seq_printf(m, "%s\n", tape->name);
+	return 0;
+}
+
+static int idetape_name_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, idetape_name_proc_show, PDE(inode)->data);
 }
 
+static const struct file_operations idetape_name_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= idetape_name_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 static ide_proc_entry_t idetape_proc[] = {
-	{ "capacity",	S_IFREG|S_IRUGO,	proc_ide_read_capacity, NULL },
-	{ "name",	S_IFREG|S_IRUGO,	proc_idetape_read_name,	NULL },
-	{ NULL, 0, NULL, NULL }
+	{ "capacity",	S_IFREG|S_IRUGO,	&ide_capacity_proc_fops	},
+	{ "name",	S_IFREG|S_IRUGO,	&idetape_name_proc_fops	},
+	{}
 };
 
 static ide_proc_entry_t *ide_tape_proc_entries(ide_drive_t *drive)
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 803c1ae31237..e4135d6e0556 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -919,8 +919,7 @@ __IDE_PROC_DEVSET(_name, _min, _max, NULL, NULL)
 typedef struct {
 	const char	*name;
 	mode_t		mode;
-	read_proc_t	*read_proc;
-	write_proc_t	*write_proc;
+	const struct file_operations *proc_fops;
 } ide_proc_entry_t;
 
 void proc_ide_create(void);
@@ -932,24 +931,8 @@ void ide_proc_unregister_port(ide_hwif_t *);
 void ide_proc_register_driver(ide_drive_t *, struct ide_driver *);
 void ide_proc_unregister_driver(ide_drive_t *, struct ide_driver *);
 
-read_proc_t proc_ide_read_capacity;
-read_proc_t proc_ide_read_geometry;
-
-/*
- * Standard exit stuff:
- */
-#define PROC_IDE_READ_RETURN(page,start,off,count,eof,len) \
-{					\
-	len -= off;			\
-	if (len < count) {		\
-		*eof = 1;		\
-		if (len <= 0)		\
-			return 0;	\
-	} else				\
-		len = count;		\
-	*start = page + off;		\
-	return len;			\
-}
+extern const struct file_operations ide_capacity_proc_fops;
+extern const struct file_operations ide_geometry_proc_fops;
 #else
 static inline void proc_ide_create(void) { ; }
 static inline void proc_ide_destroy(void) { ; }
@@ -961,7 +944,6 @@ static inline void ide_proc_register_driver(ide_drive_t *drive,
 					    struct ide_driver *driver) { ; }
 static inline void ide_proc_unregister_driver(ide_drive_t *drive,
 					      struct ide_driver *driver) { ; }
-#define PROC_IDE_READ_RETURN(page,start,off,count,eof,len) return 0;
 #endif
 
 enum {
-- 
cgit v1.2.3-71-gd317


From 69575d388603365f2afbf4166df93152df59b165 Mon Sep 17 00:00:00 2001
From: Shane Wang <shane.wang@intel.com>
Date: Tue, 1 Sep 2009 18:25:07 -0700
Subject: x86, intel_txt: clean up the impact on generic code, unbreak non-x86

Move tboot.h from asm to linux to fix the build errors of intel_txt
patch on non-X86 platforms. Remove the tboot code from generic code
init/main.c and kernel/cpu.c.

Signed-off-by: Shane Wang <shane.wang@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig              |   4 +
 arch/x86/include/asm/tboot.h  | 197 ------------------------------------------
 arch/x86/kernel/reboot.c      |   3 +-
 arch/x86/kernel/setup.c       |   3 +-
 arch/x86/kernel/smpboot.c     |   2 +-
 arch/x86/kernel/tboot.c       |  58 ++++++++++---
 drivers/acpi/acpica/hwsleep.c |   2 +-
 drivers/pci/dmar.c            |   2 +-
 drivers/pci/intel-iommu.c     |   2 +-
 include/linux/tboot.h         | 162 ++++++++++++++++++++++++++++++++++
 init/main.c                   |   3 -
 kernel/cpu.c                  |   6 +-
 security/Kconfig              |   2 +-
 13 files changed, 221 insertions(+), 225 deletions(-)
 delete mode 100644 arch/x86/include/asm/tboot.h
 create mode 100644 include/linux/tboot.h

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 738bdc6b0f8b..b66f2102c35d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -178,6 +178,10 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING
 config ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	def_bool y
 
+config HAVE_INTEL_TXT
+	def_bool y
+	depends on EXPERIMENTAL && DMAR && ACPI
+
 # Use the generic interrupt handling code in kernel/irq/:
 config GENERIC_HARDIRQS
 	bool
diff --git a/arch/x86/include/asm/tboot.h b/arch/x86/include/asm/tboot.h
deleted file mode 100644
index b13929d4e5f4..000000000000
--- a/arch/x86/include/asm/tboot.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * tboot.h: shared data structure with tboot and kernel and functions
- *          used by kernel for runtime support of Intel(R) Trusted
- *          Execution Technology
- *
- * Copyright (c) 2006-2009, Intel Corporation
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
- *
- */
-
-#ifndef _ASM_TBOOT_H
-#define _ASM_TBOOT_H
-
-#include <acpi/acpi.h>
-
-/* these must have the values from 0-5 in this order */
-enum {
-	TB_SHUTDOWN_REBOOT = 0,
-	TB_SHUTDOWN_S5,
-	TB_SHUTDOWN_S4,
-	TB_SHUTDOWN_S3,
-	TB_SHUTDOWN_HALT,
-	TB_SHUTDOWN_WFS
-};
-
-#ifdef CONFIG_INTEL_TXT
-
-/* used to communicate between tboot and the launched kernel */
-
-#define TB_KEY_SIZE             64   /* 512 bits */
-
-#define MAX_TB_MAC_REGIONS      32
-
-struct tboot_mac_region {
-	u64  start;         /* must be 64 byte -aligned */
-	u32  size;          /* must be 64 byte -granular */
-} __packed;
-
-/* GAS - Generic Address Structure (ACPI 2.0+) */
-struct tboot_acpi_generic_address {
-	u8  space_id;
-	u8  bit_width;
-	u8  bit_offset;
-	u8  access_width;
-	u64 address;
-} __packed;
-
-/*
- * combines Sx info from FADT and FACS tables per ACPI 2.0+ spec
- * (http://www.acpi.info/)
- */
-struct tboot_acpi_sleep_info {
-	struct tboot_acpi_generic_address pm1a_cnt_blk;
-	struct tboot_acpi_generic_address pm1b_cnt_blk;
-	struct tboot_acpi_generic_address pm1a_evt_blk;
-	struct tboot_acpi_generic_address pm1b_evt_blk;
-	u16 pm1a_cnt_val;
-	u16 pm1b_cnt_val;
-	u64 wakeup_vector;
-	u32 vector_width;
-	u64 kernel_s3_resume_vector;
-} __packed;
-
-/*
- * shared memory page used for communication between tboot and kernel
- */
-struct tboot {
-	/*
-	 * version 3+ fields:
-	 */
-
-	/* TBOOT_UUID */
-	u8 uuid[16];
-
-	/* version number: 5 is current */
-	u32 version;
-
-	/* physical addr of tb_log_t log */
-	u32 log_addr;
-
-	/*
-	 * physical addr of entry point for tboot shutdown and
-	 * type of shutdown (TB_SHUTDOWN_*) being requested
-	 */
-	u32 shutdown_entry;
-	u32 shutdown_type;
-
-	/* kernel-specified ACPI info for Sx shutdown */
-	struct tboot_acpi_sleep_info acpi_sinfo;
-
-	/* tboot location in memory (physical) */
-	u32 tboot_base;
-	u32 tboot_size;
-
-	/* memory regions (phys addrs) for tboot to MAC on S3 */
-	u8 num_mac_regions;
-	struct tboot_mac_region mac_regions[MAX_TB_MAC_REGIONS];
-
-
-	/*
-	 * version 4+ fields:
-	 */
-
-	/* symmetric key for use by kernel; will be encrypted on S3 */
-	u8 s3_key[TB_KEY_SIZE];
-
-
-	/*
-	 * version 5+ fields:
-	 */
-
-	/* used to 4byte-align num_in_wfs */
-	u8 reserved_align[3];
-
-	/* number of processors in wait-for-SIPI */
-	u32 num_in_wfs;
-} __packed;
-
-/*
- * UUID for tboot data struct to facilitate matching
- * defined as {663C8DFF-E8B3-4b82-AABF-19EA4D057A08} by tboot, which is
- * represented as {} in the char array used here
- */
-#define TBOOT_UUID	{0xff, 0x8d, 0x3c, 0x66, 0xb3, 0xe8, 0x82, 0x4b, 0xbf,\
-			 0xaa, 0x19, 0xea, 0x4d, 0x5, 0x7a, 0x8}
-
-extern struct tboot *tboot;
-
-static inline int tboot_enabled(void)
-{
-	return tboot != NULL;
-}
-
-extern void tboot_probe(void);
-extern void tboot_create_trampoline(void);
-extern void tboot_shutdown(u32 shutdown_type);
-extern void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control);
-extern int tboot_wait_for_aps(int num_aps);
-extern struct acpi_table_header *tboot_get_dmar_table(
-				      struct acpi_table_header *dmar_tbl);
-extern int tboot_force_iommu(void);
-
-#else     /* CONFIG_INTEL_TXT */
-
-static inline int tboot_enabled(void)
-{
-	return 0;
-}
-
-static inline void tboot_probe(void)
-{
-}
-
-static inline void tboot_create_trampoline(void)
-{
-}
-
-static inline void tboot_shutdown(u32 shutdown_type)
-{
-}
-
-static inline void tboot_sleep(u8 sleep_state, u32 pm1a_control,
-			       u32 pm1b_control)
-{
-}
-
-static inline int tboot_wait_for_aps(int num_aps)
-{
-	return 0;
-}
-
-static inline struct acpi_table_header *tboot_get_dmar_table(
-					struct acpi_table_header *dmar_tbl)
-{
-	return dmar_tbl;
-}
-
-static inline int tboot_force_iommu(void)
-{
-	return 0;
-}
-
-#endif /* !CONFIG_INTEL_TXT */
-
-#endif /* _ASM_TBOOT_H */
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 9de01c5d9794..18ce5c04242a 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -3,6 +3,7 @@
 #include <linux/init.h>
 #include <linux/pm.h>
 #include <linux/efi.h>
+#include <linux/tboot.h>
 #include <acpi/reboot.h>
 #include <asm/io.h>
 #include <asm/apic.h>
@@ -24,8 +25,6 @@
 # include <asm/iommu.h>
 #endif
 
-#include <asm/tboot.h>
-
 /*
  * Power off function, if any
  */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 80d6e9e32483..6ce0d6f38f7f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -66,6 +66,7 @@
 
 #include <linux/percpu.h>
 #include <linux/crash_dump.h>
+#include <linux/tboot.h>
 
 #include <video/edid.h>
 
@@ -145,8 +146,6 @@ struct boot_params __initdata boot_params;
 struct boot_params boot_params;
 #endif
 
-#include <asm/tboot.h>
-
 /*
  * Machine setup..
  */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 61cc40887c48..7d9d8eea20a0 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -47,6 +47,7 @@
 #include <linux/bootmem.h>
 #include <linux/err.h>
 #include <linux/nmi.h>
+#include <linux/tboot.h>
 
 #include <asm/acpi.h>
 #include <asm/desc.h>
@@ -62,7 +63,6 @@
 #include <asm/vmi.h>
 #include <asm/apic.h>
 #include <asm/setup.h>
-#include <asm/tboot.h>
 #include <asm/uv/uv.h>
 #include <linux/mc146818rtc.h>
 
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index c2e760ca7b01..86c9f91b48ae 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -22,11 +22,14 @@
 #include <linux/dma_remapping.h>
 #include <linux/init_task.h>
 #include <linux/spinlock.h>
+#include <linux/delay.h>
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/dmar.h>
+#include <linux/cpu.h>
 #include <linux/pfn.h>
 #include <linux/mm.h>
+#include <linux/tboot.h>
 
 #include <asm/trampoline.h>
 #include <asm/processor.h>
@@ -36,7 +39,6 @@
 #include <asm/fixmap.h>
 #include <asm/proto.h>
 #include <asm/setup.h>
-#include <asm/tboot.h>
 #include <asm/e820.h>
 #include <asm/io.h>
 
@@ -154,13 +156,10 @@ static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn,
 	return 0;
 }
 
-void tboot_create_trampoline(void)
+static void tboot_create_trampoline(void)
 {
 	u32 map_base, map_size;
 
-	if (!tboot_enabled())
-		return;
-
 	/* Create identity map for tboot shutdown code. */
 	map_base = PFN_DOWN(tboot->tboot_base);
 	map_size = PFN_UP(tboot->tboot_size);
@@ -295,21 +294,58 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
 	tboot_shutdown(acpi_shutdown_map[sleep_state]);
 }
 
-int tboot_wait_for_aps(int num_aps)
+static atomic_t ap_wfs_count;
+
+static int tboot_wait_for_aps(int num_aps)
 {
 	unsigned long timeout;
 
+	timeout = AP_WAIT_TIMEOUT*HZ;
+	while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps &&
+	       timeout) {
+		mdelay(1);
+		timeout--;
+	}
+
+	if (timeout)
+		pr_warning("tboot wait for APs timeout\n");
+
+	return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps);
+}
+
+static int __cpuinit tboot_cpu_callback(struct notifier_block *nfb,
+			unsigned long action, void *hcpu)
+{
+	switch (action) {
+	case CPU_DYING:
+		atomic_inc(&ap_wfs_count);
+		if (num_online_cpus() == 1)
+			if (tboot_wait_for_aps(atomic_read(&ap_wfs_count)))
+				return NOTIFY_BAD;
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block tboot_cpu_notifier __cpuinitdata =
+{
+	.notifier_call = tboot_cpu_callback,
+};
+
+static __init int tboot_late_init(void)
+{
 	if (!tboot_enabled())
 		return 0;
 
-	timeout = jiffies + AP_WAIT_TIMEOUT*HZ;
-	while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps &&
-	       time_before(jiffies, timeout))
-		cpu_relax();
+	tboot_create_trampoline();
 
-	return time_before(jiffies, timeout) ? 0 : 1;
+	atomic_set(&ap_wfs_count, 0);
+	register_hotcpu_notifier(&tboot_cpu_notifier);
+	return 0;
 }
 
+late_initcall(tboot_late_init);
+
 /*
  * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE)
  */
diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c
index 8c01dd3724e0..cc22f9a585b0 100644
--- a/drivers/acpi/acpica/hwsleep.c
+++ b/drivers/acpi/acpica/hwsleep.c
@@ -45,7 +45,7 @@
 #include <acpi/acpi.h>
 #include "accommon.h"
 #include "actables.h"
-#include <asm/tboot.h>
+#include <linux/tboot.h>
 
 #define _COMPONENT          ACPI_HARDWARE
 ACPI_MODULE_NAME("hwsleep")
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 0cbc5fd26c3c..ab99783dccec 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -33,7 +33,7 @@
 #include <linux/timer.h>
 #include <linux/irq.h>
 #include <linux/interrupt.h>
-#include <asm/tboot.h>
+#include <linux/tboot.h>
 
 #undef PREFIX
 #define PREFIX "DMAR:"
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 2dc72a6d7412..833509b53527 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -37,8 +37,8 @@
 #include <linux/iommu.h>
 #include <linux/intel-iommu.h>
 #include <linux/sysdev.h>
+#include <linux/tboot.h>
 #include <asm/cacheflush.h>
-#include <asm/tboot.h>
 #include <asm/iommu.h>
 #include "pci.h"
 
diff --git a/include/linux/tboot.h b/include/linux/tboot.h
new file mode 100644
index 000000000000..bf2a0c748878
--- /dev/null
+++ b/include/linux/tboot.h
@@ -0,0 +1,162 @@
+/*
+ * tboot.h: shared data structure with tboot and kernel and functions
+ *          used by kernel for runtime support of Intel(R) Trusted
+ *          Execution Technology
+ *
+ * Copyright (c) 2006-2009, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#ifndef _LINUX_TBOOT_H
+#define _LINUX_TBOOT_H
+
+/* these must have the values from 0-5 in this order */
+enum {
+	TB_SHUTDOWN_REBOOT = 0,
+	TB_SHUTDOWN_S5,
+	TB_SHUTDOWN_S4,
+	TB_SHUTDOWN_S3,
+	TB_SHUTDOWN_HALT,
+	TB_SHUTDOWN_WFS
+};
+
+#ifdef CONFIG_INTEL_TXT
+#include <acpi/acpi.h>
+/* used to communicate between tboot and the launched kernel */
+
+#define TB_KEY_SIZE             64   /* 512 bits */
+
+#define MAX_TB_MAC_REGIONS      32
+
+struct tboot_mac_region {
+	u64  start;         /* must be 64 byte -aligned */
+	u32  size;          /* must be 64 byte -granular */
+} __packed;
+
+/* GAS - Generic Address Structure (ACPI 2.0+) */
+struct tboot_acpi_generic_address {
+	u8  space_id;
+	u8  bit_width;
+	u8  bit_offset;
+	u8  access_width;
+	u64 address;
+} __packed;
+
+/*
+ * combines Sx info from FADT and FACS tables per ACPI 2.0+ spec
+ * (http://www.acpi.info/)
+ */
+struct tboot_acpi_sleep_info {
+	struct tboot_acpi_generic_address pm1a_cnt_blk;
+	struct tboot_acpi_generic_address pm1b_cnt_blk;
+	struct tboot_acpi_generic_address pm1a_evt_blk;
+	struct tboot_acpi_generic_address pm1b_evt_blk;
+	u16 pm1a_cnt_val;
+	u16 pm1b_cnt_val;
+	u64 wakeup_vector;
+	u32 vector_width;
+	u64 kernel_s3_resume_vector;
+} __packed;
+
+/*
+ * shared memory page used for communication between tboot and kernel
+ */
+struct tboot {
+	/*
+	 * version 3+ fields:
+	 */
+
+	/* TBOOT_UUID */
+	u8 uuid[16];
+
+	/* version number: 5 is current */
+	u32 version;
+
+	/* physical addr of tb_log_t log */
+	u32 log_addr;
+
+	/*
+	 * physical addr of entry point for tboot shutdown and
+	 * type of shutdown (TB_SHUTDOWN_*) being requested
+	 */
+	u32 shutdown_entry;
+	u32 shutdown_type;
+
+	/* kernel-specified ACPI info for Sx shutdown */
+	struct tboot_acpi_sleep_info acpi_sinfo;
+
+	/* tboot location in memory (physical) */
+	u32 tboot_base;
+	u32 tboot_size;
+
+	/* memory regions (phys addrs) for tboot to MAC on S3 */
+	u8 num_mac_regions;
+	struct tboot_mac_region mac_regions[MAX_TB_MAC_REGIONS];
+
+
+	/*
+	 * version 4+ fields:
+	 */
+
+	/* symmetric key for use by kernel; will be encrypted on S3 */
+	u8 s3_key[TB_KEY_SIZE];
+
+
+	/*
+	 * version 5+ fields:
+	 */
+
+	/* used to 4byte-align num_in_wfs */
+	u8 reserved_align[3];
+
+	/* number of processors in wait-for-SIPI */
+	u32 num_in_wfs;
+} __packed;
+
+/*
+ * UUID for tboot data struct to facilitate matching
+ * defined as {663C8DFF-E8B3-4b82-AABF-19EA4D057A08} by tboot, which is
+ * represented as {} in the char array used here
+ */
+#define TBOOT_UUID	{0xff, 0x8d, 0x3c, 0x66, 0xb3, 0xe8, 0x82, 0x4b, 0xbf,\
+			 0xaa, 0x19, 0xea, 0x4d, 0x5, 0x7a, 0x8}
+
+extern struct tboot *tboot;
+
+static inline int tboot_enabled(void)
+{
+	return tboot != NULL;
+}
+
+extern void tboot_probe(void);
+extern void tboot_shutdown(u32 shutdown_type);
+extern void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control);
+extern struct acpi_table_header *tboot_get_dmar_table(
+				      struct acpi_table_header *dmar_tbl);
+extern int tboot_force_iommu(void);
+
+#else
+
+#define tboot_probe()			do { } while (0)
+#define tboot_shutdown(shutdown_type)	do { } while (0)
+#define tboot_sleep(sleep_state, pm1a_control, pm1b_control)	\
+					do { } while (0)
+#define tboot_get_dmar_table(dmar_tbl)	(dmar_tbl)
+#define tboot_force_iommu()		0
+
+#endif /* !CONFIG_INTEL_TXT */
+
+#endif /* _LINUX_TBOOT_H */
diff --git a/init/main.c b/init/main.c
index 56ada27c4f47..2c5ade79eb81 100644
--- a/init/main.c
+++ b/init/main.c
@@ -73,7 +73,6 @@
 #include <asm/io.h>
 #include <asm/bugs.h>
 #include <asm/setup.h>
-#include <asm/tboot.h>
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
 
@@ -716,8 +715,6 @@ asmlinkage void __init start_kernel(void)
 
 	ftrace_init();
 
-	tboot_create_trampoline();
-
 	/* Do the rest non-__init'ed, we're now alive */
 	rest_init();
 }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index ff071e022a85..67a60076dd7e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -14,7 +14,6 @@
 #include <linux/kthread.h>
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
-#include <asm/tboot.h>
 
 #ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
@@ -377,7 +376,7 @@ static cpumask_var_t frozen_cpus;
 
 int disable_nonboot_cpus(void)
 {
-	int cpu, first_cpu, error, num_cpus = 0;
+	int cpu, first_cpu, error;
 
 	error = stop_machine_create();
 	if (error)
@@ -392,7 +391,6 @@ int disable_nonboot_cpus(void)
 	for_each_online_cpu(cpu) {
 		if (cpu == first_cpu)
 			continue;
-		num_cpus++;
 		error = _cpu_down(cpu, 1);
 		if (!error) {
 			cpumask_set_cpu(cpu, frozen_cpus);
@@ -403,8 +401,6 @@ int disable_nonboot_cpus(void)
 			break;
 		}
 	}
-	/* ensure all CPUs have gone into wait-for-SIPI */
-	error |= tboot_wait_for_aps(num_cpus);
 
 	if (!error) {
 		BUG_ON(num_online_cpus() > 1);
diff --git a/security/Kconfig b/security/Kconfig
index 6631774672c1..5721847a7a62 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -115,7 +115,7 @@ config SECURITY_ROOTPLUG
 
 config INTEL_TXT
 	bool "Enable Intel(R) Trusted Execution Technology (Intel(R) TXT)"
-	depends on EXPERIMENTAL && X86 && DMAR && ACPI
+	depends on HAVE_INTEL_TXT
 	help
 	  This option enables support for booting the kernel with the
 	  Trusted Boot (tboot) module. This will utilize
-- 
cgit v1.2.3-71-gd317


From a649637c73a36174287a403cdda7607177d64523 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 28 Aug 2009 08:45:01 -0400
Subject: nfsd41: bound forechannel drc size by memory usage

By using the requested ca_maxresponsesize_cached * ca_maxresponses to bound
a forechannel drc request size, clients can tailor a session to usage.

For example, an I/O session (READ/WRITE only) can have a much smaller
ca_maxresponsesize_cached (for only WRITE compound responses) and a lot larger
ca_maxresponses to service a large in-flight data window.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c        | 66 +++++++++++++++++++++++++++++++++-------------
 include/linux/nfsd/state.h |  8 ++++--
 2 files changed, 54 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b44a2cfde6f1..02b3ddd0bee3 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -414,34 +414,64 @@ gen_sessionid(struct nfsd4_session *ses)
 }
 
 /*
- * Give the client the number of slots it requests bound by
- * NFSD_MAX_SLOTS_PER_SESSION and by nfsd_drc_max_mem.
+ * The protocol defines ca_maxresponssize_cached to include the size of
+ * the rpc header, but all we need to cache is the data starting after
+ * the end of the initial SEQUENCE operation--the rest we regenerate
+ * each time.  Therefore we can advertise a ca_maxresponssize_cached
+ * value that is the number of bytes in our cache plus a few additional
+ * bytes.  In order to stay on the safe side, and not promise more than
+ * we can cache, those additional bytes must be the minimum possible: 24
+ * bytes of rpc header (xid through accept state, with AUTH_NULL
+ * verifier), 12 for the compound header (with zero-length tag), and 44
+ * for the SEQUENCE op response:
+ */
+#define NFSD_MIN_HDR_SEQ_SZ  (24 + 12 + 44)
+
+/*
+ * Give the client the number of ca_maxresponsesize_cached slots it
+ * requests, of size bounded by NFSD_SLOT_CACHE_SIZE,
+ * NFSD_MAX_MEM_PER_SESSION, and nfsd_drc_max_mem. Do not allow more
+ * than NFSD_MAX_SLOTS_PER_SESSION.
  *
- * If we run out of reserved DRC memory we should (up to a point) re-negotiate
- * active sessions and reduce their slot usage to make rooom for new
- * connections. For now we just fail the create session.
+ * If we run out of reserved DRC memory we should (up to a point)
+ * re-negotiate active sessions and reduce their slot usage to make
+ * rooom for new connections. For now we just fail the create session.
  */
-static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
+static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan)
 {
-	int mem;
+	int mem, size = fchan->maxresp_cached;
 
 	if (fchan->maxreqs < 1)
 		return nfserr_inval;
-	else if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
-		fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
 
-	mem = fchan->maxreqs * NFSD_SLOT_CACHE_SIZE;
+	if (size < NFSD_MIN_HDR_SEQ_SZ)
+		size = NFSD_MIN_HDR_SEQ_SZ;
+	size -= NFSD_MIN_HDR_SEQ_SZ;
+	if (size > NFSD_SLOT_CACHE_SIZE)
+		size = NFSD_SLOT_CACHE_SIZE;
+
+	/* bound the maxreqs by NFSD_MAX_MEM_PER_SESSION */
+	mem = fchan->maxreqs * size;
+	if (mem > NFSD_MAX_MEM_PER_SESSION) {
+		fchan->maxreqs = NFSD_MAX_MEM_PER_SESSION / size;
+		if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
+			fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
+		mem = fchan->maxreqs * size;
+	}
 
 	spin_lock(&nfsd_drc_lock);
-	if (mem + nfsd_drc_mem_used > nfsd_drc_max_mem)
-		mem = ((nfsd_drc_max_mem - nfsd_drc_mem_used) /
-				NFSD_SLOT_CACHE_SIZE) * NFSD_SLOT_CACHE_SIZE;
+	/* bound the total session drc memory ussage */
+	if (mem + nfsd_drc_mem_used > nfsd_drc_max_mem) {
+		fchan->maxreqs = (nfsd_drc_max_mem - nfsd_drc_mem_used) / size;
+		mem = fchan->maxreqs * size;
+	}
 	nfsd_drc_mem_used += mem;
 	spin_unlock(&nfsd_drc_lock);
 
-	fchan->maxreqs = mem / NFSD_SLOT_CACHE_SIZE;
 	if (fchan->maxreqs == 0)
 		return nfserr_resource;
+
+	fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ;
 	return 0;
 }
 
@@ -466,9 +496,6 @@ static int init_forechannel_attrs(struct svc_rqst *rqstp,
 		fchan->maxresp_sz = maxcount;
 	session_fchan->maxresp_sz = fchan->maxresp_sz;
 
-	session_fchan->maxresp_cached = NFSD_SLOT_CACHE_SIZE;
-	fchan->maxresp_cached = session_fchan->maxresp_cached;
-
 	/* Use the client's maxops if possible */
 	if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
 		fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
@@ -478,9 +505,12 @@ static int init_forechannel_attrs(struct svc_rqst *rqstp,
 	 * recover pages from existing sessions. For now fail session
 	 * creation.
 	 */
-	status = set_forechannel_maxreqs(fchan);
+	status = set_forechannel_drc_size(fchan);
 
+	session_fchan->maxresp_cached = fchan->maxresp_cached;
 	session_fchan->maxreqs = fchan->maxreqs;
+
+	dprintk("%s status %d\n", __func__, status);
 	return status;
 }
 
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index fb0c404c7c5c..ff0b771efde6 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -92,13 +92,17 @@ struct nfs4_cb_conn {
 	struct rpc_cred	*	cb_cred;
 };
 
-/* Maximum number of slots per session. 128 is useful for long haul TCP */
-#define NFSD_MAX_SLOTS_PER_SESSION	128
+/* Maximum number of slots per session. 160 is useful for long haul TCP */
+#define NFSD_MAX_SLOTS_PER_SESSION     160
 /* Maximum number of pages per slot cache entry */
 #define NFSD_PAGES_PER_SLOT	1
 #define NFSD_SLOT_CACHE_SIZE		PAGE_SIZE
 /* Maximum number of operations per session compound */
 #define NFSD_MAX_OPS_PER_COMPOUND	16
+/* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */
+#define NFSD_CACHE_SIZE_SLOTS_PER_SESSION	32
+#define NFSD_MAX_MEM_PER_SESSION  \
+		(NFSD_CACHE_SIZE_SLOTS_PER_SESSION * NFSD_SLOT_CACHE_SIZE)
 
 struct nfsd4_cache_entry {
 	__be32		ce_status;
-- 
cgit v1.2.3-71-gd317


From 557ce2646e775f6bda734dd92b10d4780874b9c7 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 28 Aug 2009 08:45:04 -0400
Subject: nfsd41: replace page based DRC with buffer based DRC

Use NFSD_SLOT_CACHE_SIZE size buffers for sessions DRC instead of holding nfsd
pages in cache.

Connectathon testing has shown that 1024 bytes for encoded compound operation
responses past the sequence operation is sufficient, 512 bytes is a little too
small. Set NFSD_SLOT_CACHE_SIZE to 1024.

Allocate memory for the session DRC in the CREATE_SESSION operation
to guarantee that the memory resource is available for caching responses.
Allocate each slot individually in preparation for slot table size negotiation.

Remove struct nfsd4_cache_entry and helper functions for the old page-based
DRC.

The iov_len calculation in nfs4svc_encode_compoundres is now always
correct.  Replay is now done in nfsd4_sequence under the state lock, so
the session ref count is only bumped on non-replay. Clean up the
nfs4svc_encode_compoundres session logic.

The nfsd4_compound_state statp pointer is also not used.
Remove nfsd4_set_statp().

Move useful nfsd4_cache_entry fields into nfsd4_slot.

Signed-off-by: Andy Adamson <andros@netapp.com
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c        | 211 +++++++++++++--------------------------------
 fs/nfsd/nfs4xdr.c          |  17 ++--
 fs/nfsd/nfssvc.c           |   4 -
 include/linux/nfsd/state.h |  27 +++---
 include/linux/nfsd/xdr4.h  |   5 +-
 5 files changed, 79 insertions(+), 185 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c9a45f49019d..46e9ac526872 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -514,12 +514,23 @@ static int init_forechannel_attrs(struct svc_rqst *rqstp,
 	return status;
 }
 
+static void
+free_session_slots(struct nfsd4_session *ses)
+{
+	int i;
+
+	for (i = 0; i < ses->se_fchannel.maxreqs; i++)
+		kfree(ses->se_slots[i]);
+}
+
 static int
 alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
 		   struct nfsd4_create_session *cses)
 {
 	struct nfsd4_session *new, tmp;
-	int idx, status = nfserr_serverfault, slotsize;
+	struct nfsd4_slot *sp;
+	int idx, slotsize, cachesize, i;
+	int status;
 
 	memset(&tmp, 0, sizeof(tmp));
 
@@ -530,14 +541,27 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
 	if (status)
 		goto out;
 
-	/* allocate struct nfsd4_session and slot table in one piece */
-	slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot);
+	BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot)
+		     + sizeof(struct nfsd4_session) > PAGE_SIZE);
+
+	status = nfserr_serverfault;
+	/* allocate struct nfsd4_session and slot table pointers in one piece */
+	slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *);
 	new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
 	if (!new)
 		goto out;
 
 	memcpy(new, &tmp, sizeof(*new));
 
+	/* allocate each struct nfsd4_slot and data cache in one piece */
+	cachesize = new->se_fchannel.maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
+	for (i = 0; i < new->se_fchannel.maxreqs; i++) {
+		sp = kzalloc(sizeof(*sp) + cachesize, GFP_KERNEL);
+		if (!sp)
+			goto out_free;
+		new->se_slots[i] = sp;
+	}
+
 	new->se_client = clp;
 	gen_sessionid(new);
 	idx = hash_sessionid(&new->se_sessionid);
@@ -554,6 +578,10 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
 	status = nfs_ok;
 out:
 	return status;
+out_free:
+	free_session_slots(new);
+	kfree(new);
+	goto out;
 }
 
 /* caller must hold sessionid_lock */
@@ -596,22 +624,16 @@ release_session(struct nfsd4_session *ses)
 	nfsd4_put_session(ses);
 }
 
-static void nfsd4_release_respages(struct page **respages, short resused);
-
 void
 free_session(struct kref *kref)
 {
 	struct nfsd4_session *ses;
-	int i;
 
 	ses = container_of(kref, struct nfsd4_session, se_ref);
-	for (i = 0; i < ses->se_fchannel.maxreqs; i++) {
-		struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry;
-		nfsd4_release_respages(e->ce_respages, e->ce_resused);
-	}
 	spin_lock(&nfsd_drc_lock);
 	nfsd_drc_mem_used -= ses->se_fchannel.maxreqs * NFSD_SLOT_CACHE_SIZE;
 	spin_unlock(&nfsd_drc_lock);
+	free_session_slots(ses);
 	kfree(ses);
 }
 
@@ -968,116 +990,31 @@ out_err:
 	return;
 }
 
-void
-nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
-{
-	struct nfsd4_compoundres *resp = rqstp->rq_resp;
-
-	resp->cstate.statp = statp;
-}
-
-/*
- * Dereference the result pages.
- */
-static void
-nfsd4_release_respages(struct page **respages, short resused)
-{
-	int i;
-
-	dprintk("--> %s\n", __func__);
-	for (i = 0; i < resused; i++) {
-		if (!respages[i])
-			continue;
-		put_page(respages[i]);
-		respages[i] = NULL;
-	}
-}
-
-static void
-nfsd4_copy_pages(struct page **topages, struct page **frompages, short count)
-{
-	int i;
-
-	for (i = 0; i < count; i++) {
-		topages[i] = frompages[i];
-		if (!topages[i])
-			continue;
-		get_page(topages[i]);
-	}
-}
-
 /*
- * Cache the reply pages up to NFSD_PAGES_PER_SLOT + 1, clearing the previous
- * pages. We add a page to NFSD_PAGES_PER_SLOT for the case where the total
- * length of the XDR response is less than se_fmaxresp_cached
- * (NFSD_PAGES_PER_SLOT * PAGE_SIZE) but the xdr_buf pages is used for a
- * of the reply (e.g. readdir).
- *
- * Store the base and length of the rq_req.head[0] page
- * of the NFSv4.1 data, just past the rpc header.
+ * Cache a reply. nfsd4_check_drc_limit() has bounded the cache size.
  */
 void
 nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
 {
-	struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
-	struct svc_rqst *rqstp = resp->rqstp;
-	struct kvec *resv = &rqstp->rq_res.head[0];
-
-	dprintk("--> %s entry %p\n", __func__, entry);
+	struct nfsd4_slot *slot = resp->cstate.slot;
+	unsigned int base;
 
-	nfsd4_release_respages(entry->ce_respages, entry->ce_resused);
-	entry->ce_opcnt = resp->opcnt;
-	entry->ce_status = resp->cstate.status;
+	dprintk("--> %s slot %p\n", __func__, slot);
 
-	/*
-	 * Don't need a page to cache just the sequence operation - the slot
-	 * does this for us!
-	 */
+	slot->sl_opcnt = resp->opcnt;
+	slot->sl_status = resp->cstate.status;
 
 	if (nfsd4_not_cached(resp)) {
-		entry->ce_resused = 0;
-		entry->ce_rpchdrlen = 0;
-		dprintk("%s Just cache SEQUENCE. ce_cachethis %d\n", __func__,
-			resp->cstate.slot->sl_cache_entry.ce_cachethis);
+		slot->sl_datalen = 0;
 		return;
 	}
-	entry->ce_resused = rqstp->rq_resused;
-	if (entry->ce_resused > NFSD_PAGES_PER_SLOT + 1)
-		entry->ce_resused = NFSD_PAGES_PER_SLOT + 1;
-	nfsd4_copy_pages(entry->ce_respages, rqstp->rq_respages,
-			 entry->ce_resused);
-	entry->ce_datav.iov_base = resp->cstate.statp;
-	entry->ce_datav.iov_len = resv->iov_len - ((char *)resp->cstate.statp -
-				(char *)page_address(rqstp->rq_respages[0]));
-	/* Current request rpc header length*/
-	entry->ce_rpchdrlen = (char *)resp->cstate.statp -
-				(char *)page_address(rqstp->rq_respages[0]);
-}
-
-/*
- * We keep the rpc header, but take the nfs reply from the replycache.
- */
-static int
-nfsd41_copy_replay_data(struct nfsd4_compoundres *resp,
-			struct nfsd4_cache_entry *entry)
-{
-	struct svc_rqst *rqstp = resp->rqstp;
-	struct kvec *resv = &resp->rqstp->rq_res.head[0];
-	int len;
-
-	/* Current request rpc header length*/
-	len = (char *)resp->cstate.statp -
-			(char *)page_address(rqstp->rq_respages[0]);
-	if (entry->ce_datav.iov_len + len > PAGE_SIZE) {
-		dprintk("%s v41 cached reply too large (%Zd).\n", __func__,
-			entry->ce_datav.iov_len);
-		return 0;
-	}
-	/* copy the cached reply nfsd data past the current rpc header */
-	memcpy((char *)resv->iov_base + len, entry->ce_datav.iov_base,
-		entry->ce_datav.iov_len);
-	resv->iov_len = len + entry->ce_datav.iov_len;
-	return 1;
+	slot->sl_datalen = (char *)resp->p - (char *)resp->cstate.datap;
+	base = (char *)resp->cstate.datap -
+					(char *)resp->xbuf->head[0].iov_base;
+	if (read_bytes_from_xdr_buf(resp->xbuf, base, slot->sl_data,
+				    slot->sl_datalen))
+		WARN("%s: sessions DRC could not cache compound\n", __func__);
+	return;
 }
 
 /*
@@ -1095,14 +1032,14 @@ nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args,
 	struct nfsd4_slot *slot = resp->cstate.slot;
 
 	dprintk("--> %s resp->opcnt %d cachethis %u \n", __func__,
-		resp->opcnt, resp->cstate.slot->sl_cache_entry.ce_cachethis);
+		resp->opcnt, resp->cstate.slot->sl_cachethis);
 
 	/* Encode the replayed sequence operation */
 	op = &args->ops[resp->opcnt - 1];
 	nfsd4_encode_operation(resp, op);
 
 	/* Return nfserr_retry_uncached_rep in next operation. */
-	if (args->opcnt > 1 && slot->sl_cache_entry.ce_cachethis == 0) {
+	if (args->opcnt > 1 && slot->sl_cachethis == 0) {
 		op = &args->ops[resp->opcnt++];
 		op->status = nfserr_retry_uncached_rep;
 		nfsd4_encode_operation(resp, op);
@@ -1111,57 +1048,29 @@ nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args,
 }
 
 /*
- * Keep the first page of the replay. Copy the NFSv4.1 data from the first
- * cached page.  Replace any futher replay pages from the cache.
+ * The sequence operation is not cached because we can use the slot and
+ * session values.
  */
 __be32
 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
 			 struct nfsd4_sequence *seq)
 {
-	struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
+	struct nfsd4_slot *slot = resp->cstate.slot;
 	__be32 status;
 
-	dprintk("--> %s entry %p\n", __func__, entry);
-
-	/*
-	 * If this is just the sequence operation, we did not keep
-	 * a page in the cache entry because we can just use the
-	 * slot info stored in struct nfsd4_sequence that was checked
-	 * against the slot in nfsd4_sequence().
-	 *
-	 * This occurs when seq->cachethis is FALSE, or when the client
-	 * session inactivity timer fires and a solo sequence operation
-	 * is sent (lease renewal).
-	 */
+	dprintk("--> %s slot %p\n", __func__, slot);
 
 	/* Either returns 0 or nfserr_retry_uncached */
 	status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp);
 	if (status == nfserr_retry_uncached_rep)
 		return status;
 
-	if (!nfsd41_copy_replay_data(resp, entry)) {
-		/*
-		 * Not enough room to use the replay rpc header, send the
-		 * cached header. Release all the allocated result pages.
-		 */
-		svc_free_res_pages(resp->rqstp);
-		nfsd4_copy_pages(resp->rqstp->rq_respages, entry->ce_respages,
-			entry->ce_resused);
-	} else {
-		/* Release all but the first allocated result page */
-
-		resp->rqstp->rq_resused--;
-		svc_free_res_pages(resp->rqstp);
-
-		nfsd4_copy_pages(&resp->rqstp->rq_respages[1],
-				 &entry->ce_respages[1],
-				 entry->ce_resused - 1);
-	}
+	/* The sequence operation has been encoded, cstate->datap set. */
+	memcpy(resp->cstate.datap, slot->sl_data, slot->sl_datalen);
 
-	resp->rqstp->rq_resused = entry->ce_resused;
-	resp->opcnt = entry->ce_opcnt;
-	resp->cstate.iovlen = entry->ce_datav.iov_len + entry->ce_rpchdrlen;
-	status = entry->ce_status;
+	resp->opcnt = slot->sl_opcnt;
+	resp->p = resp->cstate.datap + XDR_QUADLEN(slot->sl_datalen);
+	status = slot->sl_status;
 
 	return status;
 }
@@ -1493,7 +1402,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	if (seq->slotid >= session->se_fchannel.maxreqs)
 		goto out;
 
-	slot = &session->se_slots[seq->slotid];
+	slot = session->se_slots[seq->slotid];
 	dprintk("%s: slotid %d\n", __func__, seq->slotid);
 
 	/* We do not negotiate the number of slots yet, so set the
@@ -1506,7 +1415,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 		cstate->slot = slot;
 		cstate->session = session;
 		/* Return the cached reply status and set cstate->status
-		 * for nfsd4_svc_encode_compoundres processing */
+		 * for nfsd4_proc_compound processing */
 		status = nfsd4_replay_cache_entry(resp, seq);
 		cstate->status = nfserr_replay_cache;
 		goto out;
@@ -1517,7 +1426,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	/* Success! bump slot seqid */
 	slot->sl_inuse = true;
 	slot->sl_seqid = seq->seqid;
-	slot->sl_cache_entry.ce_cachethis = seq->cachethis;
+	slot->sl_cachethis = seq->cachethis;
 
 	cstate->slot = slot;
 	cstate->session = session;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 20c5e3db0660..00ed16a18497 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3057,6 +3057,7 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
 	WRITE32(0);
 
 	ADJUST_ARGS();
+	resp->cstate.datap = p; /* DRC cache data pointer */
 	return 0;
 }
 
@@ -3159,7 +3160,7 @@ static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
 		return status;
 
 	session = resp->cstate.session;
-	if (session == NULL || slot->sl_cache_entry.ce_cachethis == 0)
+	if (session == NULL || slot->sl_cachethis == 0)
 		return status;
 
 	if (resp->opcnt >= args->opcnt)
@@ -3284,6 +3285,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
 	/*
 	 * All that remains is to write the tag and operation count...
 	 */
+	struct nfsd4_compound_state *cs = &resp->cstate;
 	struct kvec *iov;
 	p = resp->tagp;
 	*p++ = htonl(resp->taglen);
@@ -3297,15 +3299,10 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
 		iov = &rqstp->rq_res.head[0];
 	iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
 	BUG_ON(iov->iov_len > PAGE_SIZE);
-	if (nfsd4_has_session(&resp->cstate)) {
-		if (resp->cstate.status == nfserr_replay_cache &&
-				!nfsd4_not_cached(resp)) {
-			iov->iov_len = resp->cstate.iovlen;
-		} else {
-			nfsd4_store_cache_entry(resp);
-			dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
-			resp->cstate.slot->sl_inuse = 0;
-		}
+	if (nfsd4_has_session(cs) && cs->status != nfserr_replay_cache) {
+		nfsd4_store_cache_entry(resp);
+		dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
+		resp->cstate.slot->sl_inuse = false;
 		nfsd4_put_session(resp->cstate.session);
 	}
 	return 1;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 675d395c4ab6..4472449c0937 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -577,10 +577,6 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
 		+ rqstp->rq_res.head[0].iov_len;
 	rqstp->rq_res.head[0].iov_len += sizeof(__be32);
 
-	/* NFSv4.1 DRC requires statp */
-	if (rqstp->rq_vers == 4)
-		nfsd4_set_statp(rqstp, statp);
-
 	/* Now call the procedure handler, and encode NFS status. */
 	nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
 	nfserr = map_new_errors(rqstp->rq_vers, nfserr);
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index ff0b771efde6..70ef5f4abbbc 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -94,30 +94,23 @@ struct nfs4_cb_conn {
 
 /* Maximum number of slots per session. 160 is useful for long haul TCP */
 #define NFSD_MAX_SLOTS_PER_SESSION     160
-/* Maximum number of pages per slot cache entry */
-#define NFSD_PAGES_PER_SLOT	1
-#define NFSD_SLOT_CACHE_SIZE		PAGE_SIZE
 /* Maximum number of operations per session compound */
 #define NFSD_MAX_OPS_PER_COMPOUND	16
+/* Maximum  session per slot cache size */
+#define NFSD_SLOT_CACHE_SIZE		1024
 /* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */
 #define NFSD_CACHE_SIZE_SLOTS_PER_SESSION	32
 #define NFSD_MAX_MEM_PER_SESSION  \
 		(NFSD_CACHE_SIZE_SLOTS_PER_SESSION * NFSD_SLOT_CACHE_SIZE)
 
-struct nfsd4_cache_entry {
-	__be32		ce_status;
-	struct kvec	ce_datav; /* encoded NFSv4.1 data in rq_res.head[0] */
-	struct page	*ce_respages[NFSD_PAGES_PER_SLOT + 1];
-	int		ce_cachethis;
-	short		ce_resused;
-	int		ce_opcnt;
-	int		ce_rpchdrlen;
-};
-
 struct nfsd4_slot {
-	bool				sl_inuse;
-	u32				sl_seqid;
-	struct nfsd4_cache_entry	sl_cache_entry;
+	bool	sl_inuse;
+	bool	sl_cachethis;
+	u16	sl_opcnt;
+	u32	sl_seqid;
+	__be32	sl_status;
+	u32	sl_datalen;
+	char	sl_data[];
 };
 
 struct nfsd4_channel_attrs {
@@ -159,7 +152,7 @@ struct nfsd4_session {
 	struct nfs4_sessionid	se_sessionid;
 	struct nfsd4_channel_attrs se_fchannel;
 	struct nfsd4_channel_attrs se_bchannel;
-	struct nfsd4_slot	se_slots[];	/* forward channel slots */
+	struct nfsd4_slot	*se_slots[];	/* forward channel slots */
 };
 
 static inline void
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index 3f716607c86d..73164c2b3d29 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -51,7 +51,7 @@ struct nfsd4_compound_state {
 	/* For sessions DRC */
 	struct nfsd4_session	*session;
 	struct nfsd4_slot	*slot;
-	__be32			*statp;
+	__be32			*datap;
 	size_t			iovlen;
 	u32			minorversion;
 	u32			status;
@@ -472,8 +472,7 @@ static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp)
 
 static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)
 {
-	return !resp->cstate.slot->sl_cache_entry.ce_cachethis ||
-			nfsd4_is_solo_sequence(resp);
+	return !resp->cstate.slot->sl_cachethis || nfsd4_is_solo_sequence(resp);
 }
 
 #define NFS4_SVC_XDRSIZE		sizeof(struct nfsd4_compoundargs)
-- 
cgit v1.2.3-71-gd317


From cede3930f0ca6fef353fa01306c72a01420bd45e Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Mon, 31 Aug 2009 21:34:36 +0000
Subject: powerpc: Fix some late PowerMac G5 with PCIe ATI graphics

A misconfiguration by the firmware of the U4 PCIe bridge on PowerMac G5
with the U4 bridge (latest generations, may also affect the iMac G5
"iSight") is causing us to re-assign the PCI BARs of the video card,
which can get it out of sync with the firmware, thus breaking offb.

This works around it by fixing up the bridge configuration properly
at boot time. It also fixes a bug where the firmware provides us with
an incorrect set of accessible regions in the device-tree.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/platforms/powermac/pci.c | 61 +++++++++++++++++++++++++++++++++++
 include/linux/pci_ids.h               |  1 +
 2 files changed, 62 insertions(+)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/powermac/pci.c b/arch/powerpc/platforms/powermac/pci.c
index 04cdd32624d4..e81403b245b5 100644
--- a/arch/powerpc/platforms/powermac/pci.c
+++ b/arch/powerpc/platforms/powermac/pci.c
@@ -1286,3 +1286,64 @@ static void fixup_k2_sata(struct pci_dev* dev)
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SERVERWORKS, 0x0240, fixup_k2_sata);
 
+/*
+ * On U4 (aka CPC945) the PCIe root complex "P2P" bridge resource ranges aren't
+ * configured by the firmware. The bridge itself seems to ignore them but it
+ * causes problems with Linux which then re-assigns devices below the bridge,
+ * thus changing addresses of those devices from what was in the device-tree,
+ * which sucks when those are video cards using offb
+ *
+ * We could just mark it transparent but I prefer fixing up the resources to
+ * properly show what's going on here, as I have some doubts about having them
+ * badly configured potentially being an issue for DMA.
+ *
+ * We leave PIO alone, it seems to be fine
+ *
+ * Oh and there's another funny bug. The OF properties advertize the region
+ * 0xf1000000..0xf1ffffff as being forwarded as memory space. But that's
+ * actually not true, this region is the memory mapped config space. So we
+ * also need to filter it out or we'll map things in the wrong place.
+ */
+static void fixup_u4_pcie(struct pci_dev* dev)
+{
+	struct pci_controller *host = pci_bus_to_host(dev->bus);
+	struct resource *region = NULL;
+	u32 reg;
+	int i;
+
+	/* Only do that on PowerMac */
+	if (!machine_is(powermac))
+		return;
+
+	/* Find the largest MMIO region */
+	for (i = 0; i < 3; i++) {
+		struct resource *r = &host->mem_resources[i];
+		if (!(r->flags & IORESOURCE_MEM))
+			continue;
+		/* Skip the 0xf0xxxxxx..f2xxxxxx regions, we know they
+		 * are reserved by HW for other things
+		 */
+		if (r->start >= 0xf0000000 && r->start < 0xf3000000)
+			continue;
+		if (!region || (r->end - r->start) >
+		    (region->end - region->start))
+			region = r;
+	}
+	/* Nothing found, bail */
+	if (region == 0)
+		return;
+
+	/* Print things out */
+	printk(KERN_INFO "PCI: Fixup U4 PCIe bridge range: %pR\n", region);
+
+	/* Fixup bridge config space. We know it's a Mac, resource aren't
+	 * offset so let's just blast them as-is. We also know that they
+	 * fit in 32 bits
+	 */
+	reg = ((region->start >> 16) & 0xfff0) | (region->end & 0xfff00000);
+	pci_write_config_dword(dev, PCI_MEMORY_BASE, reg);
+	pci_write_config_dword(dev, PCI_PREF_BASE_UPPER32, 0);
+	pci_write_config_dword(dev, PCI_PREF_LIMIT_UPPER32, 0);
+	pci_write_config_dword(dev, PCI_PREF_MEMORY_BASE, 0);
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_APPLE, PCI_DEVICE_ID_APPLE_U4_PCIE, fixup_u4_pcie);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 73b46b6b904f..c86bb6e3dfcc 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -877,6 +877,7 @@
 #define PCI_DEVICE_ID_APPLE_SH_SUNGEM   0x0051
 #define PCI_DEVICE_ID_APPLE_U3L_AGP	0x0058
 #define PCI_DEVICE_ID_APPLE_U3H_AGP	0x0059
+#define PCI_DEVICE_ID_APPLE_U4_PCIE	0x005b
 #define PCI_DEVICE_ID_APPLE_IPID2_AGP	0x0066
 #define PCI_DEVICE_ID_APPLE_IPID2_ATA	0x0069
 #define PCI_DEVICE_ID_APPLE_IPID2_FW	0x006a
-- 
cgit v1.2.3-71-gd317


From 652696efce135559b98ee5a3d7899295e8d553fa Mon Sep 17 00:00:00 2001
From: Kyungmin Park <kmpark@infradead.org>
Date: Wed, 24 Jun 2009 12:03:51 +0900
Subject: mtd: OneNAND: 4-bit ECC status macros

Define ECC status for 4-bit ECC status

Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/mtd/onenand_regs.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/onenand_regs.h b/include/linux/mtd/onenand_regs.h
index 86a6bbef6465..acadbf53a69f 100644
--- a/include/linux/mtd/onenand_regs.h
+++ b/include/linux/mtd/onenand_regs.h
@@ -207,6 +207,9 @@
 #define ONENAND_ECC_2BIT		(1 << 1)
 #define ONENAND_ECC_2BIT_ALL		(0xAAAA)
 #define FLEXONENAND_UNCORRECTABLE_ERROR	(0x1010)
+#define ONENAND_ECC_3BIT		(1 << 2)
+#define ONENAND_ECC_4BIT		(1 << 3)
+#define ONENAND_ECC_4BIT_UNCORRECTABLE	(0x1010)
 
 /*
  * One-Time Programmable (OTP)
-- 
cgit v1.2.3-71-gd317


From b8bdc1d0cfc488ac0d94724639f9a61b0a5a1d40 Mon Sep 17 00:00:00 2001
From: Marek Vasut <marek.vasut@gmail.com>
Date: Mon, 31 Aug 2009 06:20:12 +0200
Subject: wm97xx_battery: Use platform_data

This patch converts the wm97xx-battery driver to use platform_data
supplied by ac97 bus.

Signed-off-by: Marek Vasut <marek.vasut@gmail.com>
Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
---
 drivers/power/wm97xx_battery.c | 32 +++++++++++++++++++++++++++-----
 include/linux/wm97xx.h         | 18 ++++++++++++++++++
 include/linux/wm97xx_batt.h    | 18 ++++--------------
 3 files changed, 49 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/wm97xx_battery.c b/drivers/power/wm97xx_battery.c
index 8bde92126d34..14ebd960ebe5 100644
--- a/drivers/power/wm97xx_battery.c
+++ b/drivers/power/wm97xx_battery.c
@@ -22,17 +22,19 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/gpio.h>
-#include <linux/wm97xx_batt.h>
 
 static DEFINE_MUTEX(bat_lock);
 static struct work_struct bat_work;
 struct mutex work_lock;
 static int bat_status = POWER_SUPPLY_STATUS_UNKNOWN;
-static struct wm97xx_batt_info *pdata;
+static struct wm97xx_batt_info *gpdata;
 static enum power_supply_property *prop;
 
 static unsigned long wm97xx_read_bat(struct power_supply *bat_ps)
 {
+	struct wm97xx_pdata *wmdata = bat_ps->dev->parent->platform_data;
+	struct wm97xx_batt_pdata *pdata = wmdata->batt_pdata;
+
 	return wm97xx_read_aux_adc(bat_ps->dev->parent->driver_data,
 					pdata->batt_aux) * pdata->batt_mult /
 					pdata->batt_div;
@@ -40,6 +42,9 @@ static unsigned long wm97xx_read_bat(struct power_supply *bat_ps)
 
 static unsigned long wm97xx_read_temp(struct power_supply *bat_ps)
 {
+	struct wm97xx_pdata *wmdata = bat_ps->dev->parent->platform_data;
+	struct wm97xx_batt_pdata *pdata = wmdata->batt_pdata;
+
 	return wm97xx_read_aux_adc(bat_ps->dev->parent->driver_data,
 					pdata->temp_aux) * pdata->temp_mult /
 					pdata->temp_div;
@@ -49,6 +54,9 @@ static int wm97xx_bat_get_property(struct power_supply *bat_ps,
 			    enum power_supply_property psp,
 			    union power_supply_propval *val)
 {
+	struct wm97xx_pdata *wmdata = bat_ps->dev->parent->platform_data;
+	struct wm97xx_batt_pdata *pdata = wmdata->batt_pdata;
+
 	switch (psp) {
 	case POWER_SUPPLY_PROP_STATUS:
 		val->intval = bat_status;
@@ -97,6 +105,8 @@ static void wm97xx_bat_external_power_changed(struct power_supply *bat_ps)
 static void wm97xx_bat_update(struct power_supply *bat_ps)
 {
 	int old_status = bat_status;
+	struct wm97xx_pdata *wmdata = bat_ps->dev->parent->platform_data;
+	struct wm97xx_batt_pdata *pdata = wmdata->batt_pdata;
 
 	mutex_lock(&work_lock);
 
@@ -149,6 +159,15 @@ static int __devinit wm97xx_bat_probe(struct platform_device *dev)
 	int ret = 0;
 	int props = 1;	/* POWER_SUPPLY_PROP_PRESENT */
 	int i = 0;
+	struct wm97xx_pdata *wmdata = dev->dev.platform_data;
+	struct wm97xx_batt_pdata *pdata;
+
+	if (gpdata) {
+		dev_err(&dev->dev, "Do not pass platform_data through "
+			"wm97xx_bat_set_pdata!\n");
+		return -EINVAL;
+	} else
+		pdata = wmdata->batt_pdata;
 
 	if (dev->id != -1)
 		return -EINVAL;
@@ -156,7 +175,7 @@ static int __devinit wm97xx_bat_probe(struct platform_device *dev)
 	mutex_init(&work_lock);
 
 	if (!pdata) {
-		dev_err(&dev->dev, "Please use wm97xx_bat_set_pdata\n");
+		dev_err(&dev->dev, "No platform_data supplied\n");
 		return -EINVAL;
 	}
 
@@ -229,6 +248,9 @@ err:
 
 static int __devexit wm97xx_bat_remove(struct platform_device *dev)
 {
+	struct wm97xx_pdata *wmdata = dev->dev.platform_data;
+	struct wm97xx_batt_pdata *pdata = wmdata->batt_pdata;
+
 	if (pdata && pdata->charge_gpio && pdata->charge_gpio >= 0)
 		gpio_free(pdata->charge_gpio);
 	flush_scheduled_work();
@@ -258,9 +280,9 @@ static void __exit wm97xx_bat_exit(void)
 	platform_driver_unregister(&wm97xx_bat_driver);
 }
 
-void __init wm97xx_bat_set_pdata(struct wm97xx_batt_info *data)
+void wm97xx_bat_set_pdata(struct wm97xx_batt_info *data)
 {
-	pdata = data;
+	gpdata = data;
 }
 EXPORT_SYMBOL_GPL(wm97xx_bat_set_pdata);
 
diff --git a/include/linux/wm97xx.h b/include/linux/wm97xx.h
index 6f69968eab24..b2c2297844e3 100644
--- a/include/linux/wm97xx.h
+++ b/include/linux/wm97xx.h
@@ -286,6 +286,24 @@ struct wm97xx {
 	u16 suspend_mode;               /* PRP in suspend mode */
 };
 
+struct wm97xx_batt_pdata {
+	int	batt_aux;
+	int	temp_aux;
+	int	charge_gpio;
+	int	min_voltage;
+	int	max_voltage;
+	int	batt_div;
+	int	batt_mult;
+	int	temp_div;
+	int	temp_mult;
+	int	batt_tech;
+	char	*batt_name;
+};
+
+struct wm97xx_pdata {
+	struct wm97xx_batt_pdata	*batt_pdata;	/* battery data */
+};
+
 /*
  * Codec GPIO access (not supported on WM9705)
  * This can be used to set/get codec GPIO and Virtual GPIO status.
diff --git a/include/linux/wm97xx_batt.h b/include/linux/wm97xx_batt.h
index 9681d1ab0e4f..a1d6419c2ff8 100644
--- a/include/linux/wm97xx_batt.h
+++ b/include/linux/wm97xx_batt.h
@@ -3,22 +3,12 @@
 
 #include <linux/wm97xx.h>
 
-struct wm97xx_batt_info {
-	int	batt_aux;
-	int	temp_aux;
-	int	charge_gpio;
-	int	min_voltage;
-	int	max_voltage;
-	int	batt_div;
-	int	batt_mult;
-	int	temp_div;
-	int	temp_mult;
-	int	batt_tech;
-	char	*batt_name;
-};
+#warning This file will be removed soon, use wm97xx.h instead!
+
+#define wm97xx_batt_info wm97xx_batt_pdata
 
 #ifdef CONFIG_BATTERY_WM97XX
-void __init wm97xx_bat_set_pdata(struct wm97xx_batt_info *data);
+void wm97xx_bat_set_pdata(struct wm97xx_batt_info *data);
 #else
 static inline void wm97xx_bat_set_pdata(struct wm97xx_batt_info *data) {}
 #endif
-- 
cgit v1.2.3-71-gd317


From 3961f7c3cf247eee5df7fabadc7a40f2deeb98f3 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 10 Aug 2009 17:43:53 +0100
Subject: power_supply: Add driver for the PMU on WM831x PMICs

The WM831x PMICs provide power path management from three sources:
a wall supply, USB and a battery with integrated charger. They also
provide an additional backup supply with integrated for maintaining
always on functionality such as the RTC and monitoring of power
switches.

After some initial configuration at startup the device operates
autonomously, the driver simply provides reporting of the current
state.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
---
 drivers/power/Kconfig          |   7 +
 drivers/power/Makefile         |   1 +
 drivers/power/wm831x_power.c   | 779 +++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/wm831x/pmu.h | 189 ++++++++++
 4 files changed, 976 insertions(+)
 create mode 100644 drivers/power/wm831x_power.c
 create mode 100644 include/linux/mfd/wm831x/pmu.h

(limited to 'include/linux')

diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig
index bdbc4f73fcdc..cea6cef27e89 100644
--- a/drivers/power/Kconfig
+++ b/drivers/power/Kconfig
@@ -29,6 +29,13 @@ config APM_POWER
 	  Say Y here to enable support APM status emulation using
 	  battery class devices.
 
+config WM831X_POWER
+	tristate "WM831X PMU support"
+	depends on MFD_WM831X
+	help
+	  Say Y here to enable support for the power management unit
+	  provided by Wolfson Microelectronics WM831x PMICs.
+
 config WM8350_POWER
         tristate "WM8350 PMU support"
         depends on MFD_WM8350
diff --git a/drivers/power/Makefile b/drivers/power/Makefile
index 380d17c9ae29..b96f29d91c28 100644
--- a/drivers/power/Makefile
+++ b/drivers/power/Makefile
@@ -16,6 +16,7 @@ obj-$(CONFIG_POWER_SUPPLY)	+= power_supply.o
 
 obj-$(CONFIG_PDA_POWER)		+= pda_power.o
 obj-$(CONFIG_APM_POWER)		+= apm_power.o
+obj-$(CONFIG_WM831X_POWER)	+= wm831x_power.o
 obj-$(CONFIG_WM8350_POWER)	+= wm8350_power.o
 
 obj-$(CONFIG_BATTERY_DS2760)	+= ds2760_battery.o
diff --git a/drivers/power/wm831x_power.c b/drivers/power/wm831x_power.c
new file mode 100644
index 000000000000..2a4c8b0b829c
--- /dev/null
+++ b/drivers/power/wm831x_power.c
@@ -0,0 +1,779 @@
+/*
+ * PMU driver for Wolfson Microelectronics wm831x PMICs
+ *
+ * Copyright 2009 Wolfson Microelectronics PLC.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/platform_device.h>
+#include <linux/power_supply.h>
+
+#include <linux/mfd/wm831x/core.h>
+#include <linux/mfd/wm831x/auxadc.h>
+#include <linux/mfd/wm831x/pmu.h>
+#include <linux/mfd/wm831x/pdata.h>
+
+struct wm831x_power {
+	struct wm831x *wm831x;
+	struct power_supply wall;
+	struct power_supply backup;
+	struct power_supply usb;
+	struct power_supply battery;
+};
+
+static int wm831x_power_check_online(struct wm831x *wm831x, int supply,
+				     union power_supply_propval *val)
+{
+	int ret;
+
+	ret = wm831x_reg_read(wm831x, WM831X_SYSTEM_STATUS);
+	if (ret < 0)
+		return ret;
+
+	if (ret & supply)
+		val->intval = 1;
+	else
+		val->intval = 0;
+
+	return 0;
+}
+
+static int wm831x_power_read_voltage(struct wm831x *wm831x,
+				     enum wm831x_auxadc src,
+				     union power_supply_propval *val)
+{
+	int ret;
+
+	ret = wm831x_auxadc_read_uv(wm831x, src);
+	if (ret >= 0)
+		val->intval = ret;
+
+	return ret;
+}
+
+/*********************************************************************
+ *		WALL Power
+ *********************************************************************/
+static int wm831x_wall_get_prop(struct power_supply *psy,
+				enum power_supply_property psp,
+				union power_supply_propval *val)
+{
+	struct wm831x_power *wm831x_power = dev_get_drvdata(psy->dev->parent);
+	struct wm831x *wm831x = wm831x_power->wm831x;
+	int ret = 0;
+
+	switch (psp) {
+	case POWER_SUPPLY_PROP_ONLINE:
+		ret = wm831x_power_check_online(wm831x, WM831X_PWR_WALL, val);
+		break;
+	case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+		ret = wm831x_power_read_voltage(wm831x, WM831X_AUX_WALL, val);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static enum power_supply_property wm831x_wall_props[] = {
+	POWER_SUPPLY_PROP_ONLINE,
+	POWER_SUPPLY_PROP_VOLTAGE_NOW,
+};
+
+/*********************************************************************
+ *		USB Power
+ *********************************************************************/
+static int wm831x_usb_get_prop(struct power_supply *psy,
+			       enum power_supply_property psp,
+			       union power_supply_propval *val)
+{
+	struct wm831x_power *wm831x_power = dev_get_drvdata(psy->dev->parent);
+	struct wm831x *wm831x = wm831x_power->wm831x;
+	int ret = 0;
+
+	switch (psp) {
+	case POWER_SUPPLY_PROP_ONLINE:
+		ret = wm831x_power_check_online(wm831x, WM831X_PWR_USB, val);
+		break;
+	case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+		ret = wm831x_power_read_voltage(wm831x, WM831X_AUX_USB, val);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static enum power_supply_property wm831x_usb_props[] = {
+	POWER_SUPPLY_PROP_ONLINE,
+	POWER_SUPPLY_PROP_VOLTAGE_NOW,
+};
+
+/*********************************************************************
+ *		Battery properties
+ *********************************************************************/
+
+struct chg_map {
+	int val;
+	int reg_val;
+};
+
+static struct chg_map trickle_ilims[] = {
+	{  50, 0 << WM831X_CHG_TRKL_ILIM_SHIFT },
+	{ 100, 1 << WM831X_CHG_TRKL_ILIM_SHIFT },
+	{ 150, 2 << WM831X_CHG_TRKL_ILIM_SHIFT },
+	{ 200, 3 << WM831X_CHG_TRKL_ILIM_SHIFT },
+};
+
+static struct chg_map vsels[] = {
+	{ 4050, 0 << WM831X_CHG_VSEL_SHIFT },
+	{ 4100, 1 << WM831X_CHG_VSEL_SHIFT },
+	{ 4150, 2 << WM831X_CHG_VSEL_SHIFT },
+	{ 4200, 3 << WM831X_CHG_VSEL_SHIFT },
+};
+
+static struct chg_map fast_ilims[] = {
+	{    0,  0 << WM831X_CHG_FAST_ILIM_SHIFT },
+	{   50,  1 << WM831X_CHG_FAST_ILIM_SHIFT },
+	{  100,  2 << WM831X_CHG_FAST_ILIM_SHIFT },
+	{  150,  3 << WM831X_CHG_FAST_ILIM_SHIFT },
+	{  200,  4 << WM831X_CHG_FAST_ILIM_SHIFT },
+	{  250,  5 << WM831X_CHG_FAST_ILIM_SHIFT },
+	{  300,  6 << WM831X_CHG_FAST_ILIM_SHIFT },
+	{  350,  7 << WM831X_CHG_FAST_ILIM_SHIFT },
+	{  400,  8 << WM831X_CHG_FAST_ILIM_SHIFT },
+	{  450,  9 << WM831X_CHG_FAST_ILIM_SHIFT },
+	{  500, 10 << WM831X_CHG_FAST_ILIM_SHIFT },
+	{  600, 11 << WM831X_CHG_FAST_ILIM_SHIFT },
+	{  700, 12 << WM831X_CHG_FAST_ILIM_SHIFT },
+	{  800, 13 << WM831X_CHG_FAST_ILIM_SHIFT },
+	{  900, 14 << WM831X_CHG_FAST_ILIM_SHIFT },
+	{ 1000, 15 << WM831X_CHG_FAST_ILIM_SHIFT },
+};
+
+static struct chg_map eoc_iterms[] = {
+	{ 20, 0 << WM831X_CHG_ITERM_SHIFT },
+	{ 30, 1 << WM831X_CHG_ITERM_SHIFT },
+	{ 40, 2 << WM831X_CHG_ITERM_SHIFT },
+	{ 50, 3 << WM831X_CHG_ITERM_SHIFT },
+	{ 60, 4 << WM831X_CHG_ITERM_SHIFT },
+	{ 70, 5 << WM831X_CHG_ITERM_SHIFT },
+	{ 80, 6 << WM831X_CHG_ITERM_SHIFT },
+	{ 90, 7 << WM831X_CHG_ITERM_SHIFT },
+};
+
+static struct chg_map chg_times[] = {
+	{  60,  0 << WM831X_CHG_TIME_SHIFT },
+	{  90,  1 << WM831X_CHG_TIME_SHIFT },
+	{ 120,  2 << WM831X_CHG_TIME_SHIFT },
+	{ 150,  3 << WM831X_CHG_TIME_SHIFT },
+	{ 180,  4 << WM831X_CHG_TIME_SHIFT },
+	{ 210,  5 << WM831X_CHG_TIME_SHIFT },
+	{ 240,  6 << WM831X_CHG_TIME_SHIFT },
+	{ 270,  7 << WM831X_CHG_TIME_SHIFT },
+	{ 300,  8 << WM831X_CHG_TIME_SHIFT },
+	{ 330,  9 << WM831X_CHG_TIME_SHIFT },
+	{ 360, 10 << WM831X_CHG_TIME_SHIFT },
+	{ 390, 11 << WM831X_CHG_TIME_SHIFT },
+	{ 420, 12 << WM831X_CHG_TIME_SHIFT },
+	{ 450, 13 << WM831X_CHG_TIME_SHIFT },
+	{ 480, 14 << WM831X_CHG_TIME_SHIFT },
+	{ 510, 15 << WM831X_CHG_TIME_SHIFT },
+};
+
+static void wm831x_battey_apply_config(struct wm831x *wm831x,
+				       struct chg_map *map, int count, int val,
+				       int *reg, const char *name,
+				       const char *units)
+{
+	int i;
+
+	for (i = 0; i < count; i++)
+		if (val == map[i].val)
+			break;
+	if (i == count) {
+		dev_err(wm831x->dev, "Invalid %s %d%s\n",
+			name, val, units);
+	} else {
+		*reg |= map[i].reg_val;
+		dev_dbg(wm831x->dev, "Set %s of %d%s\n", name, val, units);
+	}
+}
+
+static void wm831x_config_battery(struct wm831x *wm831x)
+{
+	struct wm831x_pdata *wm831x_pdata = wm831x->dev->platform_data;
+	struct wm831x_battery_pdata *pdata;
+	int ret, reg1, reg2;
+
+	if (!wm831x_pdata || !wm831x_pdata->battery) {
+		dev_warn(wm831x->dev,
+			 "No battery charger configuration\n");
+		return;
+	}
+
+	pdata = wm831x_pdata->battery;
+
+	reg1 = 0;
+	reg2 = 0;
+
+	if (!pdata->enable) {
+		dev_info(wm831x->dev, "Battery charger disabled\n");
+		return;
+	}
+
+	reg1 |= WM831X_CHG_ENA;
+	if (pdata->off_mask)
+		reg2 |= WM831X_CHG_OFF_MSK;
+	if (pdata->fast_enable)
+		reg1 |= WM831X_CHG_FAST;
+
+	wm831x_battey_apply_config(wm831x, trickle_ilims,
+				   ARRAY_SIZE(trickle_ilims),
+				   pdata->trickle_ilim, &reg2,
+				   "trickle charge current limit", "mA");
+
+	wm831x_battey_apply_config(wm831x, vsels, ARRAY_SIZE(vsels),
+				   pdata->vsel, &reg2,
+				   "target voltage", "mV");
+
+	wm831x_battey_apply_config(wm831x, fast_ilims, ARRAY_SIZE(fast_ilims),
+				   pdata->fast_ilim, &reg2,
+				   "fast charge current limit", "mA");
+
+	wm831x_battey_apply_config(wm831x, eoc_iterms, ARRAY_SIZE(eoc_iterms),
+				   pdata->eoc_iterm, &reg1,
+				   "end of charge current threshold", "mA");
+
+	wm831x_battey_apply_config(wm831x, chg_times, ARRAY_SIZE(chg_times),
+				   pdata->timeout, &reg2,
+				   "charger timeout", "min");
+
+	ret = wm831x_reg_unlock(wm831x);
+	if (ret != 0) {
+		dev_err(wm831x->dev, "Failed to unlock registers: %d\n", ret);
+		return;
+	}
+
+	ret = wm831x_set_bits(wm831x, WM831X_CHARGER_CONTROL_1,
+			      WM831X_CHG_ENA_MASK |
+			      WM831X_CHG_FAST_MASK |
+			      WM831X_CHG_ITERM_MASK |
+			      WM831X_CHG_ITERM_MASK,
+			      reg1);
+	if (ret != 0)
+		dev_err(wm831x->dev, "Failed to set charger control 1: %d\n",
+			ret);
+
+	ret = wm831x_set_bits(wm831x, WM831X_CHARGER_CONTROL_2,
+			      WM831X_CHG_OFF_MSK |
+			      WM831X_CHG_TIME_MASK |
+			      WM831X_CHG_FAST_ILIM_MASK |
+			      WM831X_CHG_TRKL_ILIM_MASK |
+			      WM831X_CHG_VSEL_MASK,
+			      reg2);
+	if (ret != 0)
+		dev_err(wm831x->dev, "Failed to set charger control 2: %d\n",
+			ret);
+
+	wm831x_reg_lock(wm831x);
+}
+
+static int wm831x_bat_check_status(struct wm831x *wm831x, int *status)
+{
+	int ret;
+
+	ret = wm831x_reg_read(wm831x, WM831X_SYSTEM_STATUS);
+	if (ret < 0)
+		return ret;
+
+	if (ret & WM831X_PWR_SRC_BATT) {
+		*status = POWER_SUPPLY_STATUS_DISCHARGING;
+		return 0;
+	}
+
+	ret = wm831x_reg_read(wm831x, WM831X_CHARGER_STATUS);
+	if (ret < 0)
+		return ret;
+
+	switch (ret & WM831X_CHG_STATE_MASK) {
+	case WM831X_CHG_STATE_OFF:
+		*status = POWER_SUPPLY_STATUS_NOT_CHARGING;
+		break;
+	case WM831X_CHG_STATE_TRICKLE:
+	case WM831X_CHG_STATE_FAST:
+		*status = POWER_SUPPLY_STATUS_CHARGING;
+		break;
+
+	default:
+		*status = POWER_SUPPLY_STATUS_UNKNOWN;
+		break;
+	}
+
+	return 0;
+}
+
+static int wm831x_bat_check_type(struct wm831x *wm831x, int *type)
+{
+	int ret;
+
+	ret = wm831x_reg_read(wm831x, WM831X_CHARGER_STATUS);
+	if (ret < 0)
+		return ret;
+
+	switch (ret & WM831X_CHG_STATE_MASK) {
+	case WM831X_CHG_STATE_TRICKLE:
+	case WM831X_CHG_STATE_TRICKLE_OT:
+		*type = POWER_SUPPLY_CHARGE_TYPE_TRICKLE;
+		break;
+	case WM831X_CHG_STATE_FAST:
+	case WM831X_CHG_STATE_FAST_OT:
+		*type = POWER_SUPPLY_CHARGE_TYPE_FAST;
+		break;
+	default:
+		*type = POWER_SUPPLY_CHARGE_TYPE_NONE;
+		break;
+	}
+
+	return 0;
+}
+
+static int wm831x_bat_check_health(struct wm831x *wm831x, int *health)
+{
+	int ret;
+
+	ret = wm831x_reg_read(wm831x, WM831X_CHARGER_STATUS);
+	if (ret < 0)
+		return ret;
+
+	if (ret & WM831X_BATT_HOT_STS) {
+		*health = POWER_SUPPLY_HEALTH_OVERHEAT;
+		return 0;
+	}
+
+	if (ret & WM831X_BATT_COLD_STS) {
+		*health = POWER_SUPPLY_HEALTH_COLD;
+		return 0;
+	}
+
+	if (ret & WM831X_BATT_OV_STS) {
+		*health = POWER_SUPPLY_HEALTH_OVERVOLTAGE;
+		return 0;
+	}
+
+	switch (ret & WM831X_CHG_STATE_MASK) {
+	case WM831X_CHG_STATE_TRICKLE_OT:
+	case WM831X_CHG_STATE_FAST_OT:
+		*health = POWER_SUPPLY_HEALTH_OVERHEAT;
+		break;
+	case WM831X_CHG_STATE_DEFECTIVE:
+		*health = POWER_SUPPLY_HEALTH_UNSPEC_FAILURE;
+		break;
+	default:
+		*health = POWER_SUPPLY_HEALTH_GOOD;
+		break;
+	}
+
+	return 0;
+}
+
+static int wm831x_bat_get_prop(struct power_supply *psy,
+			       enum power_supply_property psp,
+			       union power_supply_propval *val)
+{
+	struct wm831x_power *wm831x_power = dev_get_drvdata(psy->dev->parent);
+	struct wm831x *wm831x = wm831x_power->wm831x;
+	int ret = 0;
+
+	switch (psp) {
+	case POWER_SUPPLY_PROP_STATUS:
+		ret = wm831x_bat_check_status(wm831x, &val->intval);
+		break;
+	case POWER_SUPPLY_PROP_ONLINE:
+		ret = wm831x_power_check_online(wm831x, WM831X_PWR_SRC_BATT,
+						val);
+		break;
+	case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+		ret = wm831x_power_read_voltage(wm831x, WM831X_AUX_BATT, val);
+		break;
+	case POWER_SUPPLY_PROP_HEALTH:
+		ret = wm831x_bat_check_health(wm831x, &val->intval);
+		break;
+	case POWER_SUPPLY_PROP_CHARGE_TYPE:
+		ret = wm831x_bat_check_type(wm831x, &val->intval);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static enum power_supply_property wm831x_bat_props[] = {
+	POWER_SUPPLY_PROP_STATUS,
+	POWER_SUPPLY_PROP_ONLINE,
+	POWER_SUPPLY_PROP_VOLTAGE_NOW,
+	POWER_SUPPLY_PROP_HEALTH,
+	POWER_SUPPLY_PROP_CHARGE_TYPE,
+};
+
+static const char *wm831x_bat_irqs[] = {
+	"BATT HOT",
+	"BATT COLD",
+	"BATT FAIL",
+	"OV",
+	"END",
+	"TO",
+	"MODE",
+	"START",
+};
+
+static irqreturn_t wm831x_bat_irq(int irq, void *data)
+{
+	struct wm831x_power *wm831x_power = data;
+	struct wm831x *wm831x = wm831x_power->wm831x;
+
+	dev_dbg(wm831x->dev, "Battery status changed: %d\n", irq);
+
+	/* The battery charger is autonomous so we don't need to do
+	 * anything except kick user space */
+	power_supply_changed(&wm831x_power->battery);
+
+	return IRQ_HANDLED;
+}
+
+
+/*********************************************************************
+ *		Backup supply properties
+ *********************************************************************/
+
+static void wm831x_config_backup(struct wm831x *wm831x)
+{
+	struct wm831x_pdata *wm831x_pdata = wm831x->dev->platform_data;
+	struct wm831x_backup_pdata *pdata;
+	int ret, reg;
+
+	if (!wm831x_pdata || !wm831x_pdata->backup) {
+		dev_warn(wm831x->dev,
+			 "No backup battery charger configuration\n");
+		return;
+	}
+
+	pdata = wm831x_pdata->backup;
+
+	reg = 0;
+
+	if (pdata->charger_enable)
+		reg |= WM831X_BKUP_CHG_ENA | WM831X_BKUP_BATT_DET_ENA;
+	if (pdata->no_constant_voltage)
+		reg |= WM831X_BKUP_CHG_MODE;
+
+	switch (pdata->vlim) {
+	case 2500:
+		break;
+	case 3100:
+		reg |= WM831X_BKUP_CHG_VLIM;
+		break;
+	default:
+		dev_err(wm831x->dev, "Invalid backup voltage limit %dmV\n",
+			pdata->vlim);
+	}
+
+	switch (pdata->ilim) {
+	case 100:
+		break;
+	case 200:
+		reg |= 1;
+		break;
+	case 300:
+		reg |= 2;
+		break;
+	case 400:
+		reg |= 3;
+		break;
+	default:
+		dev_err(wm831x->dev, "Invalid backup current limit %duA\n",
+			pdata->ilim);
+	}
+
+	ret = wm831x_reg_unlock(wm831x);
+	if (ret != 0) {
+		dev_err(wm831x->dev, "Failed to unlock registers: %d\n", ret);
+		return;
+	}
+
+	ret = wm831x_set_bits(wm831x, WM831X_BACKUP_CHARGER_CONTROL,
+			      WM831X_BKUP_CHG_ENA_MASK |
+			      WM831X_BKUP_CHG_MODE_MASK |
+			      WM831X_BKUP_BATT_DET_ENA_MASK |
+			      WM831X_BKUP_CHG_VLIM_MASK |
+			      WM831X_BKUP_CHG_ILIM_MASK,
+			      reg);
+	if (ret != 0)
+		dev_err(wm831x->dev,
+			"Failed to set backup charger config: %d\n", ret);
+
+	wm831x_reg_lock(wm831x);
+}
+
+static int wm831x_backup_get_prop(struct power_supply *psy,
+				  enum power_supply_property psp,
+				  union power_supply_propval *val)
+{
+	struct wm831x_power *wm831x_power = dev_get_drvdata(psy->dev->parent);
+	struct wm831x *wm831x = wm831x_power->wm831x;
+	int ret = 0;
+
+	ret = wm831x_reg_read(wm831x, WM831X_BACKUP_CHARGER_CONTROL);
+	if (ret < 0)
+		return ret;
+
+	switch (psp) {
+	case POWER_SUPPLY_PROP_STATUS:
+		if (ret & WM831X_BKUP_CHG_STS)
+			val->intval = POWER_SUPPLY_STATUS_CHARGING;
+		else
+			val->intval = POWER_SUPPLY_STATUS_NOT_CHARGING;
+		break;
+
+	case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+		ret = wm831x_power_read_voltage(wm831x, WM831X_AUX_BKUP_BATT,
+						val);
+		break;
+
+	case POWER_SUPPLY_PROP_PRESENT:
+		if (ret & WM831X_BKUP_CHG_STS)
+			val->intval = 1;
+		else
+			val->intval = 0;
+		break;
+
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static enum power_supply_property wm831x_backup_props[] = {
+	POWER_SUPPLY_PROP_STATUS,
+	POWER_SUPPLY_PROP_VOLTAGE_NOW,
+	POWER_SUPPLY_PROP_PRESENT,
+};
+
+/*********************************************************************
+ *		Initialisation
+ *********************************************************************/
+
+static irqreturn_t wm831x_syslo_irq(int irq, void *data)
+{
+	struct wm831x_power *wm831x_power = data;
+	struct wm831x *wm831x = wm831x_power->wm831x;
+
+	/* Not much we can actually *do* but tell people for
+	 * posterity, we're probably about to run out of power. */
+	dev_crit(wm831x->dev, "SYSVDD under voltage\n");
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t wm831x_pwr_src_irq(int irq, void *data)
+{
+	struct wm831x_power *wm831x_power = data;
+	struct wm831x *wm831x = wm831x_power->wm831x;
+
+	dev_dbg(wm831x->dev, "Power source changed\n");
+
+	/* Just notify for everything - little harm in overnotifying.
+	 * The backup battery is not a power source while the system
+	 * is running so skip that.
+	 */
+	power_supply_changed(&wm831x_power->battery);
+	power_supply_changed(&wm831x_power->usb);
+	power_supply_changed(&wm831x_power->wall);
+
+	return IRQ_HANDLED;
+}
+
+static __devinit int wm831x_power_probe(struct platform_device *pdev)
+{
+	struct wm831x *wm831x = dev_get_drvdata(pdev->dev.parent);
+	struct wm831x_power *power;
+	struct power_supply *usb;
+	struct power_supply *battery;
+	struct power_supply *wall;
+	struct power_supply *backup;
+	int ret, irq, i;
+
+	power = kzalloc(sizeof(struct wm831x_power), GFP_KERNEL);
+	if (power == NULL)
+		return -ENOMEM;
+
+	power->wm831x = wm831x;
+	platform_set_drvdata(pdev, power);
+
+	usb = &power->usb;
+	battery = &power->battery;
+	wall = &power->wall;
+	backup = &power->backup;
+
+	/* We ignore configuration failures since we can still read back
+	 * the status without enabling either of the chargers.
+	 */
+	wm831x_config_battery(wm831x);
+	wm831x_config_backup(wm831x);
+
+	wall->name = "wm831x-wall";
+	wall->type = POWER_SUPPLY_TYPE_MAINS;
+	wall->properties = wm831x_wall_props;
+	wall->num_properties = ARRAY_SIZE(wm831x_wall_props);
+	wall->get_property = wm831x_wall_get_prop;
+	ret = power_supply_register(&pdev->dev, wall);
+	if (ret)
+		goto err_kmalloc;
+
+	battery->name = "wm831x-battery";
+	battery->properties = wm831x_bat_props;
+	battery->num_properties = ARRAY_SIZE(wm831x_bat_props);
+	battery->get_property = wm831x_bat_get_prop;
+	battery->use_for_apm = 1;
+	ret = power_supply_register(&pdev->dev, battery);
+	if (ret)
+		goto err_wall;
+
+	usb->name = "wm831x-usb",
+	usb->type = POWER_SUPPLY_TYPE_USB;
+	usb->properties = wm831x_usb_props;
+	usb->num_properties = ARRAY_SIZE(wm831x_usb_props);
+	usb->get_property = wm831x_usb_get_prop;
+	ret = power_supply_register(&pdev->dev, usb);
+	if (ret)
+		goto err_battery;
+
+	backup->name = "wm831x-backup";
+	backup->type = POWER_SUPPLY_TYPE_BATTERY;
+	backup->properties = wm831x_backup_props;
+	backup->num_properties = ARRAY_SIZE(wm831x_backup_props);
+	backup->get_property = wm831x_backup_get_prop;
+	ret = power_supply_register(&pdev->dev, backup);
+	if (ret)
+		goto err_usb;
+
+	irq = platform_get_irq_byname(pdev, "SYSLO");
+	ret = wm831x_request_irq(wm831x, irq, wm831x_syslo_irq,
+				 IRQF_TRIGGER_RISING, "SYSLO",
+				 power);
+	if (ret != 0) {
+		dev_err(&pdev->dev, "Failed to request SYSLO IRQ %d: %d\n",
+			irq, ret);
+		goto err_backup;
+	}
+
+	irq = platform_get_irq_byname(pdev, "PWR SRC");
+	ret = wm831x_request_irq(wm831x, irq, wm831x_pwr_src_irq,
+				 IRQF_TRIGGER_RISING, "Power source",
+				 power);
+	if (ret != 0) {
+		dev_err(&pdev->dev, "Failed to request PWR SRC IRQ %d: %d\n",
+			irq, ret);
+		goto err_syslo;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(wm831x_bat_irqs); i++) {
+		irq = platform_get_irq_byname(pdev, wm831x_bat_irqs[i]);
+		ret = wm831x_request_irq(wm831x, irq, wm831x_bat_irq,
+					 IRQF_TRIGGER_RISING,
+					 wm831x_bat_irqs[i],
+					 power);
+		if (ret != 0) {
+			dev_err(&pdev->dev,
+				"Failed to request %s IRQ %d: %d\n",
+				wm831x_bat_irqs[i], irq, ret);
+			goto err_bat_irq;
+		}
+	}
+
+	return ret;
+
+err_bat_irq:
+	for (; i >= 0; i--) {
+		irq = platform_get_irq_byname(pdev, wm831x_bat_irqs[i]);
+		wm831x_free_irq(wm831x, irq, power);
+	}
+	irq = platform_get_irq_byname(pdev, "PWR SRC");
+	wm831x_free_irq(wm831x, irq, power);
+err_syslo:
+	irq = platform_get_irq_byname(pdev, "SYSLO");
+	wm831x_free_irq(wm831x, irq, power);
+err_backup:
+	power_supply_unregister(backup);
+err_usb:
+	power_supply_unregister(usb);
+err_battery:
+	power_supply_unregister(battery);
+err_wall:
+	power_supply_unregister(wall);
+err_kmalloc:
+	kfree(power);
+	return ret;
+}
+
+static __devexit int wm831x_power_remove(struct platform_device *pdev)
+{
+	struct wm831x_power *wm831x_power = platform_get_drvdata(pdev);
+	struct wm831x *wm831x = wm831x_power->wm831x;
+	int irq, i;
+
+	for (i = 0; i < ARRAY_SIZE(wm831x_bat_irqs); i++) {
+		irq = platform_get_irq_byname(pdev, wm831x_bat_irqs[i]);
+		wm831x_free_irq(wm831x, irq, wm831x_power);
+	}
+
+	irq = platform_get_irq_byname(pdev, "PWR SRC");
+	wm831x_free_irq(wm831x, irq, wm831x_power);
+
+	irq = platform_get_irq_byname(pdev, "SYSLO");
+	wm831x_free_irq(wm831x, irq, wm831x_power);
+
+	power_supply_unregister(&wm831x_power->backup);
+	power_supply_unregister(&wm831x_power->battery);
+	power_supply_unregister(&wm831x_power->wall);
+	power_supply_unregister(&wm831x_power->usb);
+	return 0;
+}
+
+static struct platform_driver wm831x_power_driver = {
+	.probe = wm831x_power_probe,
+	.remove = __devexit_p(wm831x_power_remove),
+	.driver = {
+		.name = "wm831x-power",
+	},
+};
+
+static int __init wm831x_power_init(void)
+{
+	return platform_driver_register(&wm831x_power_driver);
+}
+module_init(wm831x_power_init);
+
+static void __exit wm831x_power_exit(void)
+{
+	platform_driver_unregister(&wm831x_power_driver);
+}
+module_exit(wm831x_power_exit);
+
+MODULE_DESCRIPTION("Power supply driver for WM831x PMICs");
+MODULE_AUTHOR("Mark Brown <broonie@opensource.wolfsonmicro.com>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:wm831x-power");
diff --git a/include/linux/mfd/wm831x/pmu.h b/include/linux/mfd/wm831x/pmu.h
new file mode 100644
index 000000000000..b18cbb027bc3
--- /dev/null
+++ b/include/linux/mfd/wm831x/pmu.h
@@ -0,0 +1,189 @@
+/*
+ * include/linux/mfd/wm831x/pmu.h -- PMU for WM831x
+ *
+ * Copyright 2009 Wolfson Microelectronics PLC.
+ *
+ * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#ifndef __MFD_WM831X_PMU_H__
+#define __MFD_WM831X_PMU_H__
+
+/*
+ * R16387 (0x4003) - Power State
+ */
+#define WM831X_CHIP_ON                          0x8000  /* CHIP_ON */
+#define WM831X_CHIP_ON_MASK                     0x8000  /* CHIP_ON */
+#define WM831X_CHIP_ON_SHIFT                        15  /* CHIP_ON */
+#define WM831X_CHIP_ON_WIDTH                         1  /* CHIP_ON */
+#define WM831X_CHIP_SLP                         0x4000  /* CHIP_SLP */
+#define WM831X_CHIP_SLP_MASK                    0x4000  /* CHIP_SLP */
+#define WM831X_CHIP_SLP_SHIFT                       14  /* CHIP_SLP */
+#define WM831X_CHIP_SLP_WIDTH                        1  /* CHIP_SLP */
+#define WM831X_REF_LP                           0x1000  /* REF_LP */
+#define WM831X_REF_LP_MASK                      0x1000  /* REF_LP */
+#define WM831X_REF_LP_SHIFT                         12  /* REF_LP */
+#define WM831X_REF_LP_WIDTH                          1  /* REF_LP */
+#define WM831X_PWRSTATE_DLY_MASK                0x0C00  /* PWRSTATE_DLY - [11:10] */
+#define WM831X_PWRSTATE_DLY_SHIFT                   10  /* PWRSTATE_DLY - [11:10] */
+#define WM831X_PWRSTATE_DLY_WIDTH                    2  /* PWRSTATE_DLY - [11:10] */
+#define WM831X_SWRST_DLY                        0x0200  /* SWRST_DLY */
+#define WM831X_SWRST_DLY_MASK                   0x0200  /* SWRST_DLY */
+#define WM831X_SWRST_DLY_SHIFT                       9  /* SWRST_DLY */
+#define WM831X_SWRST_DLY_WIDTH                       1  /* SWRST_DLY */
+#define WM831X_USB100MA_STARTUP_MASK            0x0030  /* USB100MA_STARTUP - [5:4] */
+#define WM831X_USB100MA_STARTUP_SHIFT                4  /* USB100MA_STARTUP - [5:4] */
+#define WM831X_USB100MA_STARTUP_WIDTH                2  /* USB100MA_STARTUP - [5:4] */
+#define WM831X_USB_CURR_STS                     0x0008  /* USB_CURR_STS */
+#define WM831X_USB_CURR_STS_MASK                0x0008  /* USB_CURR_STS */
+#define WM831X_USB_CURR_STS_SHIFT                    3  /* USB_CURR_STS */
+#define WM831X_USB_CURR_STS_WIDTH                    1  /* USB_CURR_STS */
+#define WM831X_USB_ILIM_MASK                    0x0007  /* USB_ILIM - [2:0] */
+#define WM831X_USB_ILIM_SHIFT                        0  /* USB_ILIM - [2:0] */
+#define WM831X_USB_ILIM_WIDTH                        3  /* USB_ILIM - [2:0] */
+
+/*
+ * R16397 (0x400D) - System Status
+ */
+#define WM831X_THW_STS                          0x8000  /* THW_STS */
+#define WM831X_THW_STS_MASK                     0x8000  /* THW_STS */
+#define WM831X_THW_STS_SHIFT                        15  /* THW_STS */
+#define WM831X_THW_STS_WIDTH                         1  /* THW_STS */
+#define WM831X_PWR_SRC_BATT                     0x0400  /* PWR_SRC_BATT */
+#define WM831X_PWR_SRC_BATT_MASK                0x0400  /* PWR_SRC_BATT */
+#define WM831X_PWR_SRC_BATT_SHIFT                   10  /* PWR_SRC_BATT */
+#define WM831X_PWR_SRC_BATT_WIDTH                    1  /* PWR_SRC_BATT */
+#define WM831X_PWR_WALL                         0x0200  /* PWR_WALL */
+#define WM831X_PWR_WALL_MASK                    0x0200  /* PWR_WALL */
+#define WM831X_PWR_WALL_SHIFT                        9  /* PWR_WALL */
+#define WM831X_PWR_WALL_WIDTH                        1  /* PWR_WALL */
+#define WM831X_PWR_USB                          0x0100  /* PWR_USB */
+#define WM831X_PWR_USB_MASK                     0x0100  /* PWR_USB */
+#define WM831X_PWR_USB_SHIFT                         8  /* PWR_USB */
+#define WM831X_PWR_USB_WIDTH                         1  /* PWR_USB */
+#define WM831X_MAIN_STATE_MASK                  0x001F  /* MAIN_STATE - [4:0] */
+#define WM831X_MAIN_STATE_SHIFT                      0  /* MAIN_STATE - [4:0] */
+#define WM831X_MAIN_STATE_WIDTH                      5  /* MAIN_STATE - [4:0] */
+
+/*
+ * R16456 (0x4048) - Charger Control 1
+ */
+#define WM831X_CHG_ENA                          0x8000  /* CHG_ENA */
+#define WM831X_CHG_ENA_MASK                     0x8000  /* CHG_ENA */
+#define WM831X_CHG_ENA_SHIFT                        15  /* CHG_ENA */
+#define WM831X_CHG_ENA_WIDTH                         1  /* CHG_ENA */
+#define WM831X_CHG_FRC                          0x4000  /* CHG_FRC */
+#define WM831X_CHG_FRC_MASK                     0x4000  /* CHG_FRC */
+#define WM831X_CHG_FRC_SHIFT                        14  /* CHG_FRC */
+#define WM831X_CHG_FRC_WIDTH                         1  /* CHG_FRC */
+#define WM831X_CHG_ITERM_MASK                   0x1C00  /* CHG_ITERM - [12:10] */
+#define WM831X_CHG_ITERM_SHIFT                      10  /* CHG_ITERM - [12:10] */
+#define WM831X_CHG_ITERM_WIDTH                       3  /* CHG_ITERM - [12:10] */
+#define WM831X_CHG_FAST                         0x0020  /* CHG_FAST */
+#define WM831X_CHG_FAST_MASK                    0x0020  /* CHG_FAST */
+#define WM831X_CHG_FAST_SHIFT                        5  /* CHG_FAST */
+#define WM831X_CHG_FAST_WIDTH                        1  /* CHG_FAST */
+#define WM831X_CHG_IMON_ENA                     0x0002  /* CHG_IMON_ENA */
+#define WM831X_CHG_IMON_ENA_MASK                0x0002  /* CHG_IMON_ENA */
+#define WM831X_CHG_IMON_ENA_SHIFT                    1  /* CHG_IMON_ENA */
+#define WM831X_CHG_IMON_ENA_WIDTH                    1  /* CHG_IMON_ENA */
+#define WM831X_CHG_CHIP_TEMP_MON                0x0001  /* CHG_CHIP_TEMP_MON */
+#define WM831X_CHG_CHIP_TEMP_MON_MASK           0x0001  /* CHG_CHIP_TEMP_MON */
+#define WM831X_CHG_CHIP_TEMP_MON_SHIFT               0  /* CHG_CHIP_TEMP_MON */
+#define WM831X_CHG_CHIP_TEMP_MON_WIDTH               1  /* CHG_CHIP_TEMP_MON */
+
+/*
+ * R16457 (0x4049) - Charger Control 2
+ */
+#define WM831X_CHG_OFF_MSK                      0x4000  /* CHG_OFF_MSK */
+#define WM831X_CHG_OFF_MSK_MASK                 0x4000  /* CHG_OFF_MSK */
+#define WM831X_CHG_OFF_MSK_SHIFT                    14  /* CHG_OFF_MSK */
+#define WM831X_CHG_OFF_MSK_WIDTH                     1  /* CHG_OFF_MSK */
+#define WM831X_CHG_TIME_MASK                    0x0F00  /* CHG_TIME - [11:8] */
+#define WM831X_CHG_TIME_SHIFT                        8  /* CHG_TIME - [11:8] */
+#define WM831X_CHG_TIME_WIDTH                        4  /* CHG_TIME - [11:8] */
+#define WM831X_CHG_TRKL_ILIM_MASK               0x00C0  /* CHG_TRKL_ILIM - [7:6] */
+#define WM831X_CHG_TRKL_ILIM_SHIFT                   6  /* CHG_TRKL_ILIM - [7:6] */
+#define WM831X_CHG_TRKL_ILIM_WIDTH                   2  /* CHG_TRKL_ILIM - [7:6] */
+#define WM831X_CHG_VSEL_MASK                    0x0030  /* CHG_VSEL - [5:4] */
+#define WM831X_CHG_VSEL_SHIFT                        4  /* CHG_VSEL - [5:4] */
+#define WM831X_CHG_VSEL_WIDTH                        2  /* CHG_VSEL - [5:4] */
+#define WM831X_CHG_FAST_ILIM_MASK               0x000F  /* CHG_FAST_ILIM - [3:0] */
+#define WM831X_CHG_FAST_ILIM_SHIFT                   0  /* CHG_FAST_ILIM - [3:0] */
+#define WM831X_CHG_FAST_ILIM_WIDTH                   4  /* CHG_FAST_ILIM - [3:0] */
+
+/*
+ * R16458 (0x404A) - Charger Status
+ */
+#define WM831X_BATT_OV_STS                      0x8000  /* BATT_OV_STS */
+#define WM831X_BATT_OV_STS_MASK                 0x8000  /* BATT_OV_STS */
+#define WM831X_BATT_OV_STS_SHIFT                    15  /* BATT_OV_STS */
+#define WM831X_BATT_OV_STS_WIDTH                     1  /* BATT_OV_STS */
+#define WM831X_CHG_STATE_MASK                   0x7000  /* CHG_STATE - [14:12] */
+#define WM831X_CHG_STATE_SHIFT                      12  /* CHG_STATE - [14:12] */
+#define WM831X_CHG_STATE_WIDTH                       3  /* CHG_STATE - [14:12] */
+#define WM831X_BATT_HOT_STS                     0x0800  /* BATT_HOT_STS */
+#define WM831X_BATT_HOT_STS_MASK                0x0800  /* BATT_HOT_STS */
+#define WM831X_BATT_HOT_STS_SHIFT                   11  /* BATT_HOT_STS */
+#define WM831X_BATT_HOT_STS_WIDTH                    1  /* BATT_HOT_STS */
+#define WM831X_BATT_COLD_STS                    0x0400  /* BATT_COLD_STS */
+#define WM831X_BATT_COLD_STS_MASK               0x0400  /* BATT_COLD_STS */
+#define WM831X_BATT_COLD_STS_SHIFT                  10  /* BATT_COLD_STS */
+#define WM831X_BATT_COLD_STS_WIDTH                   1  /* BATT_COLD_STS */
+#define WM831X_CHG_TOPOFF                       0x0200  /* CHG_TOPOFF */
+#define WM831X_CHG_TOPOFF_MASK                  0x0200  /* CHG_TOPOFF */
+#define WM831X_CHG_TOPOFF_SHIFT                      9  /* CHG_TOPOFF */
+#define WM831X_CHG_TOPOFF_WIDTH                      1  /* CHG_TOPOFF */
+#define WM831X_CHG_ACTIVE                       0x0100  /* CHG_ACTIVE */
+#define WM831X_CHG_ACTIVE_MASK                  0x0100  /* CHG_ACTIVE */
+#define WM831X_CHG_ACTIVE_SHIFT                      8  /* CHG_ACTIVE */
+#define WM831X_CHG_ACTIVE_WIDTH                      1  /* CHG_ACTIVE */
+#define WM831X_CHG_TIME_ELAPSED_MASK            0x00FF  /* CHG_TIME_ELAPSED - [7:0] */
+#define WM831X_CHG_TIME_ELAPSED_SHIFT                0  /* CHG_TIME_ELAPSED - [7:0] */
+#define WM831X_CHG_TIME_ELAPSED_WIDTH                8  /* CHG_TIME_ELAPSED - [7:0] */
+
+#define WM831X_CHG_STATE_OFF         (0 << WM831X_CHG_STATE_SHIFT)
+#define WM831X_CHG_STATE_TRICKLE     (1 << WM831X_CHG_STATE_SHIFT)
+#define WM831X_CHG_STATE_FAST        (2 << WM831X_CHG_STATE_SHIFT)
+#define WM831X_CHG_STATE_TRICKLE_OT  (3 << WM831X_CHG_STATE_SHIFT)
+#define WM831X_CHG_STATE_FAST_OT     (4 << WM831X_CHG_STATE_SHIFT)
+#define WM831X_CHG_STATE_DEFECTIVE   (5 << WM831X_CHG_STATE_SHIFT)
+
+/*
+ * R16459 (0x404B) - Backup Charger Control
+ */
+#define WM831X_BKUP_CHG_ENA                     0x8000  /* BKUP_CHG_ENA */
+#define WM831X_BKUP_CHG_ENA_MASK                0x8000  /* BKUP_CHG_ENA */
+#define WM831X_BKUP_CHG_ENA_SHIFT                   15  /* BKUP_CHG_ENA */
+#define WM831X_BKUP_CHG_ENA_WIDTH                    1  /* BKUP_CHG_ENA */
+#define WM831X_BKUP_CHG_STS                     0x4000  /* BKUP_CHG_STS */
+#define WM831X_BKUP_CHG_STS_MASK                0x4000  /* BKUP_CHG_STS */
+#define WM831X_BKUP_CHG_STS_SHIFT                   14  /* BKUP_CHG_STS */
+#define WM831X_BKUP_CHG_STS_WIDTH                    1  /* BKUP_CHG_STS */
+#define WM831X_BKUP_CHG_MODE                    0x1000  /* BKUP_CHG_MODE */
+#define WM831X_BKUP_CHG_MODE_MASK               0x1000  /* BKUP_CHG_MODE */
+#define WM831X_BKUP_CHG_MODE_SHIFT                  12  /* BKUP_CHG_MODE */
+#define WM831X_BKUP_CHG_MODE_WIDTH                   1  /* BKUP_CHG_MODE */
+#define WM831X_BKUP_BATT_DET_ENA                0x0800  /* BKUP_BATT_DET_ENA */
+#define WM831X_BKUP_BATT_DET_ENA_MASK           0x0800  /* BKUP_BATT_DET_ENA */
+#define WM831X_BKUP_BATT_DET_ENA_SHIFT              11  /* BKUP_BATT_DET_ENA */
+#define WM831X_BKUP_BATT_DET_ENA_WIDTH               1  /* BKUP_BATT_DET_ENA */
+#define WM831X_BKUP_BATT_STS                    0x0400  /* BKUP_BATT_STS */
+#define WM831X_BKUP_BATT_STS_MASK               0x0400  /* BKUP_BATT_STS */
+#define WM831X_BKUP_BATT_STS_SHIFT                  10  /* BKUP_BATT_STS */
+#define WM831X_BKUP_BATT_STS_WIDTH                   1  /* BKUP_BATT_STS */
+#define WM831X_BKUP_CHG_VLIM                    0x0010  /* BKUP_CHG_VLIM */
+#define WM831X_BKUP_CHG_VLIM_MASK               0x0010  /* BKUP_CHG_VLIM */
+#define WM831X_BKUP_CHG_VLIM_SHIFT                   4  /* BKUP_CHG_VLIM */
+#define WM831X_BKUP_CHG_VLIM_WIDTH                   1  /* BKUP_CHG_VLIM */
+#define WM831X_BKUP_CHG_ILIM_MASK               0x0003  /* BKUP_CHG_ILIM - [1:0] */
+#define WM831X_BKUP_CHG_ILIM_SHIFT                   0  /* BKUP_CHG_ILIM - [1:0] */
+#define WM831X_BKUP_CHG_ILIM_WIDTH                   2  /* BKUP_CHG_ILIM - [1:0] */
+
+#endif
-- 
cgit v1.2.3-71-gd317


From 0403e3827788d878163f9ef0541b748b0f88ca5d Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 8 Sep 2009 17:42:50 -0700
Subject: dmaengine: add fence support

Some engines optimize operation by reading ahead in the descriptor chain
such that descriptor2 may start execution before descriptor1 completes.
If descriptor2 depends on the result from descriptor1 then a fence is
required (on descriptor2) to disable this optimization.  The async_tx
api could implicitly identify dependencies via the 'depend_tx'
parameter, but that would constrain cases where the dependency chain
only specifies a completion order rather than a data dependency.  So,
provide an ASYNC_TX_FENCE to explicitly identify data dependencies.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 crypto/async_tx/async_memcpy.c      |  7 ++++--
 crypto/async_tx/async_memset.c      |  7 ++++--
 crypto/async_tx/async_pq.c          |  5 ++++
 crypto/async_tx/async_raid6_recov.c | 47 +++++++++++++++++++++----------------
 crypto/async_tx/async_xor.c         | 11 ++++++---
 drivers/md/raid5.c                  | 37 ++++++++++++++++++-----------
 include/linux/async_tx.h            |  3 +++
 include/linux/dmaengine.h           |  3 +++
 8 files changed, 79 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/async_tx/async_memcpy.c b/crypto/async_tx/async_memcpy.c
index 98e15bd0dcb5..b38cbb3fd527 100644
--- a/crypto/async_tx/async_memcpy.c
+++ b/crypto/async_tx/async_memcpy.c
@@ -52,9 +52,12 @@ async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
 
 	if (device) {
 		dma_addr_t dma_dest, dma_src;
-		unsigned long dma_prep_flags;
+		unsigned long dma_prep_flags = 0;
 
-		dma_prep_flags = submit->cb_fn ? DMA_PREP_INTERRUPT : 0;
+		if (submit->cb_fn)
+			dma_prep_flags |= DMA_PREP_INTERRUPT;
+		if (submit->flags & ASYNC_TX_FENCE)
+			dma_prep_flags |= DMA_PREP_FENCE;
 		dma_dest = dma_map_page(device->dev, dest, dest_offset, len,
 					DMA_FROM_DEVICE);
 
diff --git a/crypto/async_tx/async_memset.c b/crypto/async_tx/async_memset.c
index b896a6e5f673..a374784e3329 100644
--- a/crypto/async_tx/async_memset.c
+++ b/crypto/async_tx/async_memset.c
@@ -49,9 +49,12 @@ async_memset(struct page *dest, int val, unsigned int offset, size_t len,
 
 	if (device) {
 		dma_addr_t dma_dest;
-		unsigned long dma_prep_flags;
+		unsigned long dma_prep_flags = 0;
 
-		dma_prep_flags = submit->cb_fn ? DMA_PREP_INTERRUPT : 0;
+		if (submit->cb_fn)
+			dma_prep_flags |= DMA_PREP_INTERRUPT;
+		if (submit->flags & ASYNC_TX_FENCE)
+			dma_prep_flags |= DMA_PREP_FENCE;
 		dma_dest = dma_map_page(device->dev, dest, offset, len,
 					DMA_FROM_DEVICE);
 
diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c
index 108b21efb499..a25e290c39fb 100644
--- a/crypto/async_tx/async_pq.c
+++ b/crypto/async_tx/async_pq.c
@@ -101,6 +101,7 @@ do_async_gen_syndrome(struct dma_chan *chan, struct page **blocks,
 		 */
 		if (src_cnt > pq_src_cnt) {
 			submit->flags &= ~ASYNC_TX_ACK;
+			submit->flags |= ASYNC_TX_FENCE;
 			dma_flags |= DMA_COMPL_SKIP_DEST_UNMAP;
 			submit->cb_fn = NULL;
 			submit->cb_param = NULL;
@@ -111,6 +112,8 @@ do_async_gen_syndrome(struct dma_chan *chan, struct page **blocks,
 			if (cb_fn_orig)
 				dma_flags |= DMA_PREP_INTERRUPT;
 		}
+		if (submit->flags & ASYNC_TX_FENCE)
+			dma_flags |= DMA_PREP_FENCE;
 
 		/* Since we have clobbered the src_list we are committed
 		 * to doing this asynchronously.  Drivers force forward
@@ -282,6 +285,8 @@ async_syndrome_val(struct page **blocks, unsigned int offset, int disks,
 			dma_flags |= DMA_PREP_PQ_DISABLE_P;
 		if (!Q(blocks, disks))
 			dma_flags |= DMA_PREP_PQ_DISABLE_Q;
+		if (submit->flags & ASYNC_TX_FENCE)
+			dma_flags |= DMA_PREP_FENCE;
 		for (i = 0; i < disks; i++)
 			if (likely(blocks[i])) {
 				BUG_ON(is_raid6_zero_block(blocks[i]));
diff --git a/crypto/async_tx/async_raid6_recov.c b/crypto/async_tx/async_raid6_recov.c
index 0c14d48c9896..822a42d10061 100644
--- a/crypto/async_tx/async_raid6_recov.c
+++ b/crypto/async_tx/async_raid6_recov.c
@@ -44,6 +44,8 @@ async_sum_product(struct page *dest, struct page **srcs, unsigned char *coef,
 		struct dma_async_tx_descriptor *tx;
 		enum dma_ctrl_flags dma_flags = DMA_PREP_PQ_DISABLE_P;
 
+		if (submit->flags & ASYNC_TX_FENCE)
+			dma_flags |= DMA_PREP_FENCE;
 		dma_dest[1] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL);
 		dma_src[0] = dma_map_page(dev, srcs[0], 0, len, DMA_TO_DEVICE);
 		dma_src[1] = dma_map_page(dev, srcs[1], 0, len, DMA_TO_DEVICE);
@@ -89,6 +91,8 @@ async_mult(struct page *dest, struct page *src, u8 coef, size_t len,
 		struct dma_async_tx_descriptor *tx;
 		enum dma_ctrl_flags dma_flags = DMA_PREP_PQ_DISABLE_P;
 
+		if (submit->flags & ASYNC_TX_FENCE)
+			dma_flags |= DMA_PREP_FENCE;
 		dma_dest[1] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL);
 		dma_src[0] = dma_map_page(dev, src, 0, len, DMA_TO_DEVICE);
 		tx = dma->device_prep_dma_pq(chan, dma_dest, dma_src, 1, &coef,
@@ -138,7 +142,7 @@ __2data_recov_4(size_t bytes, int faila, int failb, struct page **blocks,
 	srcs[1] = q;
 	coef[0] = raid6_gfexi[failb-faila];
 	coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]];
-	init_async_submit(submit, 0, tx, NULL, NULL, scribble);
+	init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
 	tx = async_sum_product(b, srcs, coef, bytes, submit);
 
 	/* Dy = P+Pxy+Dx */
@@ -188,23 +192,23 @@ __2data_recov_5(size_t bytes, int faila, int failb, struct page **blocks,
 	dp = blocks[faila];
 	dq = blocks[failb];
 
-	init_async_submit(submit, 0, tx, NULL, NULL, scribble);
+	init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
 	tx = async_memcpy(dp, g, 0, 0, bytes, submit);
-	init_async_submit(submit, 0, tx, NULL, NULL, scribble);
+	init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
 	tx = async_mult(dq, g, raid6_gfexp[good], bytes, submit);
 
 	/* compute P + Pxy */
 	srcs[0] = dp;
 	srcs[1] = p;
-	init_async_submit(submit, ASYNC_TX_XOR_DROP_DST, tx, NULL, NULL,
-			  scribble);
+	init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
+			  NULL, NULL, scribble);
 	tx = async_xor(dp, srcs, 0, 2, bytes, submit);
 
 	/* compute Q + Qxy */
 	srcs[0] = dq;
 	srcs[1] = q;
-	init_async_submit(submit, ASYNC_TX_XOR_DROP_DST, tx, NULL, NULL,
-			  scribble);
+	init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
+			  NULL, NULL, scribble);
 	tx = async_xor(dq, srcs, 0, 2, bytes, submit);
 
 	/* Dx = A*(P+Pxy) + B*(Q+Qxy) */
@@ -212,7 +216,7 @@ __2data_recov_5(size_t bytes, int faila, int failb, struct page **blocks,
 	srcs[1] = dq;
 	coef[0] = raid6_gfexi[failb-faila];
 	coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]];
-	init_async_submit(submit, 0, tx, NULL, NULL, scribble);
+	init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
 	tx = async_sum_product(dq, srcs, coef, bytes, submit);
 
 	/* Dy = P+Pxy+Dx */
@@ -252,7 +256,7 @@ __2data_recov_n(int disks, size_t bytes, int faila, int failb,
 	blocks[failb] = (void *)raid6_empty_zero_page;
 	blocks[disks-1] = dq;
 
-	init_async_submit(submit, 0, tx, NULL, NULL, scribble);
+	init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
 	tx = async_gen_syndrome(blocks, 0, disks, bytes, submit);
 
 	/* Restore pointer table */
@@ -264,15 +268,15 @@ __2data_recov_n(int disks, size_t bytes, int faila, int failb,
 	/* compute P + Pxy */
 	srcs[0] = dp;
 	srcs[1] = p;
-	init_async_submit(submit, ASYNC_TX_XOR_DROP_DST, tx, NULL, NULL,
-			  scribble);
+	init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
+			  NULL, NULL, scribble);
 	tx = async_xor(dp, srcs, 0, 2, bytes, submit);
 
 	/* compute Q + Qxy */
 	srcs[0] = dq;
 	srcs[1] = q;
-	init_async_submit(submit, ASYNC_TX_XOR_DROP_DST, tx, NULL, NULL,
-			  scribble);
+	init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
+			  NULL, NULL, scribble);
 	tx = async_xor(dq, srcs, 0, 2, bytes, submit);
 
 	/* Dx = A*(P+Pxy) + B*(Q+Qxy) */
@@ -280,7 +284,7 @@ __2data_recov_n(int disks, size_t bytes, int faila, int failb,
 	srcs[1] = dq;
 	coef[0] = raid6_gfexi[failb-faila];
 	coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]];
-	init_async_submit(submit, 0, tx, NULL, NULL, scribble);
+	init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
 	tx = async_sum_product(dq, srcs, coef, bytes, submit);
 
 	/* Dy = P+Pxy+Dx */
@@ -407,13 +411,16 @@ async_raid6_datap_recov(int disks, size_t bytes, int faila,
 		int good = faila == 0 ? 1 : 0;
 		struct page *g = blocks[good];
 
-		init_async_submit(submit, 0, tx, NULL, NULL, scribble);
+		init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL,
+				  scribble);
 		tx = async_memcpy(p, g, 0, 0, bytes, submit);
 
-		init_async_submit(submit, 0, tx, NULL, NULL, scribble);
+		init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL,
+				  scribble);
 		tx = async_mult(dq, g, raid6_gfexp[good], bytes, submit);
 	} else {
-		init_async_submit(submit, 0, tx, NULL, NULL, scribble);
+		init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL,
+				  scribble);
 		tx = async_gen_syndrome(blocks, 0, disks, bytes, submit);
 	}
 
@@ -426,11 +433,11 @@ async_raid6_datap_recov(int disks, size_t bytes, int faila,
 
 	srcs[0] = dq;
 	srcs[1] = q;
-	init_async_submit(submit, ASYNC_TX_XOR_DROP_DST, tx, NULL, NULL,
-			  scribble);
+	init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
+			  NULL, NULL, scribble);
 	tx = async_xor(dq, srcs, 0, 2, bytes, submit);
 
-	init_async_submit(submit, 0, tx, NULL, NULL, scribble);
+	init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
 	tx = async_mult(dq, dq, coef, bytes, submit);
 
 	srcs[0] = p;
diff --git a/crypto/async_tx/async_xor.c b/crypto/async_tx/async_xor.c
index 56b5f98da463..db279872ef3d 100644
--- a/crypto/async_tx/async_xor.c
+++ b/crypto/async_tx/async_xor.c
@@ -69,6 +69,7 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
 		 */
 		if (src_cnt > xor_src_cnt) {
 			submit->flags &= ~ASYNC_TX_ACK;
+			submit->flags |= ASYNC_TX_FENCE;
 			dma_flags = DMA_COMPL_SKIP_DEST_UNMAP;
 			submit->cb_fn = NULL;
 			submit->cb_param = NULL;
@@ -78,7 +79,8 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
 		}
 		if (submit->cb_fn)
 			dma_flags |= DMA_PREP_INTERRUPT;
-
+		if (submit->flags & ASYNC_TX_FENCE)
+			dma_flags |= DMA_PREP_FENCE;
 		/* Since we have clobbered the src_list we are committed
 		 * to doing this asynchronously.  Drivers force forward progress
 		 * in case they can not provide a descriptor
@@ -264,12 +266,15 @@ async_xor_val(struct page *dest, struct page **src_list, unsigned int offset,
 		dma_src = (dma_addr_t *) src_list;
 
 	if (dma_src && device && src_cnt <= device->max_xor) {
-		unsigned long dma_prep_flags;
+		unsigned long dma_prep_flags = 0;
 		int i;
 
 		pr_debug("%s: (async) len: %zu\n", __func__, len);
 
-		dma_prep_flags = submit->cb_fn ? DMA_PREP_INTERRUPT : 0;
+		if (submit->cb_fn)
+			dma_prep_flags |= DMA_PREP_INTERRUPT;
+		if (submit->flags & ASYNC_TX_FENCE)
+			dma_prep_flags |= DMA_PREP_FENCE;
 		for (i = 0; i < src_cnt; i++)
 			dma_src[i] = dma_map_page(device->dev, src_list[i],
 						  offset, len, DMA_TO_DEVICE);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0a5cf2171214..54ef8d75541d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -502,13 +502,17 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
 	int i;
 	int page_offset;
 	struct async_submit_ctl submit;
+	enum async_tx_flags flags = 0;
 
 	if (bio->bi_sector >= sector)
 		page_offset = (signed)(bio->bi_sector - sector) * 512;
 	else
 		page_offset = (signed)(sector - bio->bi_sector) * -512;
 
-	init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
+	if (frombio)
+		flags |= ASYNC_TX_FENCE;
+	init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
+
 	bio_for_each_segment(bvl, bio, i) {
 		int len = bio_iovec_idx(bio, i)->bv_len;
 		int clen;
@@ -685,7 +689,7 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
 
 	atomic_inc(&sh->count);
 
-	init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL,
+	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
 			  ops_complete_compute, sh, to_addr_conv(sh, percpu));
 	if (unlikely(count == 1))
 		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
@@ -763,7 +767,8 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
 		count = set_syndrome_sources(blocks, sh);
 		blocks[count] = NULL; /* regenerating p is not necessary */
 		BUG_ON(blocks[count+1] != dest); /* q should already be set */
-		init_async_submit(&submit, 0, NULL, ops_complete_compute, sh,
+		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
+				  ops_complete_compute, sh,
 				  to_addr_conv(sh, percpu));
 		tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
 	} else {
@@ -775,8 +780,8 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
 			blocks[count++] = sh->dev[i].page;
 		}
 
-		init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL,
-				  ops_complete_compute, sh,
+		init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
+				  NULL, ops_complete_compute, sh,
 				  to_addr_conv(sh, percpu));
 		tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
 	}
@@ -837,8 +842,9 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
 		/* Q disk is one of the missing disks */
 		if (faila == syndrome_disks) {
 			/* Missing P+Q, just recompute */
-			init_async_submit(&submit, 0, NULL, ops_complete_compute,
-					  sh, to_addr_conv(sh, percpu));
+			init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
+					  ops_complete_compute, sh,
+					  to_addr_conv(sh, percpu));
 			return async_gen_syndrome(blocks, 0, count+2,
 						  STRIPE_SIZE, &submit);
 		} else {
@@ -859,21 +865,24 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
 				blocks[count++] = sh->dev[i].page;
 			}
 			dest = sh->dev[data_target].page;
-			init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL,
-					  NULL, NULL, to_addr_conv(sh, percpu));
+			init_async_submit(&submit,
+					  ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
+					  NULL, NULL, NULL,
+					  to_addr_conv(sh, percpu));
 			tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
 				       &submit);
 
 			count = set_syndrome_sources(blocks, sh);
-			init_async_submit(&submit, 0, tx, ops_complete_compute,
-					  sh, to_addr_conv(sh, percpu));
+			init_async_submit(&submit, ASYNC_TX_FENCE, tx,
+					  ops_complete_compute, sh,
+					  to_addr_conv(sh, percpu));
 			return async_gen_syndrome(blocks, 0, count+2,
 						  STRIPE_SIZE, &submit);
 		}
 	}
 
-	init_async_submit(&submit, 0, NULL, ops_complete_compute, sh,
-			  to_addr_conv(sh, percpu));
+	init_async_submit(&submit, ASYNC_TX_FENCE, NULL, ops_complete_compute,
+			  sh, to_addr_conv(sh, percpu));
 	if (failb == syndrome_disks) {
 		/* We're missing D+P. */
 		return async_raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE,
@@ -916,7 +925,7 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
 			xor_srcs[count++] = dev->page;
 	}
 
-	init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST, tx,
+	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
 			  ops_complete_prexor, sh, to_addr_conv(sh, percpu));
 	tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
 
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
index 866e61c4e2e0..a1c486a88e88 100644
--- a/include/linux/async_tx.h
+++ b/include/linux/async_tx.h
@@ -58,11 +58,14 @@ struct dma_chan_ref {
  * array.
  * @ASYNC_TX_ACK: immediately ack the descriptor, precludes setting up a
  * dependency chain
+ * @ASYNC_TX_FENCE: specify that the next operation in the dependency
+ * chain uses this operation's result as an input
  */
 enum async_tx_flags {
 	ASYNC_TX_XOR_ZERO_DST	 = (1 << 0),
 	ASYNC_TX_XOR_DROP_DST	 = (1 << 1),
 	ASYNC_TX_ACK		 = (1 << 2),
+	ASYNC_TX_FENCE		 = (1 << 3),
 };
 
 /**
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 1012f1abcb54..4d6c1c925fd4 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -87,6 +87,8 @@ enum dma_transaction_type {
  * @DMA_PREP_CONTINUE - indicate to a driver that it is reusing buffers as
  *  sources that were the result of a previous operation, in the case of a PQ
  *  operation it continues the calculation with new sources
+ * @DMA_PREP_FENCE - tell the driver that subsequent operations depend
+ *  on the result of this operation
  */
 enum dma_ctrl_flags {
 	DMA_PREP_INTERRUPT = (1 << 0),
@@ -98,6 +100,7 @@ enum dma_ctrl_flags {
 	DMA_PREP_PQ_DISABLE_P = (1 << 6),
 	DMA_PREP_PQ_DISABLE_Q = (1 << 7),
 	DMA_PREP_CONTINUE = (1 << 8),
+	DMA_PREP_FENCE = (1 << 9),
 };
 
 /**
-- 
cgit v1.2.3-71-gd317


From 138f4c359d23d2ec38d18bd70dd9613ae515fe93 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 8 Sep 2009 17:42:51 -0700
Subject: dmaengine, async_tx: add a "no channel switch" allocator

Channel switching is problematic for some dmaengine drivers as the
architecture precludes separating the ->prep from ->submit.  In these
cases the driver can select ASYNC_TX_DISABLE_CHANNEL_SWITCH to modify
the async_tx allocator to only return channels that support all of the
required asynchronous operations.

For example MD_RAID456=y selects support for asynchronous xor, xor
validate, pq, pq validate, and memcpy.  When
ASYNC_TX_DISABLE_CHANNEL_SWITCH=y any channel with all these
capabilities is marked DMA_ASYNC_TX allowing async_tx_find_channel() to
quickly locate compatible channels with the guarantee that dependency
chains will remain on one channel.  When
ASYNC_TX_DISABLE_CHANNEL_SWITCH=n async_tx_find_channel() may select
channels that lead to operation chains that need to cross channel
boundaries using the async_tx channel switch capability.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 crypto/async_tx/async_tx.c |  4 ++++
 drivers/dma/Kconfig        |  4 ++++
 drivers/dma/dmaengine.c    | 40 ++++++++++++++++++++++++++++++++++++++++
 include/linux/dmaengine.h  | 10 +++++++++-
 4 files changed, 57 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c
index 60615fedcf5e..f9cdf04fe7c0 100644
--- a/crypto/async_tx/async_tx.c
+++ b/crypto/async_tx/async_tx.c
@@ -81,6 +81,10 @@ async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx,
 	struct dma_device *device = chan->device;
 	struct dma_async_tx_descriptor *intr_tx = (void *) ~0;
 
+	#ifdef CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH
+	BUG();
+	#endif
+
 	/* first check to see if we can still append to depend_tx */
 	spin_lock_bh(&depend_tx->lock);
 	if (depend_tx->parent && depend_tx->chan == tx->chan) {
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 912a51b5cbd3..ddcd9793b25c 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -17,11 +17,15 @@ if DMADEVICES
 
 comment "DMA Devices"
 
+config ASYNC_TX_DISABLE_CHANNEL_SWITCH
+	bool
+
 config INTEL_IOATDMA
 	tristate "Intel I/OAT DMA support"
 	depends on PCI && X86
 	select DMA_ENGINE
 	select DCA
+	select ASYNC_TX_DISABLE_CHANNEL_SWITCH
 	help
 	  Enable support for the Intel(R) I/OAT DMA engine present
 	  in recent Intel Xeon chipsets.
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 96598479eece..d5bc628d207c 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -608,6 +608,40 @@ void dmaengine_put(void)
 }
 EXPORT_SYMBOL(dmaengine_put);
 
+static bool device_has_all_tx_types(struct dma_device *device)
+{
+	/* A device that satisfies this test has channels that will never cause
+	 * an async_tx channel switch event as all possible operation types can
+	 * be handled.
+	 */
+	#ifdef CONFIG_ASYNC_TX_DMA
+	if (!dma_has_cap(DMA_INTERRUPT, device->cap_mask))
+		return false;
+	#endif
+
+	#if defined(CONFIG_ASYNC_MEMCPY) || defined(CONFIG_ASYNC_MEMCPY_MODULE)
+	if (!dma_has_cap(DMA_MEMCPY, device->cap_mask))
+		return false;
+	#endif
+
+	#if defined(CONFIG_ASYNC_MEMSET) || defined(CONFIG_ASYNC_MEMSET_MODULE)
+	if (!dma_has_cap(DMA_MEMSET, device->cap_mask))
+		return false;
+	#endif
+
+	#if defined(CONFIG_ASYNC_XOR) || defined(CONFIG_ASYNC_XOR_MODULE)
+	if (!dma_has_cap(DMA_XOR, device->cap_mask))
+		return false;
+	#endif
+
+	#if defined(CONFIG_ASYNC_PQ) || defined(CONFIG_ASYNC_PQ_MODULE)
+	if (!dma_has_cap(DMA_PQ, device->cap_mask))
+		return false;
+	#endif
+
+	return true;
+}
+
 static int get_dma_id(struct dma_device *device)
 {
 	int rc;
@@ -665,6 +699,12 @@ int dma_async_device_register(struct dma_device *device)
 	BUG_ON(!device->device_issue_pending);
 	BUG_ON(!device->dev);
 
+	/* note: this only matters in the
+	 * CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH=y case
+	 */
+	if (device_has_all_tx_types(device))
+		dma_cap_set(DMA_ASYNC_TX, device->cap_mask);
+
 	idr_ref = kmalloc(sizeof(*idr_ref), GFP_KERNEL);
 	if (!idr_ref)
 		return -ENOMEM;
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 4d6c1c925fd4..86853ed7970b 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -48,6 +48,9 @@ enum dma_status {
 
 /**
  * enum dma_transaction_type - DMA transaction types/indexes
+ *
+ * Note: The DMA_ASYNC_TX capability is not to be set by drivers.  It is
+ * automatically set as dma devices are registered.
  */
 enum dma_transaction_type {
 	DMA_MEMCPY,
@@ -61,6 +64,7 @@ enum dma_transaction_type {
 	DMA_MEMCPY_CRC32C,
 	DMA_INTERRUPT,
 	DMA_PRIVATE,
+	DMA_ASYNC_TX,
 	DMA_SLAVE,
 };
 
@@ -396,7 +400,11 @@ static inline void net_dmaengine_put(void)
 #ifdef CONFIG_ASYNC_TX_DMA
 #define async_dmaengine_get()	dmaengine_get()
 #define async_dmaengine_put()	dmaengine_put()
+#ifdef CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH
+#define async_dma_find_channel(type) dma_find_channel(DMA_ASYNC_TX)
+#else
 #define async_dma_find_channel(type) dma_find_channel(type)
+#endif /* CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH */
 #else
 static inline void async_dmaengine_get(void)
 {
@@ -409,7 +417,7 @@ async_dma_find_channel(enum dma_transaction_type type)
 {
 	return NULL;
 }
-#endif
+#endif /* CONFIG_ASYNC_TX_DMA */
 
 dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
 	void *dest, void *src, size_t len);
-- 
cgit v1.2.3-71-gd317


From 9308add6ea4fedeba37b0d7c4630a542bd34f214 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 8 Sep 2009 17:42:52 -0700
Subject: dmaengine: cleanup unused transaction types

No drivers currently implement these operation types, so they can be
deleted.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/arm/mach-iop13xx/setup.c | 7 -------
 arch/arm/plat-iop/adma.c      | 2 --
 drivers/dma/iop-adma.c        | 5 +----
 include/linux/dmaengine.h     | 3 ---
 4 files changed, 1 insertion(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-iop13xx/setup.c b/arch/arm/mach-iop13xx/setup.c
index faaef95342b6..5c147fb66a01 100644
--- a/arch/arm/mach-iop13xx/setup.c
+++ b/arch/arm/mach-iop13xx/setup.c
@@ -477,10 +477,8 @@ void __init iop13xx_platform_init(void)
 			plat_data = &iop13xx_adma_0_data;
 			dma_cap_set(DMA_MEMCPY, plat_data->cap_mask);
 			dma_cap_set(DMA_XOR, plat_data->cap_mask);
-			dma_cap_set(DMA_DUAL_XOR, plat_data->cap_mask);
 			dma_cap_set(DMA_XOR_VAL, plat_data->cap_mask);
 			dma_cap_set(DMA_MEMSET, plat_data->cap_mask);
-			dma_cap_set(DMA_MEMCPY_CRC32C, plat_data->cap_mask);
 			dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask);
 			break;
 		case IOP13XX_INIT_ADMA_1:
@@ -489,10 +487,8 @@ void __init iop13xx_platform_init(void)
 			plat_data = &iop13xx_adma_1_data;
 			dma_cap_set(DMA_MEMCPY, plat_data->cap_mask);
 			dma_cap_set(DMA_XOR, plat_data->cap_mask);
-			dma_cap_set(DMA_DUAL_XOR, plat_data->cap_mask);
 			dma_cap_set(DMA_XOR_VAL, plat_data->cap_mask);
 			dma_cap_set(DMA_MEMSET, plat_data->cap_mask);
-			dma_cap_set(DMA_MEMCPY_CRC32C, plat_data->cap_mask);
 			dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask);
 			break;
 		case IOP13XX_INIT_ADMA_2:
@@ -501,13 +497,10 @@ void __init iop13xx_platform_init(void)
 			plat_data = &iop13xx_adma_2_data;
 			dma_cap_set(DMA_MEMCPY, plat_data->cap_mask);
 			dma_cap_set(DMA_XOR, plat_data->cap_mask);
-			dma_cap_set(DMA_DUAL_XOR, plat_data->cap_mask);
 			dma_cap_set(DMA_XOR_VAL, plat_data->cap_mask);
 			dma_cap_set(DMA_MEMSET, plat_data->cap_mask);
-			dma_cap_set(DMA_MEMCPY_CRC32C, plat_data->cap_mask);
 			dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask);
 			dma_cap_set(DMA_PQ, plat_data->cap_mask);
-			dma_cap_set(DMA_PQ_UPDATE, plat_data->cap_mask);
 			dma_cap_set(DMA_PQ_VAL, plat_data->cap_mask);
 			break;
 		}
diff --git a/arch/arm/plat-iop/adma.c b/arch/arm/plat-iop/adma.c
index da1dd0dab07c..1ff6a37e893c 100644
--- a/arch/arm/plat-iop/adma.c
+++ b/arch/arm/plat-iop/adma.c
@@ -179,7 +179,6 @@ static int __init iop3xx_adma_cap_init(void)
 	dma_cap_set(DMA_INTERRUPT, iop3xx_dma_0_data.cap_mask);
 	#else
 	dma_cap_set(DMA_MEMCPY, iop3xx_dma_0_data.cap_mask);
-	dma_cap_set(DMA_MEMCPY_CRC32C, iop3xx_dma_0_data.cap_mask);
 	dma_cap_set(DMA_INTERRUPT, iop3xx_dma_0_data.cap_mask);
 	#endif
 
@@ -188,7 +187,6 @@ static int __init iop3xx_adma_cap_init(void)
 	dma_cap_set(DMA_INTERRUPT, iop3xx_dma_1_data.cap_mask);
 	#else
 	dma_cap_set(DMA_MEMCPY, iop3xx_dma_1_data.cap_mask);
-	dma_cap_set(DMA_MEMCPY_CRC32C, iop3xx_dma_1_data.cap_mask);
 	dma_cap_set(DMA_INTERRUPT, iop3xx_dma_1_data.cap_mask);
 	#endif
 
diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c
index 4496bc606662..cecb6d657d55 100644
--- a/drivers/dma/iop-adma.c
+++ b/drivers/dma/iop-adma.c
@@ -1256,15 +1256,12 @@ static int __devinit iop_adma_probe(struct platform_device *pdev)
 	}
 
 	dev_printk(KERN_INFO, &pdev->dev, "Intel(R) IOP: "
-	  "( %s%s%s%s%s%s%s%s%s%s)\n",
+	  "( %s%s%s%s%s%s%s)\n",
 	  dma_has_cap(DMA_PQ, dma_dev->cap_mask) ? "pq " : "",
-	  dma_has_cap(DMA_PQ_UPDATE, dma_dev->cap_mask) ? "pq_update " : "",
 	  dma_has_cap(DMA_PQ_VAL, dma_dev->cap_mask) ? "pq_val " : "",
 	  dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "xor " : "",
-	  dma_has_cap(DMA_DUAL_XOR, dma_dev->cap_mask) ? "dual_xor " : "",
 	  dma_has_cap(DMA_XOR_VAL, dma_dev->cap_mask) ? "xor_val " : "",
 	  dma_has_cap(DMA_MEMSET, dma_dev->cap_mask)  ? "fill " : "",
-	  dma_has_cap(DMA_MEMCPY_CRC32C, dma_dev->cap_mask) ? "cpy+crc " : "",
 	  dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask) ? "cpy " : "",
 	  dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask) ? "intr " : "");
 
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 86853ed7970b..db23fd583f98 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -56,12 +56,9 @@ enum dma_transaction_type {
 	DMA_MEMCPY,
 	DMA_XOR,
 	DMA_PQ,
-	DMA_DUAL_XOR,
-	DMA_PQ_UPDATE,
 	DMA_XOR_VAL,
 	DMA_PQ_VAL,
 	DMA_MEMSET,
-	DMA_MEMCPY_CRC32C,
 	DMA_INTERRUPT,
 	DMA_PRIVATE,
 	DMA_ASYNC_TX,
-- 
cgit v1.2.3-71-gd317


From 83544ae9f3991bfc7d5e0fe9a3008cd05a8d57b7 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 8 Sep 2009 17:42:53 -0700
Subject: dmaengine, async_tx: support alignment checks

Some engines have transfer size and address alignment restrictions.  Add
a per-operation alignment property to struct dma_device that the async
routines and dmatest can use to check alignment capabilities.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 crypto/async_tx/async_memcpy.c |  2 +-
 crypto/async_tx/async_memset.c |  2 +-
 crypto/async_tx/async_pq.c     |  6 ++++--
 crypto/async_tx/async_xor.c    |  5 +++--
 drivers/dma/dmatest.c          | 14 ++++++++++++++
 include/linux/dmaengine.h      | 44 ++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 67 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/async_tx/async_memcpy.c b/crypto/async_tx/async_memcpy.c
index b38cbb3fd527..0ec1fb69d4ea 100644
--- a/crypto/async_tx/async_memcpy.c
+++ b/crypto/async_tx/async_memcpy.c
@@ -50,7 +50,7 @@ async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
 	struct dma_device *device = chan ? chan->device : NULL;
 	struct dma_async_tx_descriptor *tx = NULL;
 
-	if (device) {
+	if (device && is_dma_copy_aligned(device, src_offset, dest_offset, len)) {
 		dma_addr_t dma_dest, dma_src;
 		unsigned long dma_prep_flags = 0;
 
diff --git a/crypto/async_tx/async_memset.c b/crypto/async_tx/async_memset.c
index a374784e3329..58e4a8752aee 100644
--- a/crypto/async_tx/async_memset.c
+++ b/crypto/async_tx/async_memset.c
@@ -47,7 +47,7 @@ async_memset(struct page *dest, int val, unsigned int offset, size_t len,
 	struct dma_device *device = chan ? chan->device : NULL;
 	struct dma_async_tx_descriptor *tx = NULL;
 
-	if (device) {
+	if (device && is_dma_fill_aligned(device, offset, 0, len)) {
 		dma_addr_t dma_dest;
 		unsigned long dma_prep_flags = 0;
 
diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c
index a25e290c39fb..b88db6d1dc65 100644
--- a/crypto/async_tx/async_pq.c
+++ b/crypto/async_tx/async_pq.c
@@ -211,7 +211,8 @@ async_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
 
 	if (dma_src && device &&
 	    (src_cnt <= dma_maxpq(device, 0) ||
-	     dma_maxpq(device, DMA_PREP_CONTINUE) > 0)) {
+	     dma_maxpq(device, DMA_PREP_CONTINUE) > 0) &&
+	    is_dma_pq_aligned(device, offset, 0, len)) {
 		/* run the p+q asynchronously */
 		pr_debug("%s: (async) disks: %d len: %zu\n",
 			 __func__, disks, len);
@@ -274,7 +275,8 @@ async_syndrome_val(struct page **blocks, unsigned int offset, int disks,
 	else if (sizeof(dma_addr_t) <= sizeof(struct page *))
 		dma_src = (dma_addr_t *) blocks;
 
-	if (dma_src && device && disks <= dma_maxpq(device, 0)) {
+	if (dma_src && device && disks <= dma_maxpq(device, 0) &&
+	    is_dma_pq_aligned(device, offset, 0, len)) {
 		struct device *dev = device->dev;
 		dma_addr_t *pq = &dma_src[disks-2];
 		int i;
diff --git a/crypto/async_tx/async_xor.c b/crypto/async_tx/async_xor.c
index db279872ef3d..b459a9034aac 100644
--- a/crypto/async_tx/async_xor.c
+++ b/crypto/async_tx/async_xor.c
@@ -193,7 +193,7 @@ async_xor(struct page *dest, struct page **src_list, unsigned int offset,
 	else if (sizeof(dma_addr_t) <= sizeof(struct page *))
 		dma_src = (dma_addr_t *) src_list;
 
-	if (dma_src && chan) {
+	if (dma_src && chan && is_dma_xor_aligned(chan->device, offset, 0, len)) {
 		/* run the xor asynchronously */
 		pr_debug("%s (async): len: %zu\n", __func__, len);
 
@@ -265,7 +265,8 @@ async_xor_val(struct page *dest, struct page **src_list, unsigned int offset,
 	else if (sizeof(dma_addr_t) <= sizeof(struct page *))
 		dma_src = (dma_addr_t *) src_list;
 
-	if (dma_src && device && src_cnt <= device->max_xor) {
+	if (dma_src && device && src_cnt <= device->max_xor &&
+	    is_dma_xor_aligned(device, offset, 0, len)) {
 		unsigned long dma_prep_flags = 0;
 		int i;
 
diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index 58e49e41c7a3..a3722a7384b5 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -288,6 +288,7 @@ static int dmatest_func(void *data)
 		dma_addr_t dma_dsts[dst_cnt];
 		struct completion cmp;
 		unsigned long tmo = msecs_to_jiffies(3000);
+		u8 align = 0;
 
 		total_tests++;
 
@@ -295,6 +296,18 @@ static int dmatest_func(void *data)
 		src_off = dmatest_random() % (test_buf_size - len + 1);
 		dst_off = dmatest_random() % (test_buf_size - len + 1);
 
+		/* honor alignment restrictions */
+		if (thread->type == DMA_MEMCPY)
+			align = dev->copy_align;
+		else if (thread->type == DMA_XOR)
+			align = dev->xor_align;
+		else if (thread->type == DMA_PQ)
+			align = dev->pq_align;
+
+		len = (len >> align) << align;
+		src_off = (src_off >> align) << align;
+		dst_off = (dst_off >> align) << align;
+
 		dmatest_init_srcs(thread->srcs, src_off, len);
 		dmatest_init_dsts(thread->dsts, dst_off, len);
 
@@ -311,6 +324,7 @@ static int dmatest_func(void *data)
 						     DMA_BIDIRECTIONAL);
 		}
 
+
 		if (thread->type == DMA_MEMCPY)
 			tx = dev->device_prep_dma_memcpy(chan,
 							 dma_dsts[0] + dst_off,
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index db23fd583f98..835b9c7bf1c2 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -245,6 +245,10 @@ struct dma_async_tx_descriptor {
  * @cap_mask: one or more dma_capability flags
  * @max_xor: maximum number of xor sources, 0 if no capability
  * @max_pq: maximum number of PQ sources and PQ-continue capability
+ * @copy_align: alignment shift for memcpy operations
+ * @xor_align: alignment shift for xor operations
+ * @pq_align: alignment shift for pq operations
+ * @fill_align: alignment shift for memset operations
  * @dev_id: unique device ID
  * @dev: struct device reference for dma mapping api
  * @device_alloc_chan_resources: allocate resources and return the
@@ -271,6 +275,10 @@ struct dma_device {
 	dma_cap_mask_t  cap_mask;
 	unsigned short max_xor;
 	unsigned short max_pq;
+	u8 copy_align;
+	u8 xor_align;
+	u8 pq_align;
+	u8 fill_align;
 	#define DMA_HAS_PQ_CONTINUE (1 << 15)
 
 	int dev_id;
@@ -314,6 +322,42 @@ struct dma_device {
 	void (*device_issue_pending)(struct dma_chan *chan);
 };
 
+static inline bool dmaengine_check_align(u8 align, size_t off1, size_t off2, size_t len)
+{
+	size_t mask;
+
+	if (!align)
+		return true;
+	mask = (1 << align) - 1;
+	if (mask & (off1 | off2 | len))
+		return false;
+	return true;
+}
+
+static inline bool is_dma_copy_aligned(struct dma_device *dev, size_t off1,
+				       size_t off2, size_t len)
+{
+	return dmaengine_check_align(dev->copy_align, off1, off2, len);
+}
+
+static inline bool is_dma_xor_aligned(struct dma_device *dev, size_t off1,
+				      size_t off2, size_t len)
+{
+	return dmaengine_check_align(dev->xor_align, off1, off2, len);
+}
+
+static inline bool is_dma_pq_aligned(struct dma_device *dev, size_t off1,
+				     size_t off2, size_t len)
+{
+	return dmaengine_check_align(dev->pq_align, off1, off2, len);
+}
+
+static inline bool is_dma_fill_aligned(struct dma_device *dev, size_t off1,
+				       size_t off2, size_t len)
+{
+	return dmaengine_check_align(dev->fill_align, off1, off2, len);
+}
+
 static inline void
 dma_set_maxpq(struct dma_device *dma, int maxpq, int has_pq_continue)
 {
-- 
cgit v1.2.3-71-gd317


From b265b11fc1a0bd6ae5a7fde12e374583a52ab326 Mon Sep 17 00:00:00 2001
From: Tom Picard <tom.s.picard@intel.com>
Date: Tue, 8 Sep 2009 17:43:01 -0700
Subject: ioat3: ioat3.2 pci ids for Jasper Forest

Jasper Forest introduces raid offload support via ioat3.2 support.  When
raid offload is enabled two (out of 8 channels) will report raid5/raid6
offload capabilities.  The remaining channels will only report ioat3.0
capabilities (memcpy).

Signed-off-by: Tom Picard <tom.s.picard@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/ioat/pci.c  | 13 +++++++++++++
 include/linux/pci_ids.h | 10 ++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/dma/ioat/pci.c b/drivers/dma/ioat/pci.c
index 626a508a4f8b..6c1aac5b3598 100644
--- a/drivers/dma/ioat/pci.c
+++ b/drivers/dma/ioat/pci.c
@@ -58,6 +58,19 @@ static struct pci_device_id ioat_pci_tbl[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG5) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG6) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG7) },
+
+	/* I/OAT v3.2 platforms */
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF0) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF1) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF2) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF3) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF4) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF5) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF6) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF7) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF8) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF9) },
+
 	{ 0, }
 };
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 0f71812d67d3..2b4b8ce53256 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2529,6 +2529,16 @@
 #define PCI_DEVICE_ID_INTEL_E7525_MCH	0x359e
 #define PCI_DEVICE_ID_INTEL_IOAT_CNB	0x360b
 #define PCI_DEVICE_ID_INTEL_FBD_CNB	0x360c
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF0	0x3710
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF1	0x3711
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF2	0x3712
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF3	0x3713
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF4	0x3714
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF5	0x3715
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF6	0x3716
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF7	0x3717
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF8	0x3718
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF9	0x3719
 #define PCI_DEVICE_ID_INTEL_ICH10_0	0x3a14
 #define PCI_DEVICE_ID_INTEL_ICH10_1	0x3a16
 #define PCI_DEVICE_ID_INTEL_ICH10_2	0x3a18
-- 
cgit v1.2.3-71-gd317


From 0803172778901e24a75ab074798d98c2b7411559 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 8 Sep 2009 17:53:04 -0700
Subject: dmaengine: kill tx_list

The tx_list attribute of struct dma_async_tx_descriptor is common to
most, but not all dma driver implementations.  None of the upper level
code (dmaengine/async_tx) uses it, so allow drivers to implement it
locally if they need it.  This saves sizeof(struct list_head) bytes for
drivers that do not manage descriptors with a linked list (e.g.: ioatdma
v2,3).

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/dmaengine.c   | 1 -
 include/linux/dmaengine.h | 3 ---
 2 files changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 5a87384ea4ff..562d182eae66 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -933,7 +933,6 @@ void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx,
 {
 	tx->chan = chan;
 	spin_lock_init(&tx->lock);
-	INIT_LIST_HEAD(&tx->tx_list);
 }
 EXPORT_SYMBOL(dma_async_tx_descriptor_init);
 
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index ffefba81c818..f114bc7790bc 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -180,8 +180,6 @@ typedef void (*dma_async_tx_callback)(void *dma_async_param);
  * @flags: flags to augment operation preparation, control completion, and
  * 	communicate status
  * @phys: physical address of the descriptor
- * @tx_list: driver common field for operations that require multiple
- *	descriptors
  * @chan: target channel for this operation
  * @tx_submit: set the prepared descriptor(s) to be executed by the engine
  * @callback: routine to call after this operation is complete
@@ -195,7 +193,6 @@ struct dma_async_tx_descriptor {
 	dma_cookie_t cookie;
 	enum dma_ctrl_flags flags; /* not a 'long' to pack with cookie */
 	dma_addr_t phys;
-	struct list_head tx_list;
 	struct dma_chan *chan;
 	dma_cookie_t (*tx_submit)(struct dma_async_tx_descriptor *tx);
 	dma_async_tx_callback callback;
-- 
cgit v1.2.3-71-gd317


From 711d57796f5ce2d02d6e62c9034afbb16aedda31 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 27 Jul 2009 23:37:48 +0300
Subject: PCI: expose function reset capability in sysfs

Some devices allow an individual function to be reset without affecting
other functions in the same device: that's what pci_reset_function does.
For devices that have this support, expose reset attribite in sysfs.

This is useful e.g. for virtualization, where a qemu userspace
process wants to reset the device when the guest is reset,
to emulate machine reboot as closely as possible.

Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/ABI/testing/sysfs-bus-pci | 10 +++++++++
 drivers/pci/pci-sysfs.c                 | 37 +++++++++++++++++++++++++++++++++
 drivers/pci/pci.c                       | 16 ++++++++++++++
 drivers/pci/pci.h                       |  1 +
 include/linux/pci.h                     |  1 +
 5 files changed, 65 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index 6bf68053e4b8..25be3250f7d6 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -84,6 +84,16 @@ Description:
 		from this part of the device tree.
 		Depends on CONFIG_HOTPLUG.
 
+What:		/sys/bus/pci/devices/.../reset
+Date:		July 2009
+Contact:	Michael S. Tsirkin <mst@redhat.com>
+Description:
+		Some devices allow an individual function to be reset
+		without affecting other functions in the same device.
+		For devices that have this support, a file named reset
+		will be present in sysfs.  Writing 1 to this file
+		will perform reset.
+
 What:		/sys/bus/pci/devices/.../vpd
 Date:		February 2008
 Contact:	Ben Hutchings <bhutchings@solarflare.com>
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 85ebd02a64a7..0f6382f090ee 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -916,6 +916,24 @@ int __attribute__ ((weak)) pcibios_add_platform_entries(struct pci_dev *dev)
 	return 0;
 }
 
+static ssize_t reset_store(struct device *dev,
+			   struct device_attribute *attr, const char *buf,
+			   size_t count)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	unsigned long val;
+	ssize_t result = strict_strtoul(buf, 0, &val);
+
+	if (result < 0)
+		return result;
+
+	if (val != 1)
+		return -EINVAL;
+	return pci_reset_function(pdev);
+}
+
+static struct device_attribute reset_attr = __ATTR(reset, 0200, NULL, reset_store);
+
 static int pci_create_capabilities_sysfs(struct pci_dev *dev)
 {
 	int retval;
@@ -943,7 +961,22 @@ static int pci_create_capabilities_sysfs(struct pci_dev *dev)
 	/* Active State Power Management */
 	pcie_aspm_create_sysfs_dev_files(dev);
 
+	if (!pci_probe_reset_function(dev)) {
+		retval = device_create_file(&dev->dev, &reset_attr);
+		if (retval)
+			goto error;
+		dev->reset_fn = 1;
+	}
 	return 0;
+
+error:
+	pcie_aspm_remove_sysfs_dev_files(dev);
+	if (dev->vpd && dev->vpd->attr) {
+		sysfs_remove_bin_file(&dev->dev.kobj, dev->vpd->attr);
+		kfree(dev->vpd->attr);
+	}
+
+	return retval;
 }
 
 int __must_check pci_create_sysfs_dev_files (struct pci_dev *pdev)
@@ -1037,6 +1070,10 @@ static void pci_remove_capabilities_sysfs(struct pci_dev *dev)
 	}
 
 	pcie_aspm_remove_sysfs_dev_files(dev);
+	if (dev->reset_fn) {
+		device_remove_file(&dev->dev, &reset_attr);
+		dev->reset_fn = 0;
+	}
 }
 
 /**
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 7b70312181d7..7d55039ffa05 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2261,6 +2261,22 @@ int __pci_reset_function(struct pci_dev *dev)
 }
 EXPORT_SYMBOL_GPL(__pci_reset_function);
 
+/**
+ * pci_probe_reset_function - check whether the device can be safely reset
+ * @dev: PCI device to reset
+ *
+ * Some devices allow an individual function to be reset without affecting
+ * other functions in the same device.  The PCI device must be responsive
+ * to PCI config space in order to use this function.
+ *
+ * Returns 0 if the device function can be reset or negative if the
+ * device doesn't support resetting a single function.
+ */
+int pci_probe_reset_function(struct pci_dev *dev)
+{
+	return pci_dev_reset(dev, 1);
+}
+
 /**
  * pci_reset_function - quiesce and reset a PCI device function
  * @dev: PCI device to reset
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 5ff4d25bf0e9..73d9d92715a0 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -16,6 +16,7 @@ extern void pci_cleanup_rom(struct pci_dev *dev);
 extern int pci_mmap_fits(struct pci_dev *pdev, int resno,
 			 struct vm_area_struct *vma);
 #endif
+int pci_probe_reset_function(struct pci_dev *dev);
 
 /**
  * struct pci_platform_pm_ops - Firmware PM callbacks
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 115fb7ba5089..a90f94020798 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -276,6 +276,7 @@ struct pci_dev {
 	unsigned int	state_saved:1;
 	unsigned int	is_physfn:1;
 	unsigned int	is_virtfn:1;
+	unsigned int	reset_fn:1;
 	pci_dev_flags_t dev_flags;
 	atomic_t	enable_cnt;	/* pci_enable_device has been called */
 
-- 
cgit v1.2.3-71-gd317


From 1d4a433fc4e9c7dbfc2069a8f1a1f4338b982427 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Thu, 6 Aug 2009 15:13:59 -0400
Subject: PCI: Document pci_ids.h addition policy.

IDs should generally only be added to pci_ids.h when they're shared
across several files in the tree.  IDs that are just used by a single
driver should be defined in the driver instead.

Perhaps documenting this is a good idea to prevent things being moved there,
as it still seems to be happening judging from the git log.

(based on discussion w/gregkh and others).

Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci_ids.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 73b46b6b904f..6235ce2bea04 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2,6 +2,9 @@
  *	PCI Class, Vendor and Device IDs
  *
  *	Please keep sorted.
+ *
+ *	Do not add new entries to this file unless the definitions
+ *	are shared between multiple drivers.
  */
 
 /* Device classes and subclasses */
-- 
cgit v1.2.3-71-gd317


From deb2d2ecd43dfc51efe71eed7128fda514da96c6 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Tue, 11 Aug 2009 15:52:06 +1000
Subject: PCI/GPU: implement VGA arbitration on Linux

Background:
Graphic devices are accessed through ranges in I/O or memory space. While most
modern devices allow relocation of such ranges, some "Legacy" VGA devices
implemented on PCI will typically have the same "hard-decoded" addresses as
they did on ISA. For more details see "PCI Bus Binding to IEEE Std 1275-1994
Standard for Boot (Initialization Configuration) Firmware Revision 2.1"
Section 7, Legacy Devices.

The Resource Access Control (RAC) module inside the X server currently does
the task of arbitration when more than one legacy device co-exists on the same
machine. But the problem happens when these devices are trying to be accessed
by different userspace clients (e.g. two server in parallel). Their address
assignments conflict. Therefore an arbitration scheme _outside_ of the X
server is needed to control the sharing of these resources. This document
introduces the operation of the VGA arbiter implemented for Linux kernel.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Tiago Vignatti <tiago.vignatti@nokia.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/gpu/Makefile     |    2 +-
 drivers/gpu/vga/Kconfig  |   10 +
 drivers/gpu/vga/Makefile |    1 +
 drivers/gpu/vga/vgaarb.c | 1206 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/pci.c        |   44 ++
 drivers/video/Kconfig    |    2 +
 include/linux/pci.h      |    2 +
 include/linux/vgaarb.h   |  195 ++++++++
 8 files changed, 1461 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/vga/Kconfig
 create mode 100644 drivers/gpu/vga/Makefile
 create mode 100644 drivers/gpu/vga/vgaarb.c
 create mode 100644 include/linux/vgaarb.h

(limited to 'include/linux')

diff --git a/drivers/gpu/Makefile b/drivers/gpu/Makefile
index de566cf0414c..30879df3daea 100644
--- a/drivers/gpu/Makefile
+++ b/drivers/gpu/Makefile
@@ -1 +1 @@
-obj-y			+= drm/
+obj-y			+= drm/ vga/
diff --git a/drivers/gpu/vga/Kconfig b/drivers/gpu/vga/Kconfig
new file mode 100644
index 000000000000..790e675b13eb
--- /dev/null
+++ b/drivers/gpu/vga/Kconfig
@@ -0,0 +1,10 @@
+config VGA_ARB
+	bool "VGA Arbitration" if EMBEDDED
+	default y
+	depends on PCI
+	help
+	  Some "legacy" VGA devices implemented on PCI typically have the same
+	  hard-decoded addresses as they did on ISA. When multiple PCI devices
+	  are accessed at same time they need some kind of coordination. Please
+	  see Documentation/vgaarbiter.txt for more details. Select this to
+	  enable VGA arbiter.
diff --git a/drivers/gpu/vga/Makefile b/drivers/gpu/vga/Makefile
new file mode 100644
index 000000000000..7cc8c1ed645b
--- /dev/null
+++ b/drivers/gpu/vga/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_VGA_ARB)  += vgaarb.o
diff --git a/drivers/gpu/vga/vgaarb.c b/drivers/gpu/vga/vgaarb.c
new file mode 100644
index 000000000000..199138f241e0
--- /dev/null
+++ b/drivers/gpu/vga/vgaarb.c
@@ -0,0 +1,1206 @@
+/*
+ * vgaarb.c
+ *
+ * (C) Copyright 2005 Benjamin Herrenschmidt <benh@kernel.crashing.org>
+ * (C) Copyright 2007 Paulo R. Zanoni <przanoni@gmail.com>
+ * (C) Copyright 2007, 2009 Tiago Vignatti <vignatti@freedesktop.org>
+ *
+ * Implements the VGA arbitration. For details refer to
+ * Documentation/vgaarbiter.txt
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/spinlock.h>
+#include <linux/poll.h>
+#include <linux/miscdevice.h>
+
+#include <linux/uaccess.h>
+
+#include <linux/vgaarb.h>
+
+static void vga_arbiter_notify_clients(void);
+/*
+ * We keep a list of all vga devices in the system to speed
+ * up the various operations of the arbiter
+ */
+struct vga_device {
+	struct list_head list;
+	struct pci_dev *pdev;
+	unsigned int decodes;	/* what does it decodes */
+	unsigned int owns;	/* what does it owns */
+	unsigned int locks;	/* what does it locks */
+	unsigned int io_lock_cnt;	/* legacy IO lock count */
+	unsigned int mem_lock_cnt;	/* legacy MEM lock count */
+	unsigned int io_norm_cnt;	/* normal IO count */
+	unsigned int mem_norm_cnt;	/* normal MEM count */
+
+	/* allow IRQ enable/disable hook */
+	void *cookie;
+	void (*irq_set_state)(void *cookie, bool enable);
+	unsigned int (*set_vga_decode)(void *cookie, bool decode);
+};
+
+static LIST_HEAD(vga_list);
+static int vga_count, vga_decode_count;
+static bool vga_arbiter_used;
+static DEFINE_SPINLOCK(vga_lock);
+static DECLARE_WAIT_QUEUE_HEAD(vga_wait_queue);
+
+
+static const char *vga_iostate_to_str(unsigned int iostate)
+{
+	/* Ignore VGA_RSRC_IO and VGA_RSRC_MEM */
+	iostate &= VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
+	switch (iostate) {
+	case VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM:
+		return "io+mem";
+	case VGA_RSRC_LEGACY_IO:
+		return "io";
+	case VGA_RSRC_LEGACY_MEM:
+		return "mem";
+	}
+	return "none";
+}
+
+static int vga_str_to_iostate(char *buf, int str_size, int *io_state)
+{
+	/* we could in theory hand out locks on IO and mem
+	 * separately to userspace but it can cause deadlocks */
+	if (strncmp(buf, "none", 4) == 0) {
+		*io_state = VGA_RSRC_NONE;
+		return 1;
+	}
+
+	/* XXX We're not chekcing the str_size! */
+	if (strncmp(buf, "io+mem", 6) == 0)
+		goto both;
+	else if (strncmp(buf, "io", 2) == 0)
+		goto both;
+	else if (strncmp(buf, "mem", 3) == 0)
+		goto both;
+	return 0;
+both:
+	*io_state = VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
+	return 1;
+}
+
+#ifndef __ARCH_HAS_VGA_DEFAULT_DEVICE
+/* this is only used a cookie - it should not be dereferenced */
+static struct pci_dev *vga_default;
+#endif
+
+static void vga_arb_device_card_gone(struct pci_dev *pdev);
+
+/* Find somebody in our list */
+static struct vga_device *vgadev_find(struct pci_dev *pdev)
+{
+	struct vga_device *vgadev;
+
+	list_for_each_entry(vgadev, &vga_list, list)
+		if (pdev == vgadev->pdev)
+			return vgadev;
+	return NULL;
+}
+
+/* Returns the default VGA device (vgacon's babe) */
+#ifndef __ARCH_HAS_VGA_DEFAULT_DEVICE
+struct pci_dev *vga_default_device(void)
+{
+	return vga_default;
+}
+#endif
+
+static inline void vga_irq_set_state(struct vga_device *vgadev, bool state)
+{
+	if (vgadev->irq_set_state)
+		vgadev->irq_set_state(vgadev->cookie, state);
+}
+
+
+/* If we don't ever use VGA arb we should avoid
+   turning off anything anywhere due to old X servers getting
+   confused about the boot device not being VGA */
+static void vga_check_first_use(void)
+{
+	/* we should inform all GPUs in the system that
+	 * VGA arb has occured and to try and disable resources
+	 * if they can */
+	if (!vga_arbiter_used) {
+		vga_arbiter_used = true;
+		vga_arbiter_notify_clients();
+	}
+}
+
+static struct vga_device *__vga_tryget(struct vga_device *vgadev,
+				       unsigned int rsrc)
+{
+	unsigned int wants, legacy_wants, match;
+	struct vga_device *conflict;
+	unsigned int pci_bits;
+	/* Account for "normal" resources to lock. If we decode the legacy,
+	 * counterpart, we need to request it as well
+	 */
+	if ((rsrc & VGA_RSRC_NORMAL_IO) &&
+	    (vgadev->decodes & VGA_RSRC_LEGACY_IO))
+		rsrc |= VGA_RSRC_LEGACY_IO;
+	if ((rsrc & VGA_RSRC_NORMAL_MEM) &&
+	    (vgadev->decodes & VGA_RSRC_LEGACY_MEM))
+		rsrc |= VGA_RSRC_LEGACY_MEM;
+
+	pr_devel("%s: %d\n", __func__, rsrc);
+	pr_devel("%s: owns: %d\n", __func__, vgadev->owns);
+
+	/* Check what resources we need to acquire */
+	wants = rsrc & ~vgadev->owns;
+
+	/* We already own everything, just mark locked & bye bye */
+	if (wants == 0)
+		goto lock_them;
+
+	/* We don't need to request a legacy resource, we just enable
+	 * appropriate decoding and go
+	 */
+	legacy_wants = wants & VGA_RSRC_LEGACY_MASK;
+	if (legacy_wants == 0)
+		goto enable_them;
+
+	/* Ok, we don't, let's find out how we need to kick off */
+	list_for_each_entry(conflict, &vga_list, list) {
+		unsigned int lwants = legacy_wants;
+		unsigned int change_bridge = 0;
+
+		/* Don't conflict with myself */
+		if (vgadev == conflict)
+			continue;
+
+		/* Check if the architecture allows a conflict between those
+		 * 2 devices or if they are on separate domains
+		 */
+		if (!vga_conflicts(vgadev->pdev, conflict->pdev))
+			continue;
+
+		/* We have a possible conflict. before we go further, we must
+		 * check if we sit on the same bus as the conflicting device.
+		 * if we don't, then we must tie both IO and MEM resources
+		 * together since there is only a single bit controlling
+		 * VGA forwarding on P2P bridges
+		 */
+		if (vgadev->pdev->bus != conflict->pdev->bus) {
+			change_bridge = 1;
+			lwants = VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
+		}
+
+		/* Check if the guy has a lock on the resource. If he does,
+		 * return the conflicting entry
+		 */
+		if (conflict->locks & lwants)
+			return conflict;
+
+		/* Ok, now check if he owns the resource we want. We don't need
+		 * to check "decodes" since it should be impossible to own
+		 * own legacy resources you don't decode unless I have a bug
+		 * in this code...
+		 */
+		WARN_ON(conflict->owns & ~conflict->decodes);
+		match = lwants & conflict->owns;
+		if (!match)
+			continue;
+
+		/* looks like he doesn't have a lock, we can steal
+		 * them from him
+		 */
+		vga_irq_set_state(conflict, false);
+
+		pci_bits = 0;
+		if (lwants & (VGA_RSRC_LEGACY_MEM|VGA_RSRC_NORMAL_MEM))
+			pci_bits |= PCI_COMMAND_MEMORY;
+		if (lwants & (VGA_RSRC_LEGACY_IO|VGA_RSRC_NORMAL_IO))
+			pci_bits |= PCI_COMMAND_IO;
+
+		pci_set_vga_state(conflict->pdev, false, pci_bits,
+				  change_bridge);
+		conflict->owns &= ~lwants;
+		/* If he also owned non-legacy, that is no longer the case */
+		if (lwants & VGA_RSRC_LEGACY_MEM)
+			conflict->owns &= ~VGA_RSRC_NORMAL_MEM;
+		if (lwants & VGA_RSRC_LEGACY_IO)
+			conflict->owns &= ~VGA_RSRC_NORMAL_IO;
+	}
+
+enable_them:
+	/* ok dude, we got it, everybody conflicting has been disabled, let's
+	 * enable us. Make sure we don't mark a bit in "owns" that we don't
+	 * also have in "decodes". We can lock resources we don't decode but
+	 * not own them.
+	 */
+	pci_bits = 0;
+	if (wants & (VGA_RSRC_LEGACY_MEM|VGA_RSRC_NORMAL_MEM))
+		pci_bits |= PCI_COMMAND_MEMORY;
+	if (wants & (VGA_RSRC_LEGACY_IO|VGA_RSRC_NORMAL_IO))
+		pci_bits |= PCI_COMMAND_IO;
+	pci_set_vga_state(vgadev->pdev, true, pci_bits, !!(wants & VGA_RSRC_LEGACY_MASK));
+
+	vga_irq_set_state(vgadev, true);
+	vgadev->owns |= (wants & vgadev->decodes);
+lock_them:
+	vgadev->locks |= (rsrc & VGA_RSRC_LEGACY_MASK);
+	if (rsrc & VGA_RSRC_LEGACY_IO)
+		vgadev->io_lock_cnt++;
+	if (rsrc & VGA_RSRC_LEGACY_MEM)
+		vgadev->mem_lock_cnt++;
+	if (rsrc & VGA_RSRC_NORMAL_IO)
+		vgadev->io_norm_cnt++;
+	if (rsrc & VGA_RSRC_NORMAL_MEM)
+		vgadev->mem_norm_cnt++;
+
+	return NULL;
+}
+
+static void __vga_put(struct vga_device *vgadev, unsigned int rsrc)
+{
+	unsigned int old_locks = vgadev->locks;
+
+	pr_devel("%s\n", __func__);
+
+	/* Update our counters, and account for equivalent legacy resources
+	 * if we decode them
+	 */
+	if ((rsrc & VGA_RSRC_NORMAL_IO) && vgadev->io_norm_cnt > 0) {
+		vgadev->io_norm_cnt--;
+		if (vgadev->decodes & VGA_RSRC_LEGACY_IO)
+			rsrc |= VGA_RSRC_LEGACY_IO;
+	}
+	if ((rsrc & VGA_RSRC_NORMAL_MEM) && vgadev->mem_norm_cnt > 0) {
+		vgadev->mem_norm_cnt--;
+		if (vgadev->decodes & VGA_RSRC_LEGACY_MEM)
+			rsrc |= VGA_RSRC_LEGACY_MEM;
+	}
+	if ((rsrc & VGA_RSRC_LEGACY_IO) && vgadev->io_lock_cnt > 0)
+		vgadev->io_lock_cnt--;
+	if ((rsrc & VGA_RSRC_LEGACY_MEM) && vgadev->mem_lock_cnt > 0)
+		vgadev->mem_lock_cnt--;
+
+	/* Just clear lock bits, we do lazy operations so we don't really
+	 * have to bother about anything else at this point
+	 */
+	if (vgadev->io_lock_cnt == 0)
+		vgadev->locks &= ~VGA_RSRC_LEGACY_IO;
+	if (vgadev->mem_lock_cnt == 0)
+		vgadev->locks &= ~VGA_RSRC_LEGACY_MEM;
+
+	/* Kick the wait queue in case somebody was waiting if we actually
+	 * released something
+	 */
+	if (old_locks != vgadev->locks)
+		wake_up_all(&vga_wait_queue);
+}
+
+int vga_get(struct pci_dev *pdev, unsigned int rsrc, int interruptible)
+{
+	struct vga_device *vgadev, *conflict;
+	unsigned long flags;
+	wait_queue_t wait;
+	int rc = 0;
+
+	vga_check_first_use();
+	/* The one who calls us should check for this, but lets be sure... */
+	if (pdev == NULL)
+		pdev = vga_default_device();
+	if (pdev == NULL)
+		return 0;
+
+	for (;;) {
+		spin_lock_irqsave(&vga_lock, flags);
+		vgadev = vgadev_find(pdev);
+		if (vgadev == NULL) {
+			spin_unlock_irqrestore(&vga_lock, flags);
+			rc = -ENODEV;
+			break;
+		}
+		conflict = __vga_tryget(vgadev, rsrc);
+		spin_unlock_irqrestore(&vga_lock, flags);
+		if (conflict == NULL)
+			break;
+
+
+		/* We have a conflict, we wait until somebody kicks the
+		 * work queue. Currently we have one work queue that we
+		 * kick each time some resources are released, but it would
+		 * be fairly easy to have a per device one so that we only
+		 * need to attach to the conflicting device
+		 */
+		init_waitqueue_entry(&wait, current);
+		add_wait_queue(&vga_wait_queue, &wait);
+		set_current_state(interruptible ?
+				  TASK_INTERRUPTIBLE :
+				  TASK_UNINTERRUPTIBLE);
+		if (signal_pending(current)) {
+			rc = -EINTR;
+			break;
+		}
+		schedule();
+		remove_wait_queue(&vga_wait_queue, &wait);
+		set_current_state(TASK_RUNNING);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(vga_get);
+
+int vga_tryget(struct pci_dev *pdev, unsigned int rsrc)
+{
+	struct vga_device *vgadev;
+	unsigned long flags;
+	int rc = 0;
+
+	vga_check_first_use();
+
+	/* The one who calls us should check for this, but lets be sure... */
+	if (pdev == NULL)
+		pdev = vga_default_device();
+	if (pdev == NULL)
+		return 0;
+	spin_lock_irqsave(&vga_lock, flags);
+	vgadev = vgadev_find(pdev);
+	if (vgadev == NULL) {
+		rc = -ENODEV;
+		goto bail;
+	}
+	if (__vga_tryget(vgadev, rsrc))
+		rc = -EBUSY;
+bail:
+	spin_unlock_irqrestore(&vga_lock, flags);
+	return rc;
+}
+EXPORT_SYMBOL(vga_tryget);
+
+void vga_put(struct pci_dev *pdev, unsigned int rsrc)
+{
+	struct vga_device *vgadev;
+	unsigned long flags;
+
+	/* The one who calls us should check for this, but lets be sure... */
+	if (pdev == NULL)
+		pdev = vga_default_device();
+	if (pdev == NULL)
+		return;
+	spin_lock_irqsave(&vga_lock, flags);
+	vgadev = vgadev_find(pdev);
+	if (vgadev == NULL)
+		goto bail;
+	__vga_put(vgadev, rsrc);
+bail:
+	spin_unlock_irqrestore(&vga_lock, flags);
+}
+EXPORT_SYMBOL(vga_put);
+
+/*
+ * Currently, we assume that the "initial" setup of the system is
+ * not sane, that is we come up with conflicting devices and let
+ * the arbiter's client decides if devices decodes or not legacy
+ * things.
+ */
+static bool vga_arbiter_add_pci_device(struct pci_dev *pdev)
+{
+	struct vga_device *vgadev;
+	unsigned long flags;
+	struct pci_bus *bus;
+	struct pci_dev *bridge;
+	u16 cmd;
+
+	/* Only deal with VGA class devices */
+	if ((pdev->class >> 8) != PCI_CLASS_DISPLAY_VGA)
+		return false;
+
+	/* Allocate structure */
+	vgadev = kmalloc(sizeof(struct vga_device), GFP_KERNEL);
+	if (vgadev == NULL) {
+		pr_err("vgaarb: failed to allocate pci device\n");
+		/* What to do on allocation failure ? For now, let's
+		 * just do nothing, I'm not sure there is anything saner
+		 * to be done
+		 */
+		return false;
+	}
+
+	memset(vgadev, 0, sizeof(*vgadev));
+
+	/* Take lock & check for duplicates */
+	spin_lock_irqsave(&vga_lock, flags);
+	if (vgadev_find(pdev) != NULL) {
+		BUG_ON(1);
+		goto fail;
+	}
+	vgadev->pdev = pdev;
+
+	/* By default, assume we decode everything */
+	vgadev->decodes = VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
+			  VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
+
+	/* by default mark it as decoding */
+	vga_decode_count++;
+	/* Mark that we "own" resources based on our enables, we will
+	 * clear that below if the bridge isn't forwarding
+	 */
+	pci_read_config_word(pdev, PCI_COMMAND, &cmd);
+	if (cmd & PCI_COMMAND_IO)
+		vgadev->owns |= VGA_RSRC_LEGACY_IO;
+	if (cmd & PCI_COMMAND_MEMORY)
+		vgadev->owns |= VGA_RSRC_LEGACY_MEM;
+
+	/* Check if VGA cycles can get down to us */
+	bus = pdev->bus;
+	while (bus) {
+		bridge = bus->self;
+		if (bridge) {
+			u16 l;
+			pci_read_config_word(bridge, PCI_BRIDGE_CONTROL,
+					     &l);
+			if (!(l & PCI_BRIDGE_CTL_VGA)) {
+				vgadev->owns = 0;
+				break;
+			}
+		}
+		bus = bus->parent;
+	}
+
+	/* Deal with VGA default device. Use first enabled one
+	 * by default if arch doesn't have it's own hook
+	 */
+#ifndef __ARCH_HAS_VGA_DEFAULT_DEVICE
+	if (vga_default == NULL &&
+	    ((vgadev->owns & VGA_RSRC_LEGACY_MASK) == VGA_RSRC_LEGACY_MASK))
+		vga_default = pci_dev_get(pdev);
+#endif
+
+	/* Add to the list */
+	list_add(&vgadev->list, &vga_list);
+	vga_count++;
+	pr_info("vgaarb: device added: PCI:%s,decodes=%s,owns=%s,locks=%s\n",
+		pci_name(pdev),
+		vga_iostate_to_str(vgadev->decodes),
+		vga_iostate_to_str(vgadev->owns),
+		vga_iostate_to_str(vgadev->locks));
+
+	spin_unlock_irqrestore(&vga_lock, flags);
+	return true;
+fail:
+	spin_unlock_irqrestore(&vga_lock, flags);
+	kfree(vgadev);
+	return false;
+}
+
+static bool vga_arbiter_del_pci_device(struct pci_dev *pdev)
+{
+	struct vga_device *vgadev;
+	unsigned long flags;
+	bool ret = true;
+
+	spin_lock_irqsave(&vga_lock, flags);
+	vgadev = vgadev_find(pdev);
+	if (vgadev == NULL) {
+		ret = false;
+		goto bail;
+	}
+
+	if (vga_default == pdev) {
+		pci_dev_put(vga_default);
+		vga_default = NULL;
+	}
+
+	if (vgadev->decodes & (VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM))
+		vga_decode_count--;
+
+	/* Remove entry from list */
+	list_del(&vgadev->list);
+	vga_count--;
+	/* Notify userland driver that the device is gone so it discards
+	 * it's copies of the pci_dev pointer
+	 */
+	vga_arb_device_card_gone(pdev);
+
+	/* Wake up all possible waiters */
+	wake_up_all(&vga_wait_queue);
+bail:
+	spin_unlock_irqrestore(&vga_lock, flags);
+	kfree(vgadev);
+	return ret;
+}
+
+/* this is called with the lock */
+static inline void vga_update_device_decodes(struct vga_device *vgadev,
+					     int new_decodes)
+{
+	int old_decodes;
+	struct vga_device *new_vgadev, *conflict;
+
+	old_decodes = vgadev->decodes;
+	vgadev->decodes = new_decodes;
+
+	pr_info("vgaarb: device changed decodes: PCI:%s,olddecodes=%s,decodes=%s:owns=%s\n",
+		pci_name(vgadev->pdev),
+		vga_iostate_to_str(old_decodes),
+		vga_iostate_to_str(vgadev->decodes),
+		vga_iostate_to_str(vgadev->owns));
+
+
+	/* if we own the decodes we should move them along to
+	   another card */
+	if ((vgadev->owns & old_decodes) && (vga_count > 1)) {
+		/* set us to own nothing */
+		vgadev->owns &= ~old_decodes;
+		list_for_each_entry(new_vgadev, &vga_list, list) {
+			if ((new_vgadev != vgadev) &&
+			    (new_vgadev->decodes & VGA_RSRC_LEGACY_MASK)) {
+				pr_info("vgaarb: transferring owner from PCI:%s to PCI:%s\n", pci_name(vgadev->pdev), pci_name(new_vgadev->pdev));
+				conflict = __vga_tryget(new_vgadev, VGA_RSRC_LEGACY_MASK);
+				if (!conflict)
+					__vga_put(new_vgadev, VGA_RSRC_LEGACY_MASK);
+				break;
+			}
+		}
+	}
+
+	/* change decodes counter */
+	if (old_decodes != new_decodes) {
+		if (new_decodes & (VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM))
+			vga_decode_count++;
+		else
+			vga_decode_count--;
+	}
+}
+
+void __vga_set_legacy_decoding(struct pci_dev *pdev, unsigned int decodes, bool userspace)
+{
+	struct vga_device *vgadev;
+	unsigned long flags;
+
+	decodes &= VGA_RSRC_LEGACY_MASK;
+
+	spin_lock_irqsave(&vga_lock, flags);
+	vgadev = vgadev_find(pdev);
+	if (vgadev == NULL)
+		goto bail;
+
+	/* don't let userspace futz with kernel driver decodes */
+	if (userspace && vgadev->set_vga_decode)
+		goto bail;
+
+	/* update the device decodes + counter */
+	vga_update_device_decodes(vgadev, decodes);
+
+	/* XXX if somebody is going from "doesn't decode" to "decodes" state
+	 * here, additional care must be taken as we may have pending owner
+	 * ship of non-legacy region ...
+	 */
+bail:
+	spin_unlock_irqrestore(&vga_lock, flags);
+}
+
+void vga_set_legacy_decoding(struct pci_dev *pdev, unsigned int decodes)
+{
+	__vga_set_legacy_decoding(pdev, decodes, false);
+}
+EXPORT_SYMBOL(vga_set_legacy_decoding);
+
+/* return number of active VGA devices */
+/* call with NULL to unregister */
+int vga_client_register(struct pci_dev *pdev, void *cookie,
+			void (*irq_set_state)(void *cookie, bool state),
+			unsigned int (*set_vga_decode)(void *cookie, bool decode))
+{
+	int ret = -1;
+	struct vga_device *vgadev;
+	unsigned long flags;
+
+	spin_lock_irqsave(&vga_lock, flags);
+	vgadev = vgadev_find(pdev);
+	if (!vgadev)
+		goto bail;
+
+	vgadev->irq_set_state = irq_set_state;
+	vgadev->set_vga_decode = set_vga_decode;
+	vgadev->cookie = cookie;
+	ret = 0;
+
+bail:
+	spin_unlock_irqrestore(&vga_lock, flags);
+	return ret;
+
+}
+EXPORT_SYMBOL(vga_client_register);
+
+/*
+ * Char driver implementation
+ *
+ * Semantics is:
+ *
+ *  open       : open user instance of the arbitrer. by default, it's
+ *                attached to the default VGA device of the system.
+ *
+ *  close      : close user instance, release locks
+ *
+ *  read       : return a string indicating the status of the target.
+ *                an IO state string is of the form {io,mem,io+mem,none},
+ *                mc and ic are respectively mem and io lock counts (for
+ *                debugging/diagnostic only). "decodes" indicate what the
+ *                card currently decodes, "owns" indicates what is currently
+ *                enabled on it, and "locks" indicates what is locked by this
+ *                card. If the card is unplugged, we get "invalid" then for
+ *                card_ID and an -ENODEV error is returned for any command
+ *                until a new card is targeted
+ *
+ *   "<card_ID>,decodes=<io_state>,owns=<io_state>,locks=<io_state> (ic,mc)"
+ *
+ * write       : write a command to the arbiter. List of commands is:
+ *
+ *   target <card_ID>   : switch target to card <card_ID> (see below)
+ *   lock <io_state>    : acquires locks on target ("none" is invalid io_state)
+ *   trylock <io_state> : non-blocking acquire locks on target
+ *   unlock <io_state>  : release locks on target
+ *   unlock all         : release all locks on target held by this user
+ *   decodes <io_state> : set the legacy decoding attributes for the card
+ *
+ * poll         : event if something change on any card (not just the target)
+ *
+ * card_ID is of the form "PCI:domain:bus:dev.fn". It can be set to "default"
+ * to go back to the system default card (TODO: not implemented yet).
+ * Currently, only PCI is supported as a prefix, but the userland API may
+ * support other bus types in the future, even if the current kernel
+ * implementation doesn't.
+ *
+ * Note about locks:
+ *
+ * The driver keeps track of which user has what locks on which card. It
+ * supports stacking, like the kernel one. This complexifies the implementation
+ * a bit, but makes the arbiter more tolerant to userspace problems and able
+ * to properly cleanup in all cases when a process dies.
+ * Currently, a max of 16 cards simultaneously can have locks issued from
+ * userspace for a given user (file descriptor instance) of the arbiter.
+ *
+ * If the device is hot-unplugged, there is a hook inside the module to notify
+ * they being added/removed in the system and automatically added/removed in
+ * the arbiter.
+ */
+
+#define MAX_USER_CARDS         16
+#define PCI_INVALID_CARD       ((struct pci_dev *)-1UL)
+
+/*
+ * Each user has an array of these, tracking which cards have locks
+ */
+struct vga_arb_user_card {
+	struct pci_dev *pdev;
+	unsigned int mem_cnt;
+	unsigned int io_cnt;
+};
+
+struct vga_arb_private {
+	struct list_head list;
+	struct pci_dev *target;
+	struct vga_arb_user_card cards[MAX_USER_CARDS];
+	spinlock_t lock;
+};
+
+static LIST_HEAD(vga_user_list);
+static DEFINE_SPINLOCK(vga_user_lock);
+
+
+/*
+ * This function gets a string in the format: "PCI:domain:bus:dev.fn" and
+ * returns the respective values. If the string is not in this format,
+ * it returns 0.
+ */
+static int vga_pci_str_to_vars(char *buf, int count, unsigned int *domain,
+			       unsigned int *bus, unsigned int *devfn)
+{
+	int n;
+	unsigned int slot, func;
+
+
+	n = sscanf(buf, "PCI:%x:%x:%x.%x", domain, bus, &slot, &func);
+	if (n != 4)
+		return 0;
+
+	*devfn = PCI_DEVFN(slot, func);
+
+	return 1;
+}
+
+static ssize_t vga_arb_read(struct file *file, char __user * buf,
+			    size_t count, loff_t *ppos)
+{
+	struct vga_arb_private *priv = file->private_data;
+	struct vga_device *vgadev;
+	struct pci_dev *pdev;
+	unsigned long flags;
+	size_t len;
+	int rc;
+	char *lbuf;
+
+	lbuf = kmalloc(1024, GFP_KERNEL);
+	if (lbuf == NULL)
+		return -ENOMEM;
+
+	/* Shields against vga_arb_device_card_gone (pci_dev going
+	 * away), and allows access to vga list
+	 */
+	spin_lock_irqsave(&vga_lock, flags);
+
+	/* If we are targetting the default, use it */
+	pdev = priv->target;
+	if (pdev == NULL || pdev == PCI_INVALID_CARD) {
+		spin_unlock_irqrestore(&vga_lock, flags);
+		len = sprintf(lbuf, "invalid");
+		goto done;
+	}
+
+	/* Find card vgadev structure */
+	vgadev = vgadev_find(pdev);
+	if (vgadev == NULL) {
+		/* Wow, it's not in the list, that shouldn't happen,
+		 * let's fix us up and return invalid card
+		 */
+		if (pdev == priv->target)
+			vga_arb_device_card_gone(pdev);
+		spin_unlock_irqrestore(&vga_lock, flags);
+		len = sprintf(lbuf, "invalid");
+		goto done;
+	}
+
+	/* Fill the buffer with infos */
+	len = snprintf(lbuf, 1024,
+		       "count:%d,PCI:%s,decodes=%s,owns=%s,locks=%s(%d:%d)\n",
+		       vga_decode_count, pci_name(pdev),
+		       vga_iostate_to_str(vgadev->decodes),
+		       vga_iostate_to_str(vgadev->owns),
+		       vga_iostate_to_str(vgadev->locks),
+		       vgadev->io_lock_cnt, vgadev->mem_lock_cnt);
+
+	spin_unlock_irqrestore(&vga_lock, flags);
+done:
+
+	/* Copy that to user */
+	if (len > count)
+		len = count;
+	rc = copy_to_user(buf, lbuf, len);
+	kfree(lbuf);
+	if (rc)
+		return -EFAULT;
+	return len;
+}
+
+/*
+ * TODO: To avoid parsing inside kernel and to improve the speed we may
+ * consider use ioctl here
+ */
+static ssize_t vga_arb_write(struct file *file, const char __user * buf,
+			     size_t count, loff_t *ppos)
+{
+	struct vga_arb_private *priv = file->private_data;
+	struct vga_arb_user_card *uc = NULL;
+	struct pci_dev *pdev;
+
+	unsigned int io_state;
+
+	char *kbuf, *curr_pos;
+	size_t remaining = count;
+
+	int ret_val;
+	int i;
+
+
+	kbuf = kmalloc(count + 1, GFP_KERNEL);
+	if (!kbuf)
+		return -ENOMEM;
+
+	if (copy_from_user(kbuf, buf, count)) {
+		kfree(kbuf);
+		return -EFAULT;
+	}
+	curr_pos = kbuf;
+	kbuf[count] = '\0';	/* Just to make sure... */
+
+	if (strncmp(curr_pos, "lock ", 5) == 0) {
+		curr_pos += 5;
+		remaining -= 5;
+
+		pr_devel("client 0x%X called 'lock'\n", (int)priv);
+
+		if (!vga_str_to_iostate(curr_pos, remaining, &io_state)) {
+			ret_val = -EPROTO;
+			goto done;
+		}
+		if (io_state == VGA_RSRC_NONE) {
+			ret_val = -EPROTO;
+			goto done;
+		}
+
+		pdev = priv->target;
+		if (priv->target == NULL) {
+			ret_val = -ENODEV;
+			goto done;
+		}
+
+		vga_get_uninterruptible(pdev, io_state);
+
+		/* Update the client's locks lists... */
+		for (i = 0; i < MAX_USER_CARDS; i++) {
+			if (priv->cards[i].pdev == pdev) {
+				if (io_state & VGA_RSRC_LEGACY_IO)
+					priv->cards[i].io_cnt++;
+				if (io_state & VGA_RSRC_LEGACY_MEM)
+					priv->cards[i].mem_cnt++;
+				break;
+			}
+		}
+
+		ret_val = count;
+		goto done;
+	} else if (strncmp(curr_pos, "unlock ", 7) == 0) {
+		curr_pos += 7;
+		remaining -= 7;
+
+		pr_devel("client 0x%X called 'unlock'\n", (int)priv);
+
+		if (strncmp(curr_pos, "all", 3) == 0)
+			io_state = VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
+		else {
+			if (!vga_str_to_iostate
+			    (curr_pos, remaining, &io_state)) {
+				ret_val = -EPROTO;
+				goto done;
+			}
+			/* TODO: Add this?
+			   if (io_state == VGA_RSRC_NONE) {
+			   ret_val = -EPROTO;
+			   goto done;
+			   }
+			  */
+		}
+
+		pdev = priv->target;
+		if (priv->target == NULL) {
+			ret_val = -ENODEV;
+			goto done;
+		}
+		for (i = 0; i < MAX_USER_CARDS; i++) {
+			if (priv->cards[i].pdev == pdev)
+				uc = &priv->cards[i];
+		}
+
+		if (!uc)
+			return -EINVAL;
+
+		if (io_state & VGA_RSRC_LEGACY_IO && uc->io_cnt == 0)
+			return -EINVAL;
+
+		if (io_state & VGA_RSRC_LEGACY_MEM && uc->mem_cnt == 0)
+			return -EINVAL;
+
+		vga_put(pdev, io_state);
+
+		if (io_state & VGA_RSRC_LEGACY_IO)
+			uc->io_cnt--;
+		if (io_state & VGA_RSRC_LEGACY_MEM)
+			uc->mem_cnt--;
+
+		ret_val = count;
+		goto done;
+	} else if (strncmp(curr_pos, "trylock ", 8) == 0) {
+		curr_pos += 8;
+		remaining -= 8;
+
+		pr_devel("client 0x%X called 'trylock'\n", (int)priv);
+
+		if (!vga_str_to_iostate(curr_pos, remaining, &io_state)) {
+			ret_val = -EPROTO;
+			goto done;
+		}
+		/* TODO: Add this?
+		   if (io_state == VGA_RSRC_NONE) {
+		   ret_val = -EPROTO;
+		   goto done;
+		   }
+		 */
+
+		pdev = priv->target;
+		if (priv->target == NULL) {
+			ret_val = -ENODEV;
+			goto done;
+		}
+
+		if (vga_tryget(pdev, io_state)) {
+			/* Update the client's locks lists... */
+			for (i = 0; i < MAX_USER_CARDS; i++) {
+				if (priv->cards[i].pdev == pdev) {
+					if (io_state & VGA_RSRC_LEGACY_IO)
+						priv->cards[i].io_cnt++;
+					if (io_state & VGA_RSRC_LEGACY_MEM)
+						priv->cards[i].mem_cnt++;
+					break;
+				}
+			}
+			ret_val = count;
+			goto done;
+		} else {
+			ret_val = -EBUSY;
+			goto done;
+		}
+
+	} else if (strncmp(curr_pos, "target ", 7) == 0) {
+		unsigned int domain, bus, devfn;
+		struct vga_device *vgadev;
+
+		curr_pos += 7;
+		remaining -= 7;
+		pr_devel("client 0x%X called 'target'\n", (int)priv);
+		/* if target is default */
+		if (!strncmp(buf, "default", 7))
+			pdev = pci_dev_get(vga_default_device());
+		else {
+			if (!vga_pci_str_to_vars(curr_pos, remaining,
+						 &domain, &bus, &devfn)) {
+				ret_val = -EPROTO;
+				goto done;
+			}
+
+			pdev = pci_get_bus_and_slot(bus, devfn);
+			if (!pdev) {
+				pr_info("vgaarb: invalid PCI address!\n");
+				ret_val = -ENODEV;
+				goto done;
+			}
+		}
+
+		vgadev = vgadev_find(pdev);
+		if (vgadev == NULL) {
+			pr_info("vgaarb: this pci device is not a vga device\n");
+			pci_dev_put(pdev);
+			ret_val = -ENODEV;
+			goto done;
+		}
+
+		priv->target = pdev;
+		for (i = 0; i < MAX_USER_CARDS; i++) {
+			if (priv->cards[i].pdev == pdev)
+				break;
+			if (priv->cards[i].pdev == NULL) {
+				priv->cards[i].pdev = pdev;
+				priv->cards[i].io_cnt = 0;
+				priv->cards[i].mem_cnt = 0;
+				break;
+			}
+		}
+		if (i == MAX_USER_CARDS) {
+			pr_err("vgaarb: maximum user cards number reached!\n");
+			pci_dev_put(pdev);
+			/* XXX: which value to return? */
+			ret_val =  -ENOMEM;
+			goto done;
+		}
+
+		ret_val = count;
+		pci_dev_put(pdev);
+		goto done;
+
+
+	} else if (strncmp(curr_pos, "decodes ", 8) == 0) {
+		curr_pos += 8;
+		remaining -= 8;
+		pr_devel("vgaarb: client 0x%X called 'decodes'\n", (int)priv);
+
+		if (!vga_str_to_iostate(curr_pos, remaining, &io_state)) {
+			ret_val = -EPROTO;
+			goto done;
+		}
+		pdev = priv->target;
+		if (priv->target == NULL) {
+			ret_val = -ENODEV;
+			goto done;
+		}
+
+		__vga_set_legacy_decoding(pdev, io_state, true);
+		ret_val = count;
+		goto done;
+	}
+	/* If we got here, the message written is not part of the protocol! */
+	kfree(kbuf);
+	return -EPROTO;
+
+done:
+	kfree(kbuf);
+	return ret_val;
+}
+
+static unsigned int vga_arb_fpoll(struct file *file, poll_table * wait)
+{
+	struct vga_arb_private *priv = file->private_data;
+
+	pr_devel("%s\n", __func__);
+
+	if (priv == NULL)
+		return -ENODEV;
+	poll_wait(file, &vga_wait_queue, wait);
+	return POLLIN;
+}
+
+static int vga_arb_open(struct inode *inode, struct file *file)
+{
+	struct vga_arb_private *priv;
+	unsigned long flags;
+
+	pr_devel("%s\n", __func__);
+
+	priv = kmalloc(sizeof(struct vga_arb_private), GFP_KERNEL);
+	if (priv == NULL)
+		return -ENOMEM;
+	memset(priv, 0, sizeof(*priv));
+	spin_lock_init(&priv->lock);
+	file->private_data = priv;
+
+	spin_lock_irqsave(&vga_user_lock, flags);
+	list_add(&priv->list, &vga_user_list);
+	spin_unlock_irqrestore(&vga_user_lock, flags);
+
+	/* Set the client' lists of locks */
+	priv->target = vga_default_device(); /* Maybe this is still null! */
+	priv->cards[0].pdev = priv->target;
+	priv->cards[0].io_cnt = 0;
+	priv->cards[0].mem_cnt = 0;
+
+
+	return 0;
+}
+
+static int vga_arb_release(struct inode *inode, struct file *file)
+{
+	struct vga_arb_private *priv = file->private_data;
+	struct vga_arb_user_card *uc;
+	unsigned long flags;
+	int i;
+
+	pr_devel("%s\n", __func__);
+
+	if (priv == NULL)
+		return -ENODEV;
+
+	spin_lock_irqsave(&vga_user_lock, flags);
+	list_del(&priv->list);
+	for (i = 0; i < MAX_USER_CARDS; i++) {
+		uc = &priv->cards[i];
+		if (uc->pdev == NULL)
+			continue;
+		pr_devel("uc->io_cnt == %d, uc->mem_cnt == %d\n",
+			 uc->io_cnt, uc->mem_cnt);
+		while (uc->io_cnt--)
+			vga_put(uc->pdev, VGA_RSRC_LEGACY_IO);
+		while (uc->mem_cnt--)
+			vga_put(uc->pdev, VGA_RSRC_LEGACY_MEM);
+	}
+	spin_unlock_irqrestore(&vga_user_lock, flags);
+
+	kfree(priv);
+
+	return 0;
+}
+
+static void vga_arb_device_card_gone(struct pci_dev *pdev)
+{
+}
+
+/*
+ * callback any registered clients to let them know we have a
+ * change in VGA cards
+ */
+static void vga_arbiter_notify_clients(void)
+{
+	struct vga_device *vgadev;
+	unsigned long flags;
+	uint32_t new_decodes;
+	bool new_state;
+
+	if (!vga_arbiter_used)
+		return;
+
+	spin_lock_irqsave(&vga_lock, flags);
+	list_for_each_entry(vgadev, &vga_list, list) {
+		if (vga_count > 1)
+			new_state = false;
+		else
+			new_state = true;
+		if (vgadev->set_vga_decode) {
+			new_decodes = vgadev->set_vga_decode(vgadev->cookie, new_state);
+			vga_update_device_decodes(vgadev, new_decodes);
+		}
+	}
+	spin_unlock_irqrestore(&vga_lock, flags);
+}
+
+static int pci_notify(struct notifier_block *nb, unsigned long action,
+		      void *data)
+{
+	struct device *dev = data;
+	struct pci_dev *pdev = to_pci_dev(dev);
+	bool notify = false;
+
+	pr_devel("%s\n", __func__);
+
+	/* For now we're only intereted in devices added and removed. I didn't
+	 * test this thing here, so someone needs to double check for the
+	 * cases of hotplugable vga cards. */
+	if (action == BUS_NOTIFY_ADD_DEVICE)
+		notify = vga_arbiter_add_pci_device(pdev);
+	else if (action == BUS_NOTIFY_DEL_DEVICE)
+		notify = vga_arbiter_del_pci_device(pdev);
+
+	if (notify)
+		vga_arbiter_notify_clients();
+	return 0;
+}
+
+static struct notifier_block pci_notifier = {
+	.notifier_call = pci_notify,
+};
+
+static const struct file_operations vga_arb_device_fops = {
+	.read = vga_arb_read,
+	.write = vga_arb_write,
+	.poll = vga_arb_fpoll,
+	.open = vga_arb_open,
+	.release = vga_arb_release,
+};
+
+static struct miscdevice vga_arb_device = {
+	MISC_DYNAMIC_MINOR, "vga_arbiter", &vga_arb_device_fops
+};
+
+static int __init vga_arb_device_init(void)
+{
+	int rc;
+	struct pci_dev *pdev;
+
+	rc = misc_register(&vga_arb_device);
+	if (rc < 0)
+		pr_err("vgaarb: error %d registering device\n", rc);
+
+	bus_register_notifier(&pci_bus_type, &pci_notifier);
+
+	/* We add all pci devices satisfying vga class in the arbiter by
+	 * default */
+	pdev = NULL;
+	while ((pdev =
+		pci_get_subsys(PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID,
+			       PCI_ANY_ID, pdev)) != NULL)
+		vga_arbiter_add_pci_device(pdev);
+
+	pr_info("vgaarb: loaded\n");
+	return rc;
+}
+subsys_initcall(vga_arb_device_init);
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 7d55039ffa05..bd993351db45 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2520,6 +2520,50 @@ int pci_resource_bar(struct pci_dev *dev, int resno, enum pci_bar_type *type)
 	return 0;
 }
 
+/**
+ * pci_set_vga_state - set VGA decode state on device and parents if requested
+ * @dev the PCI device
+ * @decode - true = enable decoding, false = disable decoding
+ * @command_bits PCI_COMMAND_IO and/or PCI_COMMAND_MEMORY
+ * @change_bridge - traverse ancestors and change bridges
+ */
+int pci_set_vga_state(struct pci_dev *dev, bool decode,
+		      unsigned int command_bits, bool change_bridge)
+{
+	struct pci_bus *bus;
+	struct pci_dev *bridge;
+	u16 cmd;
+
+	WARN_ON(command_bits & ~(PCI_COMMAND_IO|PCI_COMMAND_MEMORY));
+
+	pci_read_config_word(dev, PCI_COMMAND, &cmd);
+	if (decode == true)
+		cmd |= command_bits;
+	else
+		cmd &= ~command_bits;
+	pci_write_config_word(dev, PCI_COMMAND, cmd);
+
+	if (change_bridge == false)
+		return 0;
+
+	bus = dev->bus;
+	while (bus) {
+		bridge = bus->self;
+		if (bridge) {
+			pci_read_config_word(bridge, PCI_BRIDGE_CONTROL,
+					     &cmd);
+			if (decode == true)
+				cmd |= PCI_BRIDGE_CTL_VGA;
+			else
+				cmd &= ~PCI_BRIDGE_CTL_VGA;
+			pci_write_config_word(bridge, PCI_BRIDGE_CONTROL,
+					      cmd);
+		}
+		bus = bus->parent;
+	}
+	return 0;
+}
+
 #define RESOURCE_ALIGNMENT_PARAM_SIZE COMMAND_LINE_SIZE
 static char resource_alignment_param[RESOURCE_ALIGNMENT_PARAM_SIZE] = {0};
 spinlock_t resource_alignment_lock = SPIN_LOCK_UNLOCKED;
diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
index 3b54b3940178..a0d9ee1df349 100644
--- a/drivers/video/Kconfig
+++ b/drivers/video/Kconfig
@@ -7,6 +7,8 @@ menu "Graphics support"
 
 source "drivers/char/agp/Kconfig"
 
+source "drivers/gpu/vga/Kconfig"
+
 source "drivers/gpu/drm/Kconfig"
 
 config VGASTATE
diff --git a/include/linux/pci.h b/include/linux/pci.h
index a90f94020798..6dbb1fd30e5a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -806,6 +806,8 @@ int pci_cfg_space_size_ext(struct pci_dev *dev);
 int pci_cfg_space_size(struct pci_dev *dev);
 unsigned char pci_bus_max_busnr(struct pci_bus *bus);
 
+int pci_set_vga_state(struct pci_dev *pdev, bool decode,
+		      unsigned int command_bits, bool change_bridge);
 /* kmem_cache style wrapper around pci_alloc_consistent() */
 
 #include <linux/dmapool.h>
diff --git a/include/linux/vgaarb.h b/include/linux/vgaarb.h
new file mode 100644
index 000000000000..68229ce80fde
--- /dev/null
+++ b/include/linux/vgaarb.h
@@ -0,0 +1,195 @@
+/*
+ * vgaarb.c
+ *
+ * (C) Copyright 2005 Benjamin Herrenschmidt <benh@kernel.crashing.org>
+ * (C) Copyright 2007 Paulo R. Zanoni <przanoni@gmail.com>
+ * (C) Copyright 2007, 2009 Tiago Vignatti <vignatti@freedesktop.org>
+ */
+
+#ifndef LINUX_VGA_H
+
+#include <asm/vga.h>
+
+/* Legacy VGA regions */
+#define VGA_RSRC_NONE	       0x00
+#define VGA_RSRC_LEGACY_IO     0x01
+#define VGA_RSRC_LEGACY_MEM    0x02
+#define VGA_RSRC_LEGACY_MASK   (VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM)
+/* Non-legacy access */
+#define VGA_RSRC_NORMAL_IO     0x04
+#define VGA_RSRC_NORMAL_MEM    0x08
+
+/* Passing that instead of a pci_dev to use the system "default"
+ * device, that is the one used by vgacon. Archs will probably
+ * have to provide their own vga_default_device();
+ */
+#define VGA_DEFAULT_DEVICE     (NULL)
+
+/* For use by clients */
+
+/**
+ *     vga_set_legacy_decoding
+ *
+ *     @pdev: pci device of the VGA card
+ *     @decodes: bit mask of what legacy regions the card decodes
+ *
+ *     Indicates to the arbiter if the card decodes legacy VGA IOs,
+ *     legacy VGA Memory, both, or none. All cards default to both,
+ *     the card driver (fbdev for example) should tell the arbiter
+ *     if it has disabled legacy decoding, so the card can be left
+ *     out of the arbitration process (and can be safe to take
+ *     interrupts at any time.
+ */
+extern void vga_set_legacy_decoding(struct pci_dev *pdev,
+									unsigned int decodes);
+
+/**
+ *     vga_get         - acquire & locks VGA resources
+ *
+ *     pdev: pci device of the VGA card or NULL for the system default
+ *     rsrc: bit mask of resources to acquire and lock
+ *     interruptible: blocking should be interruptible by signals ?
+ *
+ *     This function acquires VGA resources for the given
+ *     card and mark those resources locked. If the resource requested
+ *     are "normal" (and not legacy) resources, the arbiter will first check
+ *     wether the card is doing legacy decoding for that type of resource. If
+ *     yes, the lock is "converted" into a legacy resource lock.
+ *     The arbiter will first look for all VGA cards that might conflict
+ *     and disable their IOs and/or Memory access, inlcuding VGA forwarding
+ *     on P2P bridges if necessary, so that the requested resources can
+ *     be used. Then, the card is marked as locking these resources and
+ *     the IO and/or Memory accesse are enabled on the card (including
+ *     VGA forwarding on parent P2P bridges if any).
+ *     This function will block if some conflicting card is already locking
+ *     one of the required resources (or any resource on a different bus
+ *     segment, since P2P bridges don't differenciate VGA memory and IO
+ *     afaik). You can indicate wether this blocking should be interruptible
+ *     by a signal (for userland interface) or not.
+ *     Must not be called at interrupt time or in atomic context.
+ *     If the card already owns the resources, the function succeeds.
+ *     Nested calls are supported (a per-resource counter is maintained)
+ */
+
+extern int vga_get(struct pci_dev *pdev, unsigned int rsrc,
+											int interruptible);
+
+/**
+ *     vga_get_interruptible
+ *
+ *     Shortcut to vga_get
+ */
+
+static inline int vga_get_interruptible(struct pci_dev *pdev,
+										unsigned int rsrc)
+{
+       return vga_get(pdev, rsrc, 1);
+}
+
+/**
+ *     vga_get_interruptible
+ *
+ *     Shortcut to vga_get
+ */
+
+static inline int vga_get_uninterruptible(struct pci_dev *pdev,
+											unsigned int rsrc)
+{
+       return vga_get(pdev, rsrc, 0);
+}
+
+/**
+ *     vga_tryget      - try to acquire & lock legacy VGA resources
+ *
+ *     @pdev: pci devivce of VGA card or NULL for system default
+ *     @rsrc: bit mask of resources to acquire and lock
+ *
+ *     This function performs the same operation as vga_get(), but
+ *     will return an error (-EBUSY) instead of blocking if the resources
+ *     are already locked by another card. It can be called in any context
+ */
+
+extern int vga_tryget(struct pci_dev *pdev, unsigned int rsrc);
+
+/**
+ *     vga_put         - release lock on legacy VGA resources
+ *
+ *     @pdev: pci device of VGA card or NULL for system default
+ *     @rsrc: but mask of resource to release
+ *
+ *     This function releases resources previously locked by vga_get()
+ *     or vga_tryget(). The resources aren't disabled right away, so
+ *     that a subsequence vga_get() on the same card will succeed
+ *     immediately. Resources have a counter, so locks are only
+ *     released if the counter reaches 0.
+ */
+
+extern void vga_put(struct pci_dev *pdev, unsigned int rsrc);
+
+
+/**
+ *     vga_default_device
+ *
+ *     This can be defined by the platform. The default implementation
+ *     is rather dumb and will probably only work properly on single
+ *     vga card setups and/or x86 platforms.
+ *
+ *     If your VGA default device is not PCI, you'll have to return
+ *     NULL here. In this case, I assume it will not conflict with
+ *     any PCI card. If this is not true, I'll have to define two archs
+ *     hooks for enabling/disabling the VGA default device if that is
+ *     possible. This may be a problem with real _ISA_ VGA cards, in
+ *     addition to a PCI one. I don't know at this point how to deal
+ *     with that card. Can theirs IOs be disabled at all ? If not, then
+ *     I suppose it's a matter of having the proper arch hook telling
+ *     us about it, so we basically never allow anybody to succeed a
+ *     vga_get()...
+ */
+
+#ifndef __ARCH_HAS_VGA_DEFAULT_DEVICE
+extern struct pci_dev *vga_default_device(void);
+#endif
+
+/**
+ *     vga_conflicts
+ *
+ *     Architectures should define this if they have several
+ *     independant PCI domains that can afford concurrent VGA
+ *     decoding
+ */
+
+#ifndef __ARCH_HAS_VGA_CONFLICT
+static inline int vga_conflicts(struct pci_dev *p1, struct pci_dev *p2)
+{
+       return 1;
+}
+#endif
+
+/*
+ * Register a client with the VGA arbitration logic
+ * return value: number of VGA devices in system.
+ *
+ * Clients have two callback mechanisms they can use.
+ * irq enable/disable callback -
+ *    If a client can't disable its GPUs VGA resources, then we
+ *    need to be able to ask it to turn off its irqs when we
+ *    turn off its mem and io decoding.
+ * set_vga_decode
+ *    If a client can disable its GPU VGA resource, it will
+ *    get a callback from this to set the encode/decode state
+ *
+ * Clients with disable abilities should check the return value
+ * of this function and if the VGA device count is > 1, should
+ * disable VGA decoding resources.
+ *
+ * Rationale: we cannot disable VGA decode resources unconditionally
+ * some single GPU laptops seem to require ACPI or BIOS access to the
+ * VGA registers to control things like backlights etc.
+ * Hopefully newer multi-GPU laptops do something saner, and desktops
+ * won't have any special ACPI for this.
+ */
+int vga_client_register(struct pci_dev *pdev, void *cookie,
+			void (*irq_set_state)(void *cookie, bool state),
+			unsigned int (*set_vga_decode)(void *cookie, bool state));
+
+#endif /* LINUX_VGA_H */
-- 
cgit v1.2.3-71-gd317


From 260d703adc5f275e3ba7ddff6e2e0217bc613b35 Mon Sep 17 00:00:00 2001
From: Mike Mason <mmlnx@us.ibm.com>
Date: Thu, 30 Jul 2009 15:33:21 -0700
Subject: PCI: support for PCI Express fundamental reset

This is the first of three patches that implement a bit field that PCI
Express device drivers can use to indicate they need a fundamental reset
during error recovery.

By default, the EEH framework on powerpc does what's known as a "hot
reset" during recovery of a PCI Express device.  We've found a case
where the device needs a "fundamental reset" to recover properly.  The
current PCI error recovery and EEH frameworks do not support this
distinction.

The attached patch (courtesy of Richard Lary) adds a bit field to
pci_dev that indicates whether the device requires a fundamental reset
during recovery.

These patches supersede the previously submitted patch that implemented
a fundamental reset bit field.

Signed-off-by: Mike Mason <mmlnx@us.ibm.com>
Signed-off-by: Richard Lary <rlary@us.ibm.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 6dbb1fd30e5a..da90217a7b0e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -273,6 +273,7 @@ struct pci_dev {
 	unsigned int	ari_enabled:1;	/* ARI forwarding */
 	unsigned int	is_managed:1;
 	unsigned int	is_pcie:1;
+	unsigned int    needs_freset:1; /* Dev requires fundamental reset */
 	unsigned int	state_saved:1;
 	unsigned int	is_physfn:1;
 	unsigned int	is_virtfn:1;
-- 
cgit v1.2.3-71-gd317


From 6ac3bd527007eeecb148b67ca47b21731fd8a503 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@linux.ie>
Date: Wed, 19 Aug 2009 15:21:57 +1000
Subject: PCI/vgaarb: cleanup some warnings + cleanup some comments.

Fix some warnings reported in linux-next + also cleanup some
comment errors noticed by Pekka Paalanen.

Signed-off-by: Dave Airlie <airlied@redhat.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/gpu/vga/vgaarb.c | 11 +++++------
 include/linux/vgaarb.h   | 49 ++++++++++++++++++++++++++----------------------
 2 files changed, 32 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/vga/vgaarb.c b/drivers/gpu/vga/vgaarb.c
index 199138f241e0..1ac0c93603c9 100644
--- a/drivers/gpu/vga/vgaarb.c
+++ b/drivers/gpu/vga/vgaarb.c
@@ -609,7 +609,6 @@ void vga_set_legacy_decoding(struct pci_dev *pdev, unsigned int decodes)
 }
 EXPORT_SYMBOL(vga_set_legacy_decoding);
 
-/* return number of active VGA devices */
 /* call with NULL to unregister */
 int vga_client_register(struct pci_dev *pdev, void *cookie,
 			void (*irq_set_state)(void *cookie, bool state),
@@ -831,7 +830,7 @@ static ssize_t vga_arb_write(struct file *file, const char __user * buf,
 		curr_pos += 5;
 		remaining -= 5;
 
-		pr_devel("client 0x%X called 'lock'\n", (int)priv);
+		pr_devel("client 0x%p called 'lock'\n", priv);
 
 		if (!vga_str_to_iostate(curr_pos, remaining, &io_state)) {
 			ret_val = -EPROTO;
@@ -867,7 +866,7 @@ static ssize_t vga_arb_write(struct file *file, const char __user * buf,
 		curr_pos += 7;
 		remaining -= 7;
 
-		pr_devel("client 0x%X called 'unlock'\n", (int)priv);
+		pr_devel("client 0x%p called 'unlock'\n", priv);
 
 		if (strncmp(curr_pos, "all", 3) == 0)
 			io_state = VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
@@ -917,7 +916,7 @@ static ssize_t vga_arb_write(struct file *file, const char __user * buf,
 		curr_pos += 8;
 		remaining -= 8;
 
-		pr_devel("client 0x%X called 'trylock'\n", (int)priv);
+		pr_devel("client 0x%p called 'trylock'\n", priv);
 
 		if (!vga_str_to_iostate(curr_pos, remaining, &io_state)) {
 			ret_val = -EPROTO;
@@ -960,7 +959,7 @@ static ssize_t vga_arb_write(struct file *file, const char __user * buf,
 
 		curr_pos += 7;
 		remaining -= 7;
-		pr_devel("client 0x%X called 'target'\n", (int)priv);
+		pr_devel("client 0x%p called 'target'\n", priv);
 		/* if target is default */
 		if (!strncmp(buf, "default", 7))
 			pdev = pci_dev_get(vga_default_device());
@@ -1014,7 +1013,7 @@ static ssize_t vga_arb_write(struct file *file, const char __user * buf,
 	} else if (strncmp(curr_pos, "decodes ", 8) == 0) {
 		curr_pos += 8;
 		remaining -= 8;
-		pr_devel("vgaarb: client 0x%X called 'decodes'\n", (int)priv);
+		pr_devel("vgaarb: client 0x%p called 'decodes'\n", priv);
 
 		if (!vga_str_to_iostate(curr_pos, remaining, &io_state)) {
 			ret_val = -EPROTO;
diff --git a/include/linux/vgaarb.h b/include/linux/vgaarb.h
index 68229ce80fde..e81c64af80c1 100644
--- a/include/linux/vgaarb.h
+++ b/include/linux/vgaarb.h
@@ -46,9 +46,9 @@ extern void vga_set_legacy_decoding(struct pci_dev *pdev,
 /**
  *     vga_get         - acquire & locks VGA resources
  *
- *     pdev: pci device of the VGA card or NULL for the system default
- *     rsrc: bit mask of resources to acquire and lock
- *     interruptible: blocking should be interruptible by signals ?
+ *     @pdev: pci device of the VGA card or NULL for the system default
+ *     @rsrc: bit mask of resources to acquire and lock
+ *     @interruptible: blocking should be interruptible by signals ?
  *
  *     This function acquires VGA resources for the given
  *     card and mark those resources locked. If the resource requested
@@ -81,19 +81,19 @@ extern int vga_get(struct pci_dev *pdev, unsigned int rsrc,
  */
 
 static inline int vga_get_interruptible(struct pci_dev *pdev,
-										unsigned int rsrc)
+					unsigned int rsrc)
 {
        return vga_get(pdev, rsrc, 1);
 }
 
 /**
- *     vga_get_interruptible
+ *     vga_get_uninterruptible
  *
  *     Shortcut to vga_get
  */
 
 static inline int vga_get_uninterruptible(struct pci_dev *pdev,
-											unsigned int rsrc)
+					  unsigned int rsrc)
 {
        return vga_get(pdev, rsrc, 0);
 }
@@ -165,28 +165,33 @@ static inline int vga_conflicts(struct pci_dev *p1, struct pci_dev *p2)
 }
 #endif
 
-/*
- * Register a client with the VGA arbitration logic
- * return value: number of VGA devices in system.
- *
- * Clients have two callback mechanisms they can use.
- * irq enable/disable callback -
- *    If a client can't disable its GPUs VGA resources, then we
- *    need to be able to ask it to turn off its irqs when we
- *    turn off its mem and io decoding.
- * set_vga_decode
- *    If a client can disable its GPU VGA resource, it will
- *    get a callback from this to set the encode/decode state
- *
- * Clients with disable abilities should check the return value
- * of this function and if the VGA device count is > 1, should
- * disable VGA decoding resources.
+/**
+ *	vga_client_register
+ *
+ *	@pdev: pci device of the VGA client
+ *	@cookie: client cookie to be used in callbacks
+ *	@irq_set_state: irq state change callback
+ *	@set_vga_decode: vga decode change callback
+ *
+ * 	return value: 0 on success, -1 on failure
+ * 	Register a client with the VGA arbitration logic
+ *
+ *	Clients have two callback mechanisms they can use.
+ *	irq enable/disable callback -
+ *		If a client can't disable its GPUs VGA resources, then we
+ *		need to be able to ask it to turn off its irqs when we
+ *		turn off its mem and io decoding.
+ *	set_vga_decode
+ *		If a client can disable its GPU VGA resource, it will
+ *		get a callback from this to set the encode/decode state
  *
  * Rationale: we cannot disable VGA decode resources unconditionally
  * some single GPU laptops seem to require ACPI or BIOS access to the
  * VGA registers to control things like backlights etc.
  * Hopefully newer multi-GPU laptops do something saner, and desktops
  * won't have any special ACPI for this.
+ * They driver will get a callback when VGA arbitration is first used
+ * by userspace since we some older X servers have issues.
  */
 int vga_client_register(struct pci_dev *pdev, void *cookie,
 			void (*irq_set_state)(void *cookie, bool state),
-- 
cgit v1.2.3-71-gd317


From 825c423a35a80a8fd66398a3f9bde7f0b0187a76 Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Wed, 29 Jul 2009 14:39:58 +0900
Subject: PCI hotplug: add support for 5.0G link speed

Add support for PCI-E 5.0 GT/s in max_bus_speed and cur_bus_speed.

Reviewed-by: Matthew Wilcox <matthew@wil.cx>
Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/pci_hotplug_core.c |  1 +
 drivers/pci/hotplug/pciehp_hpc.c       | 10 ++++++++--
 include/linux/pci_hotplug.h            |  3 ++-
 3 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/pci_hotplug_core.c b/drivers/pci/hotplug/pci_hotplug_core.c
index 9d44669bf098..0325d989bb46 100644
--- a/drivers/pci/hotplug/pci_hotplug_core.c
+++ b/drivers/pci/hotplug/pci_hotplug_core.c
@@ -87,6 +87,7 @@ static char *pci_bus_speed_strings[] = {
 	"100 MHz PCIX 533",	/* 0x12 */
 	"133 MHz PCIX 533",	/* 0x13 */
 	"2.5 GT/s PCI-E",	/* 0x14 */
+	"5.0 GT/s PCI-E",	/* 0x15 */
 };
 
 #ifdef CONFIG_HOTPLUG_PCI_CPCI
diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 52813257e5bf..271f917b6f2c 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -693,7 +693,10 @@ static int hpc_get_max_lnk_speed(struct slot *slot, enum pci_bus_speed *value)
 
 	switch (lnk_cap & 0x000F) {
 	case 1:
-		lnk_speed = PCIE_2PT5GB;
+		lnk_speed = PCIE_2_5GB;
+		break;
+	case 2:
+		lnk_speed = PCIE_5_0GB;
 		break;
 	default:
 		lnk_speed = PCIE_LNK_SPEED_UNKNOWN;
@@ -772,7 +775,10 @@ static int hpc_get_cur_lnk_speed(struct slot *slot, enum pci_bus_speed *value)
 
 	switch (lnk_status & PCI_EXP_LNKSTA_CLS) {
 	case 1:
-		lnk_speed = PCIE_2PT5GB;
+		lnk_speed = PCIE_2_5GB;
+		break;
+	case 2:
+		lnk_speed = PCIE_5_0GB;
 		break;
 	default:
 		lnk_speed = PCIE_LNK_SPEED_UNKNOWN;
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index 4391741b99dc..1b00cc3177fc 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -62,7 +62,8 @@ enum pcie_link_width {
 };
 
 enum pcie_link_speed {
-	PCIE_2PT5GB		= 0x14,
+	PCIE_2_5GB		= 0x14,
+	PCIE_5_0GB		= 0x15,
 	PCIE_LNK_SPEED_UNKNOWN	= 0xFF,
 };
 
-- 
cgit v1.2.3-71-gd317


From 9dba910e9de2c4aa15ec1286f10052c107ef48ca Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 3 Sep 2009 15:26:36 +0900
Subject: PCI: separate out pci_add_dynid()

Separate out pci_add_dynid() from store_new_id() and export it so that
in-kernel code can add PCI IDs dynamically.  As the function will be
available regardless of HOTPLUG, put it and pull pci_free_dynids()
outside of CONFIG_HOTPLUG.

This will be used by pci-stub to initialize initial IDs via module
param.

While at it, remove bogus get_driver() failure check.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Reviewed-by: Grant Grundler <grundler@parisc-linux.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pci-driver.c | 119 +++++++++++++++++++++++++++++------------------
 include/linux/pci.h      |   5 ++
 2 files changed, 79 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index f99bc7f089f1..c66dc4341fa0 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -19,37 +19,98 @@
 #include <linux/cpu.h>
 #include "pci.h"
 
-/*
- * Dynamic device IDs are disabled for !CONFIG_HOTPLUG
- */
-
 struct pci_dynid {
 	struct list_head node;
 	struct pci_device_id id;
 };
 
-#ifdef CONFIG_HOTPLUG
+/**
+ * pci_add_dynid - add a new PCI device ID to this driver and re-probe devices
+ * @drv: target pci driver
+ * @vendor: PCI vendor ID
+ * @device: PCI device ID
+ * @subvendor: PCI subvendor ID
+ * @subdevice: PCI subdevice ID
+ * @class: PCI class
+ * @class_mask: PCI class mask
+ * @driver_data: private driver data
+ *
+ * Adds a new dynamic pci device ID to this driver and causes the
+ * driver to probe for all devices again.  @drv must have been
+ * registered prior to calling this function.
+ *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int pci_add_dynid(struct pci_driver *drv,
+		  unsigned int vendor, unsigned int device,
+		  unsigned int subvendor, unsigned int subdevice,
+		  unsigned int class, unsigned int class_mask,
+		  unsigned long driver_data)
+{
+	struct pci_dynid *dynid;
+	int retval;
+
+	dynid = kzalloc(sizeof(*dynid), GFP_KERNEL);
+	if (!dynid)
+		return -ENOMEM;
+
+	dynid->id.vendor = vendor;
+	dynid->id.device = device;
+	dynid->id.subvendor = subvendor;
+	dynid->id.subdevice = subdevice;
+	dynid->id.class = class;
+	dynid->id.class_mask = class_mask;
+	dynid->id.driver_data = driver_data;
 
+	spin_lock(&drv->dynids.lock);
+	list_add_tail(&dynid->node, &drv->dynids.list);
+	spin_unlock(&drv->dynids.lock);
+
+	get_driver(&drv->driver);
+	retval = driver_attach(&drv->driver);
+	put_driver(&drv->driver);
+
+	return retval;
+}
+
+static void pci_free_dynids(struct pci_driver *drv)
+{
+	struct pci_dynid *dynid, *n;
+
+	spin_lock(&drv->dynids.lock);
+	list_for_each_entry_safe(dynid, n, &drv->dynids.list, node) {
+		list_del(&dynid->node);
+		kfree(dynid);
+	}
+	spin_unlock(&drv->dynids.lock);
+}
+
+/*
+ * Dynamic device ID manipulation via sysfs is disabled for !CONFIG_HOTPLUG
+ */
+#ifdef CONFIG_HOTPLUG
 /**
- * store_new_id - add a new PCI device ID to this driver and re-probe devices
+ * store_new_id - sysfs frontend to pci_add_dynid()
  * @driver: target device driver
  * @buf: buffer for scanning device ID data
  * @count: input size
  *
- * Adds a new dynamic pci device ID to this driver,
- * and causes the driver to probe for all devices again.
+ * Allow PCI IDs to be added to an existing driver via sysfs.
  */
 static ssize_t
 store_new_id(struct device_driver *driver, const char *buf, size_t count)
 {
-	struct pci_dynid *dynid;
 	struct pci_driver *pdrv = to_pci_driver(driver);
 	const struct pci_device_id *ids = pdrv->id_table;
 	__u32 vendor, device, subvendor=PCI_ANY_ID,
 		subdevice=PCI_ANY_ID, class=0, class_mask=0;
 	unsigned long driver_data=0;
 	int fields=0;
-	int retval=0;
+	int retval;
 
 	fields = sscanf(buf, "%x %x %x %x %x %x %lx",
 			&vendor, &device, &subvendor, &subdevice,
@@ -72,27 +133,8 @@ store_new_id(struct device_driver *driver, const char *buf, size_t count)
 			return retval;
 	}
 
-	dynid = kzalloc(sizeof(*dynid), GFP_KERNEL);
-	if (!dynid)
-		return -ENOMEM;
-
-	dynid->id.vendor = vendor;
-	dynid->id.device = device;
-	dynid->id.subvendor = subvendor;
-	dynid->id.subdevice = subdevice;
-	dynid->id.class = class;
-	dynid->id.class_mask = class_mask;
-	dynid->id.driver_data = driver_data;
-
-	spin_lock(&pdrv->dynids.lock);
-	list_add_tail(&dynid->node, &pdrv->dynids.list);
-	spin_unlock(&pdrv->dynids.lock);
-
-	if (get_driver(&pdrv->driver)) {
-		retval = driver_attach(&pdrv->driver);
-		put_driver(&pdrv->driver);
-	}
-
+	retval = pci_add_dynid(pdrv, vendor, device, subvendor, subdevice,
+			       class, class_mask, driver_data);
 	if (retval)
 		return retval;
 	return count;
@@ -145,19 +187,6 @@ store_remove_id(struct device_driver *driver, const char *buf, size_t count)
 }
 static DRIVER_ATTR(remove_id, S_IWUSR, NULL, store_remove_id);
 
-static void
-pci_free_dynids(struct pci_driver *drv)
-{
-	struct pci_dynid *dynid, *n;
-
-	spin_lock(&drv->dynids.lock);
-	list_for_each_entry_safe(dynid, n, &drv->dynids.list, node) {
-		list_del(&dynid->node);
-		kfree(dynid);
-	}
-	spin_unlock(&drv->dynids.lock);
-}
-
 static int
 pci_create_newid_file(struct pci_driver *drv)
 {
@@ -186,7 +215,6 @@ static void pci_remove_removeid_file(struct pci_driver *drv)
 	driver_remove_file(&drv->driver, &driver_attr_remove_id);
 }
 #else /* !CONFIG_HOTPLUG */
-static inline void pci_free_dynids(struct pci_driver *drv) {}
 static inline int pci_create_newid_file(struct pci_driver *drv)
 {
 	return 0;
@@ -1106,6 +1134,7 @@ static int __init pci_driver_init(void)
 
 postcore_initcall(pci_driver_init);
 
+EXPORT_SYMBOL_GPL(pci_add_dynid);
 EXPORT_SYMBOL(pci_match_id);
 EXPORT_SYMBOL(__pci_register_driver);
 EXPORT_SYMBOL(pci_unregister_driver);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index da90217a7b0e..d75668317705 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -796,6 +796,11 @@ int __must_check __pci_register_driver(struct pci_driver *, struct module *,
 void pci_unregister_driver(struct pci_driver *dev);
 void pci_remove_behind_bridge(struct pci_dev *dev);
 struct pci_driver *pci_dev_driver(const struct pci_dev *dev);
+int pci_add_dynid(struct pci_driver *drv,
+		  unsigned int vendor, unsigned int device,
+		  unsigned int subvendor, unsigned int subdevice,
+		  unsigned int class, unsigned int class_mask,
+		  unsigned long driver_data);
 const struct pci_device_id *pci_match_id(const struct pci_device_id *ids,
 					 struct pci_dev *dev);
 int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max,
-- 
cgit v1.2.3-71-gd317


From 28760489a3f1e136c5ae8581c0fa8f63511f2f4c Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@aristanetworks.com>
Date: Wed, 9 Sep 2009 14:09:24 -0700
Subject: PCI: pcie: Ensure hotplug ports have a minimum number of resources

In general a BIOS may goof or we may hotplug in a hotplug controller.
In either case the kernel needs to reserve resources for plugging
in more devices in the future instead of creating a minimal resource
assignment.

We already do this for cardbus bridges I am just adding a variant
for pcie bridges.

v2: Make testing for pcie hotplug bridges based on a flag.

    So far we only set the flag for pcie but a header_quirk
    could easily be added for the non-standard pci hotplug
    bridges.

Signed-off-by: Eric W. Biederman <ebiederm@aristanetworks.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pci.c       | 10 ++++++++++
 drivers/pci/probe.c     | 18 ++++++++++++++++++
 drivers/pci/setup-bus.c | 22 +++++++++++++++++-----
 include/linux/pci.h     |  4 ++++
 4 files changed, 49 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index bd993351db45..8c663d628d03 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -41,6 +41,12 @@ int pci_domains_supported = 1;
 unsigned long pci_cardbus_io_size = DEFAULT_CARDBUS_IO_SIZE;
 unsigned long pci_cardbus_mem_size = DEFAULT_CARDBUS_MEM_SIZE;
 
+#define DEFAULT_HOTPLUG_IO_SIZE		(256)
+#define DEFAULT_HOTPLUG_MEM_SIZE	(2*1024*1024)
+/* pci=hpmemsize=nnM,hpiosize=nn can override this */
+unsigned long pci_hotplug_io_size  = DEFAULT_HOTPLUG_IO_SIZE;
+unsigned long pci_hotplug_mem_size = DEFAULT_HOTPLUG_MEM_SIZE;
+
 /**
  * pci_bus_max_busnr - returns maximum PCI bus number of given bus' children
  * @bus: pointer to PCI bus structure to search
@@ -2732,6 +2738,10 @@ static int __init pci_setup(char *str)
 							strlen(str + 19));
 			} else if (!strncmp(str, "ecrc=", 5)) {
 				pcie_ecrc_get_policy(str + 5);
+			} else if (!strncmp(str, "hpiosize=", 9)) {
+				pci_hotplug_io_size = memparse(str + 9, &str);
+			} else if (!strncmp(str, "hpmemsize=", 10)) {
+				pci_hotplug_mem_size = memparse(str + 10, &str);
 			} else {
 				printk(KERN_ERR "PCI: Unknown option `%s'\n",
 						str);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index ab52840f4753..882383b61d30 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -697,6 +697,23 @@ static void set_pcie_port_type(struct pci_dev *pdev)
 	pdev->pcie_type = (reg16 & PCI_EXP_FLAGS_TYPE) >> 4;
 }
 
+static void set_pcie_hotplug_bridge(struct pci_dev *pdev)
+{
+	int pos;
+	u16 reg16;
+	u32 reg32;
+
+	pos = pci_find_capability(pdev, PCI_CAP_ID_EXP);
+	if (!pos)
+		return;
+	pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &reg16);
+	if (!(reg16 & PCI_EXP_FLAGS_SLOT))
+		return;
+	pci_read_config_dword(pdev, pos + PCI_EXP_SLTCAP, &reg32);
+	if (reg32 & PCI_EXP_SLTCAP_HPC)
+		pdev->is_hotplug_bridge = 1;
+}
+
 #define LEGACY_IO_RESOURCE	(IORESOURCE_IO | IORESOURCE_PCI_FIXED)
 
 /**
@@ -804,6 +821,7 @@ int pci_setup_device(struct pci_dev *dev)
 		pci_read_irq(dev);
 		dev->transparent = ((dev->class & 0xff) == 1);
 		pci_read_bases(dev, 2, PCI_ROM_ADDRESS1);
+		set_pcie_hotplug_bridge(dev);
 		break;
 
 	case PCI_HEADER_TYPE_CARDBUS:		    /* CardBus bridge header */
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 7c443b4583ab..cb1a027eb552 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -309,7 +309,7 @@ static struct resource *find_free_bus_resource(struct pci_bus *bus, unsigned lon
    since these windows have 4K granularity and the IO ranges
    of non-bridge PCI devices are limited to 256 bytes.
    We must be careful with the ISA aliasing though. */
-static void pbus_size_io(struct pci_bus *bus)
+static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size)
 {
 	struct pci_dev *dev;
 	struct resource *b_res = find_free_bus_resource(bus, IORESOURCE_IO);
@@ -336,6 +336,8 @@ static void pbus_size_io(struct pci_bus *bus)
 				size1 += r_size;
 		}
 	}
+	if (size < min_size)
+		size = min_size;
 /* To be fixed in 2.5: we should have sort of HAVE_ISA
    flag in the struct pci_bus. */
 #if defined(CONFIG_ISA) || defined(CONFIG_EISA)
@@ -354,7 +356,8 @@ static void pbus_size_io(struct pci_bus *bus)
 
 /* Calculate the size of the bus and minimal alignment which
    guarantees that all child resources fit in this size. */
-static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, unsigned long type)
+static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
+			 unsigned long type, resource_size_t min_size)
 {
 	struct pci_dev *dev;
 	resource_size_t min_align, align, size;
@@ -404,6 +407,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, unsigned long
 			mem64_mask &= r->flags & IORESOURCE_MEM_64;
 		}
 	}
+	if (size < min_size)
+		size = min_size;
 
 	align = 0;
 	min_align = 0;
@@ -483,6 +488,7 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus)
 {
 	struct pci_dev *dev;
 	unsigned long mask, prefmask;
+	resource_size_t min_mem_size = 0, min_io_size = 0;
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		struct pci_bus *b = dev->subordinate;
@@ -512,8 +518,12 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus)
 
 	case PCI_CLASS_BRIDGE_PCI:
 		pci_bridge_check_ranges(bus);
+		if (bus->self->is_hotplug_bridge) {
+			min_io_size  = pci_hotplug_io_size;
+			min_mem_size = pci_hotplug_mem_size;
+		}
 	default:
-		pbus_size_io(bus);
+		pbus_size_io(bus, min_io_size);
 		/* If the bridge supports prefetchable range, size it
 		   separately. If it doesn't, or its prefetchable window
 		   has already been allocated by arch code, try
@@ -521,9 +531,11 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus)
 		   resources. */
 		mask = IORESOURCE_MEM;
 		prefmask = IORESOURCE_MEM | IORESOURCE_PREFETCH;
-		if (pbus_size_mem(bus, prefmask, prefmask))
+		if (pbus_size_mem(bus, prefmask, prefmask, min_mem_size))
 			mask = prefmask; /* Success, size non-prefetch only. */
-		pbus_size_mem(bus, mask, IORESOURCE_MEM);
+		else
+			min_mem_size += min_mem_size;
+		pbus_size_mem(bus, mask, IORESOURCE_MEM, min_mem_size);
 		break;
 	}
 }
diff --git a/include/linux/pci.h b/include/linux/pci.h
index d75668317705..d80ed872a3c4 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -278,6 +278,7 @@ struct pci_dev {
 	unsigned int	is_physfn:1;
 	unsigned int	is_virtfn:1;
 	unsigned int	reset_fn:1;
+	unsigned int    is_hotplug_bridge:1;
 	pci_dev_flags_t dev_flags;
 	atomic_t	enable_cnt;	/* pci_enable_device has been called */
 
@@ -1245,6 +1246,9 @@ extern int pci_pci_problems;
 extern unsigned long pci_cardbus_io_size;
 extern unsigned long pci_cardbus_mem_size;
 
+extern unsigned long pci_hotplug_io_size;
+extern unsigned long pci_hotplug_mem_size;
+
 int pcibios_add_platform_entries(struct pci_dev *dev);
 void pcibios_disable_device(struct pci_dev *dev);
 int pcibios_set_pcie_reset_state(struct pci_dev *dev,
-- 
cgit v1.2.3-71-gd317


From e80bb09d2c73d76a2a4cd79e4a83802dd901c642 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 8 Sep 2009 23:14:49 +0200
Subject: PCI PM: Introduce device flag wakeup_prepared

Introduce a new PCI device flag, wakeup_prepared, to prevent PCI
wake-up preparation code from being executed twice in a row for the
same device and for the same purpose.

Reviewed-by: Matthew Garrett <mjg59@srcf.ucam.org>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pci.c   | 8 ++++++++
 include/linux/pci.h | 1 +
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 84a6d6d30c5a..dcdfb2212ca3 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1225,6 +1225,10 @@ int pci_enable_wake(struct pci_dev *dev, pci_power_t state, bool enable)
 	if (enable && !device_may_wakeup(&dev->dev))
 		return -EINVAL;
 
+	/* Don't do the same thing twice in a row for one device. */
+	if (!!enable == !!dev->wakeup_prepared)
+		return 0;
+
 	/*
 	 * According to "PCI System Architecture" 4th ed. by Tom Shanley & Don
 	 * Anderson we should be doing PME# wake enable followed by ACPI wake
@@ -1241,9 +1245,12 @@ int pci_enable_wake(struct pci_dev *dev, pci_power_t state, bool enable)
 		error = platform_pci_sleep_wake(dev, true);
 		if (ret)
 			ret = error;
+		if (!ret)
+			dev->wakeup_prepared = true;
 	} else {
 		platform_pci_sleep_wake(dev, false);
 		pci_pme_active(dev, false);
+		dev->wakeup_prepared = false;
 	}
 
 	return ret;
@@ -1365,6 +1372,7 @@ void pci_pm_init(struct pci_dev *dev)
 	int pm;
 	u16 pmc;
 
+	dev->wakeup_prepared = false;
 	dev->pm_cap = 0;
 
 	/* find PCI PM capability in list */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index d80ed872a3c4..f5c7cd343e56 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -241,6 +241,7 @@ struct pci_dev {
 	unsigned int	d1_support:1;	/* Low power state D1 is supported */
 	unsigned int	d2_support:1;	/* Low power state D2 is supported */
 	unsigned int	no_d1d2:1;	/* Only allow D0 and D3 */
+	unsigned int	wakeup_prepared:1;
 
 #ifdef CONFIG_PCIEASPM
 	struct pcie_link_state	*link_state;	/* ASPM link state. */
-- 
cgit v1.2.3-71-gd317


From 890ca9aefa78f7831f8f633cab9e4803636dffe4 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 11 May 2009 16:48:15 +0800
Subject: KVM: Add MCE support

The related MSRs are emulated. MCE capability is exported via
extension KVM_CAP_MCE and ioctl KVM_X86_GET_MCE_CAP_SUPPORTED.  A new
vcpu ioctl command KVM_X86_SETUP_MCE is used to setup MCE emulation
such as the mcg_cap. MCE is injected via vcpu ioctl command
KVM_X86_SET_MCE. Extended machine-check state (MCG_EXT_P) and CMCI are
not implemented.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm.h      |   1 +
 arch/x86/include/asm/kvm_host.h |   5 +
 arch/x86/kvm/x86.c              | 220 +++++++++++++++++++++++++++++++++++-----
 include/linux/kvm.h             |  20 ++++
 4 files changed, 222 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 125be8b19568..708b9c32a5da 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -17,6 +17,7 @@
 #define __KVM_HAVE_USER_NMI
 #define __KVM_HAVE_GUEST_DEBUG
 #define __KVM_HAVE_MSIX
+#define __KVM_HAVE_MCE
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 79561752af97..81c68f630b14 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -373,6 +373,11 @@ struct kvm_vcpu_arch {
 	unsigned long dr6;
 	unsigned long dr7;
 	unsigned long eff_db[KVM_NR_DB_REGS];
+
+	u64 mcg_cap;
+	u64 mcg_status;
+	u64 mcg_ctl;
+	u64 *mce_banks;
 };
 
 struct kvm_mem_alias {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6d46079a901c..55a9dd182de8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -42,6 +42,7 @@
 #include <asm/msr.h>
 #include <asm/desc.h>
 #include <asm/mtrr.h>
+#include <asm/mce.h>
 
 #define MAX_IO_MSRS 256
 #define CR0_RESERVED_BITS						\
@@ -55,6 +56,10 @@
 			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
+
+#define KVM_MAX_MCE_BANKS 32
+#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
+
 /* EFER defaults:
  * - enable syscall per default because its emulated by KVM
  * - enable LME and LMA per default on 64 bit KVM
@@ -777,23 +782,43 @@ static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	return 0;
 }
 
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
+	u64 mcg_cap = vcpu->arch.mcg_cap;
+	unsigned bank_num = mcg_cap & 0xff;
+
 	switch (msr) {
-	case MSR_EFER:
-		set_efer(vcpu, data);
-		break;
-	case MSR_IA32_MC0_STATUS:
-		pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
-		       __func__, data);
-		break;
 	case MSR_IA32_MCG_STATUS:
-		pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
-			__func__, data);
+		vcpu->arch.mcg_status = data;
 		break;
 	case MSR_IA32_MCG_CTL:
-		pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
-			__func__, data);
+		if (!(mcg_cap & MCG_CTL_P))
+			return 1;
+		if (data != 0 && data != ~(u64)0)
+			return -1;
+		vcpu->arch.mcg_ctl = data;
+		break;
+	default:
+		if (msr >= MSR_IA32_MC0_CTL &&
+		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+			u32 offset = msr - MSR_IA32_MC0_CTL;
+			/* only 0 or all 1s can be written to IA32_MCi_CTL */
+			if ((offset & 0x3) == 0 &&
+			    data != 0 && data != ~(u64)0)
+				return -1;
+			vcpu->arch.mce_banks[offset] = data;
+			break;
+		}
+		return 1;
+	}
+	return 0;
+}
+
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+	switch (msr) {
+	case MSR_EFER:
+		set_efer(vcpu, data);
 		break;
 	case MSR_IA32_DEBUGCTLMSR:
 		if (!data) {
@@ -849,6 +874,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		kvm_request_guest_time_update(vcpu);
 		break;
 	}
+	case MSR_IA32_MCG_CTL:
+	case MSR_IA32_MCG_STATUS:
+	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+		return set_msr_mce(vcpu, msr, data);
 	default:
 		pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
 		return 1;
@@ -904,26 +933,49 @@ static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	return 0;
 }
 
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 {
 	u64 data;
+	u64 mcg_cap = vcpu->arch.mcg_cap;
+	unsigned bank_num = mcg_cap & 0xff;
 
 	switch (msr) {
-	case 0xc0010010: /* SYSCFG */
-	case 0xc0010015: /* HWCR */
-	case MSR_IA32_PLATFORM_ID:
 	case MSR_IA32_P5_MC_ADDR:
 	case MSR_IA32_P5_MC_TYPE:
-	case MSR_IA32_MC0_CTL:
-	case MSR_IA32_MCG_STATUS:
+		data = 0;
+		break;
 	case MSR_IA32_MCG_CAP:
+		data = vcpu->arch.mcg_cap;
+		break;
 	case MSR_IA32_MCG_CTL:
-	case MSR_IA32_MC0_MISC:
-	case MSR_IA32_MC0_MISC+4:
-	case MSR_IA32_MC0_MISC+8:
-	case MSR_IA32_MC0_MISC+12:
-	case MSR_IA32_MC0_MISC+16:
-	case MSR_IA32_MC0_MISC+20:
+		if (!(mcg_cap & MCG_CTL_P))
+			return 1;
+		data = vcpu->arch.mcg_ctl;
+		break;
+	case MSR_IA32_MCG_STATUS:
+		data = vcpu->arch.mcg_status;
+		break;
+	default:
+		if (msr >= MSR_IA32_MC0_CTL &&
+		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+			u32 offset = msr - MSR_IA32_MC0_CTL;
+			data = vcpu->arch.mce_banks[offset];
+			break;
+		}
+		return 1;
+	}
+	*pdata = data;
+	return 0;
+}
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+	u64 data;
+
+	switch (msr) {
+	case 0xc0010010: /* SYSCFG */
+	case 0xc0010015: /* HWCR */
+	case MSR_IA32_PLATFORM_ID:
 	case MSR_IA32_UCODE_REV:
 	case MSR_IA32_EBL_CR_POWERON:
 	case MSR_IA32_DEBUGCTLMSR:
@@ -966,6 +1018,13 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_KVM_SYSTEM_TIME:
 		data = vcpu->arch.time;
 		break;
+	case MSR_IA32_P5_MC_ADDR:
+	case MSR_IA32_P5_MC_TYPE:
+	case MSR_IA32_MCG_CAP:
+	case MSR_IA32_MCG_CTL:
+	case MSR_IA32_MCG_STATUS:
+	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+		return get_msr_mce(vcpu, msr, pdata);
 	default:
 		pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
 		return 1;
@@ -1087,6 +1146,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_IOMMU:
 		r = iommu_found();
 		break;
+	case KVM_CAP_MCE:
+		r = KVM_MAX_MCE_BANKS;
+		break;
 	default:
 		r = 0;
 		break;
@@ -1146,6 +1208,16 @@ long kvm_arch_dev_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_X86_GET_MCE_CAP_SUPPORTED: {
+		u64 mce_cap;
+
+		mce_cap = KVM_MCE_CAP_SUPPORTED;
+		r = -EFAULT;
+		if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
+			goto out;
+		r = 0;
+		break;
+	}
 	default:
 		r = -EINVAL;
 	}
@@ -1502,6 +1574,80 @@ static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
+					u64 mcg_cap)
+{
+	int r;
+	unsigned bank_num = mcg_cap & 0xff, bank;
+
+	r = -EINVAL;
+	if (!bank_num)
+		goto out;
+	if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
+		goto out;
+	r = 0;
+	vcpu->arch.mcg_cap = mcg_cap;
+	/* Init IA32_MCG_CTL to all 1s */
+	if (mcg_cap & MCG_CTL_P)
+		vcpu->arch.mcg_ctl = ~(u64)0;
+	/* Init IA32_MCi_CTL to all 1s */
+	for (bank = 0; bank < bank_num; bank++)
+		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+out:
+	return r;
+}
+
+static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
+				      struct kvm_x86_mce *mce)
+{
+	u64 mcg_cap = vcpu->arch.mcg_cap;
+	unsigned bank_num = mcg_cap & 0xff;
+	u64 *banks = vcpu->arch.mce_banks;
+
+	if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
+		return -EINVAL;
+	/*
+	 * if IA32_MCG_CTL is not all 1s, the uncorrected error
+	 * reporting is disabled
+	 */
+	if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
+	    vcpu->arch.mcg_ctl != ~(u64)0)
+		return 0;
+	banks += 4 * mce->bank;
+	/*
+	 * if IA32_MCi_CTL is not all 1s, the uncorrected error
+	 * reporting is disabled for the bank
+	 */
+	if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
+		return 0;
+	if (mce->status & MCI_STATUS_UC) {
+		if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
+		    !(vcpu->arch.cr4 & X86_CR4_MCE)) {
+			printk(KERN_DEBUG "kvm: set_mce: "
+			       "injects mce exception while "
+			       "previous one is in progress!\n");
+			set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+			return 0;
+		}
+		if (banks[1] & MCI_STATUS_VAL)
+			mce->status |= MCI_STATUS_OVER;
+		banks[2] = mce->addr;
+		banks[3] = mce->misc;
+		vcpu->arch.mcg_status = mce->mcg_status;
+		banks[1] = mce->status;
+		kvm_queue_exception(vcpu, MC_VECTOR);
+	} else if (!(banks[1] & MCI_STATUS_VAL)
+		   || !(banks[1] & MCI_STATUS_UC)) {
+		if (banks[1] & MCI_STATUS_VAL)
+			mce->status |= MCI_STATUS_OVER;
+		banks[2] = mce->addr;
+		banks[3] = mce->misc;
+		banks[1] = mce->status;
+	} else
+		banks[1] |= MCI_STATUS_OVER;
+	return 0;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 {
@@ -1635,6 +1781,24 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
 		break;
 	}
+	case KVM_X86_SETUP_MCE: {
+		u64 mcg_cap;
+
+		r = -EFAULT;
+		if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
+			goto out;
+		r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
+		break;
+	}
+	case KVM_X86_SET_MCE: {
+		struct kvm_x86_mce mce;
+
+		r = -EFAULT;
+		if (copy_from_user(&mce, argp, sizeof mce))
+			goto out;
+		r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
+		break;
+	}
 	default:
 		r = -EINVAL;
 	}
@@ -4440,6 +4604,14 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 			goto fail_mmu_destroy;
 	}
 
+	vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
+				       GFP_KERNEL);
+	if (!vcpu->arch.mce_banks) {
+		r = -ENOMEM;
+		goto fail_mmu_destroy;
+	}
+	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
+
 	return 0;
 
 fail_mmu_destroy:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 3db5d8d37485..7b17141c47c9 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -415,6 +415,9 @@ struct kvm_trace_rec {
 #define KVM_CAP_ASSIGN_DEV_IRQ 29
 /* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */
 #define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30
+#ifdef __KVM_HAVE_MCE
+#define KVM_CAP_MCE 31
+#endif
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -454,6 +457,19 @@ struct kvm_irq_routing {
 
 #endif
 
+#ifdef KVM_CAP_MCE
+/* x86 MCE */
+struct kvm_x86_mce {
+	__u64 status;
+	__u64 addr;
+	__u64 misc;
+	__u64 mcg_status;
+	__u8 bank;
+	__u8 pad1[7];
+	__u64 pad2[3];
+};
+#endif
+
 /*
  * ioctls for VM fds
  */
@@ -541,6 +557,10 @@ struct kvm_irq_routing {
 #define KVM_NMI                   _IO(KVMIO,  0x9a)
 /* Available with KVM_CAP_SET_GUEST_DEBUG */
 #define KVM_SET_GUEST_DEBUG       _IOW(KVMIO,  0x9b, struct kvm_guest_debug)
+/* MCE for x86 */
+#define KVM_X86_SETUP_MCE         _IOW(KVMIO,  0x9c, __u64)
+#define KVM_X86_GET_MCE_CAP_SUPPORTED _IOR(KVMIO,  0x9d, __u64)
+#define KVM_X86_SET_MCE           _IOW(KVMIO,  0x9e, struct kvm_x86_mce)
 
 /*
  * Deprecated interfaces
-- 
cgit v1.2.3-71-gd317


From 721eecbf4fe995ca94a9edec0c9843b1cc0eaaf3 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Wed, 20 May 2009 10:30:49 -0400
Subject: KVM: irqfd

KVM provides a complete virtual system environment for guests, including
support for injecting interrupts modeled after the real exception/interrupt
facilities present on the native platform (such as the IDT on x86).
Virtual interrupts can come from a variety of sources (emulated devices,
pass-through devices, etc) but all must be injected to the guest via
the KVM infrastructure.  This patch adds a new mechanism to inject a specific
interrupt to a guest using a decoupled eventfd mechnanism:  Any legal signal
on the irqfd (using eventfd semantics from either userspace or kernel) will
translate into an injected interrupt in the guest at the next available
interrupt window.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/Kconfig     |   1 +
 arch/x86/kvm/Makefile    |   2 +-
 arch/x86/kvm/x86.c       |   1 +
 include/linux/kvm.h      |  11 ++
 include/linux/kvm_host.h |  24 ++++
 virt/kvm/Kconfig         |   4 +
 virt/kvm/eventfd.c       | 329 +++++++++++++++++++++++++++++++++++++++++++++++
 virt/kvm/kvm_main.c      |  12 ++
 8 files changed, 383 insertions(+), 1 deletion(-)
 create mode 100644 virt/kvm/eventfd.c

(limited to 'include/linux')

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 939b1cb9c5b3..8cd2a4efe238 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -26,6 +26,7 @@ config KVM
 	select MMU_NOTIFIER
 	select ANON_INODES
 	select HAVE_KVM_IRQCHIP
+	select HAVE_KVM_EVENTFD
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
 	  virtualization extensions.  You will need a fairly recent
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index bee9512cd60d..01e3c61f749a 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -2,7 +2,7 @@
 EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
 
 kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
-				coalesced_mmio.o irq_comm.o)
+				coalesced_mmio.o irq_comm.o eventfd.o)
 kvm-$(CONFIG_KVM_TRACE)	+= $(addprefix ../../../virt/kvm/, kvm_trace.o)
 kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f0642396783f..15f39fc08ece 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1126,6 +1126,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_REINJECT_CONTROL:
 	case KVM_CAP_IRQ_INJECT_STATUS:
 	case KVM_CAP_ASSIGN_DEV_IRQ:
+	case KVM_CAP_IRQFD:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 7b17141c47c9..8f53f24e5274 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -418,6 +418,7 @@ struct kvm_trace_rec {
 #ifdef __KVM_HAVE_MCE
 #define KVM_CAP_MCE 31
 #endif
+#define KVM_CAP_IRQFD 32
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -470,6 +471,15 @@ struct kvm_x86_mce {
 };
 #endif
 
+#define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
+
+struct kvm_irqfd {
+	__u32 fd;
+	__u32 gsi;
+	__u32 flags;
+	__u8  pad[20];
+};
+
 /*
  * ioctls for VM fds
  */
@@ -514,6 +524,7 @@ struct kvm_x86_mce {
 #define KVM_ASSIGN_SET_MSIX_ENTRY \
 			_IOW(KVMIO, 0x74, struct kvm_assigned_msix_entry)
 #define KVM_DEASSIGN_DEV_IRQ       _IOW(KVMIO, 0x75, struct kvm_assigned_irq)
+#define KVM_IRQFD                  _IOW(KVMIO, 0x76, struct kvm_irqfd)
 
 /*
  * ioctls for vcpu fds
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3060bdc35ffe..7724dcb6ff76 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -136,6 +136,12 @@ struct kvm {
 	struct list_head vm_list;
 	struct kvm_io_bus mmio_bus;
 	struct kvm_io_bus pio_bus;
+#ifdef CONFIG_HAVE_KVM_EVENTFD
+	struct {
+		spinlock_t        lock;
+		struct list_head  items;
+	} irqfds;
+#endif
 	struct kvm_vm_stat stat;
 	struct kvm_arch arch;
 	atomic_t users_count;
@@ -525,4 +531,22 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {}
 
 #endif
 
+#ifdef CONFIG_HAVE_KVM_EVENTFD
+
+void kvm_irqfd_init(struct kvm *kvm);
+int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags);
+void kvm_irqfd_release(struct kvm *kvm);
+
+#else
+
+static inline void kvm_irqfd_init(struct kvm *kvm) {}
+static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
+{
+	return -EINVAL;
+}
+
+static inline void kvm_irqfd_release(struct kvm *kvm) {}
+
+#endif /* CONFIG_HAVE_KVM_EVENTFD */
+
 #endif
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 79e83fc00dc7..56c6848d2df8 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -5,3 +5,7 @@ config HAVE_KVM
 
 config HAVE_KVM_IRQCHIP
        bool
+
+config HAVE_KVM_EVENTFD
+       bool
+       select EVENTFD
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
new file mode 100644
index 000000000000..314012323afe
--- /dev/null
+++ b/virt/kvm/eventfd.c
@@ -0,0 +1,329 @@
+/*
+ * kvm eventfd support - use eventfd objects to signal various KVM events
+ *
+ * Copyright 2009 Novell.  All Rights Reserved.
+ *
+ * Author:
+ *	Gregory Haskins <ghaskins@novell.com>
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/workqueue.h>
+#include <linux/syscalls.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/list.h>
+#include <linux/eventfd.h>
+
+/*
+ * --------------------------------------------------------------------
+ * irqfd: Allows an fd to be used to inject an interrupt to the guest
+ *
+ * Credit goes to Avi Kivity for the original idea.
+ * --------------------------------------------------------------------
+ */
+
+struct _irqfd {
+	struct kvm               *kvm;
+	struct eventfd_ctx       *eventfd;
+	int                       gsi;
+	struct list_head          list;
+	poll_table                pt;
+	wait_queue_head_t        *wqh;
+	wait_queue_t              wait;
+	struct work_struct        inject;
+	struct work_struct        shutdown;
+};
+
+static struct workqueue_struct *irqfd_cleanup_wq;
+
+static void
+irqfd_inject(struct work_struct *work)
+{
+	struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
+	struct kvm *kvm = irqfd->kvm;
+
+	mutex_lock(&kvm->lock);
+	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
+	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
+	mutex_unlock(&kvm->lock);
+}
+
+/*
+ * Race-free decouple logic (ordering is critical)
+ */
+static void
+irqfd_shutdown(struct work_struct *work)
+{
+	struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown);
+
+	/*
+	 * Synchronize with the wait-queue and unhook ourselves to prevent
+	 * further events.
+	 */
+	remove_wait_queue(irqfd->wqh, &irqfd->wait);
+
+	/*
+	 * We know no new events will be scheduled at this point, so block
+	 * until all previously outstanding events have completed
+	 */
+	flush_work(&irqfd->inject);
+
+	/*
+	 * It is now safe to release the object's resources
+	 */
+	eventfd_ctx_put(irqfd->eventfd);
+	kfree(irqfd);
+}
+
+
+/* assumes kvm->irqfds.lock is held */
+static bool
+irqfd_is_active(struct _irqfd *irqfd)
+{
+	return list_empty(&irqfd->list) ? false : true;
+}
+
+/*
+ * Mark the irqfd as inactive and schedule it for removal
+ *
+ * assumes kvm->irqfds.lock is held
+ */
+static void
+irqfd_deactivate(struct _irqfd *irqfd)
+{
+	BUG_ON(!irqfd_is_active(irqfd));
+
+	list_del_init(&irqfd->list);
+
+	queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
+}
+
+/*
+ * Called with wqh->lock held and interrupts disabled
+ */
+static int
+irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
+	unsigned long flags = (unsigned long)key;
+
+	if (flags & POLLIN)
+		/* An event has been signaled, inject an interrupt */
+		schedule_work(&irqfd->inject);
+
+	if (flags & POLLHUP) {
+		/* The eventfd is closing, detach from KVM */
+		struct kvm *kvm = irqfd->kvm;
+		unsigned long flags;
+
+		spin_lock_irqsave(&kvm->irqfds.lock, flags);
+
+		/*
+		 * We must check if someone deactivated the irqfd before
+		 * we could acquire the irqfds.lock since the item is
+		 * deactivated from the KVM side before it is unhooked from
+		 * the wait-queue.  If it is already deactivated, we can
+		 * simply return knowing the other side will cleanup for us.
+		 * We cannot race against the irqfd going away since the
+		 * other side is required to acquire wqh->lock, which we hold
+		 */
+		if (irqfd_is_active(irqfd))
+			irqfd_deactivate(irqfd);
+
+		spin_unlock_irqrestore(&kvm->irqfds.lock, flags);
+	}
+
+	return 0;
+}
+
+static void
+irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
+			poll_table *pt)
+{
+	struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt);
+
+	irqfd->wqh = wqh;
+	add_wait_queue(wqh, &irqfd->wait);
+}
+
+static int
+kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
+{
+	struct _irqfd *irqfd;
+	struct file *file = NULL;
+	struct eventfd_ctx *eventfd = NULL;
+	int ret;
+	unsigned int events;
+
+	irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
+	if (!irqfd)
+		return -ENOMEM;
+
+	irqfd->kvm = kvm;
+	irqfd->gsi = gsi;
+	INIT_LIST_HEAD(&irqfd->list);
+	INIT_WORK(&irqfd->inject, irqfd_inject);
+	INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
+
+	file = eventfd_fget(fd);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		goto fail;
+	}
+
+	eventfd = eventfd_ctx_fileget(file);
+	if (IS_ERR(eventfd)) {
+		ret = PTR_ERR(eventfd);
+		goto fail;
+	}
+
+	irqfd->eventfd = eventfd;
+
+	/*
+	 * Install our own custom wake-up handling so we are notified via
+	 * a callback whenever someone signals the underlying eventfd
+	 */
+	init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
+	init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
+
+	events = file->f_op->poll(file, &irqfd->pt);
+
+	spin_lock_irq(&kvm->irqfds.lock);
+	list_add_tail(&irqfd->list, &kvm->irqfds.items);
+	spin_unlock_irq(&kvm->irqfds.lock);
+
+	/*
+	 * Check if there was an event already pending on the eventfd
+	 * before we registered, and trigger it as if we didn't miss it.
+	 */
+	if (events & POLLIN)
+		schedule_work(&irqfd->inject);
+
+	/*
+	 * do not drop the file until the irqfd is fully initialized, otherwise
+	 * we might race against the POLLHUP
+	 */
+	fput(file);
+
+	return 0;
+
+fail:
+	if (eventfd && !IS_ERR(eventfd))
+		eventfd_ctx_put(eventfd);
+
+	if (file && !IS_ERR(file))
+		fput(file);
+
+	kfree(irqfd);
+	return ret;
+}
+
+void
+kvm_irqfd_init(struct kvm *kvm)
+{
+	spin_lock_init(&kvm->irqfds.lock);
+	INIT_LIST_HEAD(&kvm->irqfds.items);
+}
+
+/*
+ * shutdown any irqfd's that match fd+gsi
+ */
+static int
+kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
+{
+	struct _irqfd *irqfd, *tmp;
+	struct eventfd_ctx *eventfd;
+
+	eventfd = eventfd_ctx_fdget(fd);
+	if (IS_ERR(eventfd))
+		return PTR_ERR(eventfd);
+
+	spin_lock_irq(&kvm->irqfds.lock);
+
+	list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
+		if (irqfd->eventfd == eventfd && irqfd->gsi == gsi)
+			irqfd_deactivate(irqfd);
+	}
+
+	spin_unlock_irq(&kvm->irqfds.lock);
+	eventfd_ctx_put(eventfd);
+
+	/*
+	 * Block until we know all outstanding shutdown jobs have completed
+	 * so that we guarantee there will not be any more interrupts on this
+	 * gsi once this deassign function returns.
+	 */
+	flush_workqueue(irqfd_cleanup_wq);
+
+	return 0;
+}
+
+int
+kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
+{
+	if (flags & KVM_IRQFD_FLAG_DEASSIGN)
+		return kvm_irqfd_deassign(kvm, fd, gsi);
+
+	return kvm_irqfd_assign(kvm, fd, gsi);
+}
+
+/*
+ * This function is called as the kvm VM fd is being released. Shutdown all
+ * irqfds that still remain open
+ */
+void
+kvm_irqfd_release(struct kvm *kvm)
+{
+	struct _irqfd *irqfd, *tmp;
+
+	spin_lock_irq(&kvm->irqfds.lock);
+
+	list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list)
+		irqfd_deactivate(irqfd);
+
+	spin_unlock_irq(&kvm->irqfds.lock);
+
+	/*
+	 * Block until we know all outstanding shutdown jobs have completed
+	 * since we do not take a kvm* reference.
+	 */
+	flush_workqueue(irqfd_cleanup_wq);
+
+}
+
+/*
+ * create a host-wide workqueue for issuing deferred shutdown requests
+ * aggregated from all vm* instances. We need our own isolated single-thread
+ * queue to prevent deadlock against flushing the normal work-queue.
+ */
+static int __init irqfd_module_init(void)
+{
+	irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup");
+	if (!irqfd_cleanup_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void __exit irqfd_module_exit(void)
+{
+	destroy_workqueue(irqfd_cleanup_wq);
+}
+
+module_init(irqfd_module_init);
+module_exit(irqfd_module_exit);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2884baf1d5f9..dee321e58894 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -986,6 +986,7 @@ static struct kvm *kvm_create_vm(void)
 	spin_lock_init(&kvm->mmu_lock);
 	spin_lock_init(&kvm->requests_lock);
 	kvm_io_bus_init(&kvm->pio_bus);
+	kvm_irqfd_init(kvm);
 	mutex_init(&kvm->lock);
 	kvm_io_bus_init(&kvm->mmio_bus);
 	init_rwsem(&kvm->slots_lock);
@@ -1071,6 +1072,8 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
 {
 	struct kvm *kvm = filp->private_data;
 
+	kvm_irqfd_release(kvm);
+
 	kvm_put_kvm(kvm);
 	return 0;
 }
@@ -2222,6 +2225,15 @@ static long kvm_vm_ioctl(struct file *filp,
 	}
 #endif
 #endif /* KVM_CAP_IRQ_ROUTING */
+	case KVM_IRQFD: {
+		struct kvm_irqfd data;
+
+		r = -EFAULT;
+		if (copy_from_user(&data, argp, sizeof data))
+			goto out;
+		r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags);
+		break;
+	}
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 	}
-- 
cgit v1.2.3-71-gd317


From c5ff41ce66382d657a76bc06ba252d848826950f Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@web.de>
Date: Thu, 14 May 2009 22:42:53 +0200
Subject: KVM: Allow PIT emulation without speaker port

The in-kernel speaker emulation is only a dummy and also unneeded from
the performance point of view. Rather, it takes user space support to
generate sound output on the host, e.g. console beeps.

To allow this, introduce KVM_CREATE_PIT2 which controls in-kernel
speaker port emulation via a flag passed along the new IOCTL. It also
leaves room for future extensions of the PIT configuration interface.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8254.c | 14 ++++++++------
 arch/x86/kvm/i8254.h |  2 +-
 arch/x86/kvm/x86.c   | 12 +++++++++++-
 include/linux/kvm.h  | 12 ++++++++++++
 4 files changed, 32 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 21f68e00524f..0990bc9aac1f 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -563,7 +563,7 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
 	}
 }
 
-struct kvm_pit *kvm_create_pit(struct kvm *kvm)
+struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 {
 	struct kvm_pit *pit;
 	struct kvm_kpit_state *pit_state;
@@ -589,11 +589,13 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
 	pit->dev.private = pit;
 	kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
 
-	pit->speaker_dev.read = speaker_ioport_read;
-	pit->speaker_dev.write = speaker_ioport_write;
-	pit->speaker_dev.in_range = speaker_in_range;
-	pit->speaker_dev.private = pit;
-	kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev);
+	if (flags & KVM_PIT_SPEAKER_DUMMY) {
+		pit->speaker_dev.read = speaker_ioport_read;
+		pit->speaker_dev.write = speaker_ioport_write;
+		pit->speaker_dev.in_range = speaker_in_range;
+		pit->speaker_dev.private = pit;
+		kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev);
+	}
 
 	kvm->arch.vpit = pit;
 	pit->kvm = kvm;
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index bbd863ff60b7..b2670180f225 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -50,7 +50,7 @@ struct kvm_pit {
 
 void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
-struct kvm_pit *kvm_create_pit(struct kvm *kvm);
+struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
 void kvm_free_pit(struct kvm *kvm);
 void kvm_pit_reset(struct kvm_pit *pit);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 15f39fc08ece..5eb3b8dd74b8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1127,6 +1127,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_IRQ_INJECT_STATUS:
 	case KVM_CAP_ASSIGN_DEV_IRQ:
 	case KVM_CAP_IRQFD:
+	case KVM_CAP_PIT2:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -2038,6 +2039,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	union {
 		struct kvm_pit_state ps;
 		struct kvm_memory_alias alias;
+		struct kvm_pit_config pit_config;
 	} u;
 
 	switch (ioctl) {
@@ -2098,12 +2100,20 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		}
 		break;
 	case KVM_CREATE_PIT:
+		u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
+		goto create_pit;
+	case KVM_CREATE_PIT2:
+		r = -EFAULT;
+		if (copy_from_user(&u.pit_config, argp,
+				   sizeof(struct kvm_pit_config)))
+			goto out;
+	create_pit:
 		mutex_lock(&kvm->lock);
 		r = -EEXIST;
 		if (kvm->arch.vpit)
 			goto create_pit_unlock;
 		r = -ENOMEM;
-		kvm->arch.vpit = kvm_create_pit(kvm);
+		kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
 		if (kvm->arch.vpit)
 			r = 0;
 	create_pit_unlock:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 8f53f24e5274..65df6f5e2b98 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -70,6 +70,14 @@ struct kvm_irqchip {
 	} chip;
 };
 
+/* for KVM_CREATE_PIT2 */
+struct kvm_pit_config {
+	__u32 flags;
+	__u32 pad[15];
+};
+
+#define KVM_PIT_SPEAKER_DUMMY     1
+
 #define KVM_EXIT_UNKNOWN          0
 #define KVM_EXIT_EXCEPTION        1
 #define KVM_EXIT_IO               2
@@ -419,6 +427,9 @@ struct kvm_trace_rec {
 #define KVM_CAP_MCE 31
 #endif
 #define KVM_CAP_IRQFD 32
+#ifdef __KVM_HAVE_PIT
+#define KVM_CAP_PIT2 33
+#endif
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -525,6 +536,7 @@ struct kvm_irqfd {
 			_IOW(KVMIO, 0x74, struct kvm_assigned_msix_entry)
 #define KVM_DEASSIGN_DEV_IRQ       _IOW(KVMIO, 0x75, struct kvm_assigned_irq)
 #define KVM_IRQFD                  _IOW(KVMIO, 0x76, struct kvm_irqfd)
+#define KVM_CREATE_PIT2		   _IOW(KVMIO, 0x77, struct kvm_pit_config)
 
 /*
  * ioctls for vcpu fds
-- 
cgit v1.2.3-71-gd317


From e7333391403b31feb27a05bc0dcd052a471f1276 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Thu, 21 May 2009 13:50:13 +0800
Subject: KVM: Downsize max support MSI-X entry to 256

We only trap one page for MSI-X entry now, so it's 4k/(128/8) = 256 entries at
most.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 65df6f5e2b98..632a8560dbcf 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -676,7 +676,7 @@ struct kvm_assigned_msix_nr {
 	__u16 padding;
 };
 
-#define KVM_MAX_MSIX_PER_DEV		512
+#define KVM_MAX_MSIX_PER_DEV		256
 struct kvm_assigned_msix_entry {
 	__u32 assigned_dev_id;
 	__u32 gsi;
-- 
cgit v1.2.3-71-gd317


From b188d2d365702cfc660a56d66f4bf77ebf14a5c2 Mon Sep 17 00:00:00 2001
From: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Date: Fri, 29 May 2009 12:58:50 +0200
Subject: KVM: remove redundant declarations

Changing s390 code in kvm_arch_vcpu_load/put come across this header
declarations. They are complete duplicates, not even useful forward
declarations as nothing using it is in between (maybe it was that in
the past).

This patch removes the two dispensable lines.

Signed-off-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7724dcb6ff76..19240feefe6f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -249,8 +249,6 @@ long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg);
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg);
-void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
-void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
 
 int kvm_dev_ioctl_check_extension(long ext);
 
-- 
cgit v1.2.3-71-gd317


From 60eead79ad8750f80384cbe48fc44edcc78a0305 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 4 Jun 2009 15:08:23 -0300
Subject: KVM: introduce irq_lock, use it to protect ioapic

Introduce irq_lock, and use to protect ioapic data structures.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h | 3 ++-
 virt/kvm/ioapic.c        | 5 +++++
 virt/kvm/kvm_main.c      | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 19240feefe6f..0c71688b1ee3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -124,7 +124,6 @@ struct kvm_kernel_irq_routing_entry {
 };
 
 struct kvm {
-	struct mutex lock; /* protects the vcpus array and APIC accesses */
 	spinlock_t mmu_lock;
 	spinlock_t requests_lock;
 	struct rw_semaphore slots_lock;
@@ -134,6 +133,7 @@ struct kvm {
 					KVM_PRIVATE_MEM_SLOTS];
 	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
 	struct list_head vm_list;
+	struct mutex lock;
 	struct kvm_io_bus mmio_bus;
 	struct kvm_io_bus pio_bus;
 #ifdef CONFIG_HAVE_KVM_EVENTFD
@@ -150,6 +150,7 @@ struct kvm {
 	struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
 #endif
 
+	struct mutex irq_lock;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
 	struct list_head irq_routing; /* of kvm_kernel_irq_routing_entry */
 	struct hlist_head mask_notifier_list;
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 469b9faae665..2a5667173995 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -243,6 +243,7 @@ static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
 	ioapic_debug("addr %lx\n", (unsigned long)addr);
 	ASSERT(!(addr & 0xf));	/* check alignment */
 
+	mutex_lock(&ioapic->kvm->irq_lock);
 	addr &= 0xff;
 	switch (addr) {
 	case IOAPIC_REG_SELECT:
@@ -269,6 +270,7 @@ static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
 	default:
 		printk(KERN_WARNING "ioapic: wrong length %d\n", len);
 	}
+	mutex_unlock(&ioapic->kvm->irq_lock);
 }
 
 static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
@@ -280,6 +282,8 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
 	ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n",
 		     (void*)addr, len, val);
 	ASSERT(!(addr & 0xf));	/* check alignment */
+
+	mutex_lock(&ioapic->kvm->irq_lock);
 	if (len == 4 || len == 8)
 		data = *(u32 *) val;
 	else {
@@ -305,6 +309,7 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
 	default:
 		break;
 	}
+	mutex_unlock(&ioapic->kvm->irq_lock);
 }
 
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d1f9c87daa87..d47e660fb709 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -982,6 +982,7 @@ static struct kvm *kvm_create_vm(void)
 	kvm_io_bus_init(&kvm->pio_bus);
 	kvm_irqfd_init(kvm);
 	mutex_init(&kvm->lock);
+	mutex_init(&kvm->irq_lock);
 	kvm_io_bus_init(&kvm->mmio_bus);
 	init_rwsem(&kvm->slots_lock);
 	atomic_set(&kvm->users_count, 1);
-- 
cgit v1.2.3-71-gd317


From fa40a8214bb9bcae8d49c234c19d8b4a6c1f37ff Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 4 Jun 2009 15:08:24 -0300
Subject: KVM: switch irq injection/acking data structures to irq_lock

Protect irq injection/acking data structures with a separate irq_lock
mutex. This fixes the following deadlock:

CPU A                               CPU B
kvm_vm_ioctl_deassign_dev_irq()
  mutex_lock(&kvm->lock);            worker_thread()
  -> kvm_deassign_irq()                -> kvm_assigned_dev_interrupt_work_handler()
    -> deassign_host_irq()               mutex_lock(&kvm->lock);
      -> cancel_work_sync() [blocked]

[gleb: fix ia64 path]

Reported-by: Alex Williamson <alex.williamson@hp.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c |  4 ++--
 arch/x86/kvm/i8254.c     |  4 ++--
 arch/x86/kvm/lapic.c     |  4 ++++
 arch/x86/kvm/x86.c       | 19 +++++++++----------
 include/linux/kvm_host.h |  3 ++-
 virt/kvm/eventfd.c       |  4 ++--
 virt/kvm/irq_comm.c      | 34 ++++++++++++++++++++++++++++------
 virt/kvm/kvm_main.c      | 16 +++++++++-------
 8 files changed, 58 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 319922137fdd..8dde36953af3 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1000,10 +1000,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			goto out;
 		if (irqchip_in_kernel(kvm)) {
 			__s32 status;
-			mutex_lock(&kvm->lock);
+			mutex_lock(&kvm->irq_lock);
 			status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
 				    irq_event.irq, irq_event.level);
-			mutex_unlock(&kvm->lock);
+			mutex_unlock(&kvm->irq_lock);
 			if (ioctl == KVM_IRQ_LINE_STATUS) {
 				irq_event.status = status;
 				if (copy_to_user(argp, &irq_event,
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 977af7ab8193..3837db65d33e 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -654,10 +654,10 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
 	struct kvm_vcpu *vcpu;
 	int i;
 
-	mutex_lock(&kvm->lock);
+	mutex_lock(&kvm->irq_lock);
 	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
 	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
-	mutex_unlock(&kvm->lock);
+	mutex_unlock(&kvm->irq_lock);
 
 	/*
 	 * Provides NMI watchdog support via Virtual Wire mode.
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index a23f42e550af..44f20cdb5709 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -424,7 +424,9 @@ static void apic_set_eoi(struct kvm_lapic *apic)
 		trigger_mode = IOAPIC_LEVEL_TRIG;
 	else
 		trigger_mode = IOAPIC_EDGE_TRIG;
+	mutex_lock(&apic->vcpu->kvm->irq_lock);
 	kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
+	mutex_unlock(&apic->vcpu->kvm->irq_lock);
 }
 
 static void apic_send_ipi(struct kvm_lapic *apic)
@@ -448,7 +450,9 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 		   irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
 		   irq.vector);
 
+	mutex_lock(&apic->vcpu->kvm->irq_lock);
 	kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
+	mutex_unlock(&apic->vcpu->kvm->irq_lock);
 }
 
 static u32 apic_get_tmcct(struct kvm_lapic *apic)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2ad8c97f58cc..05cbe83c74e2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2136,10 +2136,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			goto out;
 		if (irqchip_in_kernel(kvm)) {
 			__s32 status;
-			mutex_lock(&kvm->lock);
+			mutex_lock(&kvm->irq_lock);
 			status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
 					irq_event.irq, irq_event.level);
-			mutex_unlock(&kvm->lock);
+			mutex_unlock(&kvm->irq_lock);
 			if (ioctl == KVM_IRQ_LINE_STATUS) {
 				irq_event.status = status;
 				if (copy_to_user(argp, &irq_event,
@@ -2385,12 +2385,11 @@ mmio:
 	 */
 	mutex_lock(&vcpu->kvm->lock);
 	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
+	mutex_unlock(&vcpu->kvm->lock);
 	if (mmio_dev) {
 		kvm_iodevice_read(mmio_dev, gpa, bytes, val);
-		mutex_unlock(&vcpu->kvm->lock);
 		return X86EMUL_CONTINUE;
 	}
-	mutex_unlock(&vcpu->kvm->lock);
 
 	vcpu->mmio_needed = 1;
 	vcpu->mmio_phys_addr = gpa;
@@ -2440,12 +2439,11 @@ mmio:
 	 */
 	mutex_lock(&vcpu->kvm->lock);
 	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
+	mutex_unlock(&vcpu->kvm->lock);
 	if (mmio_dev) {
 		kvm_iodevice_write(mmio_dev, gpa, bytes, val);
-		mutex_unlock(&vcpu->kvm->lock);
 		return X86EMUL_CONTINUE;
 	}
-	mutex_unlock(&vcpu->kvm->lock);
 
 	vcpu->mmio_needed = 1;
 	vcpu->mmio_phys_addr = gpa;
@@ -2768,7 +2766,6 @@ static void kernel_pio(struct kvm_io_device *pio_dev,
 {
 	/* TODO: String I/O for in kernel device */
 
-	mutex_lock(&vcpu->kvm->lock);
 	if (vcpu->arch.pio.in)
 		kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
 				  vcpu->arch.pio.size,
@@ -2777,7 +2774,6 @@ static void kernel_pio(struct kvm_io_device *pio_dev,
 		kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
 				   vcpu->arch.pio.size,
 				   pd);
-	mutex_unlock(&vcpu->kvm->lock);
 }
 
 static void pio_string_write(struct kvm_io_device *pio_dev,
@@ -2787,14 +2783,12 @@ static void pio_string_write(struct kvm_io_device *pio_dev,
 	void *pd = vcpu->arch.pio_data;
 	int i;
 
-	mutex_lock(&vcpu->kvm->lock);
 	for (i = 0; i < io->cur_count; i++) {
 		kvm_iodevice_write(pio_dev, io->port,
 				   io->size,
 				   pd);
 		pd += io->size;
 	}
-	mutex_unlock(&vcpu->kvm->lock);
 }
 
 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
@@ -2831,7 +2825,9 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	val = kvm_register_read(vcpu, VCPU_REGS_RAX);
 	memcpy(vcpu->arch.pio_data, &val, 4);
 
+	mutex_lock(&vcpu->kvm->lock);
 	pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
+	mutex_unlock(&vcpu->kvm->lock);
 	if (pio_dev) {
 		kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
 		complete_pio(vcpu);
@@ -2895,9 +2891,12 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 
 	vcpu->arch.pio.guest_gva = address;
 
+	mutex_lock(&vcpu->kvm->lock);
 	pio_dev = vcpu_find_pio_dev(vcpu, port,
 				    vcpu->arch.pio.cur_count,
 				    !vcpu->arch.pio.in);
+	mutex_unlock(&vcpu->kvm->lock);
+
 	if (!vcpu->arch.pio.in) {
 		/* string PIO write */
 		ret = pio_copy_data(vcpu);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0c71688b1ee3..a29ea030dd8e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -371,7 +371,8 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
 				   struct kvm_irq_ack_notifier *kian);
-void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian);
+void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
+				   struct kvm_irq_ack_notifier *kian);
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 314012323afe..4092b8dcd510 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -57,10 +57,10 @@ irqfd_inject(struct work_struct *work)
 	struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
 	struct kvm *kvm = irqfd->kvm;
 
-	mutex_lock(&kvm->lock);
+	mutex_lock(&kvm->irq_lock);
 	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
 	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
-	mutex_unlock(&kvm->lock);
+	mutex_unlock(&kvm->irq_lock);
 }
 
 /*
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index ddc17f0e2f35..08a9a49481b2 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -62,6 +62,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 	int i, r = -1;
 	struct kvm_vcpu *vcpu, *lowest = NULL;
 
+	WARN_ON(!mutex_is_locked(&kvm->irq_lock));
+
 	if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
 			kvm_is_dm_lowest_prio(irq))
 		printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
@@ -113,7 +115,7 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 	return kvm_irq_delivery_to_apic(kvm, NULL, &irq);
 }
 
-/* This should be called with the kvm->lock mutex held
+/* This should be called with the kvm->irq_lock mutex held
  * Return value:
  *  < 0   Interrupt was ignored (masked or not delivered for other reasons)
  *  = 0   Interrupt was coalesced (previous irq is still pending)
@@ -125,6 +127,8 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level)
 	unsigned long *irq_state, sig_level;
 	int ret = -1;
 
+	WARN_ON(!mutex_is_locked(&kvm->irq_lock));
+
 	if (irq < KVM_IOAPIC_NUM_PINS) {
 		irq_state = (unsigned long *)&kvm->arch.irq_states[irq];
 
@@ -175,19 +179,26 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
 				   struct kvm_irq_ack_notifier *kian)
 {
+	mutex_lock(&kvm->irq_lock);
 	hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list);
+	mutex_unlock(&kvm->irq_lock);
 }
 
-void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian)
+void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
+				    struct kvm_irq_ack_notifier *kian)
 {
+	mutex_lock(&kvm->irq_lock);
 	hlist_del_init(&kian->link);
+	mutex_unlock(&kvm->irq_lock);
 }
 
-/* The caller must hold kvm->lock mutex */
 int kvm_request_irq_source_id(struct kvm *kvm)
 {
 	unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
-	int irq_source_id = find_first_zero_bit(bitmap,
+	int irq_source_id;
+
+	mutex_lock(&kvm->irq_lock);
+	irq_source_id = find_first_zero_bit(bitmap,
 				sizeof(kvm->arch.irq_sources_bitmap));
 
 	if (irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) {
@@ -197,6 +208,7 @@ int kvm_request_irq_source_id(struct kvm *kvm)
 
 	ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
 	set_bit(irq_source_id, bitmap);
+	mutex_unlock(&kvm->irq_lock);
 
 	return irq_source_id;
 }
@@ -207,6 +219,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
 
 	ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
 
+	mutex_lock(&kvm->irq_lock);
 	if (irq_source_id < 0 ||
 	    irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) {
 		printk(KERN_ERR "kvm: IRQ source ID out of range!\n");
@@ -215,19 +228,24 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
 	for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++)
 		clear_bit(irq_source_id, &kvm->arch.irq_states[i]);
 	clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
+	mutex_unlock(&kvm->irq_lock);
 }
 
 void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
 				    struct kvm_irq_mask_notifier *kimn)
 {
+	mutex_lock(&kvm->irq_lock);
 	kimn->irq = irq;
 	hlist_add_head(&kimn->link, &kvm->mask_notifier_list);
+	mutex_unlock(&kvm->irq_lock);
 }
 
 void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 				      struct kvm_irq_mask_notifier *kimn)
 {
+	mutex_lock(&kvm->irq_lock);
 	hlist_del(&kimn->link);
+	mutex_unlock(&kvm->irq_lock);
 }
 
 void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask)
@@ -235,6 +253,8 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask)
 	struct kvm_irq_mask_notifier *kimn;
 	struct hlist_node *n;
 
+	WARN_ON(!mutex_is_locked(&kvm->irq_lock));
+
 	hlist_for_each_entry(kimn, n, &kvm->mask_notifier_list, link)
 		if (kimn->irq == irq)
 			kimn->func(kimn, mask);
@@ -250,7 +270,9 @@ static void __kvm_free_irq_routing(struct list_head *irq_routing)
 
 void kvm_free_irq_routing(struct kvm *kvm)
 {
+	mutex_lock(&kvm->irq_lock);
 	__kvm_free_irq_routing(&kvm->irq_routing);
+	mutex_unlock(&kvm->irq_lock);
 }
 
 static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e,
@@ -325,13 +347,13 @@ int kvm_set_irq_routing(struct kvm *kvm,
 		e = NULL;
 	}
 
-	mutex_lock(&kvm->lock);
+	mutex_lock(&kvm->irq_lock);
 	list_splice(&kvm->irq_routing, &tmp);
 	INIT_LIST_HEAD(&kvm->irq_routing);
 	list_splice(&irq_list, &kvm->irq_routing);
 	INIT_LIST_HEAD(&irq_list);
 	list_splice(&tmp, &irq_list);
-	mutex_unlock(&kvm->lock);
+	mutex_unlock(&kvm->irq_lock);
 
 	r = 0;
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d47e660fb709..0d481b282448 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -62,6 +62,12 @@
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
+/*
+ * Ordering of locks:
+ *
+ * 		kvm->lock --> kvm->irq_lock
+ */
+
 DEFINE_SPINLOCK(kvm_lock);
 LIST_HEAD(vm_list);
 
@@ -126,11 +132,7 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 				    interrupt_work);
 	kvm = assigned_dev->kvm;
 
-	/* This is taken to safely inject irq inside the guest. When
-	 * the interrupt injection (or the ioapic code) uses a
-	 * finer-grained lock, update this
-	 */
-	mutex_lock(&kvm->lock);
+	mutex_lock(&kvm->irq_lock);
 	spin_lock_irq(&assigned_dev->assigned_dev_lock);
 	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
 		struct kvm_guest_msix_entry *guest_entries =
@@ -149,7 +151,7 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 			    assigned_dev->guest_irq, 1);
 
 	spin_unlock_irq(&assigned_dev->assigned_dev_lock);
-	mutex_unlock(&assigned_dev->kvm->lock);
+	mutex_unlock(&assigned_dev->kvm->irq_lock);
 }
 
 static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
@@ -207,7 +209,7 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 static void deassign_guest_irq(struct kvm *kvm,
 			       struct kvm_assigned_dev_kernel *assigned_dev)
 {
-	kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier);
+	kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
 	assigned_dev->ack_notifier.gsi = -1;
 
 	if (assigned_dev->irq_source_id != -1)
-- 
cgit v1.2.3-71-gd317


From 6a4a98397331723dce25a7537270548d91523431 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 9 Jun 2009 11:33:36 +0300
Subject: KVM: Reorder ioctls in kvm.h

Somehow the VM ioctls got unsorted; resort.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 632a8560dbcf..f1dada0519eb 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -495,11 +495,6 @@ struct kvm_irqfd {
  * ioctls for VM fds
  */
 #define KVM_SET_MEMORY_REGION     _IOW(KVMIO, 0x40, struct kvm_memory_region)
-#define KVM_SET_NR_MMU_PAGES      _IO(KVMIO, 0x44)
-#define KVM_GET_NR_MMU_PAGES      _IO(KVMIO, 0x45)
-#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\
-					struct kvm_userspace_memory_region)
-#define KVM_SET_TSS_ADDR          _IO(KVMIO, 0x47)
 /*
  * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns
  * a vcpu fd.
@@ -507,6 +502,11 @@ struct kvm_irqfd {
 #define KVM_CREATE_VCPU           _IO(KVMIO,  0x41)
 #define KVM_GET_DIRTY_LOG         _IOW(KVMIO, 0x42, struct kvm_dirty_log)
 #define KVM_SET_MEMORY_ALIAS      _IOW(KVMIO, 0x43, struct kvm_memory_alias)
+#define KVM_SET_NR_MMU_PAGES      _IO(KVMIO, 0x44)
+#define KVM_GET_NR_MMU_PAGES      _IO(KVMIO, 0x45)
+#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\
+					struct kvm_userspace_memory_region)
+#define KVM_SET_TSS_ADDR          _IO(KVMIO, 0x47)
 /* Device model IOC */
 #define KVM_CREATE_IRQCHIP	  _IO(KVMIO,  0x60)
 #define KVM_IRQ_LINE		  _IOW(KVMIO, 0x61, struct kvm_irq_level)
-- 
cgit v1.2.3-71-gd317


From c5af89b68abb26eea5e745f33228f4d672f115e5 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 9 Jun 2009 15:56:26 +0300
Subject: KVM: Introduce kvm_vcpu_is_bsp() function.

Use it instead of open code "vcpu_id zero is BSP" assumption.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c | 2 +-
 arch/ia64/kvm/vcpu.c     | 2 +-
 arch/x86/kvm/i8254.c     | 4 ++--
 arch/x86/kvm/i8259.c     | 6 +++---
 arch/x86/kvm/lapic.c     | 7 ++++---
 arch/x86/kvm/svm.c       | 4 ++--
 arch/x86/kvm/vmx.c       | 6 +++---
 arch/x86/kvm/x86.c       | 4 ++--
 include/linux/kvm_host.h | 5 +++++
 virt/kvm/ioapic.c        | 4 +++-
 virt/kvm/kvm_main.c      | 2 ++
 11 files changed, 28 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 8dde36953af3..4082665ace0a 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1216,7 +1216,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	if (IS_ERR(vmm_vcpu))
 		return PTR_ERR(vmm_vcpu);
 
-	if (vcpu->vcpu_id == 0) {
+	if (kvm_vcpu_is_bsp(vcpu)) {
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 
 		/*Set entry address for first run.*/
diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c
index cc406d064a09..61a3320b62c1 100644
--- a/arch/ia64/kvm/vcpu.c
+++ b/arch/ia64/kvm/vcpu.c
@@ -830,7 +830,7 @@ static void vcpu_set_itc(struct kvm_vcpu *vcpu, u64 val)
 
 	kvm = (struct kvm *)KVM_VM_BASE;
 
-	if (vcpu->vcpu_id == 0) {
+	if (kvm_vcpu_is_bsp(vcpu)) {
 		for (i = 0; i < kvm->arch.online_vcpus; i++) {
 			v = (struct kvm_vcpu *)((char *)vcpu +
 					sizeof(struct kvm_vcpu_data) * i);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 3837db65d33e..008a83185067 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -231,7 +231,7 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_pit *pit = vcpu->kvm->arch.vpit;
 
-	if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack)
+	if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack)
 		return atomic_read(&pit->pit_state.pit_timer.pending);
 	return 0;
 }
@@ -252,7 +252,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
 	struct kvm_pit *pit = vcpu->kvm->arch.vpit;
 	struct hrtimer *timer;
 
-	if (vcpu->vcpu_id != 0 || !pit)
+	if (!kvm_vcpu_is_bsp(vcpu) || !pit)
 		return;
 
 	timer = &pit->pit_state.pit_timer.timer;
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index bf94a45f4f86..148c52a608d6 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -57,7 +57,7 @@ static void pic_unlock(struct kvm_pic *s)
 	}
 
 	if (wakeup) {
-		vcpu = s->kvm->vcpus[0];
+		vcpu = s->kvm->bsp_vcpu;
 		if (vcpu)
 			kvm_vcpu_kick(vcpu);
 	}
@@ -254,7 +254,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
 {
 	int irq, irqbase, n;
 	struct kvm *kvm = s->pics_state->irq_request_opaque;
-	struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
+	struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
 
 	if (s == &s->pics_state->pics[0])
 		irqbase = 0;
@@ -512,7 +512,7 @@ static void picdev_read(struct kvm_io_device *this,
 static void pic_irq_request(void *opaque, int level)
 {
 	struct kvm *kvm = opaque;
-	struct kvm_vcpu *vcpu = kvm->vcpus[0];
+	struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
 	struct kvm_pic *s = pic_irqchip(kvm);
 	int irq = pic_get_irq(&s->pics[0]);
 
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 44f20cdb5709..b0661300eb28 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -793,7 +793,8 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 		vcpu->arch.apic_base = value;
 		return;
 	}
-	if (apic->vcpu->vcpu_id)
+
+	if (!kvm_vcpu_is_bsp(apic->vcpu))
 		value &= ~MSR_IA32_APICBASE_BSP;
 
 	vcpu->arch.apic_base = value;
@@ -844,7 +845,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 	}
 	update_divide_count(apic);
 	atomic_set(&apic->lapic_timer.pending, 0);
-	if (vcpu->vcpu_id == 0)
+	if (kvm_vcpu_is_bsp(vcpu))
 		vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
 	apic_update_ppr(apic);
 
@@ -985,7 +986,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
 	u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
 	int r = 0;
 
-	if (vcpu->vcpu_id == 0) {
+	if (kvm_vcpu_is_bsp(vcpu)) {
 		if (!apic_hw_enabled(vcpu->arch.apic))
 			r = 1;
 		if ((lvt0 & APIC_LVT_MASKED) == 0 &&
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7749b0692cb2..28b981409a8f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -642,7 +642,7 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	init_vmcb(svm);
 
-	if (vcpu->vcpu_id != 0) {
+	if (!kvm_vcpu_is_bsp(vcpu)) {
 		kvm_rip_write(vcpu, 0);
 		svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
 		svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
@@ -706,7 +706,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	fx_init(&svm->vcpu);
 	svm->vcpu.fpu_active = 1;
 	svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
-	if (svm->vcpu.vcpu_id == 0)
+	if (kvm_vcpu_is_bsp(&svm->vcpu))
 		svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
 
 	return &svm->vcpu;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ae682929a642..c08bb4cf372e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2411,7 +2411,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
 	kvm_set_cr8(&vmx->vcpu, 0);
 	msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
-	if (vmx->vcpu.vcpu_id == 0)
+	if (kvm_vcpu_is_bsp(&vmx->vcpu))
 		msr |= MSR_IA32_APICBASE_BSP;
 	kvm_set_apic_base(&vmx->vcpu, msr);
 
@@ -2422,7 +2422,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
 	 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
 	 */
-	if (vmx->vcpu.vcpu_id == 0) {
+	if (kvm_vcpu_is_bsp(&vmx->vcpu)) {
 		vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
 		vmcs_writel(GUEST_CS_BASE, 0x000f0000);
 	} else {
@@ -2451,7 +2451,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmcs_writel(GUEST_SYSENTER_EIP, 0);
 
 	vmcs_writel(GUEST_RFLAGS, 0x02);
-	if (vmx->vcpu.vcpu_id == 0)
+	if (kvm_vcpu_is_bsp(&vmx->vcpu))
 		kvm_rip_write(vcpu, 0xfff0);
 	else
 		kvm_rip_write(vcpu, 0);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e877efa37620..d8adc1da76dd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4330,7 +4330,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
 
 	/* Older userspace won't unhalt the vcpu on reset. */
-	if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
+	if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
 	    sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
 	    !(vcpu->arch.cr0 & X86_CR0_PE))
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -4601,7 +4601,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	kvm = vcpu->kvm;
 
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-	if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
+	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 	else
 		vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a29ea030dd8e..a5bd429e9bd3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -131,6 +131,7 @@ struct kvm {
 	int nmemslots;
 	struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
 					KVM_PRIVATE_MEM_SLOTS];
+	struct kvm_vcpu *bsp_vcpu;
 	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
 	struct list_head vm_list;
 	struct mutex lock;
@@ -549,4 +550,8 @@ static inline void kvm_irqfd_release(struct kvm *kvm) {}
 
 #endif /* CONFIG_HAVE_KVM_EVENTFD */
 
+static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
+{
+	return vcpu->kvm->bsp_vcpu == vcpu;
+}
 #endif
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 2a5667173995..0532fa68f5d1 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -164,7 +164,9 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 	/* Always delivery PIT interrupt to vcpu 0 */
 	if (irq == 0) {
 		irqe.dest_mode = 0; /* Physical mode. */
-		irqe.dest_id = ioapic->kvm->vcpus[0]->vcpu_id;
+		/* need to read apic_id from apic regiest since
+		 * it can be rewritten */
+		irqe.dest_id = ioapic->kvm->bsp_vcpu->vcpu_id;
 	}
 #endif
 	return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0d481b282448..0d54edecbc70 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1747,6 +1747,8 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
 		goto vcpu_destroy;
 	}
 	kvm->vcpus[n] = vcpu;
+	if (n == 0)
+		kvm->bsp_vcpu = vcpu;
 	mutex_unlock(&kvm->lock);
 
 	/* Now it's all set up, let userspace reach it */
-- 
cgit v1.2.3-71-gd317


From 73880c80aa9c8dc353cd0ad26579023213cd5314 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 9 Jun 2009 15:56:28 +0300
Subject: KVM: Break dependency between vcpu index in vcpus array and vcpu_id.

Archs are free to use vcpu_id as they see fit. For x86 it is used as
vcpu's apic id. New ioctl is added to configure boot vcpu id that was
assumed to be 0 till now.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/include/asm/kvm_host.h |  1 -
 arch/ia64/kvm/Kconfig            |  1 +
 arch/ia64/kvm/kvm-ia64.c         |  8 ++---
 arch/ia64/kvm/vcpu.c             |  2 +-
 arch/x86/kvm/Kconfig             |  1 +
 include/linux/kvm.h              |  2 ++
 include/linux/kvm_host.h         |  6 ++++
 virt/kvm/Kconfig                 |  3 ++
 virt/kvm/kvm_main.c              | 64 ++++++++++++++++++++++++----------------
 9 files changed, 55 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 5f43697aed30..9cf1c4b1f92f 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -465,7 +465,6 @@ struct kvm_arch {
 	unsigned long	metaphysical_rr4;
 	unsigned long	vmm_init_rr;
 
-	int		online_vcpus;
 	int		is_sn2;
 
 	struct kvm_ioapic *vioapic;
diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig
index f922bbba3797..cbadd8a65233 100644
--- a/arch/ia64/kvm/Kconfig
+++ b/arch/ia64/kvm/Kconfig
@@ -25,6 +25,7 @@ config KVM
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
 	select HAVE_KVM_IRQCHIP
+	select KVM_APIC_ARCHITECTURE
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
 	  virtualization extensions.  You will need a fairly recent
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 4082665ace0a..d1f7bcda2c7f 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -338,7 +338,7 @@ static struct kvm_vcpu *lid_to_vcpu(struct kvm *kvm, unsigned long id,
 	union ia64_lid lid;
 	int i;
 
-	for (i = 0; i < kvm->arch.online_vcpus; i++) {
+	for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) {
 		if (kvm->vcpus[i]) {
 			lid.val = VCPU_LID(kvm->vcpus[i]);
 			if (lid.id == id && lid.eid == eid)
@@ -412,7 +412,7 @@ static int handle_global_purge(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	call_data.ptc_g_data = p->u.ptc_g_data;
 
-	for (i = 0; i < kvm->arch.online_vcpus; i++) {
+	for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) {
 		if (!kvm->vcpus[i] || kvm->vcpus[i]->arch.mp_state ==
 						KVM_MP_STATE_UNINITIALIZED ||
 					vcpu == kvm->vcpus[i])
@@ -852,8 +852,6 @@ struct  kvm *kvm_arch_create_vm(void)
 
 	kvm_init_vm(kvm);
 
-	kvm->arch.online_vcpus = 0;
-
 	return kvm;
 
 }
@@ -1356,8 +1354,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 		goto fail;
 	}
 
-	kvm->arch.online_vcpus++;
-
 	return vcpu;
 fail:
 	return ERR_PTR(r);
diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c
index 61a3320b62c1..dce75b70cdd5 100644
--- a/arch/ia64/kvm/vcpu.c
+++ b/arch/ia64/kvm/vcpu.c
@@ -831,7 +831,7 @@ static void vcpu_set_itc(struct kvm_vcpu *vcpu, u64 val)
 	kvm = (struct kvm *)KVM_VM_BASE;
 
 	if (kvm_vcpu_is_bsp(vcpu)) {
-		for (i = 0; i < kvm->arch.online_vcpus; i++) {
+		for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) {
 			v = (struct kvm_vcpu *)((char *)vcpu +
 					sizeof(struct kvm_vcpu_data) * i);
 			VMX(v, itc_offset) = itc_offset;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 8cd2a4efe238..7fbedfd34d6c 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -27,6 +27,7 @@ config KVM
 	select ANON_INODES
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_EVENTFD
+	select KVM_APIC_ARCHITECTURE
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
 	  virtualization extensions.  You will need a fairly recent
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index f1dada0519eb..5037e170a70d 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -430,6 +430,7 @@ struct kvm_trace_rec {
 #ifdef __KVM_HAVE_PIT
 #define KVM_CAP_PIT2 33
 #endif
+#define KVM_CAP_SET_BOOT_CPU_ID 34
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -537,6 +538,7 @@ struct kvm_irqfd {
 #define KVM_DEASSIGN_DEV_IRQ       _IOW(KVMIO, 0x75, struct kvm_assigned_irq)
 #define KVM_IRQFD                  _IOW(KVMIO, 0x76, struct kvm_irqfd)
 #define KVM_CREATE_PIT2		   _IOW(KVMIO, 0x77, struct kvm_pit_config)
+#define KVM_SET_BOOT_CPU_ID        _IO(KVMIO, 0x78)
 
 /*
  * ioctls for vcpu fds
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a5bd429e9bd3..d3fdf1a738c9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -131,8 +131,12 @@ struct kvm {
 	int nmemslots;
 	struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
 					KVM_PRIVATE_MEM_SLOTS];
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
+	u32 bsp_vcpu_id;
 	struct kvm_vcpu *bsp_vcpu;
+#endif
 	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+	atomic_t online_vcpus;
 	struct list_head vm_list;
 	struct mutex lock;
 	struct kvm_io_bus mmio_bus;
@@ -550,8 +554,10 @@ static inline void kvm_irqfd_release(struct kvm *kvm) {}
 
 #endif /* CONFIG_HAVE_KVM_EVENTFD */
 
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
 static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
 {
 	return vcpu->kvm->bsp_vcpu == vcpu;
 }
 #endif
+#endif
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 56c6848d2df8..daece36c0a57 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -9,3 +9,6 @@ config HAVE_KVM_IRQCHIP
 config HAVE_KVM_EVENTFD
        bool
        select EVENTFD
+
+config KVM_APIC_ARCHITECTURE
+       bool
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0d54edecbc70..25e1f9c97b1a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -689,11 +689,6 @@ out:
 }
 #endif
 
-static inline int valid_vcpu(int n)
-{
-	return likely(n >= 0 && n < KVM_MAX_VCPUS);
-}
-
 inline int kvm_is_mmio_pfn(pfn_t pfn)
 {
 	if (pfn_valid(pfn)) {
@@ -1714,24 +1709,18 @@ static struct file_operations kvm_vcpu_fops = {
  */
 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
 {
-	int fd = anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0);
-	if (fd < 0)
-		kvm_put_kvm(vcpu->kvm);
-	return fd;
+	return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0);
 }
 
 /*
  * Creates some virtual cpus.  Good luck creating more than one.
  */
-static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
+static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 {
 	int r;
 	struct kvm_vcpu *vcpu;
 
-	if (!valid_vcpu(n))
-		return -EINVAL;
-
-	vcpu = kvm_arch_vcpu_create(kvm, n);
+	vcpu = kvm_arch_vcpu_create(kvm, id);
 	if (IS_ERR(vcpu))
 		return PTR_ERR(vcpu);
 
@@ -1742,25 +1731,38 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
 		return r;
 
 	mutex_lock(&kvm->lock);
-	if (kvm->vcpus[n]) {
-		r = -EEXIST;
+	if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
+		r = -EINVAL;
 		goto vcpu_destroy;
 	}
-	kvm->vcpus[n] = vcpu;
-	if (n == 0)
-		kvm->bsp_vcpu = vcpu;
-	mutex_unlock(&kvm->lock);
+
+	for (r = 0; r < atomic_read(&kvm->online_vcpus); r++)
+		if (kvm->vcpus[r]->vcpu_id == id) {
+			r = -EEXIST;
+			goto vcpu_destroy;
+		}
+
+	BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
 
 	/* Now it's all set up, let userspace reach it */
 	kvm_get_kvm(kvm);
 	r = create_vcpu_fd(vcpu);
-	if (r < 0)
-		goto unlink;
+	if (r < 0) {
+		kvm_put_kvm(kvm);
+		goto vcpu_destroy;
+	}
+
+	kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
+	smp_wmb();
+	atomic_inc(&kvm->online_vcpus);
+
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
+	if (kvm->bsp_vcpu_id == id)
+		kvm->bsp_vcpu = vcpu;
+#endif
+	mutex_unlock(&kvm->lock);
 	return r;
 
-unlink:
-	mutex_lock(&kvm->lock);
-	kvm->vcpus[n] = NULL;
 vcpu_destroy:
 	mutex_unlock(&kvm->lock);
 	kvm_arch_vcpu_destroy(vcpu);
@@ -2233,6 +2235,15 @@ static long kvm_vm_ioctl(struct file *filp,
 		r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags);
 		break;
 	}
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
+	case KVM_SET_BOOT_CPU_ID:
+		r = 0;
+		if (atomic_read(&kvm->online_vcpus) != 0)
+			r = -EBUSY;
+		else
+			kvm->bsp_vcpu_id = arg;
+		break;
+#endif
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 	}
@@ -2299,6 +2310,9 @@ static long kvm_dev_ioctl_check_extension_generic(long arg)
 	case KVM_CAP_USER_MEMORY:
 	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
 	case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
+	case KVM_CAP_SET_BOOT_CPU_ID:
+#endif
 		return 1;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
 	case KVM_CAP_IRQ_ROUTING:
-- 
cgit v1.2.3-71-gd317


From 988a2cae6a3c0dea6df59808a935a9a697bfc28c Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 9 Jun 2009 15:56:29 +0300
Subject: KVM: Use macro to iterate over vcpus.

[christian: remove unused variables on s390]

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c   | 29 ++++++++++++++---------------
 arch/powerpc/kvm/powerpc.c | 16 ++++++++++------
 arch/s390/kvm/kvm-s390.c   | 31 +++++++++++++++----------------
 arch/x86/kvm/i8254.c       |  7 ++-----
 arch/x86/kvm/mmu.c         |  6 +++---
 arch/x86/kvm/x86.c         | 25 ++++++++++++-------------
 include/linux/kvm_host.h   | 11 +++++++++++
 virt/kvm/irq_comm.c        |  6 ++----
 virt/kvm/kvm_main.c        | 19 +++++++------------
 9 files changed, 76 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index d1f7bcda2c7f..5c766bd82b05 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -337,13 +337,12 @@ static struct kvm_vcpu *lid_to_vcpu(struct kvm *kvm, unsigned long id,
 {
 	union ia64_lid lid;
 	int i;
+	struct kvm_vcpu *vcpu;
 
-	for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) {
-		if (kvm->vcpus[i]) {
-			lid.val = VCPU_LID(kvm->vcpus[i]);
-			if (lid.id == id && lid.eid == eid)
-				return kvm->vcpus[i];
-		}
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		lid.val = VCPU_LID(vcpu);
+		if (lid.id == id && lid.eid == eid)
+			return vcpu;
 	}
 
 	return NULL;
@@ -409,21 +408,21 @@ static int handle_global_purge(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	struct kvm *kvm = vcpu->kvm;
 	struct call_data call_data;
 	int i;
+	struct kvm_vcpu *vcpui;
 
 	call_data.ptc_g_data = p->u.ptc_g_data;
 
-	for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) {
-		if (!kvm->vcpus[i] || kvm->vcpus[i]->arch.mp_state ==
-						KVM_MP_STATE_UNINITIALIZED ||
-					vcpu == kvm->vcpus[i])
+	kvm_for_each_vcpu(i, vcpui, kvm) {
+		if (vcpui->arch.mp_state == KVM_MP_STATE_UNINITIALIZED ||
+				vcpu == vcpui)
 			continue;
 
-		if (waitqueue_active(&kvm->vcpus[i]->wq))
-			wake_up_interruptible(&kvm->vcpus[i]->wq);
+		if (waitqueue_active(&vcpui->wq))
+			wake_up_interruptible(&vcpui->wq);
 
-		if (kvm->vcpus[i]->cpu != -1) {
-			call_data.vcpu = kvm->vcpus[i];
-			smp_call_function_single(kvm->vcpus[i]->cpu,
+		if (vcpui->cpu != -1) {
+			call_data.vcpu = vcpui;
+			smp_call_function_single(vcpui->cpu,
 					vcpu_global_purge, &call_data, 1);
 		} else
 			printk(KERN_WARNING"kvm: Uninit vcpu received ipi!\n");
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 2cf915e51e7e..7ad30e0a1b9a 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -122,13 +122,17 @@ struct kvm *kvm_arch_create_vm(void)
 static void kvmppc_free_vcpus(struct kvm *kvm)
 {
 	unsigned int i;
+	struct kvm_vcpu *vcpu;
 
-	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-		if (kvm->vcpus[i]) {
-			kvm_arch_vcpu_free(kvm->vcpus[i]);
-			kvm->vcpus[i] = NULL;
-		}
-	}
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		kvm_arch_vcpu_free(vcpu);
+
+	mutex_lock(&kvm->lock);
+	for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
+		kvm->vcpus[i] = NULL;
+
+	atomic_set(&kvm->online_vcpus, 0);
+	mutex_unlock(&kvm->lock);
 }
 
 void kvm_arch_sync_events(struct kvm *kvm)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 098bfa6fbdf6..07ced89740d7 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -211,13 +211,17 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 static void kvm_free_vcpus(struct kvm *kvm)
 {
 	unsigned int i;
+	struct kvm_vcpu *vcpu;
 
-	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-		if (kvm->vcpus[i]) {
-			kvm_arch_vcpu_destroy(kvm->vcpus[i]);
-			kvm->vcpus[i] = NULL;
-		}
-	}
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		kvm_arch_vcpu_destroy(vcpu);
+
+	mutex_lock(&kvm->lock);
+	for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
+		kvm->vcpus[i] = NULL;
+
+	atomic_set(&kvm->online_vcpus, 0);
+	mutex_unlock(&kvm->lock);
 }
 
 void kvm_arch_sync_events(struct kvm *kvm)
@@ -314,8 +318,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 	BUG_ON(!kvm->arch.sca);
 	if (!kvm->arch.sca->cpu[id].sda)
 		kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block;
-	else
-		BUG_ON(!kvm->vcpus[id]); /* vcpu does already exist */
 	vcpu->arch.sie_block->scaoh = (__u32)(((__u64)kvm->arch.sca) >> 32);
 	vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca;
 
@@ -683,6 +685,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 				int user_alloc)
 {
 	int i;
+	struct kvm_vcpu *vcpu;
 
 	/* A few sanity checks. We can have exactly one memory slot which has
 	   to start at guest virtual zero and which has to be located at a
@@ -707,14 +710,10 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 		return -EINVAL;
 
 	/* request update of sie control block for all available vcpus */
-	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-		if (kvm->vcpus[i]) {
-			if (test_and_set_bit(KVM_REQ_MMU_RELOAD,
-						&kvm->vcpus[i]->requests))
-				continue;
-			kvm_s390_inject_sigp_stop(kvm->vcpus[i],
-						  ACTION_RELOADVCPU_ON_STOP);
-		}
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (test_and_set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
+			continue;
+		kvm_s390_inject_sigp_stop(vcpu, ACTION_RELOADVCPU_ON_STOP);
 	}
 
 	return 0;
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 06d8f84ae8a2..15fc95b2fc05 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -669,11 +669,8 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
 	 * VCPU0, and only if its LVT0 is in EXTINT mode.
 	 */
 	if (kvm->arch.vapics_in_nmi_mode > 0)
-		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-			vcpu = kvm->vcpus[i];
-			if (vcpu)
-				kvm_apic_nmi_wd_deliver(vcpu);
-		}
+		kvm_for_each_vcpu(i, vcpu, kvm)
+			kvm_apic_nmi_wd_deliver(vcpu);
 }
 
 void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d443a421ca3e..5f97dbd24291 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1347,10 +1347,10 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
 static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
 {
 	int i;
+	struct kvm_vcpu *vcpu;
 
-	for (i = 0; i < KVM_MAX_VCPUS; ++i)
-		if (kvm->vcpus[i])
-			kvm->vcpus[i]->arch.last_pte_updated = NULL;
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		vcpu->arch.last_pte_updated = NULL;
 }
 
 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d8adc1da76dd..89862a80e32c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2946,10 +2946,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
 
 	spin_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list) {
-		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-			vcpu = kvm->vcpus[i];
-			if (!vcpu)
-				continue;
+		kvm_for_each_vcpu(i, vcpu, kvm) {
 			if (vcpu->cpu != freq->cpu)
 				continue;
 			if (!kvm_request_guest_time_update(vcpu))
@@ -4678,20 +4675,22 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
 static void kvm_free_vcpus(struct kvm *kvm)
 {
 	unsigned int i;
+	struct kvm_vcpu *vcpu;
 
 	/*
 	 * Unpin any mmu pages first.
 	 */
-	for (i = 0; i < KVM_MAX_VCPUS; ++i)
-		if (kvm->vcpus[i])
-			kvm_unload_vcpu_mmu(kvm->vcpus[i]);
-	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-		if (kvm->vcpus[i]) {
-			kvm_arch_vcpu_free(kvm->vcpus[i]);
-			kvm->vcpus[i] = NULL;
-		}
-	}
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		kvm_unload_vcpu_mmu(vcpu);
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		kvm_arch_vcpu_free(vcpu);
+
+	mutex_lock(&kvm->lock);
+	for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
+		kvm->vcpus[i] = NULL;
 
+	atomic_set(&kvm->online_vcpus, 0);
+	mutex_unlock(&kvm->lock);
 }
 
 void kvm_arch_sync_events(struct kvm *kvm)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d3fdf1a738c9..c6e4d02067fe 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -179,6 +179,17 @@ struct kvm {
 #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
 #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
 
+static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
+{
+	smp_rmb();
+	return kvm->vcpus[i];
+}
+
+#define kvm_for_each_vcpu(idx, vcpup, kvm) \
+	for (idx = 0, vcpup = kvm_get_vcpu(kvm, idx); \
+	     idx < atomic_read(&kvm->online_vcpus) && vcpup; \
+	     vcpup = kvm_get_vcpu(kvm, ++idx))
+
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
 
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 08a9a49481b2..bb8a1b5e41c1 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -68,10 +68,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 			kvm_is_dm_lowest_prio(irq))
 		printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
 
-	for (i = 0; i < KVM_MAX_VCPUS; i++) {
-		vcpu = kvm->vcpus[i];
-
-		if (!vcpu || !kvm_apic_present(vcpu))
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (!kvm_apic_present(vcpu))
 			continue;
 
 		if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 25e1f9c97b1a..777fe533cfe7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -738,10 +738,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 
 	me = get_cpu();
 	spin_lock(&kvm->requests_lock);
-	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-		vcpu = kvm->vcpus[i];
-		if (!vcpu)
-			continue;
+	kvm_for_each_vcpu(i, vcpu, kvm) {
 		if (test_and_set_bit(req, &vcpu->requests))
 			continue;
 		cpu = vcpu->cpu;
@@ -1718,7 +1715,7 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu)
 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 {
 	int r;
-	struct kvm_vcpu *vcpu;
+	struct kvm_vcpu *vcpu, *v;
 
 	vcpu = kvm_arch_vcpu_create(kvm, id);
 	if (IS_ERR(vcpu))
@@ -1736,8 +1733,8 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 		goto vcpu_destroy;
 	}
 
-	for (r = 0; r < atomic_read(&kvm->online_vcpus); r++)
-		if (kvm->vcpus[r]->vcpu_id == id) {
+	kvm_for_each_vcpu(r, v, kvm)
+		if (v->vcpu_id == id) {
 			r = -EEXIST;
 			goto vcpu_destroy;
 		}
@@ -2526,11 +2523,9 @@ static int vcpu_stat_get(void *_offset, u64 *val)
 	*val = 0;
 	spin_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list)
-		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-			vcpu = kvm->vcpus[i];
-			if (vcpu)
-				*val += *(u32 *)((void *)vcpu + offset);
-		}
+		kvm_for_each_vcpu(i, vcpu, kvm)
+			*val += *(u32 *)((void *)vcpu + offset);
+
 	spin_unlock(&kvm_lock);
 	return 0;
 }
-- 
cgit v1.2.3-71-gd317


From 3f5d18a96577fd78277e08c467041573b9a65eaf Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 11 Jun 2009 15:43:28 +0300
Subject: KVM: Return to userspace on emulation failure

Instead of mindlessly retrying to execute the instruction, report the
failure to userspace.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c  | 5 +++--
 include/linux/kvm.h | 7 +++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5f97dbd24291..b6e4cda77047 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2673,8 +2673,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
 		++vcpu->stat.mmio_exits;
 		return 0;
 	case EMULATE_FAIL:
-		kvm_report_emulation_failure(vcpu, "pagetable");
-		return 1;
+		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+		return 0;
 	default:
 		BUG();
 	}
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 5037e170a70d..671051829da6 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -95,6 +95,10 @@ struct kvm_pit_config {
 #define KVM_EXIT_S390_RESET       14
 #define KVM_EXIT_DCR              15
 #define KVM_EXIT_NMI              16
+#define KVM_EXIT_INTERNAL_ERROR   17
+
+/* For KVM_EXIT_INTERNAL_ERROR */
+#define KVM_INTERNAL_ERROR_EMULATION 1
 
 /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
 struct kvm_run {
@@ -181,6 +185,9 @@ struct kvm_run {
 			__u32 data;
 			__u8  is_write;
 		} dcr;
+		struct {
+			__u32 suberror;
+		} internal;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
-- 
cgit v1.2.3-71-gd317


From 54dee9933e8d93589ad63ec3d6be39f1921b0767 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 11 Jun 2009 12:07:44 -0300
Subject: KVM: VMX: conditionally disable 2M pages

Disable usage of 2M pages if VMX_EPT_2MB_PAGE_BIT (bit 16) is clear
in MSR_IA32_VMX_EPT_VPID_CAP and EPT is enabled.

[avi: s/largepages_disabled/largepages_enabled/ to avoid negative logic]

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c       |  3 +++
 include/linux/kvm_host.h |  1 +
 virt/kvm/kvm_main.c      | 14 ++++++++++++--
 3 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 94c07ada103e..fc8d49c6bc51 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1381,6 +1381,9 @@ static __init int hardware_setup(void)
 	if (!cpu_has_vmx_tpr_shadow())
 		kvm_x86_ops->update_cr8_intercept = NULL;
 
+	if (enable_ept && !cpu_has_vmx_ept_2m_page())
+		kvm_disable_largepages();
+
 	return alloc_kvm_area();
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c6e4d02067fe..6988858dc56e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -224,6 +224,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem,
 				struct kvm_memory_slot old,
 				int user_alloc);
+void kvm_disable_largepages(void);
 void kvm_arch_flush_shadow(struct kvm *kvm);
 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 777fe533cfe7..48d5e697bf44 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -85,6 +85,8 @@ static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
 
 static bool kvm_rebooting;
 
+static bool largepages_enabled = true;
+
 #ifdef KVM_CAP_DEVICE_ASSIGNMENT
 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
 						      int assigned_dev_id)
@@ -1174,9 +1176,11 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		ugfn = new.userspace_addr >> PAGE_SHIFT;
 		/*
 		 * If the gfn and userspace address are not aligned wrt each
-		 * other, disable large page support for this slot
+		 * other, or if explicitly asked to, disable large page
+		 * support for this slot
 		 */
-		if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE - 1))
+		if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE - 1) ||
+		    !largepages_enabled)
 			for (i = 0; i < largepages; ++i)
 				new.lpage_info[i].write_count = 1;
 	}
@@ -1291,6 +1295,12 @@ out:
 	return r;
 }
 
+void kvm_disable_largepages(void)
+{
+	largepages_enabled = false;
+}
+EXPORT_SYMBOL_GPL(kvm_disable_largepages);
+
 int is_error_page(struct page *page)
 {
 	return page == bad_page;
-- 
cgit v1.2.3-71-gd317


From ec04b2604c3707a46db1d26d98f82b11d0844669 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 19 Jun 2009 15:16:23 +0200
Subject: KVM: Prepare memslot data structures for multiple hugepage sizes

[avi: fix build on non-x86]

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/include/asm/kvm_host.h    |  3 +-
 arch/powerpc/include/asm/kvm_host.h |  3 +-
 arch/s390/include/asm/kvm_host.h    |  6 +++-
 arch/x86/include/asm/kvm_host.h     | 12 ++++----
 arch/x86/kvm/mmu.c                  | 30 ++++++++++----------
 arch/x86/kvm/paging_tmpl.h          |  3 +-
 include/linux/kvm_host.h            |  2 +-
 virt/kvm/kvm_main.c                 | 56 ++++++++++++++++++++++++++-----------
 8 files changed, 73 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 9cf1c4b1f92f..d9b6325a9328 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -235,7 +235,8 @@ struct kvm_vm_data {
 #define KVM_REQ_PTC_G		32
 #define KVM_REQ_RESUME		33
 
-#define KVM_PAGES_PER_HPAGE	1
+#define KVM_NR_PAGE_SIZES	1
+#define KVM_PAGES_PER_HPAGE(x)	1
 
 struct kvm;
 struct kvm_vcpu;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index d4caa6127f55..c9c930ed11d7 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -34,7 +34,8 @@
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 
 /* We don't currently support large pages. */
-#define KVM_PAGES_PER_HPAGE (1UL << 31)
+#define KVM_NR_PAGE_SIZES	1
+#define KVM_PAGES_PER_HPAGE(x)	(1UL<<31)
 
 struct kvm;
 struct kvm_run;
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 75535d4d7a05..78e07a622b45 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -40,7 +40,11 @@ struct sca_block {
 	struct sca_entry cpu[64];
 } __attribute__((packed));
 
-#define KVM_PAGES_PER_HPAGE 256
+#define KVM_NR_PAGE_SIZES 2
+#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + ((x) - 1) * 8)
+#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
+#define KVM_HPAGE_MASK(x)	(~(KVM_HPAGE_SIZE(x) - 1))
+#define KVM_PAGES_PER_HPAGE(x)	(KVM_HPAGE_SIZE(x) / PAGE_SIZE)
 
 #define CPUSTAT_HOST       0x80000000
 #define CPUSTAT_WAIT       0x10000000
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 19027ab20412..30b625d8e5f0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -54,12 +54,12 @@
 #define INVALID_PAGE (~(hpa_t)0)
 #define UNMAPPED_GVA (~(gpa_t)0)
 
-/* shadow tables are PAE even on non-PAE hosts */
-#define KVM_HPAGE_SHIFT 21
-#define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_SHIFT)
-#define KVM_HPAGE_MASK (~(KVM_HPAGE_SIZE - 1))
-
-#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE)
+/* KVM Hugepage definitions for x86 */
+#define KVM_NR_PAGE_SIZES	2
+#define KVM_HPAGE_SHIFT(x)	(PAGE_SHIFT + (((x) - 1) * 9))
+#define KVM_HPAGE_SIZE(x)	(1UL << KVM_HPAGE_SHIFT(x))
+#define KVM_HPAGE_MASK(x)	(~(KVM_HPAGE_SIZE(x) - 1))
+#define KVM_PAGES_PER_HPAGE(x)	(KVM_HPAGE_SIZE(x) / PAGE_SIZE)
 
 #define DE_VECTOR 0
 #define DB_VECTOR 1
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 12974de88aa5..b67585c1ef08 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -394,9 +394,9 @@ static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
 {
 	unsigned long idx;
 
-	idx = (gfn / KVM_PAGES_PER_HPAGE) -
-	      (slot->base_gfn / KVM_PAGES_PER_HPAGE);
-	return &slot->lpage_info[idx].write_count;
+	idx = (gfn / KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL)) -
+	      (slot->base_gfn / KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL));
+	return &slot->lpage_info[0][idx].write_count;
 }
 
 static void account_shadowed(struct kvm *kvm, gfn_t gfn)
@@ -485,10 +485,10 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
 	if (!lpage)
 		return &slot->rmap[gfn - slot->base_gfn];
 
-	idx = (gfn / KVM_PAGES_PER_HPAGE) -
-	      (slot->base_gfn / KVM_PAGES_PER_HPAGE);
+	idx = (gfn / KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL)) -
+	      (slot->base_gfn / KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL));
 
-	return &slot->lpage_info[idx].rmap_pde;
+	return &slot->lpage_info[0][idx].rmap_pde;
 }
 
 /*
@@ -731,11 +731,11 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 		end = start + (memslot->npages << PAGE_SHIFT);
 		if (hva >= start && hva < end) {
 			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+			int idx = gfn_offset /
+			          KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL);
 			retval |= handler(kvm, &memslot->rmap[gfn_offset]);
 			retval |= handler(kvm,
-					  &memslot->lpage_info[
-						  gfn_offset /
-						  KVM_PAGES_PER_HPAGE].rmap_pde);
+					&memslot->lpage_info[0][idx].rmap_pde);
 		}
 	}
 
@@ -1876,8 +1876,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	pfn_t pfn;
 	unsigned long mmu_seq;
 
-	if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
-		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
+	if (is_largepage_backed(vcpu, gfn &
+			~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1))) {
+		gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1);
 		largepage = 1;
 	}
 
@@ -2082,8 +2083,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 	if (r)
 		return r;
 
-	if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
-		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
+	if (is_largepage_backed(vcpu, gfn &
+			~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1))) {
+		gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1);
 		largepage = 1;
 	}
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
@@ -2485,7 +2487,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 
 	if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
-		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
+		gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1);
 		vcpu->arch.update_pte.largepage = 1;
 	}
 	vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 322e8113aeea..53e129cec5fd 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -401,7 +401,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 
 	if (walker.level == PT_DIRECTORY_LEVEL) {
 		gfn_t large_gfn;
-		large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
+		large_gfn = walker.gfn &
+			~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1);
 		if (is_largepage_backed(vcpu, large_gfn)) {
 			walker.gfn = large_gfn;
 			largepage = 1;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6988858dc56e..06af936a250a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -103,7 +103,7 @@ struct kvm_memory_slot {
 	struct {
 		unsigned long rmap_pde;
 		int write_count;
-	} *lpage_info;
+	} *lpage_info[KVM_NR_PAGE_SIZES - 1];
 	unsigned long userspace_addr;
 	int user_alloc;
 };
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1da8072d61b1..8361662e7e0a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1001,19 +1001,25 @@ out:
 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
 				  struct kvm_memory_slot *dont)
 {
+	int i;
+
 	if (!dont || free->rmap != dont->rmap)
 		vfree(free->rmap);
 
 	if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
 		vfree(free->dirty_bitmap);
 
-	if (!dont || free->lpage_info != dont->lpage_info)
-		vfree(free->lpage_info);
+
+	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+		if (!dont || free->lpage_info[i] != dont->lpage_info[i]) {
+			vfree(free->lpage_info[i]);
+			free->lpage_info[i] = NULL;
+		}
+	}
 
 	free->npages = 0;
 	free->dirty_bitmap = NULL;
 	free->rmap = NULL;
-	free->lpage_info = NULL;
 }
 
 void kvm_free_physmem(struct kvm *kvm)
@@ -1087,7 +1093,8 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	int r;
 	gfn_t base_gfn;
 	unsigned long npages, ugfn;
-	unsigned long largepages, i;
+	int lpages;
+	unsigned long i, j;
 	struct kvm_memory_slot *memslot;
 	struct kvm_memory_slot old, new;
 
@@ -1161,33 +1168,48 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		else
 			new.userspace_addr = 0;
 	}
-	if (npages && !new.lpage_info) {
-		largepages = 1 + (base_gfn + npages - 1) / KVM_PAGES_PER_HPAGE;
-		largepages -= base_gfn / KVM_PAGES_PER_HPAGE;
+	if (!npages)
+		goto skip_lpage;
 
-		new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info));
+	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+		int level = i + 2;
 
-		if (!new.lpage_info)
+		/* Avoid unused variable warning if no large pages */
+		(void)level;
+
+		if (new.lpage_info[i])
+			continue;
+
+		lpages = 1 + (base_gfn + npages - 1) /
+			     KVM_PAGES_PER_HPAGE(level);
+		lpages -= base_gfn / KVM_PAGES_PER_HPAGE(level);
+
+		new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i]));
+
+		if (!new.lpage_info[i])
 			goto out_free;
 
-		memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info));
+		memset(new.lpage_info[i], 0,
+		       lpages * sizeof(*new.lpage_info[i]));
 
-		if (base_gfn % KVM_PAGES_PER_HPAGE)
-			new.lpage_info[0].write_count = 1;
-		if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE)
-			new.lpage_info[largepages-1].write_count = 1;
+		if (base_gfn % KVM_PAGES_PER_HPAGE(level))
+			new.lpage_info[i][0].write_count = 1;
+		if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE(level))
+			new.lpage_info[i][lpages - 1].write_count = 1;
 		ugfn = new.userspace_addr >> PAGE_SHIFT;
 		/*
 		 * If the gfn and userspace address are not aligned wrt each
 		 * other, or if explicitly asked to, disable large page
 		 * support for this slot
 		 */
-		if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE - 1) ||
+		if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
 		    !largepages_enabled)
-			for (i = 0; i < largepages; ++i)
-				new.lpage_info[i].write_count = 1;
+			for (j = 0; j < lpages; ++j)
+				new.lpage_info[i][j].write_count = 1;
 	}
 
+skip_lpage:
+
 	/* Allocate page dirty bitmap if needed */
 	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
 		unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
-- 
cgit v1.2.3-71-gd317


From 2023a29cbe34139afcea8f65f8aef78c325c5dc0 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 18 Jun 2009 11:47:28 -0300
Subject: KVM: remove old KVMTRACE support code

Return EOPNOTSUPP for KVM_TRACE_ENABLE/PAUSE/DISABLE ioctls.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/Kconfig     |   3 -
 arch/powerpc/kvm/Kconfig  |  11 --
 arch/powerpc/kvm/Makefile |   2 -
 arch/s390/kvm/Kconfig     |   3 -
 arch/x86/kvm/Kconfig      |  12 --
 arch/x86/kvm/Makefile     |   1 -
 include/linux/kvm.h       |  31 +----
 include/linux/kvm_host.h  |  31 -----
 virt/kvm/kvm_main.c       |   3 +-
 virt/kvm/kvm_trace.c      | 285 ----------------------------------------------
 10 files changed, 2 insertions(+), 380 deletions(-)
 delete mode 100644 virt/kvm/kvm_trace.c

(limited to 'include/linux')

diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig
index cbadd8a65233..ef3e7be29caf 100644
--- a/arch/ia64/kvm/Kconfig
+++ b/arch/ia64/kvm/Kconfig
@@ -47,9 +47,6 @@ config KVM_INTEL
 	  Provides support for KVM on Itanium 2 processors equipped with the VT
 	  extensions.
 
-config KVM_TRACE
-       bool
-
 source drivers/virtio/Kconfig
 
 endif # VIRTUALIZATION
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 46019dccce1c..c29926846613 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -58,17 +58,6 @@ config KVM_E500
 
 	  If unsure, say N.
 
-config KVM_TRACE
-	bool "KVM trace support"
-	depends on KVM && MARKERS && SYSFS
-	select RELAY
-	select DEBUG_FS
-	default n
-	---help---
-	  This option allows reading a trace of kvm-related events through
-	  relayfs.  Note the ABI is not considered stable and will be
-	  modified in future updates.
-
 source drivers/virtio/Kconfig
 
 endif # VIRTUALIZATION
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 4f407f2662f7..37655fe19f2f 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -8,8 +8,6 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/powerpc/kvm
 
 common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
 
-common-objs-$(CONFIG_KVM_TRACE)  += $(addprefix ../../../virt/kvm/, kvm_trace.o)
-
 CFLAGS_44x_tlb.o  := -I.
 CFLAGS_e500_tlb.o := -I.
 CFLAGS_emulate.o  := -I.
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index ad75ce33be12..bf164fc21864 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -34,9 +34,6 @@ config KVM
 
 	  If unsure, say N.
 
-config KVM_TRACE
-       bool
-
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/virtio/Kconfig
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 7fbedfd34d6c..b84e571f4175 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -62,18 +62,6 @@ config KVM_AMD
 	  To compile this as a module, choose M here: the module
 	  will be called kvm-amd.
 
-config KVM_TRACE
-	bool "KVM trace support"
-	depends on KVM && SYSFS
-	select MARKERS
-	select RELAY
-	select DEBUG_FS
-	default n
-	---help---
-	  This option allows reading a trace of kvm-related events through
-	  relayfs.  Note the ABI is not considered stable and will be
-	  modified in future updates.
-
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/lguest/Kconfig
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 7c56850b82cb..afaaa7627d95 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,7 +7,6 @@ CFLAGS_vmx.o := -I.
 
 kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
 				coalesced_mmio.o irq_comm.o eventfd.o)
-kvm-$(CONFIG_KVM_TRACE)	+= $(addprefix ../../../virt/kvm/, kvm_trace.o)
 kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
 
 kvm-y			+= x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 671051829da6..76c640834ea6 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -14,7 +14,7 @@
 
 #define KVM_API_VERSION 12
 
-/* for KVM_TRACE_ENABLE */
+/* for KVM_TRACE_ENABLE, deprecated */
 struct kvm_user_trace_setup {
 	__u32 buf_size; /* sub_buffer size of each per-cpu */
 	__u32 buf_nr; /* the number of sub_buffers of each per-cpu */
@@ -325,35 +325,6 @@ struct kvm_guest_debug {
 #define KVM_TRC_CYCLE_SIZE      8
 #define KVM_TRC_EXTRA_MAX       7
 
-/* This structure represents a single trace buffer record. */
-struct kvm_trace_rec {
-	/* variable rec_val
-	 * is split into:
-	 * bits 0 - 27  -> event id
-	 * bits 28 -30  -> number of extra data args of size u32
-	 * bits 31      -> binary indicator for if tsc is in record
-	 */
-	__u32 rec_val;
-	__u32 pid;
-	__u32 vcpu_id;
-	union {
-		struct {
-			__u64 timestamp;
-			__u32 extra_u32[KVM_TRC_EXTRA_MAX];
-		} __attribute__((packed)) timestamp;
-		struct {
-			__u32 extra_u32[KVM_TRC_EXTRA_MAX];
-		} notimestamp;
-	} u;
-};
-
-#define TRACE_REC_EVENT_ID(val) \
-		(0x0fffffff & (val))
-#define TRACE_REC_NUM_DATA_ARGS(val) \
-		(0x70000000 & ((val) << 28))
-#define TRACE_REC_TCS(val) \
-		(0x80000000 & ((val) << 31))
-
 #define KVMIO 0xAE
 
 /*
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 06af936a250a..0604d56f6eed 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -482,37 +482,6 @@ struct kvm_stats_debugfs_item {
 extern struct kvm_stats_debugfs_item debugfs_entries[];
 extern struct dentry *kvm_debugfs_dir;
 
-#define KVMTRACE_5D(evt, vcpu, d1, d2, d3, d4, d5, name) \
-	trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
-						vcpu, 5, d1, d2, d3, d4, d5)
-#define KVMTRACE_4D(evt, vcpu, d1, d2, d3, d4, name) \
-	trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
-						vcpu, 4, d1, d2, d3, d4, 0)
-#define KVMTRACE_3D(evt, vcpu, d1, d2, d3, name) \
-	trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
-						vcpu, 3, d1, d2, d3, 0, 0)
-#define KVMTRACE_2D(evt, vcpu, d1, d2, name) \
-	trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
-						vcpu, 2, d1, d2, 0, 0, 0)
-#define KVMTRACE_1D(evt, vcpu, d1, name) \
-	trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
-						vcpu, 1, d1, 0, 0, 0, 0)
-#define KVMTRACE_0D(evt, vcpu, name) \
-	trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
-						vcpu, 0, 0, 0, 0, 0, 0)
-
-#ifdef CONFIG_KVM_TRACE
-int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg);
-void kvm_trace_cleanup(void);
-#else
-static inline
-int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg)
-{
-	return -EINVAL;
-}
-#define kvm_trace_cleanup() ((void)0)
-#endif
-
 #ifdef KVM_ARCH_WANT_MMU_NOTIFIER
 static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq)
 {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f1e2e8c373c6..bbb4029d7c4d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2398,7 +2398,7 @@ static long kvm_dev_ioctl(struct file *filp,
 	case KVM_TRACE_ENABLE:
 	case KVM_TRACE_PAUSE:
 	case KVM_TRACE_DISABLE:
-		r = kvm_trace_ioctl(ioctl, arg);
+		r = -EOPNOTSUPP;
 		break;
 	default:
 		return kvm_arch_dev_ioctl(filp, ioctl, arg);
@@ -2748,7 +2748,6 @@ EXPORT_SYMBOL_GPL(kvm_init);
 
 void kvm_exit(void)
 {
-	kvm_trace_cleanup();
 	tracepoint_synchronize_unregister();
 	misc_deregister(&kvm_dev);
 	kmem_cache_destroy(kvm_vcpu_cache);
diff --git a/virt/kvm/kvm_trace.c b/virt/kvm/kvm_trace.c
deleted file mode 100644
index f59874446440..000000000000
--- a/virt/kvm/kvm_trace.c
+++ /dev/null
@@ -1,285 +0,0 @@
-/*
- * kvm trace
- *
- * It is designed to allow debugging traces of kvm to be generated
- * on UP / SMP machines.  Each trace entry can be timestamped so that
- * it's possible to reconstruct a chronological record of trace events.
- * The implementation refers to blktrace kernel support.
- *
- * Copyright (c) 2008 Intel Corporation
- * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
- *
- * Authors: Feng(Eric) Liu, eric.e.liu@intel.com
- *
- * Date:    Feb 2008
- */
-
-#include <linux/module.h>
-#include <linux/relay.h>
-#include <linux/debugfs.h>
-#include <linux/ktime.h>
-
-#include <linux/kvm_host.h>
-
-#define KVM_TRACE_STATE_RUNNING 	(1 << 0)
-#define KVM_TRACE_STATE_PAUSE 		(1 << 1)
-#define KVM_TRACE_STATE_CLEARUP 	(1 << 2)
-
-struct kvm_trace {
-	int trace_state;
-	struct rchan *rchan;
-	struct dentry *lost_file;
-	atomic_t lost_records;
-};
-static struct kvm_trace *kvm_trace;
-
-struct kvm_trace_probe {
-	const char *name;
-	const char *format;
-	u32 timestamp_in;
-	marker_probe_func *probe_func;
-};
-
-static inline int calc_rec_size(int timestamp, int extra)
-{
-	int rec_size = KVM_TRC_HEAD_SIZE;
-
-	rec_size += extra;
-	return timestamp ? rec_size += KVM_TRC_CYCLE_SIZE : rec_size;
-}
-
-static void kvm_add_trace(void *probe_private, void *call_data,
-			  const char *format, va_list *args)
-{
-	struct kvm_trace_probe *p = probe_private;
-	struct kvm_trace *kt = kvm_trace;
-	struct kvm_trace_rec rec;
-	struct kvm_vcpu *vcpu;
-	int    i, size;
-	u32    extra;
-
-	if (unlikely(kt->trace_state != KVM_TRACE_STATE_RUNNING))
-		return;
-
-	rec.rec_val	= TRACE_REC_EVENT_ID(va_arg(*args, u32));
-	vcpu		= va_arg(*args, struct kvm_vcpu *);
-	rec.pid		= current->tgid;
-	rec.vcpu_id	= vcpu->vcpu_id;
-
-	extra   	= va_arg(*args, u32);
-	WARN_ON(!(extra <= KVM_TRC_EXTRA_MAX));
-	extra 		= min_t(u32, extra, KVM_TRC_EXTRA_MAX);
-
-	rec.rec_val |= TRACE_REC_TCS(p->timestamp_in)
-			| TRACE_REC_NUM_DATA_ARGS(extra);
-
-	if (p->timestamp_in) {
-		rec.u.timestamp.timestamp = ktime_to_ns(ktime_get());
-
-		for (i = 0; i < extra; i++)
-			rec.u.timestamp.extra_u32[i] = va_arg(*args, u32);
-	} else {
-		for (i = 0; i < extra; i++)
-			rec.u.notimestamp.extra_u32[i] = va_arg(*args, u32);
-	}
-
-	size = calc_rec_size(p->timestamp_in, extra * sizeof(u32));
-	relay_write(kt->rchan, &rec, size);
-}
-
-static struct kvm_trace_probe kvm_trace_probes[] = {
-	{ "kvm_trace_entryexit", "%u %p %u %u %u %u %u %u", 1, kvm_add_trace },
-	{ "kvm_trace_handler", "%u %p %u %u %u %u %u %u", 0, kvm_add_trace },
-};
-
-static int lost_records_get(void *data, u64 *val)
-{
-	struct kvm_trace *kt = data;
-
-	*val = atomic_read(&kt->lost_records);
-	return 0;
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(kvm_trace_lost_ops, lost_records_get, NULL, "%llu\n");
-
-/*
- *  The relay channel is used in "no-overwrite" mode, it keeps trace of how
- *  many times we encountered a full subbuffer, to tell user space app the
- *  lost records there were.
- */
-static int kvm_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
-				     void *prev_subbuf, size_t prev_padding)
-{
-	struct kvm_trace *kt;
-
-	if (!relay_buf_full(buf)) {
-		if (!prev_subbuf) {
-			/*
-			 * executed only once when the channel is opened
-			 * save metadata as first record
-			 */
-			subbuf_start_reserve(buf, sizeof(u32));
-			*(u32 *)subbuf = 0x12345678;
-		}
-
-		return 1;
-	}
-
-	kt = buf->chan->private_data;
-	atomic_inc(&kt->lost_records);
-
-	return 0;
-}
-
-static struct dentry *kvm_create_buf_file_callack(const char *filename,
-						 struct dentry *parent,
-						 int mode,
-						 struct rchan_buf *buf,
-						 int *is_global)
-{
-	return debugfs_create_file(filename, mode, parent, buf,
-				   &relay_file_operations);
-}
-
-static int kvm_remove_buf_file_callback(struct dentry *dentry)
-{
-	debugfs_remove(dentry);
-	return 0;
-}
-
-static struct rchan_callbacks kvm_relay_callbacks = {
-	.subbuf_start 		= kvm_subbuf_start_callback,
-	.create_buf_file 	= kvm_create_buf_file_callack,
-	.remove_buf_file 	= kvm_remove_buf_file_callback,
-};
-
-static int do_kvm_trace_enable(struct kvm_user_trace_setup *kuts)
-{
-	struct kvm_trace *kt;
-	int i, r = -ENOMEM;
-
-	if (!kuts->buf_size || !kuts->buf_nr)
-		return -EINVAL;
-
-	kt = kzalloc(sizeof(*kt), GFP_KERNEL);
-	if (!kt)
-		goto err;
-
-	r = -EIO;
-	atomic_set(&kt->lost_records, 0);
-	kt->lost_file = debugfs_create_file("lost_records", 0444, kvm_debugfs_dir,
-					    kt, &kvm_trace_lost_ops);
-	if (!kt->lost_file)
-		goto err;
-
-	kt->rchan = relay_open("trace", kvm_debugfs_dir, kuts->buf_size,
-				kuts->buf_nr, &kvm_relay_callbacks, kt);
-	if (!kt->rchan)
-		goto err;
-
-	kvm_trace = kt;
-
-	for (i = 0; i < ARRAY_SIZE(kvm_trace_probes); i++) {
-		struct kvm_trace_probe *p = &kvm_trace_probes[i];
-
-		r = marker_probe_register(p->name, p->format, p->probe_func, p);
-		if (r)
-			printk(KERN_INFO "Unable to register probe %s\n",
-			       p->name);
-	}
-
-	kvm_trace->trace_state = KVM_TRACE_STATE_RUNNING;
-
-	return 0;
-err:
-	if (kt) {
-		if (kt->lost_file)
-			debugfs_remove(kt->lost_file);
-		if (kt->rchan)
-			relay_close(kt->rchan);
-		kfree(kt);
-	}
-	return r;
-}
-
-static int kvm_trace_enable(char __user *arg)
-{
-	struct kvm_user_trace_setup kuts;
-	int ret;
-
-	ret = copy_from_user(&kuts, arg, sizeof(kuts));
-	if (ret)
-		return -EFAULT;
-
-	ret = do_kvm_trace_enable(&kuts);
-	if (ret)
-		return ret;
-
-	return 0;
-}
-
-static int kvm_trace_pause(void)
-{
-	struct kvm_trace *kt = kvm_trace;
-	int r = -EINVAL;
-
-	if (kt == NULL)
-		return r;
-
-	if (kt->trace_state == KVM_TRACE_STATE_RUNNING) {
-		kt->trace_state = KVM_TRACE_STATE_PAUSE;
-		relay_flush(kt->rchan);
-		r = 0;
-	}
-
-	return r;
-}
-
-void kvm_trace_cleanup(void)
-{
-	struct kvm_trace *kt = kvm_trace;
-	int i;
-
-	if (kt == NULL)
-		return;
-
-	if (kt->trace_state == KVM_TRACE_STATE_RUNNING ||
-	    kt->trace_state == KVM_TRACE_STATE_PAUSE) {
-
-		kt->trace_state = KVM_TRACE_STATE_CLEARUP;
-
-		for (i = 0; i < ARRAY_SIZE(kvm_trace_probes); i++) {
-			struct kvm_trace_probe *p = &kvm_trace_probes[i];
-			marker_probe_unregister(p->name, p->probe_func, p);
-		}
-		marker_synchronize_unregister();
-
-		relay_close(kt->rchan);
-		debugfs_remove(kt->lost_file);
-		kfree(kt);
-	}
-}
-
-int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg)
-{
-	void __user *argp = (void __user *)arg;
-	long r = -EINVAL;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	switch (ioctl) {
-	case KVM_TRACE_ENABLE:
-		r = kvm_trace_enable(argp);
-		break;
-	case KVM_TRACE_PAUSE:
-		r = kvm_trace_pause();
-		break;
-	case KVM_TRACE_DISABLE:
-		r = 0;
-		kvm_trace_cleanup();
-		break;
-	}
-
-	return r;
-}
-- 
cgit v1.2.3-71-gd317


From d3efc8efdbaa81e9db3089810fcd526fccba234d Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 17 Jun 2009 10:07:59 -0300
Subject: KVM: use vcpu_id instead of bsp_vcpu pointer in kvm_vcpu_is_bsp

Change kvm_vcpu_is_bsp to use vcpu_id instead of bsp_vcpu pointer, which
is only initialized at the end of kvm_vm_ioctl_create_vcpu.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0604d56f6eed..4ea42c950539 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -538,7 +538,7 @@ static inline void kvm_irqfd_release(struct kvm *kvm) {}
 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
 static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
 {
-	return vcpu->kvm->bsp_vcpu == vcpu;
+	return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id;
 }
 #endif
 #endif
-- 
cgit v1.2.3-71-gd317


From 6c474694530f377507f9aca438c17206e051e6e7 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 29 Jun 2009 22:24:26 +0300
Subject: KVM: convert bus to slots_lock

Use slots_lock to protect device list on the bus.  slots_lock is already
taken for read everywhere, so we only need to take it for write when
registering devices.  This is in preparation to removing in_range and
kvm->lock around it.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8254.c      |  5 +++--
 arch/x86/kvm/i8259.c      |  2 +-
 include/linux/kvm_host.h  |  5 ++++-
 virt/kvm/coalesced_mmio.c |  2 +-
 virt/kvm/ioapic.c         |  2 +-
 virt/kvm/kvm_main.c       | 12 +++++++++++-
 6 files changed, 21 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index bcd00c76d69e..4082cdd468ed 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -583,6 +583,7 @@ static const struct kvm_io_device_ops speaker_dev_ops = {
 	.in_range = speaker_in_range,
 };
 
+/* Caller must have writers lock on slots_lock */
 struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 {
 	struct kvm_pit *pit;
@@ -621,11 +622,11 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 	kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
 
 	kvm_iodevice_init(&pit->dev, &pit_dev_ops);
-	kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
+	__kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
 
 	if (flags & KVM_PIT_SPEAKER_DUMMY) {
 		kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
-		kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev);
+		__kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev);
 	}
 
 	return pit;
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 148c52a608d6..1851aec8a7da 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -548,6 +548,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
 	 * Initialize PIO device
 	 */
 	kvm_iodevice_init(&s->dev, &picdev_ops);
-	kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev);
+	kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev);
 	return s;
 }
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4ea42c950539..96c8c0b01929 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -42,6 +42,7 @@
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID	0
 
+struct kvm;
 struct kvm_vcpu;
 extern struct kmem_cache *kvm_vcpu_cache;
 
@@ -61,7 +62,9 @@ void kvm_io_bus_init(struct kvm_io_bus *bus);
 void kvm_io_bus_destroy(struct kvm_io_bus *bus);
 struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus,
 					  gpa_t addr, int len, int is_write);
-void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
+void __kvm_io_bus_register_dev(struct kvm_io_bus *bus,
+			       struct kvm_io_device *dev);
+void kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus,
 			     struct kvm_io_device *dev);
 
 struct kvm_vcpu {
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index b40946c1237d..7b7cc9fe5ee3 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -102,7 +102,7 @@ int kvm_coalesced_mmio_init(struct kvm *kvm)
 	kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
 	dev->kvm = kvm;
 	kvm->coalesced_mmio_dev = dev;
-	kvm_io_bus_register_dev(&kvm->mmio_bus, &dev->dev);
+	kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &dev->dev);
 
 	return 0;
 }
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 0532fa68f5d1..0eca54e06326 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -343,7 +343,7 @@ int kvm_ioapic_init(struct kvm *kvm)
 	kvm_ioapic_reset(ioapic);
 	kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
 	ioapic->kvm = kvm;
-	kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev);
+	kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &ioapic->dev);
 	return 0;
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index bbb4029d7c4d..0edc366ecf89 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2527,7 +2527,17 @@ struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus,
 	return NULL;
 }
 
-void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
+void kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus,
+			     struct kvm_io_device *dev)
+{
+	down_write(&kvm->slots_lock);
+	__kvm_io_bus_register_dev(bus, dev);
+	up_write(&kvm->slots_lock);
+}
+
+/* An unlocked version. Caller must have write lock on slots_lock. */
+void __kvm_io_bus_register_dev(struct kvm_io_bus *bus,
+			     struct kvm_io_device *dev)
 {
 	BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
 
-- 
cgit v1.2.3-71-gd317


From bda9020e2463ec94db9f97e8615f3bae22069838 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 29 Jun 2009 22:24:32 +0300
Subject: KVM: remove in_range from io devices

This changes bus accesses to use high-level kvm_io_bus_read/kvm_io_bus_write
functions. in_range now becomes unused so it is removed from device ops in
favor of read/write callbacks performing range checks internally.

This allows aliasing (mostly for in-kernel virtio), as well as better error
handling by making it possible to pass errors up to userspace.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c  |  28 ++++--------
 arch/x86/kvm/i8254.c      |  49 +++++++++++----------
 arch/x86/kvm/i8259.c      |  20 +++++----
 arch/x86/kvm/lapic.c      |  44 +++++++++----------
 arch/x86/kvm/x86.c        | 110 ++++++++++++++--------------------------------
 include/linux/kvm_host.h  |   6 ++-
 virt/kvm/coalesced_mmio.c |  16 +++----
 virt/kvm/ioapic.c         |  22 +++++-----
 virt/kvm/iodev.h          |  39 +++++++---------
 virt/kvm/kvm_main.c       |  26 ++++++-----
 10 files changed, 152 insertions(+), 208 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 5c766bd82b05..d7aa6bb8f477 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -210,16 +210,6 @@ int kvm_dev_ioctl_check_extension(long ext)
 
 }
 
-static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
-					gpa_t addr, int len, int is_write)
-{
-	struct kvm_io_device *dev;
-
-	dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, is_write);
-
-	return dev;
-}
-
 static int handle_vm_error(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
@@ -231,6 +221,7 @@ static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct kvm_mmio_req *p;
 	struct kvm_io_device *mmio_dev;
+	int r;
 
 	p = kvm_get_vcpu_ioreq(vcpu);
 
@@ -247,16 +238,13 @@ static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	kvm_run->exit_reason = KVM_EXIT_MMIO;
 	return 0;
 mmio:
-	mmio_dev = vcpu_find_mmio_dev(vcpu, p->addr, p->size, !p->dir);
-	if (mmio_dev) {
-		if (!p->dir)
-			kvm_iodevice_write(mmio_dev, p->addr, p->size,
-						&p->data);
-		else
-			kvm_iodevice_read(mmio_dev, p->addr, p->size,
-						&p->data);
-
-	} else
+	if (p->dir)
+		r = kvm_io_bus_read(&vcpu->kvm->mmio_bus, p->addr,
+				    p->size, &p->data);
+	else
+		r = kvm_io_bus_write(&vcpu->kvm->mmio_bus, p->addr,
+				     p->size, &p->data);
+	if (r)
 		printk(KERN_ERR"kvm: No iodevice found! addr:%lx\n", p->addr);
 	p->state = STATE_IORESP_READY;
 
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 4082cdd468ed..8c3ac30ef9bd 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -358,8 +358,14 @@ static inline struct kvm_pit *speaker_to_pit(struct kvm_io_device *dev)
 	return container_of(dev, struct kvm_pit, speaker_dev);
 }
 
-static void pit_ioport_write(struct kvm_io_device *this,
-			     gpa_t addr, int len, const void *data)
+static inline int pit_in_range(gpa_t addr)
+{
+	return ((addr >= KVM_PIT_BASE_ADDRESS) &&
+		(addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
+}
+
+static int pit_ioport_write(struct kvm_io_device *this,
+			    gpa_t addr, int len, const void *data)
 {
 	struct kvm_pit *pit = dev_to_pit(this);
 	struct kvm_kpit_state *pit_state = &pit->pit_state;
@@ -367,6 +373,8 @@ static void pit_ioport_write(struct kvm_io_device *this,
 	int channel, access;
 	struct kvm_kpit_channel_state *s;
 	u32 val = *(u32 *) data;
+	if (!pit_in_range(addr))
+		return -EOPNOTSUPP;
 
 	val  &= 0xff;
 	addr &= KVM_PIT_CHANNEL_MASK;
@@ -429,16 +437,19 @@ static void pit_ioport_write(struct kvm_io_device *this,
 	}
 
 	mutex_unlock(&pit_state->lock);
+	return 0;
 }
 
-static void pit_ioport_read(struct kvm_io_device *this,
-			    gpa_t addr, int len, void *data)
+static int pit_ioport_read(struct kvm_io_device *this,
+			   gpa_t addr, int len, void *data)
 {
 	struct kvm_pit *pit = dev_to_pit(this);
 	struct kvm_kpit_state *pit_state = &pit->pit_state;
 	struct kvm *kvm = pit->kvm;
 	int ret, count;
 	struct kvm_kpit_channel_state *s;
+	if (!pit_in_range(addr))
+		return -EOPNOTSUPP;
 
 	addr &= KVM_PIT_CHANNEL_MASK;
 	s = &pit_state->channels[addr];
@@ -493,37 +504,36 @@ static void pit_ioport_read(struct kvm_io_device *this,
 	memcpy(data, (char *)&ret, len);
 
 	mutex_unlock(&pit_state->lock);
+	return 0;
 }
 
-static int pit_in_range(struct kvm_io_device *this, gpa_t addr,
-			int len, int is_write)
-{
-	return ((addr >= KVM_PIT_BASE_ADDRESS) &&
-		(addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
-}
-
-static void speaker_ioport_write(struct kvm_io_device *this,
-				 gpa_t addr, int len, const void *data)
+static int speaker_ioport_write(struct kvm_io_device *this,
+				gpa_t addr, int len, const void *data)
 {
 	struct kvm_pit *pit = speaker_to_pit(this);
 	struct kvm_kpit_state *pit_state = &pit->pit_state;
 	struct kvm *kvm = pit->kvm;
 	u32 val = *(u32 *) data;
+	if (addr != KVM_SPEAKER_BASE_ADDRESS)
+		return -EOPNOTSUPP;
 
 	mutex_lock(&pit_state->lock);
 	pit_state->speaker_data_on = (val >> 1) & 1;
 	pit_set_gate(kvm, 2, val & 1);
 	mutex_unlock(&pit_state->lock);
+	return 0;
 }
 
-static void speaker_ioport_read(struct kvm_io_device *this,
-				gpa_t addr, int len, void *data)
+static int speaker_ioport_read(struct kvm_io_device *this,
+			       gpa_t addr, int len, void *data)
 {
 	struct kvm_pit *pit = speaker_to_pit(this);
 	struct kvm_kpit_state *pit_state = &pit->pit_state;
 	struct kvm *kvm = pit->kvm;
 	unsigned int refresh_clock;
 	int ret;
+	if (addr != KVM_SPEAKER_BASE_ADDRESS)
+		return -EOPNOTSUPP;
 
 	/* Refresh clock toggles at about 15us. We approximate as 2^14ns. */
 	refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1;
@@ -535,12 +545,7 @@ static void speaker_ioport_read(struct kvm_io_device *this,
 		len = sizeof(ret);
 	memcpy(data, (char *)&ret, len);
 	mutex_unlock(&pit_state->lock);
-}
-
-static int speaker_in_range(struct kvm_io_device *this, gpa_t addr,
-			    int len, int is_write)
-{
-	return (addr == KVM_SPEAKER_BASE_ADDRESS);
+	return 0;
 }
 
 void kvm_pit_reset(struct kvm_pit *pit)
@@ -574,13 +579,11 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
 static const struct kvm_io_device_ops pit_dev_ops = {
 	.read     = pit_ioport_read,
 	.write    = pit_ioport_write,
-	.in_range = pit_in_range,
 };
 
 static const struct kvm_io_device_ops speaker_dev_ops = {
 	.read     = speaker_ioport_read,
 	.write    = speaker_ioport_write,
-	.in_range = speaker_in_range,
 };
 
 /* Caller must have writers lock on slots_lock */
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 1851aec8a7da..1d1bb75dc7bc 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -430,8 +430,7 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1)
 	return s->elcr;
 }
 
-static int picdev_in_range(struct kvm_io_device *this, gpa_t addr,
-			   int len, int is_write)
+static int picdev_in_range(gpa_t addr)
 {
 	switch (addr) {
 	case 0x20:
@@ -451,16 +450,18 @@ static inline struct kvm_pic *to_pic(struct kvm_io_device *dev)
 	return container_of(dev, struct kvm_pic, dev);
 }
 
-static void picdev_write(struct kvm_io_device *this,
+static int picdev_write(struct kvm_io_device *this,
 			 gpa_t addr, int len, const void *val)
 {
 	struct kvm_pic *s = to_pic(this);
 	unsigned char data = *(unsigned char *)val;
+	if (!picdev_in_range(addr))
+		return -EOPNOTSUPP;
 
 	if (len != 1) {
 		if (printk_ratelimit())
 			printk(KERN_ERR "PIC: non byte write\n");
-		return;
+		return 0;
 	}
 	pic_lock(s);
 	switch (addr) {
@@ -476,18 +477,21 @@ static void picdev_write(struct kvm_io_device *this,
 		break;
 	}
 	pic_unlock(s);
+	return 0;
 }
 
-static void picdev_read(struct kvm_io_device *this,
-			gpa_t addr, int len, void *val)
+static int picdev_read(struct kvm_io_device *this,
+		       gpa_t addr, int len, void *val)
 {
 	struct kvm_pic *s = to_pic(this);
 	unsigned char data = 0;
+	if (!picdev_in_range(addr))
+		return -EOPNOTSUPP;
 
 	if (len != 1) {
 		if (printk_ratelimit())
 			printk(KERN_ERR "PIC: non byte read\n");
-		return;
+		return 0;
 	}
 	pic_lock(s);
 	switch (addr) {
@@ -504,6 +508,7 @@ static void picdev_read(struct kvm_io_device *this,
 	}
 	*(unsigned char *)val = data;
 	pic_unlock(s);
+	return 0;
 }
 
 /*
@@ -526,7 +531,6 @@ static void pic_irq_request(void *opaque, int level)
 static const struct kvm_io_device_ops picdev_ops = {
 	.read     = picdev_read,
 	.write    = picdev_write,
-	.in_range = picdev_in_range,
 };
 
 struct kvm_pic *kvm_create_pic(struct kvm *kvm)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2e0286596387..265a765f038f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -546,18 +546,27 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
 	return container_of(dev, struct kvm_lapic, dev);
 }
 
-static void apic_mmio_read(struct kvm_io_device *this,
-			   gpa_t address, int len, void *data)
+static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
+{
+	return apic_hw_enabled(apic) &&
+	    addr >= apic->base_address &&
+	    addr < apic->base_address + LAPIC_MMIO_LENGTH;
+}
+
+static int apic_mmio_read(struct kvm_io_device *this,
+			  gpa_t address, int len, void *data)
 {
 	struct kvm_lapic *apic = to_lapic(this);
 	unsigned int offset = address - apic->base_address;
 	unsigned char alignment = offset & 0xf;
 	u32 result;
+	if (!apic_mmio_in_range(apic, address))
+		return -EOPNOTSUPP;
 
 	if ((alignment + len) > 4) {
 		printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d",
 		       (unsigned long)address, len);
-		return;
+		return 0;
 	}
 	result = __apic_read(apic, offset & ~0xf);
 
@@ -574,6 +583,7 @@ static void apic_mmio_read(struct kvm_io_device *this,
 		       "should be 1,2, or 4 instead\n", len);
 		break;
 	}
+	return 0;
 }
 
 static void update_divide_count(struct kvm_lapic *apic)
@@ -629,13 +639,15 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
 		apic->vcpu->kvm->arch.vapics_in_nmi_mode--;
 }
 
-static void apic_mmio_write(struct kvm_io_device *this,
-			    gpa_t address, int len, const void *data)
+static int apic_mmio_write(struct kvm_io_device *this,
+			   gpa_t address, int len, const void *data)
 {
 	struct kvm_lapic *apic = to_lapic(this);
 	unsigned int offset = address - apic->base_address;
 	unsigned char alignment = offset & 0xf;
 	u32 val;
+	if (!apic_mmio_in_range(apic, address))
+		return -EOPNOTSUPP;
 
 	/*
 	 * APIC register must be aligned on 128-bits boundary.
@@ -646,7 +658,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
 		/* Don't shout loud, $infamous_os would cause only noise. */
 		apic_debug("apic write: bad size=%d %lx\n",
 			   len, (long)address);
-		return;
+		return 0;
 	}
 
 	val = *(u32 *) data;
@@ -729,7 +741,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
 		hrtimer_cancel(&apic->lapic_timer.timer);
 		apic_set_reg(apic, APIC_TMICT, val);
 		start_apic_timer(apic);
-		return;
+		return 0;
 
 	case APIC_TDCR:
 		if (val & 4)
@@ -743,22 +755,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
 			   offset);
 		break;
 	}
-
-}
-
-static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr,
-			   int len, int size)
-{
-	struct kvm_lapic *apic = to_lapic(this);
-	int ret = 0;
-
-
-	if (apic_hw_enabled(apic) &&
-	    (addr >= apic->base_address) &&
-	    (addr < (apic->base_address + LAPIC_MMIO_LENGTH)))
-		ret = 1;
-
-	return ret;
+	return 0;
 }
 
 void kvm_free_lapic(struct kvm_vcpu *vcpu)
@@ -938,7 +935,6 @@ static struct kvm_timer_ops lapic_timer_ops = {
 static const struct kvm_io_device_ops apic_mmio_ops = {
 	.read     = apic_mmio_read,
 	.write    = apic_mmio_write,
-	.in_range = apic_mmio_range,
 };
 
 int kvm_create_lapic(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7ce6367c1976..96f0ae7d97b6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2333,35 +2333,23 @@ static void kvm_init_msr_list(void)
 	num_msrs_to_save = j;
 }
 
-/*
- * Only apic need an MMIO device hook, so shortcut now..
- */
-static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
-						gpa_t addr, int len,
-						int is_write)
+static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
+			   const void *v)
 {
-	struct kvm_io_device *dev;
+	if (vcpu->arch.apic &&
+	    !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
+		return 0;
 
-	if (vcpu->arch.apic) {
-		dev = &vcpu->arch.apic->dev;
-		if (kvm_iodevice_in_range(dev, addr, len, is_write))
-			return dev;
-	}
-	return NULL;
+	return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v);
 }
 
-
-static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
-						gpa_t addr, int len,
-						int is_write)
+static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 {
-	struct kvm_io_device *dev;
+	if (vcpu->arch.apic &&
+	    !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
+		return 0;
 
-	dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write);
-	if (dev == NULL)
-		dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
-					  is_write);
-	return dev;
+	return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v);
 }
 
 static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
@@ -2430,7 +2418,6 @@ static int emulator_read_emulated(unsigned long addr,
 				  unsigned int bytes,
 				  struct kvm_vcpu *vcpu)
 {
-	struct kvm_io_device *mmio_dev;
 	gpa_t                 gpa;
 
 	if (vcpu->mmio_read_completed) {
@@ -2455,13 +2442,8 @@ mmio:
 	/*
 	 * Is this MMIO handled locally?
 	 */
-	mutex_lock(&vcpu->kvm->lock);
-	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
-	mutex_unlock(&vcpu->kvm->lock);
-	if (mmio_dev) {
-		kvm_iodevice_read(mmio_dev, gpa, bytes, val);
+	if (!vcpu_mmio_read(vcpu, gpa, bytes, val))
 		return X86EMUL_CONTINUE;
-	}
 
 	vcpu->mmio_needed = 1;
 	vcpu->mmio_phys_addr = gpa;
@@ -2488,7 +2470,6 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 					   unsigned int bytes,
 					   struct kvm_vcpu *vcpu)
 {
-	struct kvm_io_device *mmio_dev;
 	gpa_t                 gpa;
 
 	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
@@ -2509,13 +2490,8 @@ mmio:
 	/*
 	 * Is this MMIO handled locally?
 	 */
-	mutex_lock(&vcpu->kvm->lock);
-	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
-	mutex_unlock(&vcpu->kvm->lock);
-	if (mmio_dev) {
-		kvm_iodevice_write(mmio_dev, gpa, bytes, val);
+	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
 		return X86EMUL_CONTINUE;
-	}
 
 	vcpu->mmio_needed = 1;
 	vcpu->mmio_phys_addr = gpa;
@@ -2850,48 +2826,40 @@ int complete_pio(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static void kernel_pio(struct kvm_io_device *pio_dev,
-		       struct kvm_vcpu *vcpu,
-		       void *pd)
+static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
 {
 	/* TODO: String I/O for in kernel device */
+	int r;
 
 	if (vcpu->arch.pio.in)
-		kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
-				  vcpu->arch.pio.size,
-				  pd);
+		r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
+				    vcpu->arch.pio.size, pd);
 	else
-		kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
-				   vcpu->arch.pio.size,
-				   pd);
+		r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
+				     vcpu->arch.pio.size, pd);
+	return r;
 }
 
-static void pio_string_write(struct kvm_io_device *pio_dev,
-			     struct kvm_vcpu *vcpu)
+static int pio_string_write(struct kvm_vcpu *vcpu)
 {
 	struct kvm_pio_request *io = &vcpu->arch.pio;
 	void *pd = vcpu->arch.pio_data;
-	int i;
+	int i, r = 0;
 
 	for (i = 0; i < io->cur_count; i++) {
-		kvm_iodevice_write(pio_dev, io->port,
-				   io->size,
-				   pd);
+		if (kvm_io_bus_write(&vcpu->kvm->pio_bus,
+				     io->port, io->size, pd)) {
+			r = -EOPNOTSUPP;
+			break;
+		}
 		pd += io->size;
 	}
-}
-
-static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
-					       gpa_t addr, int len,
-					       int is_write)
-{
-	return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
+	return r;
 }
 
 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 		  int size, unsigned port)
 {
-	struct kvm_io_device *pio_dev;
 	unsigned long val;
 
 	vcpu->run->exit_reason = KVM_EXIT_IO;
@@ -2911,11 +2879,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	val = kvm_register_read(vcpu, VCPU_REGS_RAX);
 	memcpy(vcpu->arch.pio_data, &val, 4);
 
-	mutex_lock(&vcpu->kvm->lock);
-	pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
-	mutex_unlock(&vcpu->kvm->lock);
-	if (pio_dev) {
-		kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
+	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
 		complete_pio(vcpu);
 		return 1;
 	}
@@ -2929,7 +2893,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 {
 	unsigned now, in_page;
 	int ret = 0;
-	struct kvm_io_device *pio_dev;
 
 	vcpu->run->exit_reason = KVM_EXIT_IO;
 	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
@@ -2973,12 +2936,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 
 	vcpu->arch.pio.guest_gva = address;
 
-	mutex_lock(&vcpu->kvm->lock);
-	pio_dev = vcpu_find_pio_dev(vcpu, port,
-				    vcpu->arch.pio.cur_count,
-				    !vcpu->arch.pio.in);
-	mutex_unlock(&vcpu->kvm->lock);
-
 	if (!vcpu->arch.pio.in) {
 		/* string PIO write */
 		ret = pio_copy_data(vcpu);
@@ -2986,16 +2943,13 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 			kvm_inject_gp(vcpu, 0);
 			return 1;
 		}
-		if (ret == 0 && pio_dev) {
-			pio_string_write(pio_dev, vcpu);
+		if (ret == 0 && !pio_string_write(vcpu)) {
 			complete_pio(vcpu);
 			if (vcpu->arch.pio.count == 0)
 				ret = 1;
 		}
-	} else if (pio_dev)
-		pr_unimpl(vcpu, "no string pio read support yet, "
-		       "port %x size %d count %ld\n",
-			port, size, count);
+	}
+	/* no string PIO read support yet */
 
 	return ret;
 }
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 96c8c0b01929..077e8bb875a9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -60,8 +60,10 @@ struct kvm_io_bus {
 
 void kvm_io_bus_init(struct kvm_io_bus *bus);
 void kvm_io_bus_destroy(struct kvm_io_bus *bus);
-struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus,
-					  gpa_t addr, int len, int is_write);
+int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr, int len,
+		     const void *val);
+int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len,
+		    void *val);
 void __kvm_io_bus_register_dev(struct kvm_io_bus *bus,
 			       struct kvm_io_device *dev);
 void kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus,
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 7b7cc9fe5ee3..0352f81ecc0b 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -19,18 +19,14 @@ static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev)
 	return container_of(dev, struct kvm_coalesced_mmio_dev, dev);
 }
 
-static int coalesced_mmio_in_range(struct kvm_io_device *this,
-				   gpa_t addr, int len, int is_write)
+static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,
+				   gpa_t addr, int len)
 {
-	struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
 	struct kvm_coalesced_mmio_zone *zone;
 	struct kvm_coalesced_mmio_ring *ring;
 	unsigned avail;
 	int i;
 
-	if (!is_write)
-		return 0;
-
 	/* Are we able to batch it ? */
 
 	/* last is the first free entry
@@ -60,11 +56,13 @@ static int coalesced_mmio_in_range(struct kvm_io_device *this,
 	return 0;
 }
 
-static void coalesced_mmio_write(struct kvm_io_device *this,
-				 gpa_t addr, int len, const void *val)
+static int coalesced_mmio_write(struct kvm_io_device *this,
+				gpa_t addr, int len, const void *val)
 {
 	struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
 	struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring;
+	if (!coalesced_mmio_in_range(dev, addr, len))
+		return -EOPNOTSUPP;
 
 	spin_lock(&dev->lock);
 
@@ -76,6 +74,7 @@ static void coalesced_mmio_write(struct kvm_io_device *this,
 	smp_wmb();
 	ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX;
 	spin_unlock(&dev->lock);
+	return 0;
 }
 
 static void coalesced_mmio_destructor(struct kvm_io_device *this)
@@ -87,7 +86,6 @@ static void coalesced_mmio_destructor(struct kvm_io_device *this)
 
 static const struct kvm_io_device_ops coalesced_mmio_ops = {
 	.write      = coalesced_mmio_write,
-	.in_range   = coalesced_mmio_in_range,
 	.destructor = coalesced_mmio_destructor,
 };
 
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 0eca54e06326..ddf6aa998b18 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -227,20 +227,19 @@ static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev)
 	return container_of(dev, struct kvm_ioapic, dev);
 }
 
-static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr,
-			   int len, int is_write)
+static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr)
 {
-	struct kvm_ioapic *ioapic = to_ioapic(this);
-
 	return ((addr >= ioapic->base_address &&
 		 (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
 }
 
-static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
-			     void *val)
+static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
+			    void *val)
 {
 	struct kvm_ioapic *ioapic = to_ioapic(this);
 	u32 result;
+	if (!ioapic_in_range(ioapic, addr))
+		return -EOPNOTSUPP;
 
 	ioapic_debug("addr %lx\n", (unsigned long)addr);
 	ASSERT(!(addr & 0xf));	/* check alignment */
@@ -273,13 +272,16 @@ static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
 		printk(KERN_WARNING "ioapic: wrong length %d\n", len);
 	}
 	mutex_unlock(&ioapic->kvm->irq_lock);
+	return 0;
 }
 
-static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
-			      const void *val)
+static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
+			     const void *val)
 {
 	struct kvm_ioapic *ioapic = to_ioapic(this);
 	u32 data;
+	if (!ioapic_in_range(ioapic, addr))
+		return -EOPNOTSUPP;
 
 	ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n",
 		     (void*)addr, len, val);
@@ -290,7 +292,7 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
 		data = *(u32 *) val;
 	else {
 		printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
-		return;
+		return 0;
 	}
 
 	addr &= 0xff;
@@ -312,6 +314,7 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
 		break;
 	}
 	mutex_unlock(&ioapic->kvm->irq_lock);
+	return 0;
 }
 
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
@@ -329,7 +332,6 @@ void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
 static const struct kvm_io_device_ops ioapic_mmio_ops = {
 	.read     = ioapic_mmio_read,
 	.write    = ioapic_mmio_write,
-	.in_range = ioapic_in_range,
 };
 
 int kvm_ioapic_init(struct kvm *kvm)
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
index 06e38b23fa61..12fd3caffd2b 100644
--- a/virt/kvm/iodev.h
+++ b/virt/kvm/iodev.h
@@ -17,23 +17,24 @@
 #define __KVM_IODEV_H__
 
 #include <linux/kvm_types.h>
+#include <asm/errno.h>
 
 struct kvm_io_device;
 
 /**
  * kvm_io_device_ops are called under kvm slots_lock.
+ * read and write handlers return 0 if the transaction has been handled,
+ * or non-zero to have it passed to the next device.
  **/
 struct kvm_io_device_ops {
-	void (*read)(struct kvm_io_device *this,
+	int (*read)(struct kvm_io_device *this,
+		    gpa_t addr,
+		    int len,
+		    void *val);
+	int (*write)(struct kvm_io_device *this,
 		     gpa_t addr,
 		     int len,
-		     void *val);
-	void (*write)(struct kvm_io_device *this,
-		      gpa_t addr,
-		      int len,
-		      const void *val);
-	int (*in_range)(struct kvm_io_device *this, gpa_t addr, int len,
-			int is_write);
+		     const void *val);
 	void (*destructor)(struct kvm_io_device *this);
 };
 
@@ -48,26 +49,16 @@ static inline void kvm_iodevice_init(struct kvm_io_device *dev,
 	dev->ops = ops;
 }
 
-static inline void kvm_iodevice_read(struct kvm_io_device *dev,
-				     gpa_t addr,
-				     int len,
-				     void *val)
+static inline int kvm_iodevice_read(struct kvm_io_device *dev,
+				    gpa_t addr, int l, void *v)
 {
-	dev->ops->read(dev, addr, len, val);
+	return dev->ops->read ? dev->ops->read(dev, addr, l, v) : -EOPNOTSUPP;
 }
 
-static inline void kvm_iodevice_write(struct kvm_io_device *dev,
-				      gpa_t addr,
-				      int len,
-				      const void *val)
+static inline int kvm_iodevice_write(struct kvm_io_device *dev,
+				     gpa_t addr, int l, const void *v)
 {
-	dev->ops->write(dev, addr, len, val);
-}
-
-static inline int kvm_iodevice_in_range(struct kvm_io_device *dev,
-					gpa_t addr, int len, int is_write)
-{
-	return dev->ops->in_range(dev, addr, len, is_write);
+	return dev->ops->write ? dev->ops->write(dev, addr, l, v) : -EOPNOTSUPP;
 }
 
 static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0edc366ecf89..594606526620 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2512,19 +2512,25 @@ void kvm_io_bus_destroy(struct kvm_io_bus *bus)
 	}
 }
 
-struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus,
-					  gpa_t addr, int len, int is_write)
+/* kvm_io_bus_write - called under kvm->slots_lock */
+int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr,
+		     int len, const void *val)
 {
 	int i;
+	for (i = 0; i < bus->dev_count; i++)
+		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
+			return 0;
+	return -EOPNOTSUPP;
+}
 
-	for (i = 0; i < bus->dev_count; i++) {
-		struct kvm_io_device *pos = bus->devs[i];
-
-		if (kvm_iodevice_in_range(pos, addr, len, is_write))
-			return pos;
-	}
-
-	return NULL;
+/* kvm_io_bus_read - called under kvm->slots_lock */
+int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len, void *val)
+{
+	int i;
+	for (i = 0; i < bus->dev_count; i++)
+		if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
+			return 0;
+	return -EOPNOTSUPP;
 }
 
 void kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus,
-- 
cgit v1.2.3-71-gd317


From e9f4275732add046fed4a548b8dbb98dbe500d2f Mon Sep 17 00:00:00 2001
From: Beth Kon <eak@us.ibm.com>
Date: Tue, 7 Jul 2009 11:50:38 -0400
Subject: KVM: PIT support for HPET legacy mode

When kvm is in hpet_legacy_mode, the hpet is providing the timer
interrupt and the pit should not be. So in legacy mode, the pit timer
is destroyed, but the *state* of the pit is maintained. So if kvm or
the guest tries to modify the state of the pit, this modification is
accepted, *except* that the timer isn't actually started. When we exit
hpet_legacy_mode, the current state of the pit (which is up to date
since we've been accepting modifications) is used to restart the pit
timer.

The saved_mode code in kvm_pit_load_count temporarily changes mode to
0xff in order to destroy the timer, but then restores the actual
value, again maintaining "current" state of the pit for possible later
reenablement.

[avi: add some reserved storage in the ioctl; make SET_PIT2 IOW]
[marcelo: fix memory corruption due to reserved storage]

Signed-off-by: Beth Kon <eak@us.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm.h |  9 +++++++
 arch/x86/kvm/i8254.c       | 22 +++++++++++++----
 arch/x86/kvm/i8254.h       |  3 ++-
 arch/x86/kvm/x86.c         | 59 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/kvm.h        |  6 +++++
 5 files changed, 93 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 708b9c32a5da..4a5fe914dc59 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -18,6 +18,7 @@
 #define __KVM_HAVE_GUEST_DEBUG
 #define __KVM_HAVE_MSIX
 #define __KVM_HAVE_MCE
+#define __KVM_HAVE_PIT_STATE2
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
@@ -237,6 +238,14 @@ struct kvm_pit_state {
 	struct kvm_pit_channel_state channels[3];
 };
 
+#define KVM_PIT_FLAGS_HPET_LEGACY  0x00000001
+
+struct kvm_pit_state2 {
+	struct kvm_pit_channel_state channels[3];
+	__u32 flags;
+	__u32 reserved[9];
+};
+
 struct kvm_reinject_control {
 	__u8 pit_reinject;
 	__u8 reserved[31];
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 8c3ac30ef9bd..9b62c57ba6e4 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -332,20 +332,33 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
 	case 1:
         /* FIXME: enhance mode 4 precision */
 	case 4:
-		create_pit_timer(ps, val, 0);
+		if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) {
+			create_pit_timer(ps, val, 0);
+		}
 		break;
 	case 2:
 	case 3:
-		create_pit_timer(ps, val, 1);
+		if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){
+			create_pit_timer(ps, val, 1);
+		}
 		break;
 	default:
 		destroy_pit_timer(&ps->pit_timer);
 	}
 }
 
-void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val)
+void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start)
 {
-	pit_load_count(kvm, channel, val);
+	u8 saved_mode;
+	if (hpet_legacy_start) {
+		/* save existing mode for later reenablement */
+		saved_mode = kvm->arch.vpit->pit_state.channels[0].mode;
+		kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */
+		pit_load_count(kvm, channel, val);
+		kvm->arch.vpit->pit_state.channels[0].mode = saved_mode;
+	} else {
+		pit_load_count(kvm, channel, val);
+	}
 }
 
 static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev)
@@ -554,6 +567,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
 	struct kvm_kpit_channel_state *c;
 
 	mutex_lock(&pit->pit_state.lock);
+	pit->pit_state.flags = 0;
 	for (i = 0; i < 3; i++) {
 		c = &pit->pit_state.channels[i];
 		c->mode = 0xff;
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index b2670180f225..d4c1c7ffdc09 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -21,6 +21,7 @@ struct kvm_kpit_channel_state {
 
 struct kvm_kpit_state {
 	struct kvm_kpit_channel_state channels[3];
+	u32 flags;
 	struct kvm_timer pit_timer;
 	bool is_periodic;
 	u32    speaker_data_on;
@@ -49,7 +50,7 @@ struct kvm_pit {
 #define KVM_PIT_CHANNEL_MASK	    0x3
 
 void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
-void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
+void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start);
 struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
 void kvm_free_pit(struct kvm *kvm);
 void kvm_pit_reset(struct kvm_pit *pit);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dfb0e37b3c65..2214384ff610 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1213,6 +1213,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_ASSIGN_DEV_IRQ:
 	case KVM_CAP_IRQFD:
 	case KVM_CAP_PIT2:
+	case KVM_CAP_PIT_STATE2:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -2078,7 +2079,36 @@ static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
-	kvm_pit_load_count(kvm, 0, ps->channels[0].count);
+	kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
+	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+	return r;
+}
+
+static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+	int r = 0;
+
+	mutex_lock(&kvm->arch.vpit->pit_state.lock);
+	memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
+		sizeof(ps->channels));
+	ps->flags = kvm->arch.vpit->pit_state.flags;
+	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+	return r;
+}
+
+static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+	int r = 0, start = 0;
+	u32 prev_legacy, cur_legacy;
+	mutex_lock(&kvm->arch.vpit->pit_state.lock);
+	prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+	cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
+	if (!prev_legacy && cur_legacy)
+		start = 1;
+	memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
+	       sizeof(kvm->arch.vpit->pit_state.channels));
+	kvm->arch.vpit->pit_state.flags = ps->flags;
+	kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
 	return r;
 }
@@ -2140,6 +2170,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	 */
 	union {
 		struct kvm_pit_state ps;
+		struct kvm_pit_state2 ps2;
 		struct kvm_memory_alias alias;
 		struct kvm_pit_config pit_config;
 	} u;
@@ -2322,6 +2353,32 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_GET_PIT2: {
+		r = -ENXIO;
+		if (!kvm->arch.vpit)
+			goto out;
+		r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
+		if (r)
+			goto out;
+		r = -EFAULT;
+		if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_SET_PIT2: {
+		r = -EFAULT;
+		if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
+			goto out;
+		r = -ENXIO;
+		if (!kvm->arch.vpit)
+			goto out;
+		r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
 	case KVM_REINJECT_CONTROL: {
 		struct kvm_reinject_control control;
 		r =  -EFAULT;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 76c640834ea6..a74a1fcc28e9 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -409,6 +409,9 @@ struct kvm_guest_debug {
 #define KVM_CAP_PIT2 33
 #endif
 #define KVM_CAP_SET_BOOT_CPU_ID 34
+#ifdef __KVM_HAVE_PIT_STATE2
+#define KVM_CAP_PIT_STATE2 35
+#endif
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -586,6 +589,9 @@ struct kvm_debug_guest {
 #define KVM_IA64_VCPU_GET_STACK   _IOR(KVMIO,  0x9a, void *)
 #define KVM_IA64_VCPU_SET_STACK   _IOW(KVMIO,  0x9b, void *)
 
+#define KVM_GET_PIT2   _IOR(KVMIO,   0x9f, struct kvm_pit_state2)
+#define KVM_SET_PIT2   _IOW(KVMIO,   0xa0, struct kvm_pit_state2)
+
 #define KVM_TRC_INJ_VIRQ         (KVM_TRC_HANDLER + 0x02)
 #define KVM_TRC_REDELIVER_EVT    (KVM_TRC_HANDLER + 0x03)
 #define KVM_TRC_PEND_INTR        (KVM_TRC_HANDLER + 0x04)
-- 
cgit v1.2.3-71-gd317


From 090b7aff27120cdae76a346a70db394844fea598 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Tue, 7 Jul 2009 17:08:44 -0400
Subject: KVM: make io_bus interface more robust

Today kvm_io_bus_regsiter_dev() returns void and will internally BUG_ON
if it fails.  We want to create dynamic MMIO/PIO entries driven from
userspace later in the series, so we need to enhance the code to be more
robust with the following changes:

   1) Add a return value to the registration function
   2) Fix up all the callsites to check the return code, handle any
      failures, and percolate the error up to the caller.
   3) Add an unregister function that collapses holes in the array

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8254.c      | 20 ++++++++++++++++++--
 arch/x86/kvm/i8259.c      |  9 ++++++++-
 include/linux/kvm_host.h  | 10 +++++++---
 virt/kvm/coalesced_mmio.c |  8 ++++++--
 virt/kvm/ioapic.c         |  8 ++++++--
 virt/kvm/kvm_main.c       | 39 ++++++++++++++++++++++++++++++++++-----
 6 files changed, 79 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 9b62c57ba6e4..137e54817102 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -605,6 +605,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 {
 	struct kvm_pit *pit;
 	struct kvm_kpit_state *pit_state;
+	int ret;
 
 	pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
 	if (!pit)
@@ -639,14 +640,29 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 	kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
 
 	kvm_iodevice_init(&pit->dev, &pit_dev_ops);
-	__kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
+	ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
+	if (ret < 0)
+		goto fail;
 
 	if (flags & KVM_PIT_SPEAKER_DUMMY) {
 		kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
-		__kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev);
+		ret = __kvm_io_bus_register_dev(&kvm->pio_bus,
+						&pit->speaker_dev);
+		if (ret < 0)
+			goto fail_unregister;
 	}
 
 	return pit;
+
+fail_unregister:
+	__kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev);
+
+fail:
+	if (pit->irq_source_id >= 0)
+		kvm_free_irq_source_id(kvm, pit->irq_source_id);
+
+	kfree(pit);
+	return NULL;
 }
 
 void kvm_free_pit(struct kvm *kvm)
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index e4bcbddecb36..daf4606b0293 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -539,6 +539,8 @@ static const struct kvm_io_device_ops picdev_ops = {
 struct kvm_pic *kvm_create_pic(struct kvm *kvm)
 {
 	struct kvm_pic *s;
+	int ret;
+
 	s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
 	if (!s)
 		return NULL;
@@ -555,6 +557,11 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
 	 * Initialize PIO device
 	 */
 	kvm_iodevice_init(&s->dev, &picdev_ops);
-	kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev);
+	ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev);
+	if (ret < 0) {
+		kfree(s);
+		return NULL;
+	}
+
 	return s;
 }
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 077e8bb875a9..983b0bdeb3ff 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -64,10 +64,14 @@ int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr, int len,
 		     const void *val);
 int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len,
 		    void *val);
-void __kvm_io_bus_register_dev(struct kvm_io_bus *bus,
+int __kvm_io_bus_register_dev(struct kvm_io_bus *bus,
+			       struct kvm_io_device *dev);
+int kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus,
+			    struct kvm_io_device *dev);
+void __kvm_io_bus_unregister_dev(struct kvm_io_bus *bus,
+				 struct kvm_io_device *dev);
+void kvm_io_bus_unregister_dev(struct kvm *kvm, struct kvm_io_bus *bus,
 			       struct kvm_io_device *dev);
-void kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus,
-			     struct kvm_io_device *dev);
 
 struct kvm_vcpu {
 	struct kvm *kvm;
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 0352f81ecc0b..04d69cd7049b 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -92,6 +92,7 @@ static const struct kvm_io_device_ops coalesced_mmio_ops = {
 int kvm_coalesced_mmio_init(struct kvm *kvm)
 {
 	struct kvm_coalesced_mmio_dev *dev;
+	int ret;
 
 	dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
 	if (!dev)
@@ -100,9 +101,12 @@ int kvm_coalesced_mmio_init(struct kvm *kvm)
 	kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
 	dev->kvm = kvm;
 	kvm->coalesced_mmio_dev = dev;
-	kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &dev->dev);
 
-	return 0;
+	ret = kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &dev->dev);
+	if (ret < 0)
+		kfree(dev);
+
+	return ret;
 }
 
 int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index b91fbb215447..fa05f67423ab 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -342,6 +342,7 @@ static const struct kvm_io_device_ops ioapic_mmio_ops = {
 int kvm_ioapic_init(struct kvm *kvm)
 {
 	struct kvm_ioapic *ioapic;
+	int ret;
 
 	ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
 	if (!ioapic)
@@ -350,7 +351,10 @@ int kvm_ioapic_init(struct kvm *kvm)
 	kvm_ioapic_reset(ioapic);
 	kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
 	ioapic->kvm = kvm;
-	kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &ioapic->dev);
-	return 0;
+	ret = kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &ioapic->dev);
+	if (ret < 0)
+		kfree(ioapic);
+
+	return ret;
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index fc1b58a72757..9c2fd025b8ae 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2533,21 +2533,50 @@ int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len, void *val)
 	return -EOPNOTSUPP;
 }
 
-void kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus,
+int kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus,
 			     struct kvm_io_device *dev)
 {
+	int ret;
+
 	down_write(&kvm->slots_lock);
-	__kvm_io_bus_register_dev(bus, dev);
+	ret = __kvm_io_bus_register_dev(bus, dev);
 	up_write(&kvm->slots_lock);
+
+	return ret;
 }
 
 /* An unlocked version. Caller must have write lock on slots_lock. */
-void __kvm_io_bus_register_dev(struct kvm_io_bus *bus,
-			     struct kvm_io_device *dev)
+int __kvm_io_bus_register_dev(struct kvm_io_bus *bus,
+			      struct kvm_io_device *dev)
 {
-	BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
+	if (bus->dev_count > NR_IOBUS_DEVS-1)
+		return -ENOSPC;
 
 	bus->devs[bus->dev_count++] = dev;
+
+	return 0;
+}
+
+void kvm_io_bus_unregister_dev(struct kvm *kvm,
+			       struct kvm_io_bus *bus,
+			       struct kvm_io_device *dev)
+{
+	down_write(&kvm->slots_lock);
+	__kvm_io_bus_unregister_dev(bus, dev);
+	up_write(&kvm->slots_lock);
+}
+
+/* An unlocked version. Caller must have write lock on slots_lock. */
+void __kvm_io_bus_unregister_dev(struct kvm_io_bus *bus,
+				 struct kvm_io_device *dev)
+{
+	int i;
+
+	for (i = 0; i < bus->dev_count; i++)
+		if (bus->devs[i] == dev) {
+			bus->devs[i] = bus->devs[--bus->dev_count];
+			break;
+		}
 }
 
 static struct notifier_block kvm_cpu_notifier = {
-- 
cgit v1.2.3-71-gd317


From d34e6b175e61821026893ec5298cc8e7558df43a Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Tue, 7 Jul 2009 17:08:49 -0400
Subject: KVM: add ioeventfd support

ioeventfd is a mechanism to register PIO/MMIO regions to trigger an eventfd
signal when written to by a guest.  Host userspace can register any
arbitrary IO address with a corresponding eventfd and then pass the eventfd
to a specific end-point of interest for handling.

Normal IO requires a blocking round-trip since the operation may cause
side-effects in the emulated model or may return data to the caller.
Therefore, an IO in KVM traps from the guest to the host, causes a VMX/SVM
"heavy-weight" exit back to userspace, and is ultimately serviced by qemu's
device model synchronously before returning control back to the vcpu.

However, there is a subclass of IO which acts purely as a trigger for
other IO (such as to kick off an out-of-band DMA request, etc).  For these
patterns, the synchronous call is particularly expensive since we really
only want to simply get our notification transmitted asychronously and
return as quickly as possible.  All the sychronous infrastructure to ensure
proper data-dependencies are met in the normal IO case are just unecessary
overhead for signalling.  This adds additional computational load on the
system, as well as latency to the signalling path.

Therefore, we provide a mechanism for registration of an in-kernel trigger
point that allows the VCPU to only require a very brief, lightweight
exit just long enough to signal an eventfd.  This also means that any
clients compatible with the eventfd interface (which includes userspace
and kernelspace equally well) can now register to be notified. The end
result should be a more flexible and higher performance notification API
for the backend KVM hypervisor and perhipheral components.

To test this theory, we built a test-harness called "doorbell".  This
module has a function called "doorbell_ring()" which simply increments a
counter for each time the doorbell is signaled.  It supports signalling
from either an eventfd, or an ioctl().

We then wired up two paths to the doorbell: One via QEMU via a registered
io region and through the doorbell ioctl().  The other is direct via
ioeventfd.

You can download this test harness here:

ftp://ftp.novell.com/dev/ghaskins/doorbell.tar.bz2

The measured results are as follows:

qemu-mmio:       110000 iops, 9.09us rtt
ioeventfd-mmio: 200100 iops, 5.00us rtt
ioeventfd-pio:  367300 iops, 2.72us rtt

I didn't measure qemu-pio, because I have to figure out how to register a
PIO region with qemu's device model, and I got lazy.  However, for now we
can extrapolate based on the data from the NULLIO runs of +2.56us for MMIO,
and -350ns for HC, we get:

qemu-pio:      153139 iops, 6.53us rtt
ioeventfd-hc: 412585 iops, 2.37us rtt

these are just for fun, for now, until I can gather more data.

Here is a graph for your convenience:

http://developer.novell.com/wiki/images/7/76/Iofd-chart.png

The conclusion to draw is that we save about 4us by skipping the userspace
hop.

--------------------

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c       |   1 +
 include/linux/kvm.h      |  24 +++++
 include/linux/kvm_host.h |  10 +-
 virt/kvm/eventfd.c       | 251 ++++++++++++++++++++++++++++++++++++++++++++++-
 virt/kvm/kvm_main.c      |  11 ++-
 5 files changed, 293 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2214384ff610..42160b031fcd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1212,6 +1212,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_IRQ_INJECT_STATUS:
 	case KVM_CAP_ASSIGN_DEV_IRQ:
 	case KVM_CAP_IRQFD:
+	case KVM_CAP_IOEVENTFD:
 	case KVM_CAP_PIT2:
 	case KVM_CAP_PIT_STATE2:
 		r = 1;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index a74a1fcc28e9..230a91aa61c9 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -307,6 +307,28 @@ struct kvm_guest_debug {
 	struct kvm_guest_debug_arch arch;
 };
 
+enum {
+	kvm_ioeventfd_flag_nr_datamatch,
+	kvm_ioeventfd_flag_nr_pio,
+	kvm_ioeventfd_flag_nr_deassign,
+	kvm_ioeventfd_flag_nr_max,
+};
+
+#define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
+#define KVM_IOEVENTFD_FLAG_PIO       (1 << kvm_ioeventfd_flag_nr_pio)
+#define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
+
+#define KVM_IOEVENTFD_VALID_FLAG_MASK  ((1 << kvm_ioeventfd_flag_nr_max) - 1)
+
+struct kvm_ioeventfd {
+	__u64 datamatch;
+	__u64 addr;        /* legal pio/mmio address */
+	__u32 len;         /* 1, 2, 4, or 8 bytes    */
+	__s32 fd;
+	__u32 flags;
+	__u8  pad[36];
+};
+
 #define KVM_TRC_SHIFT           16
 /*
  * kvm trace categories
@@ -412,6 +434,7 @@ struct kvm_guest_debug {
 #ifdef __KVM_HAVE_PIT_STATE2
 #define KVM_CAP_PIT_STATE2 35
 #endif
+#define KVM_CAP_IOEVENTFD 36
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -520,6 +543,7 @@ struct kvm_irqfd {
 #define KVM_IRQFD                  _IOW(KVMIO, 0x76, struct kvm_irqfd)
 #define KVM_CREATE_PIT2		   _IOW(KVMIO, 0x77, struct kvm_pit_config)
 #define KVM_SET_BOOT_CPU_ID        _IO(KVMIO, 0x78)
+#define KVM_IOEVENTFD             _IOW(KVMIO, 0x79, struct kvm_ioeventfd)
 
 /*
  * ioctls for vcpu fds
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 983b0bdeb3ff..6ec9fc56a49e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -155,6 +155,7 @@ struct kvm {
 		spinlock_t        lock;
 		struct list_head  items;
 	} irqfds;
+	struct list_head ioeventfds;
 #endif
 	struct kvm_vm_stat stat;
 	struct kvm_arch arch;
@@ -528,19 +529,24 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {}
 
 #ifdef CONFIG_HAVE_KVM_EVENTFD
 
-void kvm_irqfd_init(struct kvm *kvm);
+void kvm_eventfd_init(struct kvm *kvm);
 int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags);
 void kvm_irqfd_release(struct kvm *kvm);
+int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
 
 #else
 
-static inline void kvm_irqfd_init(struct kvm *kvm) {}
+static inline void kvm_eventfd_init(struct kvm *kvm) {}
 static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
 {
 	return -EINVAL;
 }
 
 static inline void kvm_irqfd_release(struct kvm *kvm) {}
+static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
+{
+	return -ENOSYS;
+}
 
 #endif /* CONFIG_HAVE_KVM_EVENTFD */
 
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 4092b8dcd510..99017e8a92ac 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -21,6 +21,7 @@
  */
 
 #include <linux/kvm_host.h>
+#include <linux/kvm.h>
 #include <linux/workqueue.h>
 #include <linux/syscalls.h>
 #include <linux/wait.h>
@@ -28,6 +29,9 @@
 #include <linux/file.h>
 #include <linux/list.h>
 #include <linux/eventfd.h>
+#include <linux/kernel.h>
+
+#include "iodev.h"
 
 /*
  * --------------------------------------------------------------------
@@ -234,10 +238,11 @@ fail:
 }
 
 void
-kvm_irqfd_init(struct kvm *kvm)
+kvm_eventfd_init(struct kvm *kvm)
 {
 	spin_lock_init(&kvm->irqfds.lock);
 	INIT_LIST_HEAD(&kvm->irqfds.items);
+	INIT_LIST_HEAD(&kvm->ioeventfds);
 }
 
 /*
@@ -327,3 +332,247 @@ static void __exit irqfd_module_exit(void)
 
 module_init(irqfd_module_init);
 module_exit(irqfd_module_exit);
+
+/*
+ * --------------------------------------------------------------------
+ * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal.
+ *
+ * userspace can register a PIO/MMIO address with an eventfd for receiving
+ * notification when the memory has been touched.
+ * --------------------------------------------------------------------
+ */
+
+struct _ioeventfd {
+	struct list_head     list;
+	u64                  addr;
+	int                  length;
+	struct eventfd_ctx  *eventfd;
+	u64                  datamatch;
+	struct kvm_io_device dev;
+	bool                 wildcard;
+};
+
+static inline struct _ioeventfd *
+to_ioeventfd(struct kvm_io_device *dev)
+{
+	return container_of(dev, struct _ioeventfd, dev);
+}
+
+static void
+ioeventfd_release(struct _ioeventfd *p)
+{
+	eventfd_ctx_put(p->eventfd);
+	list_del(&p->list);
+	kfree(p);
+}
+
+static bool
+ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
+{
+	u64 _val;
+
+	if (!(addr == p->addr && len == p->length))
+		/* address-range must be precise for a hit */
+		return false;
+
+	if (p->wildcard)
+		/* all else equal, wildcard is always a hit */
+		return true;
+
+	/* otherwise, we have to actually compare the data */
+
+	BUG_ON(!IS_ALIGNED((unsigned long)val, len));
+
+	switch (len) {
+	case 1:
+		_val = *(u8 *)val;
+		break;
+	case 2:
+		_val = *(u16 *)val;
+		break;
+	case 4:
+		_val = *(u32 *)val;
+		break;
+	case 8:
+		_val = *(u64 *)val;
+		break;
+	default:
+		return false;
+	}
+
+	return _val == p->datamatch ? true : false;
+}
+
+/* MMIO/PIO writes trigger an event if the addr/val match */
+static int
+ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
+		const void *val)
+{
+	struct _ioeventfd *p = to_ioeventfd(this);
+
+	if (!ioeventfd_in_range(p, addr, len, val))
+		return -EOPNOTSUPP;
+
+	eventfd_signal(p->eventfd, 1);
+	return 0;
+}
+
+/*
+ * This function is called as KVM is completely shutting down.  We do not
+ * need to worry about locking just nuke anything we have as quickly as possible
+ */
+static void
+ioeventfd_destructor(struct kvm_io_device *this)
+{
+	struct _ioeventfd *p = to_ioeventfd(this);
+
+	ioeventfd_release(p);
+}
+
+static const struct kvm_io_device_ops ioeventfd_ops = {
+	.write      = ioeventfd_write,
+	.destructor = ioeventfd_destructor,
+};
+
+/* assumes kvm->slots_lock held */
+static bool
+ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
+{
+	struct _ioeventfd *_p;
+
+	list_for_each_entry(_p, &kvm->ioeventfds, list)
+		if (_p->addr == p->addr && _p->length == p->length &&
+		    (_p->wildcard || p->wildcard ||
+		     _p->datamatch == p->datamatch))
+			return true;
+
+	return false;
+}
+
+static int
+kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
+{
+	int                       pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
+	struct kvm_io_bus        *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus;
+	struct _ioeventfd        *p;
+	struct eventfd_ctx       *eventfd;
+	int                       ret;
+
+	/* must be natural-word sized */
+	switch (args->len) {
+	case 1:
+	case 2:
+	case 4:
+	case 8:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* check for range overflow */
+	if (args->addr + args->len < args->addr)
+		return -EINVAL;
+
+	/* check for extra flags that we don't understand */
+	if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
+		return -EINVAL;
+
+	eventfd = eventfd_ctx_fdget(args->fd);
+	if (IS_ERR(eventfd))
+		return PTR_ERR(eventfd);
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	INIT_LIST_HEAD(&p->list);
+	p->addr    = args->addr;
+	p->length  = args->len;
+	p->eventfd = eventfd;
+
+	/* The datamatch feature is optional, otherwise this is a wildcard */
+	if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
+		p->datamatch = args->datamatch;
+	else
+		p->wildcard = true;
+
+	down_write(&kvm->slots_lock);
+
+	/* Verify that there isnt a match already */
+	if (ioeventfd_check_collision(kvm, p)) {
+		ret = -EEXIST;
+		goto unlock_fail;
+	}
+
+	kvm_iodevice_init(&p->dev, &ioeventfd_ops);
+
+	ret = __kvm_io_bus_register_dev(bus, &p->dev);
+	if (ret < 0)
+		goto unlock_fail;
+
+	list_add_tail(&p->list, &kvm->ioeventfds);
+
+	up_write(&kvm->slots_lock);
+
+	return 0;
+
+unlock_fail:
+	up_write(&kvm->slots_lock);
+
+fail:
+	kfree(p);
+	eventfd_ctx_put(eventfd);
+
+	return ret;
+}
+
+static int
+kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
+{
+	int                       pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
+	struct kvm_io_bus        *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus;
+	struct _ioeventfd        *p, *tmp;
+	struct eventfd_ctx       *eventfd;
+	int                       ret = -ENOENT;
+
+	eventfd = eventfd_ctx_fdget(args->fd);
+	if (IS_ERR(eventfd))
+		return PTR_ERR(eventfd);
+
+	down_write(&kvm->slots_lock);
+
+	list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {
+		bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
+
+		if (p->eventfd != eventfd  ||
+		    p->addr != args->addr  ||
+		    p->length != args->len ||
+		    p->wildcard != wildcard)
+			continue;
+
+		if (!p->wildcard && p->datamatch != args->datamatch)
+			continue;
+
+		__kvm_io_bus_unregister_dev(bus, &p->dev);
+		ioeventfd_release(p);
+		ret = 0;
+		break;
+	}
+
+	up_write(&kvm->slots_lock);
+
+	eventfd_ctx_put(eventfd);
+
+	return ret;
+}
+
+int
+kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
+{
+	if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN)
+		return kvm_deassign_ioeventfd(kvm, args);
+
+	return kvm_assign_ioeventfd(kvm, args);
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9c2fd025b8ae..d7b9bbba26da 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -979,7 +979,7 @@ static struct kvm *kvm_create_vm(void)
 	spin_lock_init(&kvm->mmu_lock);
 	spin_lock_init(&kvm->requests_lock);
 	kvm_io_bus_init(&kvm->pio_bus);
-	kvm_irqfd_init(kvm);
+	kvm_eventfd_init(kvm);
 	mutex_init(&kvm->lock);
 	mutex_init(&kvm->irq_lock);
 	kvm_io_bus_init(&kvm->mmio_bus);
@@ -2271,6 +2271,15 @@ static long kvm_vm_ioctl(struct file *filp,
 		r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags);
 		break;
 	}
+	case KVM_IOEVENTFD: {
+		struct kvm_ioeventfd data;
+
+		r = -EFAULT;
+		if (copy_from_user(&data, argp, sizeof data))
+			goto out;
+		r = kvm_ioeventfd(kvm, &data);
+		break;
+	}
 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
 	case KVM_SET_BOOT_CPU_ID:
 		r = 0;
-- 
cgit v1.2.3-71-gd317


From 0b71785dc05f1f66e6268022b9953c0d6a9985c6 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 9 Jul 2009 15:33:53 +0300
Subject: KVM: Move kvm_cpu_get_interrupt() declaration to x86 code

It is implemented only by x86.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 include/linux/kvm_host.h        | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 30b625d8e5f0..3f4f00a23536 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -797,5 +797,6 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6ec9fc56a49e..2c6a5f008746 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -341,7 +341,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm);
 void kvm_free_all_assigned_devices(struct kvm *kvm);
 void kvm_arch_sync_events(struct kvm *kvm);
 
-int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
-- 
cgit v1.2.3-71-gd317


From a1b37100d9e29c1f8dc3e2f5490a205c80180e01 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 9 Jul 2009 15:33:52 +0300
Subject: KVM: Reduce runnability interface with arch support code

Remove kvm_cpu_has_interrupt() and kvm_arch_interrupt_allowed() from
interface between general code and arch code. kvm_arch_vcpu_runnable()
checks for interrupts instead.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c        | 16 ++--------------
 arch/powerpc/kvm/powerpc.c      | 13 +------------
 arch/s390/kvm/interrupt.c       |  8 +-------
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/x86.c              |  6 ++++--
 include/linux/kvm_host.h        |  2 --
 virt/kvm/kvm_main.c             |  4 +---
 7 files changed, 11 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index d7aa6bb8f477..0ad09f05efa9 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1935,19 +1935,6 @@ int kvm_highest_pending_irq(struct kvm_vcpu *vcpu)
     return find_highest_bits((int *)&vpd->irr[0]);
 }
 
-int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
-{
-	if (kvm_highest_pending_irq(vcpu) != -1)
-		return 1;
-	return 0;
-}
-
-int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
-{
-	/* do real check here */
-	return 1;
-}
-
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.timer_fired;
@@ -1960,7 +1947,8 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
-	return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE;
+	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) ||
+		(kvm_highest_pending_irq(vcpu) != -1);
 }
 
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 0341391eff12..2a4551f78f60 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -39,20 +39,9 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
 	return gfn;
 }
 
-int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
-{
-	return !!(v->arch.pending_exceptions);
-}
-
-int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
-{
-	/* do real check here */
-	return 1;
-}
-
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
-	return !(v->arch.msr & MSR_WE);
+	return !(v->arch.msr & MSR_WE) || !!(v->arch.pending_exceptions);
 }
 
 
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 4d613415c435..2c2f98353415 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -283,7 +283,7 @@ static int __try_deliver_ckc_interrupt(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
-int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
+static int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
 {
 	struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
 	struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int;
@@ -320,12 +320,6 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
 	return rc;
 }
 
-int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
-{
-	/* do real check here */
-	return 1;
-}
-
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
 	return 0;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3f4f00a23536..08732d7b6d98 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -797,6 +797,8 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
+int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 11cfd897aac6..b87d65d89a05 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4890,8 +4890,10 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
-	       || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
-	       || vcpu->arch.nmi_pending;
+		|| vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
+		|| vcpu->arch.nmi_pending ||
+		(kvm_arch_interrupt_allowed(vcpu) &&
+		 kvm_cpu_has_interrupt(vcpu));
 }
 
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2c6a5f008746..4af56036a6bf 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -332,7 +332,6 @@ int kvm_arch_hardware_setup(void);
 void kvm_arch_hardware_unsetup(void);
 void kvm_arch_check_processor_compat(void *rtn);
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
-int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
 
 void kvm_free_physmem(struct kvm *kvm);
 
@@ -341,7 +340,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm);
 void kvm_free_all_assigned_devices(struct kvm *kvm);
 void kvm_arch_sync_events(struct kvm *kvm);
 
-int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d7b9bbba26da..532af9b41ee3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1666,9 +1666,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 	for (;;) {
 		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 
-		if ((kvm_arch_interrupt_allowed(vcpu) &&
-					kvm_cpu_has_interrupt(vcpu)) ||
-				kvm_arch_vcpu_runnable(vcpu)) {
+		if (kvm_arch_vcpu_runnable(vcpu)) {
 			set_bit(KVM_REQ_UNHALT, &vcpu->requests);
 			break;
 		}
-- 
cgit v1.2.3-71-gd317


From b927a3cec081a605142f5b7e90b730611bee28b1 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Tue, 21 Jul 2009 10:42:48 +0800
Subject: KVM: VMX: Introduce KVM_SET_IDENTITY_MAP_ADDR ioctl

Now KVM allow guest to modify guest's physical address of EPT's identity mapping page.

(change from v1, discard unnecessary check, change ioctl to accept parameter
address rather than value)

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/vmx.c              | 15 ++++++++++-----
 arch/x86/kvm/x86.c              | 19 +++++++++++++++++++
 include/linux/kvm.h             |  2 ++
 4 files changed, 32 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 08732d7b6d98..e210b218df44 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -411,6 +411,7 @@ struct kvm_arch{
 
 	struct page *ept_identity_pagetable;
 	bool ept_identity_pagetable_done;
+	gpa_t ept_identity_map_addr;
 
 	unsigned long irq_sources_bitmap;
 	unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c6256b98f078..686e1abb6816 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1719,7 +1719,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 		eptp = construct_eptp(cr3);
 		vmcs_write64(EPT_POINTER, eptp);
 		guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
-			VMX_EPT_IDENTITY_PAGETABLE_ADDR;
+			vcpu->kvm->arch.ept_identity_map_addr;
 	}
 
 	vmx_flush_tlb(vcpu);
@@ -2122,7 +2122,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
 	if (likely(kvm->arch.ept_identity_pagetable_done))
 		return 1;
 	ret = 0;
-	identity_map_pfn = VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT;
+	identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
 	r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
 	if (r < 0)
 		goto out;
@@ -2191,14 +2191,15 @@ static int alloc_identity_pagetable(struct kvm *kvm)
 		goto out;
 	kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
 	kvm_userspace_mem.flags = 0;
-	kvm_userspace_mem.guest_phys_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
+	kvm_userspace_mem.guest_phys_addr =
+		kvm->arch.ept_identity_map_addr;
 	kvm_userspace_mem.memory_size = PAGE_SIZE;
 	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
 	if (r)
 		goto out;
 
 	kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
-			VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT);
+			kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
 out:
 	up_write(&kvm->slots_lock);
 	return r;
@@ -3814,9 +3815,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 		if (alloc_apic_access_page(kvm) != 0)
 			goto free_vmcs;
 
-	if (enable_ept)
+	if (enable_ept) {
+		if (!kvm->arch.ept_identity_map_addr)
+			kvm->arch.ept_identity_map_addr =
+				VMX_EPT_IDENTITY_PAGETABLE_ADDR;
 		if (alloc_identity_pagetable(kvm) != 0)
 			goto free_vmcs;
+	}
 
 	return &vmx->vcpu;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c7ec0c921c01..f4cb1baaa04b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1206,6 +1206,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_IOEVENTFD:
 	case KVM_CAP_PIT2:
 	case KVM_CAP_PIT_STATE2:
+	case KVM_CAP_SET_IDENTITY_MAP_ADDR:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -1906,6 +1907,13 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
 	return ret;
 }
 
+static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
+					      u64 ident_addr)
+{
+	kvm->arch.ept_identity_map_addr = ident_addr;
+	return 0;
+}
+
 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
 					  u32 kvm_nr_mmu_pages)
 {
@@ -2173,6 +2181,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		if (r < 0)
 			goto out;
 		break;
+	case KVM_SET_IDENTITY_MAP_ADDR: {
+		u64 ident_addr;
+
+		r = -EFAULT;
+		if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
+			goto out;
+		r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
+		if (r < 0)
+			goto out;
+		break;
+	}
 	case KVM_SET_MEMORY_REGION: {
 		struct kvm_memory_region kvm_mem;
 		struct kvm_userspace_memory_region kvm_userspace_mem;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 230a91aa61c9..f8f8900fc5ec 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -435,6 +435,7 @@ struct kvm_ioeventfd {
 #define KVM_CAP_PIT_STATE2 35
 #endif
 #define KVM_CAP_IOEVENTFD 36
+#define KVM_CAP_SET_IDENTITY_MAP_ADDR 37
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -512,6 +513,7 @@ struct kvm_irqfd {
 #define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\
 					struct kvm_userspace_memory_region)
 #define KVM_SET_TSS_ADDR          _IO(KVMIO, 0x47)
+#define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64)
 /* Device model IOC */
 #define KVM_CREATE_IRQCHIP	  _IO(KVMIO,  0x60)
 #define KVM_IRQ_LINE		  _IOW(KVMIO, 0x61, struct kvm_irq_level)
-- 
cgit v1.2.3-71-gd317


From 07708c4af1346ab1521b26a202f438366b7bcffd Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 3 Aug 2009 18:43:28 +0200
Subject: KVM: x86: Disallow hypercalls for guest callers in rings > 0

So far unprivileged guest callers running in ring 3 can issue, e.g., MMU
hypercalls. Normally, such callers cannot provide any hand-crafted MMU
command structure as it has to be passed by its physical address, but
they can still crash the guest kernel by passing random addresses.

To close the hole, this patch considers hypercalls valid only if issued
from guest ring 0. This may still be relaxed on a per-hypercall base in
the future once required.

Cc: stable@kernel.org
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c       | 6 ++++++
 include/linux/kvm_para.h | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fa525d511d92..92b5eddaa877 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3213,6 +3213,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 		a3 &= 0xFFFFFFFF;
 	}
 
+	if (kvm_x86_ops->get_cpl(vcpu) != 0) {
+		ret = -KVM_EPERM;
+		goto out;
+	}
+
 	switch (nr) {
 	case KVM_HC_VAPIC_POLL_IRQ:
 		ret = 0;
@@ -3224,6 +3229,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 		ret = -KVM_ENOSYS;
 		break;
 	}
+out:
 	kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
 	++vcpu->stat.hypercalls;
 	return r;
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
index 3ddce03766ca..d73109243fda 100644
--- a/include/linux/kvm_para.h
+++ b/include/linux/kvm_para.h
@@ -13,6 +13,7 @@
 #define KVM_ENOSYS		1000
 #define KVM_EFAULT		EFAULT
 #define KVM_E2BIG		E2BIG
+#define KVM_EPERM		EPERM
 
 #define KVM_HC_VAPIC_POLL_IRQ		1
 #define KVM_HC_MMU_OP			2
-- 
cgit v1.2.3-71-gd317


From da18acffc3f13c4f187c20bc5dcfcb110b374b48 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 10 Aug 2009 15:59:25 +0300
Subject: KVM: export kvm_para.h

kvm_para.h contains userspace interface and so
should be exported.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/asm-generic/Kbuild.asm | 5 +++++
 include/linux/Kbuild           | 4 ++++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/asm-generic/Kbuild.asm b/include/asm-generic/Kbuild.asm
index 290910e4ede4..96d7c9804dc1 100644
--- a/include/asm-generic/Kbuild.asm
+++ b/include/asm-generic/Kbuild.asm
@@ -3,6 +3,11 @@ ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm.h \
 header-y  += kvm.h
 endif
 
+ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm_para.h \
+		  $(srctree)/include/asm-$(SRCARCH)/kvm_para.h),)
+header-y  += kvm_para.h
+endif
+
 ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/a.out.h \
       		  $(srctree)/include/asm-$(SRCARCH)/a.out.h),)
 unifdef-y += a.out.h
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 334a3593cdfd..cff4a101f266 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -268,6 +268,10 @@ ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm.h \
       		  $(srctree)/include/asm-$(SRCARCH)/kvm.h),)
 unifdef-y += kvm.h
 endif
+ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm_para.h \
+		  $(srctree)/include/asm-$(SRCARCH)/kvm_para.h),)
+unifdef-y += kvm_para.h
+endif
 unifdef-y += llc.h
 unifdef-y += loop.h
 unifdef-y += lp.h
-- 
cgit v1.2.3-71-gd317


From 1a5aeeecd550ee4344cfba1791f1134739b16dc6 Mon Sep 17 00:00:00 2001
From: Maciej Sosnowski <maciej.sosnowski@intel.com>
Date: Thu, 10 Sep 2009 15:05:58 +0200
Subject: dca: registering requesters in multiple dca domains

This patch enables DCA support on multiple-IOH/multiple-IIO architectures.
It modifies dca module by replacing single dca_providers list
with dca_domains list, each domain containing separate list of providers.
This approach lets dca driver manage multiple domains, i.e. sets of providers
and requesters mapped back to the same PCI root complex device.
The driver takes care to register each requester to a provider
from the same domain.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
---
 drivers/dca/dca-core.c | 122 +++++++++++++++++++++++++++++++++++++++++++------
 drivers/dma/ioat/pci.c |   2 +-
 include/linux/dca.h    |  11 ++++-
 3 files changed, 120 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dca/dca-core.c b/drivers/dca/dca-core.c
index 25b743abfb59..7e318de0904b 100644
--- a/drivers/dca/dca-core.c
+++ b/drivers/dca/dca-core.c
@@ -28,7 +28,7 @@
 #include <linux/device.h>
 #include <linux/dca.h>
 
-#define DCA_VERSION "1.8"
+#define DCA_VERSION "1.12.1"
 
 MODULE_VERSION(DCA_VERSION);
 MODULE_LICENSE("GPL");
@@ -36,20 +36,92 @@ MODULE_AUTHOR("Intel Corporation");
 
 static DEFINE_SPINLOCK(dca_lock);
 
-static LIST_HEAD(dca_providers);
+static LIST_HEAD(dca_domains);
 
-static struct dca_provider *dca_find_provider_by_dev(struct device *dev)
+static struct pci_bus *dca_pci_rc_from_dev(struct device *dev)
 {
-	struct dca_provider *dca, *ret = NULL;
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct pci_bus *bus = pdev->bus;
 
-	list_for_each_entry(dca, &dca_providers, node) {
-		if ((!dev) || (dca->ops->dev_managed(dca, dev))) {
-			ret = dca;
-			break;
-		}
+	while (bus->parent)
+		bus = bus->parent;
+
+	return bus;
+}
+
+static struct dca_domain *dca_allocate_domain(struct pci_bus *rc)
+{
+	struct dca_domain *domain;
+
+	domain = kzalloc(sizeof(*domain), GFP_NOWAIT);
+	if (!domain)
+		return NULL;
+
+	INIT_LIST_HEAD(&domain->dca_providers);
+	domain->pci_rc = rc;
+
+	return domain;
+}
+
+static void dca_free_domain(struct dca_domain *domain)
+{
+	list_del(&domain->node);
+	kfree(domain);
+}
+
+static struct dca_domain *dca_find_domain(struct pci_bus *rc)
+{
+	struct dca_domain *domain;
+
+	list_for_each_entry(domain, &dca_domains, node)
+		if (domain->pci_rc == rc)
+			return domain;
+
+	return NULL;
+}
+
+static struct dca_domain *dca_get_domain(struct device *dev)
+{
+	struct pci_bus *rc;
+	struct dca_domain *domain;
+
+	rc = dca_pci_rc_from_dev(dev);
+	domain = dca_find_domain(rc);
+
+	if (!domain) {
+		domain = dca_allocate_domain(rc);
+		if (domain)
+			list_add(&domain->node, &dca_domains);
+	}
+
+	return domain;
+}
+
+static struct dca_provider *dca_find_provider_by_dev(struct device *dev)
+{
+	struct dca_provider *dca;
+	struct pci_bus *rc;
+	struct dca_domain *domain;
+
+	if (dev) {
+		rc = dca_pci_rc_from_dev(dev);
+		domain = dca_find_domain(rc);
+		if (!domain)
+			return NULL;
+	} else {
+		if (!list_empty(&dca_domains))
+			domain = list_first_entry(&dca_domains,
+						  struct dca_domain,
+						  node);
+		else
+			return NULL;
 	}
 
-	return ret;
+	list_for_each_entry(dca, &domain->dca_providers, node)
+		if ((!dev) || (dca->ops->dev_managed(dca, dev)))
+			return dca;
+
+	return NULL;
 }
 
 /**
@@ -61,6 +133,8 @@ int dca_add_requester(struct device *dev)
 	struct dca_provider *dca;
 	int err, slot = -ENODEV;
 	unsigned long flags;
+	struct pci_bus *pci_rc;
+	struct dca_domain *domain;
 
 	if (!dev)
 		return -EFAULT;
@@ -74,7 +148,14 @@ int dca_add_requester(struct device *dev)
 		return -EEXIST;
 	}
 
-	list_for_each_entry(dca, &dca_providers, node) {
+	pci_rc = dca_pci_rc_from_dev(dev);
+	domain = dca_find_domain(pci_rc);
+	if (!domain) {
+		spin_unlock_irqrestore(&dca_lock, flags);
+		return -ENODEV;
+	}
+
+	list_for_each_entry(dca, &domain->dca_providers, node) {
 		slot = dca->ops->add_requester(dca, dev);
 		if (slot >= 0)
 			break;
@@ -222,13 +303,19 @@ int register_dca_provider(struct dca_provider *dca, struct device *dev)
 {
 	int err;
 	unsigned long flags;
+	struct dca_domain *domain;
 
 	err = dca_sysfs_add_provider(dca, dev);
 	if (err)
 		return err;
 
 	spin_lock_irqsave(&dca_lock, flags);
-	list_add(&dca->node, &dca_providers);
+	domain = dca_get_domain(dev);
+	if (!domain) {
+		spin_unlock_irqrestore(&dca_lock, flags);
+		return -ENODEV;
+	}
+	list_add(&dca->node, &domain->dca_providers);
 	spin_unlock_irqrestore(&dca_lock, flags);
 
 	blocking_notifier_call_chain(&dca_provider_chain,
@@ -241,15 +328,24 @@ EXPORT_SYMBOL_GPL(register_dca_provider);
  * unregister_dca_provider - remove a dca provider
  * @dca - struct created by alloc_dca_provider()
  */
-void unregister_dca_provider(struct dca_provider *dca)
+void unregister_dca_provider(struct dca_provider *dca, struct device *dev)
 {
 	unsigned long flags;
+	struct pci_bus *pci_rc;
+	struct dca_domain *domain;
 
 	blocking_notifier_call_chain(&dca_provider_chain,
 				     DCA_PROVIDER_REMOVE, NULL);
 
 	spin_lock_irqsave(&dca_lock, flags);
+
 	list_del(&dca->node);
+
+	pci_rc = dca_pci_rc_from_dev(dev);
+	domain = dca_find_domain(pci_rc);
+	if (list_empty(&domain->dca_providers))
+		dca_free_domain(domain);
+
 	spin_unlock_irqrestore(&dca_lock, flags);
 
 	dca_sysfs_remove_provider(dca);
diff --git a/drivers/dma/ioat/pci.c b/drivers/dma/ioat/pci.c
index c788fa266470..d545fae30f37 100644
--- a/drivers/dma/ioat/pci.c
+++ b/drivers/dma/ioat/pci.c
@@ -175,7 +175,7 @@ static void __devexit ioat_remove(struct pci_dev *pdev)
 
 	dev_err(&pdev->dev, "Removing dma and dca services\n");
 	if (device->dca) {
-		unregister_dca_provider(device->dca);
+		unregister_dca_provider(device->dca, &pdev->dev);
 		free_dca_provider(device->dca);
 		device->dca = NULL;
 	}
diff --git a/include/linux/dca.h b/include/linux/dca.h
index 9c20c7e87d0a..d27a7a05718d 100644
--- a/include/linux/dca.h
+++ b/include/linux/dca.h
@@ -20,6 +20,9 @@
  */
 #ifndef DCA_H
 #define DCA_H
+
+#include <linux/pci.h>
+
 /* DCA Provider API */
 
 /* DCA Notifier Interface */
@@ -36,6 +39,12 @@ struct dca_provider {
 	int			 id;
 };
 
+struct dca_domain {
+	struct list_head	node;
+	struct list_head	dca_providers;
+	struct pci_bus		*pci_rc;
+};
+
 struct dca_ops {
 	int	(*add_requester)    (struct dca_provider *, struct device *);
 	int	(*remove_requester) (struct dca_provider *, struct device *);
@@ -47,7 +56,7 @@ struct dca_ops {
 struct dca_provider *alloc_dca_provider(struct dca_ops *ops, int priv_size);
 void free_dca_provider(struct dca_provider *dca);
 int register_dca_provider(struct dca_provider *dca, struct device *dev);
-void unregister_dca_provider(struct dca_provider *dca);
+void unregister_dca_provider(struct dca_provider *dca, struct device *dev);
 
 static inline void *dca_priv(struct dca_provider *dca)
 {
-- 
cgit v1.2.3-71-gd317


From a82afdfcb8c0df09776b6458af6b68fc58b2e87b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Jul 2009 17:48:16 +0900
Subject: block: use the same failfast bits for bio and request

bio and request use the same set of failfast bits.  This patch makes
the following changes to simplify things.

* enumify BIO_RW* bits and reorder bits such that BIOS_RW_FAILFAST_*
  bits coincide with __REQ_FAILFAST_* bits.

* The above pushes BIO_RW_AHEAD out of sync with __REQ_FAILFAST_DEV
  but the matching is useless anyway.  init_request_from_bio() is
  responsible for setting FAILFAST bits on FS requests and non-FS
  requests never use BIO_RW_AHEAD.  Drop the code and comment from
  blk_rq_bio_prep().

* Define REQ_FAILFAST_MASK which is OR of all FAILFAST bits and
  simplify FAILFAST flags handling in init_request_from_bio().

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 19 +++++++------------
 include/linux/bio.h    | 43 +++++++++++++++++++++++--------------------
 include/linux/blkdev.h |  4 ++++
 3 files changed, 34 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index e3299a77a0d8..4daae1ee2b23 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1111,17 +1111,13 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	req->cmd_type = REQ_TYPE_FS;
 
 	/*
-	 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
+	 * Inherit FAILFAST from bio (for read-ahead, and explicit
+	 * FAILFAST).  FAILFAST flags are identical for req and bio.
 	 */
 	if (bio_rw_ahead(bio))
-		req->cmd_flags |= (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
-				   REQ_FAILFAST_DRIVER);
-	if (bio_failfast_dev(bio))
-		req->cmd_flags |= REQ_FAILFAST_DEV;
-	if (bio_failfast_transport(bio))
-		req->cmd_flags |= REQ_FAILFAST_TRANSPORT;
-	if (bio_failfast_driver(bio))
-		req->cmd_flags |= REQ_FAILFAST_DRIVER;
+		req->cmd_flags |= REQ_FAILFAST_MASK;
+	else
+		req->cmd_flags |= bio->bi_rw & REQ_FAILFAST_MASK;
 
 	if (unlikely(bio_discard(bio))) {
 		req->cmd_flags |= REQ_DISCARD;
@@ -2239,9 +2235,8 @@ EXPORT_SYMBOL(__blk_end_request_cur);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
 {
-	/* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw, and
-	   we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */
-	rq->cmd_flags |= (bio->bi_rw & 3);
+	/* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */
+	rq->cmd_flags |= bio->bi_rw & REQ_RW;
 
 	if (bio_has_data(bio)) {
 		rq->nr_phys_segments = bio_phys_segments(q, bio);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 2892b710771c..a299ed38fcd7 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -142,37 +142,40 @@ struct bio {
  *
  * bit 0 -- data direction
  *	If not set, bio is a read from device. If set, it's a write to device.
- * bit 1 -- rw-ahead when set
- * bit 2 -- barrier
+ * bit 1 -- fail fast device errors
+ * bit 2 -- fail fast transport errors
+ * bit 3 -- fail fast driver errors
+ * bit 4 -- rw-ahead when set
+ * bit 5 -- barrier
  *	Insert a serialization point in the IO queue, forcing previously
  *	submitted IO to be completed before this one is issued.
- * bit 3 -- synchronous I/O hint.
- * bit 4 -- Unplug the device immediately after submitting this bio.
- * bit 5 -- metadata request
+ * bit 6 -- synchronous I/O hint.
+ * bit 7 -- Unplug the device immediately after submitting this bio.
+ * bit 8 -- metadata request
  *	Used for tracing to differentiate metadata and data IO. May also
  *	get some preferential treatment in the IO scheduler
- * bit 6 -- discard sectors
+ * bit 9 -- discard sectors
  *	Informs the lower level device that this range of sectors is no longer
  *	used by the file system and may thus be freed by the device. Used
  *	for flash based storage.
- * bit 7 -- fail fast device errors
- * bit 8 -- fail fast transport errors
- * bit 9 -- fail fast driver errors
  *	Don't want driver retries for any fast fail whatever the reason.
  * bit 10 -- Tell the IO scheduler not to wait for more requests after this
 	one has been submitted, even if it is a SYNC request.
  */
-#define BIO_RW		0	/* Must match RW in req flags (blkdev.h) */
-#define BIO_RW_AHEAD	1	/* Must match FAILFAST in req flags */
-#define BIO_RW_BARRIER	2
-#define BIO_RW_SYNCIO	3
-#define BIO_RW_UNPLUG	4
-#define BIO_RW_META	5
-#define BIO_RW_DISCARD	6
-#define BIO_RW_FAILFAST_DEV		7
-#define BIO_RW_FAILFAST_TRANSPORT	8
-#define BIO_RW_FAILFAST_DRIVER		9
-#define BIO_RW_NOIDLE	10
+enum bio_rw_flags {
+	BIO_RW,
+	BIO_RW_FAILFAST_DEV,
+	BIO_RW_FAILFAST_TRANSPORT,
+	BIO_RW_FAILFAST_DRIVER,
+	/* above flags must match REQ_* */
+	BIO_RW_AHEAD,
+	BIO_RW_BARRIER,
+	BIO_RW_SYNCIO,
+	BIO_RW_UNPLUG,
+	BIO_RW_META,
+	BIO_RW_DISCARD,
+	BIO_RW_NOIDLE,
+};
 
 #define bio_rw_flagged(bio, flag)	((bio)->bi_rw & (1 << (flag)))
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 69103e053c92..c3015736d814 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -93,6 +93,7 @@ enum rq_flag_bits {
 	__REQ_FAILFAST_DEV,	/* no driver retries of device errors */
 	__REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */
 	__REQ_FAILFAST_DRIVER,	/* no driver retries of driver errors */
+	/* above flags must match BIO_RW_* */
 	__REQ_DISCARD,		/* request to discard sectors */
 	__REQ_SORTED,		/* elevator knows about this request */
 	__REQ_SOFTBARRIER,	/* may not be passed by ioscheduler */
@@ -143,6 +144,9 @@ enum rq_flag_bits {
 #define REQ_NOIDLE	(1 << __REQ_NOIDLE)
 #define REQ_IO_STAT	(1 << __REQ_IO_STAT)
 
+#define REQ_FAILFAST_MASK	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | \
+				 REQ_FAILFAST_DRIVER)
+
 #define BLK_MAX_CDB	16
 
 /*
-- 
cgit v1.2.3-71-gd317


From 80a761fd33cf812f771e212139157bf8f58d4b3f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Jul 2009 17:48:17 +0900
Subject: block: implement mixed merge of different failfast requests

Failfast has characteristics from other attributes.  When issuing,
executing and successuflly completing requests, failfast doesn't make
any difference.  It only affects how a request is handled on failure.
Allowing requests with different failfast settings to be merged cause
normal IOs to fail prematurely while not allowing has performance
penalties as failfast is used for read aheads which are likely to be
located near in-flight or to-be-issued normal IOs.

This patch introduces the concept of 'mixed merge'.  A request is a
mixed merge if it is merge of segments which require different
handling on failure.  Currently the only mixable attributes are
failfast ones (or lack thereof).

When a bio with different failfast settings is added to an existing
request or requests of different failfast settings are merged, the
merged request is marked mixed.  Each bio carries failfast settings
and the request always tracks failfast state of the first bio.  When
the request fails, blk_rq_err_bytes() can be used to determine how
many bytes can be safely failed without crossing into an area which
requires further retrials.

This allows request merging regardless of failfast settings while
keeping the failure handling correct.

This patch only implements mixed merge but doesn't enable it.  The
next one will update SCSI to make use of mixed merge.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Niel Lambrechts <niel.lambrechts@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++
 block/blk-merge.c      | 43 ++++++++++++++++++++++
 block/blk.h            |  1 +
 include/linux/blkdev.h | 23 +++++++++---
 4 files changed, 161 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 4daae1ee2b23..c822239bcc9d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1157,6 +1157,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	const unsigned short prio = bio_prio(bio);
 	const int sync = bio_sync(bio);
 	const int unplug = bio_unplug(bio);
+	const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
 	int rw_flags;
 
 	if (bio_barrier(bio) && bio_has_data(bio) &&
@@ -1186,6 +1187,9 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 
 		trace_block_bio_backmerge(q, bio);
 
+		if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
+			blk_rq_set_mixed_merge(req);
+
 		req->biotail->bi_next = bio;
 		req->biotail = bio;
 		req->__data_len += bytes;
@@ -1205,6 +1209,12 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 
 		trace_block_bio_frontmerge(q, bio);
 
+		if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) {
+			blk_rq_set_mixed_merge(req);
+			req->cmd_flags &= ~REQ_FAILFAST_MASK;
+			req->cmd_flags |= ff;
+		}
+
 		bio->bi_next = req->bio;
 		req->bio = bio;
 
@@ -1649,6 +1659,50 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 }
 EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
 
+/**
+ * blk_rq_err_bytes - determine number of bytes till the next failure boundary
+ * @rq: request to examine
+ *
+ * Description:
+ *     A request could be merge of IOs which require different failure
+ *     handling.  This function determines the number of bytes which
+ *     can be failed from the beginning of the request without
+ *     crossing into area which need to be retried further.
+ *
+ * Return:
+ *     The number of bytes to fail.
+ *
+ * Context:
+ *     queue_lock must be held.
+ */
+unsigned int blk_rq_err_bytes(const struct request *rq)
+{
+	unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
+	unsigned int bytes = 0;
+	struct bio *bio;
+
+	if (!(rq->cmd_flags & REQ_MIXED_MERGE))
+		return blk_rq_bytes(rq);
+
+	/*
+	 * Currently the only 'mixing' which can happen is between
+	 * different fastfail types.  We can safely fail portions
+	 * which have all the failfast bits that the first one has -
+	 * the ones which are at least as eager to fail as the first
+	 * one.
+	 */
+	for (bio = rq->bio; bio; bio = bio->bi_next) {
+		if ((bio->bi_rw & ff) != ff)
+			break;
+		bytes += bio->bi_size;
+	}
+
+	/* this could lead to infinite loop */
+	BUG_ON(blk_rq_bytes(rq) && !bytes);
+	return bytes;
+}
+EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
+
 static void blk_account_io_completion(struct request *req, unsigned int bytes)
 {
 	if (blk_do_io_stat(req)) {
@@ -1995,6 +2049,12 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 	if (blk_fs_request(req) || blk_discard_rq(req))
 		req->__sector += total_bytes >> 9;
 
+	/* mixed attributes always follow the first bio */
+	if (req->cmd_flags & REQ_MIXED_MERGE) {
+		req->cmd_flags &= ~REQ_FAILFAST_MASK;
+		req->cmd_flags |= req->bio->bi_rw & REQ_FAILFAST_MASK;
+	}
+
 	/*
 	 * If total number of sectors is less than the first segment
 	 * size, something has gone terribly wrong.
@@ -2173,6 +2233,25 @@ bool blk_end_request_cur(struct request *rq, int error)
 }
 EXPORT_SYMBOL(blk_end_request_cur);
 
+/**
+ * blk_end_request_err - Finish a request till the next failure boundary.
+ * @rq: the request to finish till the next failure boundary for
+ * @error: must be negative errno
+ *
+ * Description:
+ *     Complete @rq till the next failure boundary.
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
+ */
+bool blk_end_request_err(struct request *rq, int error)
+{
+	WARN_ON(error >= 0);
+	return blk_end_request(rq, error, blk_rq_err_bytes(rq));
+}
+EXPORT_SYMBOL_GPL(blk_end_request_err);
+
 /**
  * __blk_end_request - Helper function for drivers to complete the request.
  * @rq:       the request being processed
@@ -2232,6 +2311,26 @@ bool __blk_end_request_cur(struct request *rq, int error)
 }
 EXPORT_SYMBOL(__blk_end_request_cur);
 
+/**
+ * __blk_end_request_err - Finish a request till the next failure boundary.
+ * @rq: the request to finish till the next failure boundary for
+ * @error: must be negative errno
+ *
+ * Description:
+ *     Complete @rq till the next failure boundary.  Must be called
+ *     with queue lock held.
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
+ */
+bool __blk_end_request_err(struct request *rq, int error)
+{
+	WARN_ON(error >= 0);
+	return __blk_end_request(rq, error, blk_rq_err_bytes(rq));
+}
+EXPORT_SYMBOL_GPL(__blk_end_request_err);
+
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
 {
diff --git a/block/blk-merge.c b/block/blk-merge.c
index e1999679a4d5..7c9ca01baa45 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -311,6 +311,36 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 	return 1;
 }
 
+/**
+ * blk_rq_set_mixed_merge - mark a request as mixed merge
+ * @rq: request to mark as mixed merge
+ *
+ * Description:
+ *     @rq is about to be mixed merged.  Make sure the attributes
+ *     which can be mixed are set in each bio and mark @rq as mixed
+ *     merged.
+ */
+void blk_rq_set_mixed_merge(struct request *rq)
+{
+	unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
+	struct bio *bio;
+
+	if (rq->cmd_flags & REQ_MIXED_MERGE)
+		return;
+
+	/*
+	 * @rq will no longer represent mixable attributes for all the
+	 * contained bios.  It will just track those of the first one.
+	 * Distributes the attributs to each bio.
+	 */
+	for (bio = rq->bio; bio; bio = bio->bi_next) {
+		WARN_ON_ONCE((bio->bi_rw & REQ_FAILFAST_MASK) &&
+			     (bio->bi_rw & REQ_FAILFAST_MASK) != ff);
+		bio->bi_rw |= ff;
+	}
+	rq->cmd_flags |= REQ_MIXED_MERGE;
+}
+
 static void blk_account_io_merge(struct request *req)
 {
 	if (blk_do_io_stat(req)) {
@@ -365,6 +395,19 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	if (!ll_merge_requests_fn(q, req, next))
 		return 0;
 
+	/*
+	 * If failfast settings disagree or any of the two is already
+	 * a mixed merge, mark both as mixed before proceeding.  This
+	 * makes sure that all involved bios have mixable attributes
+	 * set properly.
+	 */
+	if ((req->cmd_flags | next->cmd_flags) & REQ_MIXED_MERGE ||
+	    (req->cmd_flags & REQ_FAILFAST_MASK) !=
+	    (next->cmd_flags & REQ_FAILFAST_MASK)) {
+		blk_rq_set_mixed_merge(req);
+		blk_rq_set_mixed_merge(next);
+	}
+
 	/*
 	 * At this point we have either done a back merge
 	 * or front merge. We need the smaller start_time of
diff --git a/block/blk.h b/block/blk.h
index 3fae6add5430..5ee3d7e72feb 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -104,6 +104,7 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 int attempt_back_merge(struct request_queue *q, struct request *rq);
 int attempt_front_merge(struct request_queue *q, struct request *rq);
 void blk_recalc_rq_segments(struct request *rq);
+void blk_rq_set_mixed_merge(struct request *rq);
 
 void blk_queue_congestion_threshold(struct request_queue *q);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c3015736d814..650b6a9cb679 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -115,6 +115,7 @@ enum rq_flag_bits {
 	__REQ_INTEGRITY,	/* integrity metadata has been remapped */
 	__REQ_NOIDLE,		/* Don't anticipate more IO after this one */
 	__REQ_IO_STAT,		/* account I/O stat */
+	__REQ_MIXED_MERGE,	/* merge of different types, fail separately */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -143,6 +144,7 @@ enum rq_flag_bits {
 #define REQ_INTEGRITY	(1 << __REQ_INTEGRITY)
 #define REQ_NOIDLE	(1 << __REQ_NOIDLE)
 #define REQ_IO_STAT	(1 << __REQ_IO_STAT)
+#define REQ_MIXED_MERGE	(1 << __REQ_MIXED_MERGE)
 
 #define REQ_FAILFAST_MASK	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | \
 				 REQ_FAILFAST_DRIVER)
@@ -832,11 +834,13 @@ static inline void blk_run_address_space(struct address_space *mapping)
 }
 
 /*
- * blk_rq_pos()		: the current sector
- * blk_rq_bytes()	: bytes left in the entire request
- * blk_rq_cur_bytes()	: bytes left in the current segment
- * blk_rq_sectors()	: sectors left in the entire request
- * blk_rq_cur_sectors()	: sectors left in the current segment
+ * blk_rq_pos()			: the current sector
+ * blk_rq_bytes()		: bytes left in the entire request
+ * blk_rq_cur_bytes()		: bytes left in the current segment
+ * blk_rq_err_bytes()		: bytes left till the next error boundary
+ * blk_rq_sectors()		: sectors left in the entire request
+ * blk_rq_cur_sectors()		: sectors left in the current segment
+ * blk_rq_err_sectors()		: sectors left till the next error boundary
  */
 static inline sector_t blk_rq_pos(const struct request *rq)
 {
@@ -853,6 +857,8 @@ static inline int blk_rq_cur_bytes(const struct request *rq)
 	return rq->bio ? bio_cur_bytes(rq->bio) : 0;
 }
 
+extern unsigned int blk_rq_err_bytes(const struct request *rq);
+
 static inline unsigned int blk_rq_sectors(const struct request *rq)
 {
 	return blk_rq_bytes(rq) >> 9;
@@ -863,6 +869,11 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
 	return blk_rq_cur_bytes(rq) >> 9;
 }
 
+static inline unsigned int blk_rq_err_sectors(const struct request *rq)
+{
+	return blk_rq_err_bytes(rq) >> 9;
+}
+
 /*
  * Request issue related functions.
  */
@@ -889,10 +900,12 @@ extern bool blk_end_request(struct request *rq, int error,
 			    unsigned int nr_bytes);
 extern void blk_end_request_all(struct request *rq, int error);
 extern bool blk_end_request_cur(struct request *rq, int error);
+extern bool blk_end_request_err(struct request *rq, int error);
 extern bool __blk_end_request(struct request *rq, int error,
 			      unsigned int nr_bytes);
 extern void __blk_end_request_all(struct request *rq, int error);
 extern bool __blk_end_request_cur(struct request *rq, int error);
+extern bool __blk_end_request_err(struct request *rq, int error);
 
 extern void blk_complete_request(struct request *);
 extern void __blk_complete_request(struct request *);
-- 
cgit v1.2.3-71-gd317


From e7e503aedb1f4d165081cb8d47a58c38f80f0cb4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 28 Jul 2009 10:15:30 +0200
Subject: block: make bio_rw_flagged() return a bool

Makes for a saner interface, instead of returning the bit position.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/bio.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index a299ed38fcd7..4f8fd0221cd2 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -177,7 +177,10 @@ enum bio_rw_flags {
 	BIO_RW_NOIDLE,
 };
 
-#define bio_rw_flagged(bio, flag)	((bio)->bi_rw & (1 << (flag)))
+static inline bool bio_rw_flagged(struct bio *bio, enum bio_rw_flags flag)
+{
+	return (bio->bi_rw & (1 << flag)) != 0;
+}
 
 /*
  * Old defines, these should eventually be replaced by direct usage of
-- 
cgit v1.2.3-71-gd317


From 1f98a13f623e0ef666690a18c1250335fc6d7ef1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 11 Sep 2009 14:32:04 +0200
Subject: bio: first step in sanitizing the bio->bi_rw flag testing

Get rid of any functions that test for these bits and make callers
use bio_rw_flagged() directly. Then it is at least directly apparent
what variable and flag they check.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c            | 25 +++++++++++++------------
 block/cfq-iosched.c         |  2 +-
 block/elevator.c            |  3 ++-
 drivers/block/loop.c        |  2 +-
 drivers/md/dm-raid1.c       |  2 +-
 drivers/md/dm-stripe.c      |  2 +-
 drivers/md/dm.c             | 12 ++++++------
 drivers/md/linear.c         |  2 +-
 drivers/md/multipath.c      |  4 ++--
 drivers/md/raid0.c          |  2 +-
 drivers/md/raid1.c          | 14 ++++++++------
 drivers/md/raid10.c         |  6 +++---
 drivers/md/raid5.c          |  2 +-
 drivers/staging/dst/dcore.c |  5 +++--
 fs/btrfs/volumes.c          |  4 ++--
 include/linux/bio.h         | 25 +++++++------------------
 include/linux/blkdev.h      |  2 +-
 17 files changed, 54 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index c822239bcc9d..52559715cb90 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1114,24 +1114,24 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	 * Inherit FAILFAST from bio (for read-ahead, and explicit
 	 * FAILFAST).  FAILFAST flags are identical for req and bio.
 	 */
-	if (bio_rw_ahead(bio))
+	if (bio_rw_flagged(bio, BIO_RW_AHEAD))
 		req->cmd_flags |= REQ_FAILFAST_MASK;
 	else
 		req->cmd_flags |= bio->bi_rw & REQ_FAILFAST_MASK;
 
-	if (unlikely(bio_discard(bio))) {
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_DISCARD))) {
 		req->cmd_flags |= REQ_DISCARD;
-		if (bio_barrier(bio))
+		if (bio_rw_flagged(bio, BIO_RW_BARRIER))
 			req->cmd_flags |= REQ_SOFTBARRIER;
 		req->q->prepare_discard_fn(req->q, req);
-	} else if (unlikely(bio_barrier(bio)))
+	} else if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)))
 		req->cmd_flags |= REQ_HARDBARRIER;
 
-	if (bio_sync(bio))
+	if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
 		req->cmd_flags |= REQ_RW_SYNC;
-	if (bio_rw_meta(bio))
+	if (bio_rw_flagged(bio, BIO_RW_META))
 		req->cmd_flags |= REQ_RW_META;
-	if (bio_noidle(bio))
+	if (bio_rw_flagged(bio, BIO_RW_NOIDLE))
 		req->cmd_flags |= REQ_NOIDLE;
 
 	req->errors = 0;
@@ -1155,12 +1155,12 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	int el_ret;
 	unsigned int bytes = bio->bi_size;
 	const unsigned short prio = bio_prio(bio);
-	const int sync = bio_sync(bio);
-	const int unplug = bio_unplug(bio);
+	const bool sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
+	const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG);
 	const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
 	int rw_flags;
 
-	if (bio_barrier(bio) && bio_has_data(bio) &&
+	if (bio_rw_flagged(bio, BIO_RW_BARRIER) && bio_has_data(bio) &&
 	    (q->next_ordered == QUEUE_ORDERED_NONE)) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
@@ -1174,7 +1174,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 
 	spin_lock_irq(q->queue_lock);
 
-	if (unlikely(bio_barrier(bio)) || elv_queue_empty(q))
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)) || elv_queue_empty(q))
 		goto get_rq;
 
 	el_ret = elv_merge(q, &req, bio);
@@ -1470,7 +1470,8 @@ static inline void __generic_make_request(struct bio *bio)
 		if (bio_check_eod(bio, nr_sectors))
 			goto end_io;
 
-		if (bio_discard(bio) && !q->prepare_discard_fn) {
+		if (bio_rw_flagged(bio, BIO_RW_DISCARD) &&
+		    !q->prepare_discard_fn) {
 			err = -EOPNOTSUPP;
 			goto end_io;
 		}
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ca0d7e71324b..9e6d0af6c990 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -257,7 +257,7 @@ static inline void cic_set_cfqq(struct cfq_io_context *cic,
  */
 static inline int cfq_bio_sync(struct bio *bio)
 {
-	if (bio_data_dir(bio) == READ || bio_sync(bio))
+	if (bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO))
 		return 1;
 
 	return 0;
diff --git a/block/elevator.c b/block/elevator.c
index ca861927ba41..1975b619c86d 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -79,7 +79,8 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
 	/*
 	 * Don't merge file system requests and discard requests
 	 */
-	if (bio_discard(bio) != bio_discard(rq->bio))
+	if (bio_rw_flagged(bio, BIO_RW_DISCARD) !=
+	    bio_rw_flagged(rq->bio, BIO_RW_DISCARD))
 		return 0;
 
 	/*
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 5757188cd1fb..bbb79441d895 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -475,7 +475,7 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
 	pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;
 
 	if (bio_rw(bio) == WRITE) {
-		int barrier = bio_barrier(bio);
+		bool barrier = bio_rw_flagged(bio, BIO_RW_BARRIER);
 		struct file *file = lo->lo_backing_file;
 
 		if (barrier) {
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 33f179e66bf5..cc9dc79b0784 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1129,7 +1129,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
 	if (error == -EOPNOTSUPP)
 		goto out;
 
-	if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
+	if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD))
 		goto out;
 
 	if (unlikely(error)) {
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 3e563d251733..fde658ccbcec 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -285,7 +285,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
 	if (!error)
 		return 0; /* I/O complete */
 
-	if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
+	if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD))
 		return error;
 
 	if (error == -EOPNOTSUPP)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b4845b14740d..ec012f030240 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -586,7 +586,7 @@ static void dec_pending(struct dm_io *io, int error)
 			 */
 			spin_lock_irqsave(&md->deferred_lock, flags);
 			if (__noflush_suspending(md)) {
-				if (!bio_barrier(io->bio))
+				if (!bio_rw_flagged(io->bio, BIO_RW_BARRIER))
 					bio_list_add_head(&md->deferred,
 							  io->bio);
 			} else
@@ -598,7 +598,7 @@ static void dec_pending(struct dm_io *io, int error)
 		io_error = io->error;
 		bio = io->bio;
 
-		if (bio_barrier(bio)) {
+		if (bio_rw_flagged(bio, BIO_RW_BARRIER)) {
 			/*
 			 * There can be just one barrier request so we use
 			 * a per-device variable for error reporting.
@@ -1209,7 +1209,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 
 	ci.map = dm_get_table(md);
 	if (unlikely(!ci.map)) {
-		if (!bio_barrier(bio))
+		if (!bio_rw_flagged(bio, BIO_RW_BARRIER))
 			bio_io_error(bio);
 		else
 			if (!md->barrier_error)
@@ -1321,7 +1321,7 @@ static int _dm_request(struct request_queue *q, struct bio *bio)
 	 * we have to queue this io for later.
 	 */
 	if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
-	    unlikely(bio_barrier(bio))) {
+	    unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		up_read(&md->io_lock);
 
 		if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
@@ -1344,7 +1344,7 @@ static int dm_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct mapped_device *md = q->queuedata;
 
-	if (unlikely(bio_barrier(bio))) {
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
 	}
@@ -2164,7 +2164,7 @@ static void dm_wq_work(struct work_struct *work)
 		if (dm_request_based(md))
 			generic_make_request(c);
 		else {
-			if (bio_barrier(c))
+			if (bio_rw_flagged(c, BIO_RW_BARRIER))
 				process_barrier(md, c);
 			else
 				__split_and_process_bio(md, c);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 5fe39c2a3d2b..ea4842905444 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -288,7 +288,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
 	sector_t start_sector;
 	int cpu;
 
-	if (unlikely(bio_barrier(bio))) {
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
 	}
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 7140909f6662..89e76819f61f 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -90,7 +90,7 @@ static void multipath_end_request(struct bio *bio, int error)
 
 	if (uptodate)
 		multipath_end_bh_io(mp_bh, 0);
-	else if (!bio_rw_ahead(bio)) {
+	else if (!bio_rw_flagged(bio, BIO_RW_AHEAD)) {
 		/*
 		 * oops, IO error:
 		 */
@@ -144,7 +144,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)
 	const int rw = bio_data_dir(bio);
 	int cpu;
 
-	if (unlikely(bio_barrier(bio))) {
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
 	}
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 898e2bdfee47..f845ed98fec9 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -448,7 +448,7 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio)
 	const int rw = bio_data_dir(bio);
 	int cpu;
 
-	if (unlikely(bio_barrier(bio))) {
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
 	}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 8726fd7ebce5..ff7ed3335995 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -782,8 +782,9 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	struct bio_list bl;
 	struct page **behind_pages = NULL;
 	const int rw = bio_data_dir(bio);
-	const int do_sync = bio_sync(bio);
-	int cpu, do_barriers;
+	const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
+	int cpu;
+	bool do_barriers;
 	mdk_rdev_t *blocked_rdev;
 
 	/*
@@ -797,7 +798,8 @@ static int make_request(struct request_queue *q, struct bio * bio)
 
 	md_write_start(mddev, bio); /* wait on superblock update early */
 
-	if (unlikely(!mddev->barriers_work && bio_barrier(bio))) {
+	if (unlikely(!mddev->barriers_work &&
+		     bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		if (rw == WRITE)
 			md_write_end(mddev);
 		bio_endio(bio, -EOPNOTSUPP);
@@ -925,7 +927,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	atomic_set(&r1_bio->remaining, 0);
 	atomic_set(&r1_bio->behind_remaining, 0);
 
-	do_barriers = bio_barrier(bio);
+	do_barriers = bio_rw_flagged(bio, BIO_RW_BARRIER);
 	if (do_barriers)
 		set_bit(R1BIO_Barrier, &r1_bio->state);
 
@@ -1600,7 +1602,7 @@ static void raid1d(mddev_t *mddev)
 			 * We already have a nr_pending reference on these rdevs.
 			 */
 			int i;
-			const int do_sync = bio_sync(r1_bio->master_bio);
+			const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO);
 			clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
 			clear_bit(R1BIO_Barrier, &r1_bio->state);
 			for (i=0; i < conf->raid_disks; i++)
@@ -1654,7 +1656,7 @@ static void raid1d(mddev_t *mddev)
 				       (unsigned long long)r1_bio->sector);
 				raid_end_bio_io(r1_bio);
 			} else {
-				const int do_sync = bio_sync(r1_bio->master_bio);
+				const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO);
 				r1_bio->bios[r1_bio->read_disk] =
 					mddev->ro ? IO_BLOCKED : NULL;
 				r1_bio->read_disk = disk;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3d9020cf6f6e..d0a2152e064f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -796,12 +796,12 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	int i;
 	int chunk_sects = conf->chunk_mask + 1;
 	const int rw = bio_data_dir(bio);
-	const int do_sync = bio_sync(bio);
+	const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
 	struct bio_list bl;
 	unsigned long flags;
 	mdk_rdev_t *blocked_rdev;
 
-	if (unlikely(bio_barrier(bio))) {
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
 	}
@@ -1610,7 +1610,7 @@ static void raid10d(mddev_t *mddev)
 				raid_end_bio_io(r10_bio);
 				bio_put(bio);
 			} else {
-				const int do_sync = bio_sync(r10_bio->master_bio);
+				const bool do_sync = bio_rw_flagged(r10_bio->master_bio, BIO_RW_SYNCIO);
 				bio_put(bio);
 				rdev = conf->mirrors[mirror].rdev;
 				if (printk_ratelimit())
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b8a2c5dc67ba..826eb3467357 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3606,7 +3606,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
 	const int rw = bio_data_dir(bi);
 	int cpu, remaining;
 
-	if (unlikely(bio_barrier(bi))) {
+	if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) {
 		bio_endio(bi, -EOPNOTSUPP);
 		return 0;
 	}
diff --git a/drivers/staging/dst/dcore.c b/drivers/staging/dst/dcore.c
index fad25b753042..b1c258ca2102 100644
--- a/drivers/staging/dst/dcore.c
+++ b/drivers/staging/dst/dcore.c
@@ -112,8 +112,9 @@ static int dst_request(struct request_queue *q, struct bio *bio)
 		 * I worked with.
 		 *
 		 * Empty barriers are not allowed anyway, see 51fd77bd9f512
-		 * for example, although later it was changed to bio_discard()
-		 * only, which does not work in this case.
+		 * for example, although later it was changed to
+		 * bio_rw_flagged(bio, BIO_RW_DISCARD) only, which does not
+		 * work in this case.
 		 */
 		//err = -EOPNOTSUPP;
 		err = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5dbefd11b4af..5cf405b0828d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -260,7 +260,7 @@ loop_lock:
 		num_run++;
 		batch_run++;
 
-		if (bio_sync(cur))
+		if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
 			num_sync_run++;
 
 		if (need_resched()) {
@@ -2903,7 +2903,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
 	bio->bi_rw |= rw;
 
 	spin_lock(&device->io_lock);
-	if (bio_sync(bio))
+	if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
 		pending_bios = &device->pending_sync_bios;
 	else
 		pending_bios = &device->pending_bios;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 4f8fd0221cd2..5be93f18d842 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -177,28 +177,17 @@ enum bio_rw_flags {
 	BIO_RW_NOIDLE,
 };
 
+/*
+ * First four bits must match between bio->bi_rw and rq->cmd_flags, make
+ * that explicit here.
+ */
+#define BIO_RW_RQ_MASK		0xf
+
 static inline bool bio_rw_flagged(struct bio *bio, enum bio_rw_flags flag)
 {
 	return (bio->bi_rw & (1 << flag)) != 0;
 }
 
-/*
- * Old defines, these should eventually be replaced by direct usage of
- * bio_rw_flagged()
- */
-#define bio_barrier(bio)	bio_rw_flagged(bio, BIO_RW_BARRIER)
-#define bio_sync(bio)		bio_rw_flagged(bio, BIO_RW_SYNCIO)
-#define bio_unplug(bio)		bio_rw_flagged(bio, BIO_RW_UNPLUG)
-#define bio_failfast_dev(bio)	bio_rw_flagged(bio, BIO_RW_FAILFAST_DEV)
-#define bio_failfast_transport(bio)	\
-		bio_rw_flagged(bio, BIO_RW_FAILFAST_TRANSPORT)
-#define bio_failfast_driver(bio) 	\
-		bio_rw_flagged(bio, BIO_RW_FAILFAST_DRIVER)
-#define bio_rw_ahead(bio)	bio_rw_flagged(bio, BIO_RW_AHEAD)
-#define bio_rw_meta(bio)	bio_rw_flagged(bio, BIO_RW_META)
-#define bio_discard(bio)	bio_rw_flagged(bio, BIO_RW_DISCARD)
-#define bio_noidle(bio)		bio_rw_flagged(bio, BIO_RW_NOIDLE)
-
 /*
  * upper 16 bits of bi_rw define the io priority of this bio
  */
@@ -222,7 +211,7 @@ static inline bool bio_rw_flagged(struct bio *bio, enum bio_rw_flags flag)
 #define bio_offset(bio)		bio_iovec((bio))->bv_offset
 #define bio_segments(bio)	((bio)->bi_vcnt - (bio)->bi_idx)
 #define bio_sectors(bio)	((bio)->bi_size >> 9)
-#define bio_empty_barrier(bio)	(bio_barrier(bio) && !bio_has_data(bio) && !bio_discard(bio))
+#define bio_empty_barrier(bio)	(bio_rw_flagged(bio, BIO_RW_BARRIER) && !bio_has_data(bio) && !bio_rw_flagged(bio, BIO_RW_DISCARD))
 
 static inline unsigned int bio_cur_bytes(struct bio *bio)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 650b6a9cb679..88edb62def82 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -86,7 +86,7 @@ enum {
 };
 
 /*
- * request type modified bits. first two bits match BIO_RW* bits, important
+ * request type modified bits. first four bits match BIO_RW* bits, important
  */
 enum rq_flag_bits {
 	__REQ_RW,		/* not set, read. set, write */
-- 
cgit v1.2.3-71-gd317


From fb1e75389bd06fd5987e9cda1b4e0305c782f854 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 30 Jul 2009 08:18:24 +0200
Subject: block: improve queue_should_plug() by looking at IO depths

Instead of just checking whether this device uses block layer
tagging, we can improve the detection by looking at the maximum
queue depth it has reached. If that crosses 4, then deem it a
queuing device.

This is important on high IOPS devices, since plugging hurts
the performance there (it can be as much as 10-15% of the sys
time).

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 11 +++++++++--
 include/linux/blkdev.h |  2 ++
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 52559715cb90..93051d151635 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1146,7 +1146,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
  */
 static inline bool queue_should_plug(struct request_queue *q)
 {
-	return !(blk_queue_nonrot(q) && blk_queue_tagged(q));
+	return !(blk_queue_nonrot(q) && blk_queue_queuing(q));
 }
 
 static int __make_request(struct request_queue *q, struct bio *bio)
@@ -1857,8 +1857,15 @@ void blk_dequeue_request(struct request *rq)
 	 * and to it is freed is accounted as io that is in progress at
 	 * the driver side.
 	 */
-	if (blk_account_rq(rq))
+	if (blk_account_rq(rq)) {
 		q->in_flight[rq_is_sync(rq)]++;
+		/*
+		 * Mark this device as supporting hardware queuing, if
+		 * we have more IOs in flight than 4.
+		 */
+		if (!blk_queue_queuing(q) && queue_in_flight(q) > 4)
+			set_bit(QUEUE_FLAG_CQ, &q->queue_flags);
+	}
 }
 
 /**
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 88edb62def82..98b45633a27e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -459,6 +459,7 @@ struct request_queue
 #define QUEUE_FLAG_NONROT      14	/* non-rotational device (SSD) */
 #define QUEUE_FLAG_VIRT        QUEUE_FLAG_NONROT /* paravirt device */
 #define QUEUE_FLAG_IO_STAT     15	/* do IO stats */
+#define QUEUE_FLAG_CQ	       16	/* hardware does queuing */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_CLUSTER) |		\
@@ -581,6 +582,7 @@ enum {
 
 #define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
+#define blk_queue_queuing(q)	test_bit(QUEUE_FLAG_CQ, &(q)->queue_flags)
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 #define blk_queue_nomerges(q)	test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
 #define blk_queue_nonrot(q)	test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
-- 
cgit v1.2.3-71-gd317


From 5e605b64a183a6c0e84cdb99a6f8acb1f8200437 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 5 Aug 2009 09:07:21 +0200
Subject: block: add blk-iopoll, a NAPI like approach for block devices

This borrows some code from NAPI and implements a polled completion
mode for block devices. The idea is the same as NAPI - instead of
doing the command completion when the irq occurs, schedule a dedicated
softirq in the hopes that we will complete more IO when the iopoll
handler is invoked. Devices have a budget of commands assigned, and will
stay in polled mode as long as they continue to consume their budget
from the iopoll softirq handler. If they do not, the device is set back
to interrupt completion mode.

This patch holds the core bits for blk-iopoll, device driver support
sold separately.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/Makefile             |   2 +-
 block/blk-iopoll.c         | 220 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/blk-iopoll.h |  41 +++++++++
 include/linux/interrupt.h  |   1 +
 kernel/sysctl.c            |  10 ++-
 5 files changed, 272 insertions(+), 2 deletions(-)
 create mode 100644 block/blk-iopoll.c
 create mode 100644 include/linux/blk-iopoll.h

(limited to 'include/linux')

diff --git a/block/Makefile b/block/Makefile
index 6c54ed0ff755..ba74ca6bfa14 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
 obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
-			ioctl.o genhd.o scsi_ioctl.o
+			blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o
 
 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
new file mode 100644
index 000000000000..566db1e7c1c7
--- /dev/null
+++ b/block/blk-iopoll.c
@@ -0,0 +1,220 @@
+/*
+ * Functions related to interrupt-poll handling in the block layer. This
+ * is similar to NAPI for network devices.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/blk-iopoll.h>
+#include <linux/delay.h>
+
+#include "blk.h"
+
+int blk_iopoll_enabled = 1;
+EXPORT_SYMBOL(blk_iopoll_enabled);
+
+static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);
+
+/**
+ * blk_iopoll_sched - Schedule a run of the iopoll handler
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     Add this blk_iopoll structure to the pending poll list and trigger the raise
+ *     of the blk iopoll softirq. The driver must already have gotten a succesful
+ *     return from blk_iopoll_sched_prep() before calling this.
+ **/
+void blk_iopoll_sched(struct blk_iopoll *iop)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	list_add_tail(&iop->list, &__get_cpu_var(blk_cpu_iopoll));
+	__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(blk_iopoll_sched);
+
+/**
+ * __blk_iopoll_complete - Mark this @iop as un-polled again
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     See blk_iopoll_complete(). This function must be called with interrupts disabled.
+ **/
+void __blk_iopoll_complete(struct blk_iopoll *iop)
+{
+	list_del(&iop->list);
+	smp_mb__before_clear_bit();
+	clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
+}
+EXPORT_SYMBOL(__blk_iopoll_complete);
+
+/**
+ * blk_iopoll_complete - Mark this @iop as un-polled again
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     If a driver consumes less than the assigned budget in its run of the iopoll
+ *     handler, it'll end the polled mode by calling this function. The iopoll handler
+ *     will not be invoked again before blk_iopoll_sched_prep() is called.
+ **/
+void blk_iopoll_complete(struct blk_iopoll *iopoll)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__blk_iopoll_complete(iopoll);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(blk_iopoll_complete);
+
+static void blk_iopoll_softirq(struct softirq_action *h)
+{
+	struct list_head *list = &__get_cpu_var(blk_cpu_iopoll);
+	unsigned long start_time = jiffies;
+	int rearm = 0, budget = 64;
+
+	local_irq_disable();
+
+	while (!list_empty(list)) {
+		struct blk_iopoll *iop;
+		int work, weight;
+
+		/*
+		 * If softirq window is exhausted then punt.
+		 */
+		if (budget <= 0 || time_after(jiffies, start_time)) {
+			rearm = 1;
+			break;
+		}
+
+		local_irq_enable();
+
+		/* Even though interrupts have been re-enabled, this
+		 * access is safe because interrupts can only add new
+		 * entries to the tail of this list, and only ->poll()
+		 * calls can remove this head entry from the list.
+		 */
+		iop = list_entry(list->next, struct blk_iopoll, list);
+
+		weight = iop->weight;
+		work = 0;
+		if (test_bit(IOPOLL_F_SCHED, &iop->state))
+			work = iop->poll(iop, weight);
+
+		budget -= work;
+
+		local_irq_disable();
+
+		/* Drivers must not modify the NAPI state if they
+		 * consume the entire weight.  In such cases this code
+		 * still "owns" the NAPI instance and therefore can
+		 * move the instance around on the list at-will.
+		 */
+		if (work >= weight) {
+			if (blk_iopoll_disable_pending(iop))
+				__blk_iopoll_complete(iop);
+			else
+				list_move_tail(&iop->list, list);
+		}
+	}
+
+	if (rearm)
+		__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+
+	local_irq_enable();
+}
+
+/**
+ * blk_iopoll_disable - Disable iopoll on this @iop
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     Disable io polling and wait for any pending callbacks to have completed.
+ **/
+void blk_iopoll_disable(struct blk_iopoll *iop)
+{
+	set_bit(IOPOLL_F_DISABLE, &iop->state);
+	while (test_and_set_bit(IOPOLL_F_SCHED, &iop->state))
+		msleep(1);
+	clear_bit(IOPOLL_F_DISABLE, &iop->state);
+}
+EXPORT_SYMBOL(blk_iopoll_disable);
+
+/**
+ * blk_iopoll_enable - Enable iopoll on this @iop
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     Enable iopoll on this @iop. Note that the handler run will not be scheduled, it
+ *     will only mark it as active.
+ **/
+void blk_iopoll_enable(struct blk_iopoll *iop)
+{
+	BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state));
+        smp_mb__before_clear_bit();
+	clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
+}
+EXPORT_SYMBOL(blk_iopoll_enable);
+
+/**
+ * blk_iopoll_init - Initialize this @iop
+ * @iop:      The parent iopoll structure
+ * @weight:   The default weight (or command completion budget)
+ * @poll_fn:  The handler to invoke
+ *
+ * Description:
+ *     Initialize this blk_iopoll structure. Before being actively used, the driver
+ *     must call blk_iopoll_enable().
+ **/
+void blk_iopoll_init(struct blk_iopoll *iop, int weight, blk_iopoll_fn *poll_fn)
+{
+	memset(iop, 0, sizeof(*iop));
+	INIT_LIST_HEAD(&iop->list);
+	iop->weight = weight;
+	iop->poll = poll_fn;
+	set_bit(IOPOLL_F_SCHED, &iop->state);
+}
+EXPORT_SYMBOL(blk_iopoll_init);
+
+static int __cpuinit blk_iopoll_cpu_notify(struct notifier_block *self,
+					  unsigned long action, void *hcpu)
+{
+	/*
+	 * If a CPU goes away, splice its entries to the current CPU
+	 * and trigger a run of the softirq
+	 */
+	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+		int cpu = (unsigned long) hcpu;
+
+		local_irq_disable();
+		list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
+				 &__get_cpu_var(blk_cpu_iopoll));
+		raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+		local_irq_enable();
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata blk_iopoll_cpu_notifier = {
+	.notifier_call	= blk_iopoll_cpu_notify,
+};
+
+static __init int blk_iopoll_setup(void)
+{
+	int i;
+
+	for_each_possible_cpu(i)
+		INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i));
+
+	open_softirq(BLOCK_IOPOLL_SOFTIRQ, blk_iopoll_softirq);
+	register_hotcpu_notifier(&blk_iopoll_cpu_notifier);
+	return 0;
+}
+subsys_initcall(blk_iopoll_setup);
diff --git a/include/linux/blk-iopoll.h b/include/linux/blk-iopoll.h
new file mode 100644
index 000000000000..b2e1739a2e7b
--- /dev/null
+++ b/include/linux/blk-iopoll.h
@@ -0,0 +1,41 @@
+#ifndef BLK_IOPOLL_H
+#define BLK_IOPOLL_H
+
+struct blk_iopoll;
+typedef int (blk_iopoll_fn)(struct blk_iopoll *, int);
+
+struct blk_iopoll {
+	struct list_head list;
+	unsigned long state;
+	unsigned long data;
+	int weight;
+	int max;
+	blk_iopoll_fn *poll;
+};
+
+enum {
+	IOPOLL_F_SCHED		= 0,
+	IOPOLL_F_DISABLE	= 1,
+};
+
+static inline int blk_iopoll_sched_prep(struct blk_iopoll *iop)
+{
+	return !test_bit(IOPOLL_F_DISABLE, &iop->state) &&
+		!test_and_set_bit(IOPOLL_F_SCHED, &iop->state);
+}
+
+static inline int blk_iopoll_disable_pending(struct blk_iopoll *iop)
+{
+	return test_bit(IOPOLL_F_DISABLE, &iop->state);
+}
+
+extern void blk_iopoll_sched(struct blk_iopoll *);
+extern void blk_iopoll_init(struct blk_iopoll *, int, blk_iopoll_fn *);
+extern void blk_iopoll_complete(struct blk_iopoll *);
+extern void __blk_iopoll_complete(struct blk_iopoll *);
+extern void blk_iopoll_enable(struct blk_iopoll *);
+extern void blk_iopoll_disable(struct blk_iopoll *);
+
+extern int blk_iopoll_enabled;
+
+#endif
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 35e7df1e9f30..edd8d5c90394 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -344,6 +344,7 @@ enum
 	NET_TX_SOFTIRQ,
 	NET_RX_SOFTIRQ,
 	BLOCK_SOFTIRQ,
+	BLOCK_IOPOLL_SOFTIRQ,
 	TASKLET_SOFTIRQ,
 	SCHED_SOFTIRQ,
 	HRTIMER_SOFTIRQ,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 58be76017fd0..0ed9fa6f322e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -92,6 +92,7 @@ extern int sysctl_nr_trim_pages;
 #ifdef CONFIG_RCU_TORTURE_TEST
 extern int rcutorture_runnable;
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
+extern int blk_iopoll_enabled;
 
 /* Constants used for minimum and  maximum */
 #ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -990,7 +991,14 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
-
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "blk_iopoll",
+		.data		= &blk_iopoll_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
-- 
cgit v1.2.3-71-gd317


From d62f843b295393124970d29316344150c7de009b Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 11 Aug 2009 08:31:14 +0200
Subject: block: make blk_iopoll_prep_sched() follow normal 0/1 return
 convention

Return 0 if we successfully marked this iopoll structure as ours for
scheduling, instead of 1.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/blk-iopoll.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-iopoll.h b/include/linux/blk-iopoll.h
index b2e1739a2e7b..308734d3d4a2 100644
--- a/include/linux/blk-iopoll.h
+++ b/include/linux/blk-iopoll.h
@@ -18,10 +18,17 @@ enum {
 	IOPOLL_F_DISABLE	= 1,
 };
 
+/*
+ * Returns 0 if we successfully set the IOPOLL_F_SCHED bit, indicating
+ * that we were the first to acquire this iop for scheduling. If this iop
+ * is currently disabled, return "failure".
+ */
 static inline int blk_iopoll_sched_prep(struct blk_iopoll *iop)
 {
-	return !test_bit(IOPOLL_F_DISABLE, &iop->state) &&
-		!test_and_set_bit(IOPOLL_F_SCHED, &iop->state);
+	if (!test_bit(IOPOLL_F_DISABLE, &iop->state))
+		return test_and_set_bit(IOPOLL_F_SCHED, &iop->state);
+
+	return 1;
 }
 
 static inline int blk_iopoll_disable_pending(struct blk_iopoll *iop)
-- 
cgit v1.2.3-71-gd317


From 01e97f6b897bf06ec83375d691f2f4d57f5b3a09 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 3 Sep 2009 20:06:47 +0200
Subject: block: enable rq CPU completion affinity by default

Test results here look good, and on big OLTP runs it has also shown
to significantly increase cycles attributed to the database and
cause a performance boost.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/blkdev.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 98b45633a27e..8bf1a10e4d88 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -463,7 +463,8 @@ struct request_queue
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_CLUSTER) |		\
-				 (1 << QUEUE_FLAG_STACKABLE))
+				 (1 << QUEUE_FLAG_STACKABLE)	|	\
+				 (1 << QUEUE_FLAG_SAME_COMP))
 
 static inline int queue_is_locked(struct request_queue *q)
 {
-- 
cgit v1.2.3-71-gd317


From 48659d31195bb76d688e99dabd816c5472fb1656 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 11 Sep 2009 11:36:23 -0400
Subject: tracing: move tgid out of generic entry and into userstack

The userstack trace required the recording of the tgid entry.
Unfortunately, it was added to the generic entry where it wasted
4 bytes of every entry and was only used by one entry.

This patch moves it out of the generic field and moves it into the
only user (userstack_entry).

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h | 1 -
 kernel/trace/trace.c         | 2 +-
 kernel/trace/trace.h         | 1 +
 kernel/trace/trace_events.c  | 5 +----
 kernel/trace/trace_output.c  | 2 +-
 5 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 23f7179bf74e..06c795b536c2 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -34,7 +34,6 @@ struct trace_entry {
 	unsigned char		flags;
 	unsigned char		preempt_count;
 	int			pid;
-	int			tgid;
 };
 
 #define FTRACE_MAX_EVENT						\
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5c75deeefe30..1a37da2e8534 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -886,7 +886,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 
 	entry->preempt_count		= pc & 0xff;
 	entry->pid			= (tsk) ? tsk->pid : 0;
-	entry->tgid			= (tsk) ? tsk->tgid : 0;
 	entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1068,6 +1067,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 		return;
 	entry	= ring_buffer_event_data(event);
 
+	entry->tgid		= current->tgid;
 	memset(&entry->caller, 0, sizeof(entry->caller));
 
 	trace.nr_entries	= 0;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index acaa68060ebc..b69697b4b846 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -101,6 +101,7 @@ struct stack_entry {
 
 struct userstack_entry {
 	struct trace_entry	ent;
+	unsigned int		tgid;
 	unsigned long		caller[FTRACE_STACK_ENTRIES];
 };
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 78b1ed230177..28d92027a93c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -86,7 +86,6 @@ int trace_define_common_fields(struct ftrace_event_call *call)
 	__common_field(unsigned char, flags);
 	__common_field(unsigned char, preempt_count);
 	__common_field(int, pid);
-	__common_field(int, tgid);
 
 	return ret;
 }
@@ -572,13 +571,11 @@ static int trace_write_header(struct trace_seq *s)
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
-				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\n",
 				FIELD(unsigned short, type),
 				FIELD(unsigned char, flags),
 				FIELD(unsigned char, preempt_count),
-				FIELD(int, pid),
-				FIELD(int, tgid));
+				FIELD(int, pid));
 }
 
 static ssize_t
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index e0c2545622e8..be34a6aa7e4d 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -407,7 +407,7 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
 		 * since individual threads might have already quit!
 		 */
 		rcu_read_lock();
-		task = find_task_by_vpid(entry->ent.tgid);
+		task = find_task_by_vpid(entry->tgid);
 		if (task)
 			mm = get_task_mm(task);
 		rcu_read_unlock();
-- 
cgit v1.2.3-71-gd317


From 074835f0143b83845af5044af2739c52c9f53808 Mon Sep 17 00:00:00 2001
From: Youquan Song <youquan.song@intel.com>
Date: Wed, 9 Sep 2009 12:05:39 -0400
Subject: intel-iommu: Fix kernel hang if interrupt remapping disabled in BIOS

BIOS clear DMAR table INTR_REMAP flag to disable interrupt remapping. Current
kernel only check interrupt remapping(IR) flag in DRHD's extended capability
register to decide interrupt remapping support or not. But IR flag will not
change when BIOS disable/enable interrupt remapping.

When user disable interrupt remapping in BIOS or BIOS often defaultly disable
interrupt remapping feature when BIOS is not mature.Though BIOS disable
interrupt remapping but intr_remapping_supported function will always report
to OS support interrupt remapping if VT-d2 chipset populated. On this
cases, kernel will continue enable interrupt remapping and result kernel panic.
This bug exist on almost all platforms with interrupt remapping support.

This patch add DMAR table INTR_REMAP flag check before enable interrupt
remapping.

Signed-off-by: Youquan Song <youquan.song@intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/dmar.c           | 10 ++++++++++
 drivers/pci/intr_remapping.c |  3 +++
 include/linux/intel-iommu.h  |  2 ++
 3 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index fba4f6891680..270ed222a075 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -1316,3 +1316,13 @@ int dmar_reenable_qi(struct intel_iommu *iommu)
 
 	return 0;
 }
+
+/*
+ * Check interrupt remapping support in DMAR table description.
+ */
+int dmar_ir_support(void)
+{
+	struct acpi_table_dmar *dmar;
+	dmar = (struct acpi_table_dmar *)dmar_tbl;
+	return dmar->flags & 0x1;
+}
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index ebfa47b79c5b..ac065144c01c 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -611,6 +611,9 @@ int __init intr_remapping_supported(void)
 	if (disable_intremap)
 		return 0;
 
+	if (!dmar_ir_support())
+		return 0;
+
 	for_each_drhd_unit(drhd) {
 		struct intel_iommu *iommu = drhd->iommu;
 
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 482dc91fd53a..4f0a72a9740c 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -360,4 +360,6 @@ extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
 
 extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
 
+extern int dmar_ir_support(void);
+
 #endif
-- 
cgit v1.2.3-71-gd317


From 637e7e864103a7a68c1ce43ada27dfc25c0d113f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 11 Sep 2009 13:55:35 -0400
Subject: tracing: add lock depth to entries

This patch adds the lock depth of the big kernel lock to the generic
entry header. This way we can see the depth of the lock and help
in removing the BKL.

Example:

 #                  _------=> CPU#
 #                 / _-----=> irqs-off
 #                | / _----=> need-resched
 #                || / _---=> hardirq/softirq
 #                ||| / _--=> preempt-depth
 #                |||| /_--=> lock-depth
 #                |||||/     delay
 #  cmd     pid   |||||| time  |   caller
 #     \   /      ||||||   \   |   /
   <idle>-0       2.N..3 5902255250us+: lock_acquire: read rcu_read_lock
   <idle>-0       2.N..3 5902255253us+: lock_release: rcu_read_lock
   <idle>-0       2dN..3 5902255257us+: lock_acquire: xtime_lock
   <idle>-0       2dN..4 5902255259us : lock_acquire: clocksource_lock
   <idle>-0       2dN..4 5902255261us+: lock_release: clocksource_lock

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h         |  1 +
 kernel/trace/trace.c                 |  9 +++++----
 kernel/trace/trace_events.c          |  5 ++++-
 kernel/trace/trace_functions_graph.c | 16 ++++++++++++----
 kernel/trace/trace_output.c          | 10 +++++++++-
 5 files changed, 31 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 06c795b536c2..0608b0ff2635 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -34,6 +34,7 @@ struct trace_entry {
 	unsigned char		flags;
 	unsigned char		preempt_count;
 	int			pid;
+	int			lock_depth;
 };
 
 #define FTRACE_MAX_EVENT						\
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1a37da2e8534..3b918283cf94 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -886,6 +886,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 
 	entry->preempt_count		= pc & 0xff;
 	entry->pid			= (tsk) ? tsk->pid : 0;
+	entry->lock_depth		= (tsk) ? tsk->lock_depth : 0;
 	entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1530,10 +1531,10 @@ static void print_lat_help_header(struct seq_file *m)
 	seq_puts(m, "#                | / _----=> need-resched    \n");
 	seq_puts(m, "#                || / _---=> hardirq/softirq \n");
 	seq_puts(m, "#                ||| / _--=> preempt-depth   \n");
-	seq_puts(m, "#                |||| /                      \n");
-	seq_puts(m, "#                |||||     delay             \n");
-	seq_puts(m, "#  cmd     pid   ||||| time  |   caller      \n");
-	seq_puts(m, "#     \\   /      |||||   \\   |   /           \n");
+	seq_puts(m, "#                |||| /_--=> lock-depth       \n");
+	seq_puts(m, "#                |||||/     delay             \n");
+	seq_puts(m, "#  cmd     pid   |||||| time  |   caller      \n");
+	seq_puts(m, "#     \\   /      ||||||   \\   |   /           \n");
 }
 
 static void print_func_help_header(struct seq_file *m)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 28d92027a93c..975f324a07e7 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -86,6 +86,7 @@ int trace_define_common_fields(struct ftrace_event_call *call)
 	__common_field(unsigned char, flags);
 	__common_field(unsigned char, preempt_count);
 	__common_field(int, pid);
+	__common_field(int, lock_depth);
 
 	return ret;
 }
@@ -571,11 +572,13 @@ static int trace_write_header(struct trace_seq *s)
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\n",
 				FIELD(unsigned short, type),
 				FIELD(unsigned char, flags),
 				FIELD(unsigned char, preempt_count),
-				FIELD(int, pid));
+				FIELD(int, pid),
+				FIELD(int, lock_depth));
 }
 
 static ssize_t
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index ee791a9650c5..48af49374384 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -368,6 +368,7 @@ static enum print_line_t
 print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 {
 	int hardirq, softirq;
+	int ret;
 
 	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
 	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
@@ -382,6 +383,13 @@ print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 				hardirq ? 'h' : softirq ? 's' : '.'))
 		return 0;
 
+	if (entry->lock_depth < 0)
+		ret = trace_seq_putc(s, '.');
+	else
+		ret = trace_seq_printf(s, "%d", entry->lock_depth);
+	if (!ret)
+		return 0;
+
 	if (entry->preempt_count)
 		return trace_seq_printf(s, "%x", entry->preempt_count);
 	return trace_seq_puts(s, ".");
@@ -1001,8 +1009,8 @@ static void print_lat_header(struct seq_file *s)
 	seq_printf(s, "#%.*s / _----=> need-resched    \n", size, spaces);
 	seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces);
 	seq_printf(s, "#%.*s|| / _--=> preempt-depth   \n", size, spaces);
-	seq_printf(s, "#%.*s||| /                      \n", size, spaces);
-	seq_printf(s, "#%.*s||||                       \n", size, spaces);
+	seq_printf(s, "#%.*s||| / _-=> lock-depth      \n", size, spaces);
+	seq_printf(s, "#%.*s|||| /                     \n", size, spaces);
 }
 
 static void print_graph_headers(struct seq_file *s)
@@ -1021,7 +1029,7 @@ static void print_graph_headers(struct seq_file *s)
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
 		seq_printf(s, "  TASK/PID       ");
 	if (lat)
-		seq_printf(s, "||||");
+		seq_printf(s, "|||||");
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
 		seq_printf(s, "  DURATION   ");
 	seq_printf(s, "               FUNCTION CALLS\n");
@@ -1035,7 +1043,7 @@ static void print_graph_headers(struct seq_file *s)
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
 		seq_printf(s, "   |    |        ");
 	if (lat)
-		seq_printf(s, "||||");
+		seq_printf(s, "|||||");
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
 		seq_printf(s, "   |   |      ");
 	seq_printf(s, "               |   |   |   |\n");
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index be34a6aa7e4d..29a370a45582 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -465,6 +465,7 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
 {
 	int hardirq, softirq;
 	char comm[TASK_COMM_LEN];
+	int ret;
 
 	trace_find_cmdline(entry->pid, comm);
 	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
@@ -481,9 +482,16 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
 				hardirq ? 'h' : softirq ? 's' : '.'))
 		return 0;
 
+	if (entry->lock_depth < 0)
+		ret = trace_seq_putc(s, '.');
+	else
+		ret = trace_seq_printf(s, "%d", entry->lock_depth);
+	if (!ret)
+		return 0;
+
 	if (entry->preempt_count)
 		return trace_seq_printf(s, "%x", entry->preempt_count);
-	return trace_seq_puts(s, ".");
+	return trace_seq_putc(s, '.');
 }
 
 static unsigned long preempt_mark_thresh = 100;
-- 
cgit v1.2.3-71-gd317


From 4cfc7e6019caa3e97d2a81c48c8d575d7b38d751 Mon Sep 17 00:00:00 2001
From: Rahul Iyer <iyer@netapp.com>
Date: Thu, 10 Sep 2009 17:32:28 +0300
Subject: nfsd41: sunrpc: Added rpc server-side backchannel handling

When the call direction is a reply, copy the xid and call direction into the
req->rq_private_buf.head[0].iov_base otherwise rpc_verify_header returns
rpc_garbage.

Signed-off-by: Rahul Iyer <iyer@netapp.com>
Signed-off-by: Mike Sager <sager@netapp.com>
Signed-off-by: Marc Eshel <eshel@almaden.ibm.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[get rid of CONFIG_NFSD_V4_1]
[sunrpc: refactoring of svc_tcp_recvfrom]
[nfsd41: sunrpc: create common send routine for the fore and the back channels]
[nfsd41: sunrpc: Use free_page() to free server backchannel pages]
[nfsd41: sunrpc: Document server backchannel locking]
[nfsd41: sunrpc: remove bc_connect_worker()]
[nfsd41: sunrpc: Define xprt_server_backchannel()[
[nfsd41: sunrpc: remove bc_close and bc_init_auto_disconnect dummy functions]
[nfsd41: sunrpc: eliminate unneeded switch statement in xs_setup_tcp()]
[nfsd41: sunrpc: Don't auto close the server backchannel connection]
[nfsd41: sunrpc: Remove unused functions]
Signed-off-by: Alexandros Batsakis <batsakis@netapp.com>
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: change bc_sock to bc_xprt]
[nfsd41: sunrpc: move struct rpc_buffer def into a common header file]
[nfsd41: sunrpc: use rpc_sleep in bc_send_request so not to block on mutex]
[removed cosmetic changes]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[sunrpc: add new xprt class for nfsv4.1 backchannel]
[sunrpc: v2.1 change handling of auto_close and init_auto_disconnect operations for the nfsv4.1 backchannel]
Signed-off-by: Alexandros Batsakis <batsakis@netapp.com>
[reverted more cosmetic leftovers]
[got rid of xprt_server_backchannel]
[separated "nfsd41: sunrpc: add new xprt class for nfsv4.1 backchannel"]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Cc: Trond Myklebust <trond.myklebust@netapp.com>
[sunrpc: change idle timeout value for the backchannel]
Signed-off-by: Alexandros Batsakis <batsakis@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Acked-by: Trond Myklebust <trond.myklebust@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 include/linux/sunrpc/svc_xprt.h |   1 +
 include/linux/sunrpc/svcsock.h  |   1 +
 include/linux/sunrpc/xprt.h     |   1 +
 net/sunrpc/sunrpc.h             |   4 +
 net/sunrpc/svc_xprt.c           |   2 +
 net/sunrpc/svcsock.c            | 172 ++++++++++++++++++++++++++++++++--------
 net/sunrpc/xprt.c               |  15 +++-
 net/sunrpc/xprtsock.c           | 146 ++++++++++++++++++++++++++++++++++
 8 files changed, 303 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 2223ae0b5ed5..5f4e18b3ce73 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -65,6 +65,7 @@ struct svc_xprt {
 	size_t			xpt_locallen;	/* length of address */
 	struct sockaddr_storage	xpt_remote;	/* remote peer's address */
 	size_t			xpt_remotelen;	/* length of address */
+	struct rpc_wait_queue	xpt_bc_pending;	/* backchannel wait queue */
 };
 
 int	svc_reg_xprt_class(struct svc_xprt_class *);
diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index 04dba23c59f2..1b353a76c304 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -28,6 +28,7 @@ struct svc_sock {
 	/* private TCP part */
 	u32			sk_reclen;	/* length of record */
 	u32			sk_tcplen;	/* current read length */
+	struct rpc_xprt		*sk_bc_xprt;	/* NFSv4.1 backchannel xprt */
 };
 
 /*
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index c090df442572..228d694dbb90 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -179,6 +179,7 @@ struct rpc_xprt {
 	spinlock_t		reserve_lock;	/* lock slot table */
 	u32			xid;		/* Next XID value to use */
 	struct rpc_task *	snd_task;	/* Task blocked in send */
+	struct svc_xprt		*bc_xprt;	/* NFSv4.1 backchannel */
 #if defined(CONFIG_NFS_V4_1)
 	struct svc_serv		*bc_serv;       /* The RPC service which will */
 						/* process the callback */
diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h
index 13171e63f51b..90c292e2738b 100644
--- a/net/sunrpc/sunrpc.h
+++ b/net/sunrpc/sunrpc.h
@@ -43,5 +43,9 @@ static inline int rpc_reply_expected(struct rpc_task *task)
 		(task->tk_msg.rpc_proc->p_decode != NULL);
 }
 
+int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
+		    struct page *headpage, unsigned long headoffset,
+		    struct page *tailpage, unsigned long tailoffset);
+
 #endif /* _NET_SUNRPC_SUNRPC_H */
 
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 912dea558ccc..df124f78ee48 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -160,6 +160,7 @@ void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt,
 	mutex_init(&xprt->xpt_mutex);
 	spin_lock_init(&xprt->xpt_lock);
 	set_bit(XPT_BUSY, &xprt->xpt_flags);
+	rpc_init_wait_queue(&xprt->xpt_bc_pending, "xpt_bc_pending");
 }
 EXPORT_SYMBOL_GPL(svc_xprt_init);
 
@@ -810,6 +811,7 @@ int svc_send(struct svc_rqst *rqstp)
 	else
 		len = xprt->xpt_ops->xpo_sendto(rqstp);
 	mutex_unlock(&xprt->xpt_mutex);
+	rpc_wake_up(&xprt->xpt_bc_pending);
 	svc_xprt_release(rqstp);
 
 	if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN)
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 76a380d37de4..ccc5e83cae5d 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -49,6 +49,7 @@
 #include <linux/sunrpc/msg_prot.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/xprt.h>
 
 #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
 
@@ -153,49 +154,27 @@ static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh)
 }
 
 /*
- * Generic sendto routine
+ * send routine intended to be shared by the fore- and back-channel
  */
-static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
+int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
+		    struct page *headpage, unsigned long headoffset,
+		    struct page *tailpage, unsigned long tailoffset)
 {
-	struct svc_sock	*svsk =
-		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
-	struct socket	*sock = svsk->sk_sock;
-	int		slen;
-	union {
-		struct cmsghdr	hdr;
-		long		all[SVC_PKTINFO_SPACE / sizeof(long)];
-	} buffer;
-	struct cmsghdr *cmh = &buffer.hdr;
-	int		len = 0;
 	int		result;
 	int		size;
 	struct page	**ppage = xdr->pages;
 	size_t		base = xdr->page_base;
 	unsigned int	pglen = xdr->page_len;
 	unsigned int	flags = MSG_MORE;
-	RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
+	int		slen;
+	int		len = 0;
 
 	slen = xdr->len;
 
-	if (rqstp->rq_prot == IPPROTO_UDP) {
-		struct msghdr msg = {
-			.msg_name	= &rqstp->rq_addr,
-			.msg_namelen	= rqstp->rq_addrlen,
-			.msg_control	= cmh,
-			.msg_controllen	= sizeof(buffer),
-			.msg_flags	= MSG_MORE,
-		};
-
-		svc_set_cmsg_data(rqstp, cmh);
-
-		if (sock_sendmsg(sock, &msg, 0) < 0)
-			goto out;
-	}
-
 	/* send head */
 	if (slen == xdr->head[0].iov_len)
 		flags = 0;
-	len = kernel_sendpage(sock, rqstp->rq_respages[0], 0,
+	len = kernel_sendpage(sock, headpage, headoffset,
 				  xdr->head[0].iov_len, flags);
 	if (len != xdr->head[0].iov_len)
 		goto out;
@@ -219,16 +198,58 @@ static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
 		base = 0;
 		ppage++;
 	}
+
 	/* send tail */
 	if (xdr->tail[0].iov_len) {
-		result = kernel_sendpage(sock, rqstp->rq_respages[0],
-					     ((unsigned long)xdr->tail[0].iov_base)
-						& (PAGE_SIZE-1),
-					     xdr->tail[0].iov_len, 0);
-
+		result = kernel_sendpage(sock, tailpage, tailoffset,
+				   xdr->tail[0].iov_len, 0);
 		if (result > 0)
 			len += result;
 	}
+
+out:
+	return len;
+}
+
+
+/*
+ * Generic sendto routine
+ */
+static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
+{
+	struct svc_sock	*svsk =
+		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+	struct socket	*sock = svsk->sk_sock;
+	union {
+		struct cmsghdr	hdr;
+		long		all[SVC_PKTINFO_SPACE / sizeof(long)];
+	} buffer;
+	struct cmsghdr *cmh = &buffer.hdr;
+	int		len = 0;
+	unsigned long tailoff;
+	unsigned long headoff;
+	RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
+
+	if (rqstp->rq_prot == IPPROTO_UDP) {
+		struct msghdr msg = {
+			.msg_name	= &rqstp->rq_addr,
+			.msg_namelen	= rqstp->rq_addrlen,
+			.msg_control	= cmh,
+			.msg_controllen	= sizeof(buffer),
+			.msg_flags	= MSG_MORE,
+		};
+
+		svc_set_cmsg_data(rqstp, cmh);
+
+		if (sock_sendmsg(sock, &msg, 0) < 0)
+			goto out;
+	}
+
+	tailoff = ((unsigned long)xdr->tail[0].iov_base) & (PAGE_SIZE-1);
+	headoff = 0;
+	len = svc_send_common(sock, xdr, rqstp->rq_respages[0], headoff,
+			       rqstp->rq_respages[0], tailoff);
+
 out:
 	dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n",
 		svsk, xdr->head[0].iov_base, xdr->head[0].iov_len,
@@ -951,6 +972,57 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp)
 	return -EAGAIN;
 }
 
+static int svc_process_calldir(struct svc_sock *svsk, struct svc_rqst *rqstp,
+			       struct rpc_rqst **reqpp, struct kvec *vec)
+{
+	struct rpc_rqst *req = NULL;
+	u32 *p;
+	u32 xid;
+	u32 calldir;
+	int len;
+
+	len = svc_recvfrom(rqstp, vec, 1, 8);
+	if (len < 0)
+		goto error;
+
+	p = (u32 *)rqstp->rq_arg.head[0].iov_base;
+	xid = *p++;
+	calldir = *p;
+
+	if (calldir == 0) {
+		/* REQUEST is the most common case */
+		vec[0] = rqstp->rq_arg.head[0];
+	} else {
+		/* REPLY */
+		if (svsk->sk_bc_xprt)
+			req = xprt_lookup_rqst(svsk->sk_bc_xprt, xid);
+
+		if (!req) {
+			printk(KERN_NOTICE
+				"%s: Got unrecognized reply: "
+				"calldir 0x%x sk_bc_xprt %p xid %08x\n",
+				__func__, ntohl(calldir),
+				svsk->sk_bc_xprt, xid);
+			vec[0] = rqstp->rq_arg.head[0];
+			goto out;
+		}
+
+		memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
+		       sizeof(struct xdr_buf));
+		/* copy the xid and call direction */
+		memcpy(req->rq_private_buf.head[0].iov_base,
+		       rqstp->rq_arg.head[0].iov_base, 8);
+		vec[0] = req->rq_private_buf.head[0];
+	}
+ out:
+	vec[0].iov_base += 8;
+	vec[0].iov_len -= 8;
+	len = svsk->sk_reclen - 8;
+ error:
+	*reqpp = req;
+	return len;
+}
+
 /*
  * Receive data from a TCP socket.
  */
@@ -962,6 +1034,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	int		len;
 	struct kvec *vec;
 	int pnum, vlen;
+	struct rpc_rqst *req = NULL;
 
 	dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
 		svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags),
@@ -975,9 +1048,27 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	vec = rqstp->rq_vec;
 	vec[0] = rqstp->rq_arg.head[0];
 	vlen = PAGE_SIZE;
+
+	/*
+	 * We have enough data for the whole tcp record. Let's try and read the
+	 * first 8 bytes to get the xid and the call direction. We can use this
+	 * to figure out if this is a call or a reply to a callback. If
+	 * sk_reclen is < 8 (xid and calldir), then this is a malformed packet.
+	 * In that case, don't bother with the calldir and just read the data.
+	 * It will be rejected in svc_process.
+	 */
+	if (len >= 8) {
+		len = svc_process_calldir(svsk, rqstp, &req, vec);
+		if (len < 0)
+			goto err_again;
+		vlen -= 8;
+	}
+
 	pnum = 1;
 	while (vlen < len) {
-		vec[pnum].iov_base = page_address(rqstp->rq_pages[pnum]);
+		vec[pnum].iov_base = (req) ?
+			page_address(req->rq_private_buf.pages[pnum - 1]) :
+			page_address(rqstp->rq_pages[pnum]);
 		vec[pnum].iov_len = PAGE_SIZE;
 		pnum++;
 		vlen += PAGE_SIZE;
@@ -989,6 +1080,16 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	if (len < 0)
 		goto err_again;
 
+	/*
+	 * Account for the 8 bytes we read earlier
+	 */
+	len += 8;
+
+	if (req) {
+		xprt_complete_rqst(req->rq_task, len);
+		len = 0;
+		goto out;
+	}
 	dprintk("svc: TCP complete record (%d bytes)\n", len);
 	rqstp->rq_arg.len = len;
 	rqstp->rq_arg.page_base = 0;
@@ -1002,6 +1103,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	rqstp->rq_xprt_ctxt   = NULL;
 	rqstp->rq_prot	      = IPPROTO_TCP;
 
+out:
 	/* Reset TCP read info */
 	svsk->sk_reclen = 0;
 	svsk->sk_tcplen = 0;
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index f412a852bc73..fd46d42afa89 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -832,6 +832,11 @@ static void xprt_timer(struct rpc_task *task)
 	spin_unlock_bh(&xprt->transport_lock);
 }
 
+static inline int xprt_has_timer(struct rpc_xprt *xprt)
+{
+	return xprt->idle_timeout != 0;
+}
+
 /**
  * xprt_prepare_transmit - reserve the transport before sending a request
  * @task: RPC task about to send a request
@@ -1013,7 +1018,7 @@ void xprt_release(struct rpc_task *task)
 	if (!list_empty(&req->rq_list))
 		list_del(&req->rq_list);
 	xprt->last_used = jiffies;
-	if (list_empty(&xprt->recv))
+	if (list_empty(&xprt->recv) && xprt_has_timer(xprt))
 		mod_timer(&xprt->timer,
 				xprt->last_used + xprt->idle_timeout);
 	spin_unlock_bh(&xprt->transport_lock);
@@ -1082,8 +1087,11 @@ found:
 #endif /* CONFIG_NFS_V4_1 */
 
 	INIT_WORK(&xprt->task_cleanup, xprt_autoclose);
-	setup_timer(&xprt->timer, xprt_init_autodisconnect,
-			(unsigned long)xprt);
+	if (xprt_has_timer(xprt))
+		setup_timer(&xprt->timer, xprt_init_autodisconnect,
+			    (unsigned long)xprt);
+	else
+		init_timer(&xprt->timer);
 	xprt->last_used = jiffies;
 	xprt->cwnd = RPC_INITCWND;
 	xprt->bind_index = 0;
@@ -1102,7 +1110,6 @@ found:
 
 	dprintk("RPC:       created transport %p with %u slots\n", xprt,
 			xprt->max_reqs);
-
 	return xprt;
 }
 
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 62438f3a914d..d9a2b815714e 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -32,6 +32,7 @@
 #include <linux/tcp.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/svcsock.h>
 #include <linux/sunrpc/xprtsock.h>
 #include <linux/file.h>
 #ifdef CONFIG_NFS_V4_1
@@ -43,6 +44,7 @@
 #include <net/udp.h>
 #include <net/tcp.h>
 
+#include "sunrpc.h"
 /*
  * xprtsock tunables
  */
@@ -2098,6 +2100,134 @@ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
 			xprt->stat.bklog_u);
 }
 
+/*
+ * Allocate a bunch of pages for a scratch buffer for the rpc code. The reason
+ * we allocate pages instead doing a kmalloc like rpc_malloc is because we want
+ * to use the server side send routines.
+ */
+void *bc_malloc(struct rpc_task *task, size_t size)
+{
+	struct page *page;
+	struct rpc_buffer *buf;
+
+	BUG_ON(size > PAGE_SIZE - sizeof(struct rpc_buffer));
+	page = alloc_page(GFP_KERNEL);
+
+	if (!page)
+		return NULL;
+
+	buf = page_address(page);
+	buf->len = PAGE_SIZE;
+
+	return buf->data;
+}
+
+/*
+ * Free the space allocated in the bc_alloc routine
+ */
+void bc_free(void *buffer)
+{
+	struct rpc_buffer *buf;
+
+	if (!buffer)
+		return;
+
+	buf = container_of(buffer, struct rpc_buffer, data);
+	free_page((unsigned long)buf);
+}
+
+/*
+ * Use the svc_sock to send the callback. Must be called with svsk->sk_mutex
+ * held. Borrows heavily from svc_tcp_sendto and xs_tcp_send_request.
+ */
+static int bc_sendto(struct rpc_rqst *req)
+{
+	int len;
+	struct xdr_buf *xbufp = &req->rq_snd_buf;
+	struct rpc_xprt *xprt = req->rq_xprt;
+	struct sock_xprt *transport =
+				container_of(xprt, struct sock_xprt, xprt);
+	struct socket *sock = transport->sock;
+	unsigned long headoff;
+	unsigned long tailoff;
+
+	/*
+	 * Set up the rpc header and record marker stuff
+	 */
+	xs_encode_tcp_record_marker(xbufp);
+
+	tailoff = (unsigned long)xbufp->tail[0].iov_base & ~PAGE_MASK;
+	headoff = (unsigned long)xbufp->head[0].iov_base & ~PAGE_MASK;
+	len = svc_send_common(sock, xbufp,
+			      virt_to_page(xbufp->head[0].iov_base), headoff,
+			      xbufp->tail[0].iov_base, tailoff);
+
+	if (len != xbufp->len) {
+		printk(KERN_NOTICE "Error sending entire callback!\n");
+		len = -EAGAIN;
+	}
+
+	return len;
+}
+
+/*
+ * The send routine. Borrows from svc_send
+ */
+static int bc_send_request(struct rpc_task *task)
+{
+	struct rpc_rqst *req = task->tk_rqstp;
+	struct svc_xprt	*xprt;
+	struct svc_sock         *svsk;
+	u32                     len;
+
+	dprintk("sending request with xid: %08x\n", ntohl(req->rq_xid));
+	/*
+	 * Get the server socket associated with this callback xprt
+	 */
+	xprt = req->rq_xprt->bc_xprt;
+	svsk = container_of(xprt, struct svc_sock, sk_xprt);
+
+	/*
+	 * Grab the mutex to serialize data as the connection is shared
+	 * with the fore channel
+	 */
+	if (!mutex_trylock(&xprt->xpt_mutex)) {
+		rpc_sleep_on(&xprt->xpt_bc_pending, task, NULL);
+		if (!mutex_trylock(&xprt->xpt_mutex))
+			return -EAGAIN;
+		rpc_wake_up_queued_task(&xprt->xpt_bc_pending, task);
+	}
+	if (test_bit(XPT_DEAD, &xprt->xpt_flags))
+		len = -ENOTCONN;
+	else
+		len = bc_sendto(req);
+	mutex_unlock(&xprt->xpt_mutex);
+
+	if (len > 0)
+		len = 0;
+
+	return len;
+}
+
+/*
+ * The close routine. Since this is client initiated, we do nothing
+ */
+
+static void bc_close(struct rpc_xprt *xprt)
+{
+	return;
+}
+
+/*
+ * The xprt destroy routine. Again, because this connection is client
+ * initiated, we do nothing
+ */
+
+static void bc_destroy(struct rpc_xprt *xprt)
+{
+	return;
+}
+
 static struct rpc_xprt_ops xs_udp_ops = {
 	.set_buffer_size	= xs_udp_set_buffer_size,
 	.reserve_xprt		= xprt_reserve_xprt_cong,
@@ -2134,6 +2264,22 @@ static struct rpc_xprt_ops xs_tcp_ops = {
 	.print_stats		= xs_tcp_print_stats,
 };
 
+/*
+ * The rpc_xprt_ops for the server backchannel
+ */
+
+static struct rpc_xprt_ops bc_tcp_ops = {
+	.reserve_xprt		= xprt_reserve_xprt,
+	.release_xprt		= xprt_release_xprt,
+	.buf_alloc		= bc_malloc,
+	.buf_free		= bc_free,
+	.send_request		= bc_send_request,
+	.set_retrans_timeout	= xprt_set_retrans_timeout_def,
+	.close			= bc_close,
+	.destroy		= bc_destroy,
+	.print_stats		= xs_tcp_print_stats,
+};
+
 static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args,
 				      unsigned int slot_table_size)
 {
-- 
cgit v1.2.3-71-gd317


From f17a1f06d2fa93f4825be572622eb02c4894db4e Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@stericsson.com>
Date: Tue, 4 Aug 2009 01:01:02 +0100
Subject: ARM: 5636/1: Move vendor enum to AMBA include

This moves the primecell vendor enum definition inside vic.c
out to linux/amba/bus.h where it belongs and replace any
occurances of specific vendor ID:s with the respective enums
instead.

Signed-off-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/common/vic.c    | 12 ++++--------
 drivers/mmc/host/mmci.c  |  6 +++---
 include/linux/amba/bus.h |  5 +++++
 3 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/common/vic.c b/arch/arm/common/vic.c
index bc1f9ad61ff6..920ced0b73c5 100644
--- a/arch/arm/common/vic.c
+++ b/arch/arm/common/vic.c
@@ -22,6 +22,7 @@
 #include <linux/list.h>
 #include <linux/io.h>
 #include <linux/sysdev.h>
+#include <linux/amba/bus.h>
 
 #include <asm/mach/irq.h>
 #include <asm/hardware/vic.h>
@@ -272,11 +273,6 @@ static struct irq_chip vic_chip = {
 static void vik_init_st(void __iomem *base, unsigned int irq_start,
 			 u32 vic_sources);
 
-enum vic_vendor {
-	VENDOR_ARM = 0x41,
-	VENDOR_ST = 0x80,
-};
-
 /**
  * vic_init - initialise a vectored interrupt controller
  * @base: iomem base address
@@ -289,7 +285,7 @@ void __init vic_init(void __iomem *base, unsigned int irq_start,
 {
 	unsigned int i;
 	u32 cellid = 0;
-	enum vic_vendor vendor;
+	enum amba_vendor vendor;
 
 	/* Identify which VIC cell this one is, by reading the ID */
 	for (i = 0; i < 4; i++) {
@@ -301,13 +297,13 @@ void __init vic_init(void __iomem *base, unsigned int irq_start,
 	       base, cellid, vendor);
 
 	switch(vendor) {
-	case VENDOR_ST:
+	case AMBA_VENDOR_ST:
 		vik_init_st(base, irq_start, vic_sources);
 		return;
 	default:
 		printk(KERN_WARNING "VIC: unknown vendor, continuing anyways\n");
 		/* fall through */
-	case VENDOR_ARM:
+	case AMBA_VENDOR_ARM:
 		break;
 	}
 
diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
index e1aa8471ab1c..d78b1e460d7d 100644
--- a/drivers/mmc/host/mmci.c
+++ b/drivers/mmc/host/mmci.c
@@ -430,7 +430,7 @@ static void mmci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 				clk = 255;
 			host->cclk = host->mclk / (2 * (clk + 1));
 		}
-		if (host->hw_designer == 0x80)
+		if (host->hw_designer == AMBA_VENDOR_ST)
 			clk |= MCI_FCEN; /* Bug fix in ST IP block */
 		clk |= MCI_CLK_ENABLE;
 	}
@@ -443,7 +443,7 @@ static void mmci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 		break;
 	case MMC_POWER_UP:
 		/* The ST version does not have this, fall through to POWER_ON */
-		if (host->hw_designer != 0x80) {
+		if (host->hw_designer != AMBA_VENDOR_ST) {
 			pwr |= MCI_PWR_UP;
 			break;
 		}
@@ -453,7 +453,7 @@ static void mmci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 	}
 
 	if (ios->bus_mode == MMC_BUSMODE_OPENDRAIN) {
-		if (host->hw_designer != 0x80)
+		if (host->hw_designer != AMBA_VENDOR_ST)
 			pwr |= MCI_ROD;
 		else {
 			/*
diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h
index 9b93cafa82a0..ab94335b4bb9 100644
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -36,6 +36,11 @@ struct amba_driver {
 	struct amba_id		*id_table;
 };
 
+enum amba_vendor {
+	AMBA_VENDOR_ARM = 0x41,
+	AMBA_VENDOR_ST = 0x80,
+};
+
 #define amba_get_drvdata(d)	dev_get_drvdata(&d->dev)
 #define amba_set_drvdata(d,p)	dev_set_drvdata(&d->dev, p)
 
-- 
cgit v1.2.3-71-gd317


From 18668ff9a3232d5f942a2f7abc1ad67d2760dcdf Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sun, 6 Sep 2009 18:49:48 +0200
Subject: firewire: core: header file cleanup

fw_card_get, fw_card_put, fw_card_release are currently not exported for
use outside the firewire-core.  Move their definitions/ declarations
from the subsystem header file to the core header file.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core.h  | 14 ++++++++++++++
 include/linux/firewire.h | 14 --------------
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h
index 6052816be353..7ff6e7585152 100644
--- a/drivers/firewire/core.h
+++ b/drivers/firewire/core.h
@@ -96,6 +96,20 @@ int fw_core_initiate_bus_reset(struct fw_card *card, int short_reset);
 int fw_compute_block_crc(u32 *block);
 void fw_schedule_bm_work(struct fw_card *card, unsigned long delay);
 
+static inline struct fw_card *fw_card_get(struct fw_card *card)
+{
+	kref_get(&card->kref);
+
+	return card;
+}
+
+void fw_card_release(struct kref *kref);
+
+static inline void fw_card_put(struct fw_card *card)
+{
+	kref_put(&card->kref, fw_card_release);
+}
+
 
 /* -cdev */
 
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 192d1e43c43c..7e1d4dec83e7 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -134,20 +134,6 @@ struct fw_card {
 	u32 topology_map[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4];
 };
 
-static inline struct fw_card *fw_card_get(struct fw_card *card)
-{
-	kref_get(&card->kref);
-
-	return card;
-}
-
-void fw_card_release(struct kref *kref);
-
-static inline void fw_card_put(struct fw_card *card)
-{
-	kref_put(&card->kref, fw_card_release);
-}
-
 struct fw_attribute_group {
 	struct attribute_group *groups[2];
 	struct attribute_group group;
-- 
cgit v1.2.3-71-gd317


From 65dd2f93febf6345ce9e39d2f6e35ce1122f4a4a Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@redhat.com>
Date: Tue, 30 Jun 2009 16:14:17 -0300
Subject: V4L/DVB (12149): videodev2.h: Reorganize fourcc table

With the changes this file suffered along the time, things got a little disorganized.
In particular, V4L2_PIX_FMT_YVYU were shown as a device-specific format, instead of
yet another variant of YUV.

There's no functional change on this patch. It just adds some comments and reorder
fourcc formats to their proper places.

Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 include/linux/videodev2.h | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 74f16876f38d..eddd5ff99c80 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -275,7 +275,9 @@ struct v4l2_pix_format {
 	__u32			priv;		/* private data, depends on pixelformat */
 };
 
-/*      Pixel format         FOURCC                        depth  Description  */
+/*      Pixel format         FOURCC                          depth  Description  */
+
+/* RGB formats */
 #define V4L2_PIX_FMT_RGB332  v4l2_fourcc('R', 'G', 'B', '1') /*  8  RGB-3-3-2     */
 #define V4L2_PIX_FMT_RGB444  v4l2_fourcc('R', '4', '4', '4') /* 16  xxxxrrrr ggggbbbb */
 #define V4L2_PIX_FMT_RGB555  v4l2_fourcc('R', 'G', 'B', 'O') /* 16  RGB-5-5-5     */
@@ -286,12 +288,20 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_RGB24   v4l2_fourcc('R', 'G', 'B', '3') /* 24  RGB-8-8-8     */
 #define V4L2_PIX_FMT_BGR32   v4l2_fourcc('B', 'G', 'R', '4') /* 32  BGR-8-8-8-8   */
 #define V4L2_PIX_FMT_RGB32   v4l2_fourcc('R', 'G', 'B', '4') /* 32  RGB-8-8-8-8   */
+
+/* Grey formats */
 #define V4L2_PIX_FMT_GREY    v4l2_fourcc('G', 'R', 'E', 'Y') /*  8  Greyscale     */
 #define V4L2_PIX_FMT_Y16     v4l2_fourcc('Y', '1', '6', ' ') /* 16  Greyscale     */
+
+/* Palette formats */
 #define V4L2_PIX_FMT_PAL8    v4l2_fourcc('P', 'A', 'L', '8') /*  8  8-bit palette */
+
+/* Luminance+Chrominance formats */
 #define V4L2_PIX_FMT_YVU410  v4l2_fourcc('Y', 'V', 'U', '9') /*  9  YVU 4:1:0     */
 #define V4L2_PIX_FMT_YVU420  v4l2_fourcc('Y', 'V', '1', '2') /* 12  YVU 4:2:0     */
 #define V4L2_PIX_FMT_YUYV    v4l2_fourcc('Y', 'U', 'Y', 'V') /* 16  YUV 4:2:2     */
+#define V4L2_PIX_FMT_YYUV    v4l2_fourcc('Y', 'Y', 'U', 'V') /* 16  YUV 4:2:2     */
+#define V4L2_PIX_FMT_YVYU    v4l2_fourcc('Y', 'V', 'Y', 'U') /* 16 YVU 4:2:2 */
 #define V4L2_PIX_FMT_UYVY    v4l2_fourcc('U', 'Y', 'V', 'Y') /* 16  YUV 4:2:2     */
 #define V4L2_PIX_FMT_VYUY    v4l2_fourcc('V', 'Y', 'U', 'Y') /* 16  YUV 4:2:2     */
 #define V4L2_PIX_FMT_YUV422P v4l2_fourcc('4', '2', '2', 'P') /* 16  YVU422 planar */
@@ -301,6 +311,10 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_YUV555  v4l2_fourcc('Y', 'U', 'V', 'O') /* 16  YUV-5-5-5     */
 #define V4L2_PIX_FMT_YUV565  v4l2_fourcc('Y', 'U', 'V', 'P') /* 16  YUV-5-6-5     */
 #define V4L2_PIX_FMT_YUV32   v4l2_fourcc('Y', 'U', 'V', '4') /* 32  YUV-8-8-8-8   */
+#define V4L2_PIX_FMT_YUV410  v4l2_fourcc('Y', 'U', 'V', '9') /*  9  YUV 4:1:0     */
+#define V4L2_PIX_FMT_YUV420  v4l2_fourcc('Y', 'U', '1', '2') /* 12  YUV 4:2:0     */
+#define V4L2_PIX_FMT_HI240   v4l2_fourcc('H', 'I', '2', '4') /*  8  8-bit color   */
+#define V4L2_PIX_FMT_HM12    v4l2_fourcc('H', 'M', '1', '2') /*  8  YUV 4:2:0 16x16 macroblocks */
 
 /* two planes -- one Y, one Cr + Cb interleaved  */
 #define V4L2_PIX_FMT_NV12    v4l2_fourcc('N', 'V', '1', '2') /* 12  Y/CbCr 4:2:0  */
@@ -308,25 +322,17 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_NV16    v4l2_fourcc('N', 'V', '1', '6') /* 16  Y/CbCr 4:2:2  */
 #define V4L2_PIX_FMT_NV61    v4l2_fourcc('N', 'V', '6', '1') /* 16  Y/CrCb 4:2:2  */
 
-/*  The following formats are not defined in the V4L2 specification */
-#define V4L2_PIX_FMT_YUV410  v4l2_fourcc('Y', 'U', 'V', '9') /*  9  YUV 4:1:0     */
-#define V4L2_PIX_FMT_YUV420  v4l2_fourcc('Y', 'U', '1', '2') /* 12  YUV 4:2:0     */
-#define V4L2_PIX_FMT_YYUV    v4l2_fourcc('Y', 'Y', 'U', 'V') /* 16  YUV 4:2:2     */
-#define V4L2_PIX_FMT_HI240   v4l2_fourcc('H', 'I', '2', '4') /*  8  8-bit color   */
-#define V4L2_PIX_FMT_HM12    v4l2_fourcc('H', 'M', '1', '2') /*  8  YUV 4:2:0 16x16 macroblocks */
-
-/* see http://www.siliconimaging.com/RGB%20Bayer.htm */
+/* Bayer formats - see http://www.siliconimaging.com/RGB%20Bayer.htm */
 #define V4L2_PIX_FMT_SBGGR8  v4l2_fourcc('B', 'A', '8', '1') /*  8  BGBG.. GRGR.. */
 #define V4L2_PIX_FMT_SGBRG8  v4l2_fourcc('G', 'B', 'R', 'G') /*  8  GBGB.. RGRG.. */
 #define V4L2_PIX_FMT_SGRBG8  v4l2_fourcc('G', 'R', 'B', 'G') /*  8  GRGR.. BGBG.. */
-
-/*
- * 10bit raw bayer, expanded to 16 bits
- * xxxxrrrrrrrrrrxxxxgggggggggg xxxxggggggggggxxxxbbbbbbbbbb...
- */
-#define V4L2_PIX_FMT_SGRBG10 v4l2_fourcc('B', 'A', '1', '0')
-/* 10bit raw bayer DPCM compressed to 8 bits */
+#define V4L2_PIX_FMT_SGRBG10 v4l2_fourcc('B', 'A', '1', '0') /* 10bit raw bayer */
+	/* 10bit raw bayer DPCM compressed to 8 bits */
 #define V4L2_PIX_FMT_SGRBG10DPCM8 v4l2_fourcc('B', 'D', '1', '0')
+	/*
+	 * 10bit raw bayer, expanded to 16 bits
+	 * xxxxrrrrrrrrrrxxxxgggggggggg xxxxggggggggggxxxxbbbbbbbbbb...
+	 */
 #define V4L2_PIX_FMT_SBGGR16 v4l2_fourcc('B', 'Y', 'R', '2') /* 16  BGBG.. GRGR.. */
 
 /* compressed formats */
@@ -350,7 +356,6 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_MR97310A v4l2_fourcc('M', '3', '1', '0') /* compressed BGGR bayer */
 #define V4L2_PIX_FMT_SQ905C   v4l2_fourcc('9', '0', '5', 'C') /* compressed RGGB bayer */
 #define V4L2_PIX_FMT_PJPG     v4l2_fourcc('P', 'J', 'P', 'G') /* Pixart 73xx JPEG */
-#define V4L2_PIX_FMT_YVYU     v4l2_fourcc('Y', 'V', 'Y', 'U') /* 16 YVU 4:2:2 */
 #define V4L2_PIX_FMT_OV511    v4l2_fourcc('O', '5', '1', '1') /* ov511 JPEG */
 #define V4L2_PIX_FMT_OV518    v4l2_fourcc('O', '5', '1', '8') /* ov518 JPEG */
 
-- 
cgit v1.2.3-71-gd317


From de05f63430e8cb8cde0a21260bc6b01765111e5c Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@skynet.be>
Date: Fri, 26 Jun 2009 12:15:38 -0300
Subject: V4L/DVB (12187): uvcvideo: Move UVC definitions to linux/usb/video.h

To make UVC constants accessible by a future UVC gadget driver, move them from
drivers/media/video/uvc/uvcvideo.h to include/linux/usb/video.h.

Signed-off-by: Laurent Pinchart <laurent.pinchart@skynet.be>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/media/video/uvc/uvcvideo.h | 136 +-----------------------------
 include/linux/usb/video.h          | 164 +++++++++++++++++++++++++++++++++++++
 2 files changed, 165 insertions(+), 135 deletions(-)
 create mode 100644 include/linux/usb/video.h

(limited to 'include/linux')

diff --git a/drivers/media/video/uvc/uvcvideo.h b/drivers/media/video/uvc/uvcvideo.h
index cc5023e09ca9..5cf68f590725 100644
--- a/drivers/media/video/uvc/uvcvideo.h
+++ b/drivers/media/video/uvc/uvcvideo.h
@@ -67,144 +67,12 @@ struct uvc_xu_control {
 #ifdef __KERNEL__
 
 #include <linux/poll.h>
+#include <linux/usb/video.h>
 
 /* --------------------------------------------------------------------------
  * UVC constants
  */
 
-#define UVC_SC_UNDEFINED				0x00
-#define UVC_SC_VIDEOCONTROL				0x01
-#define UVC_SC_VIDEOSTREAMING				0x02
-#define UVC_SC_VIDEO_INTERFACE_COLLECTION		0x03
-
-#define UVC_PC_PROTOCOL_UNDEFINED			0x00
-
-/* VideoControl class specific interface descriptor */
-#define UVC_VC_DESCRIPTOR_UNDEFINED			0x00
-#define UVC_VC_HEADER					0x01
-#define UVC_VC_INPUT_TERMINAL				0x02
-#define UVC_VC_OUTPUT_TERMINAL				0x03
-#define UVC_VC_SELECTOR_UNIT				0x04
-#define UVC_VC_PROCESSING_UNIT				0x05
-#define UVC_VC_EXTENSION_UNIT				0x06
-
-/* VideoStreaming class specific interface descriptor */
-#define UVC_VS_UNDEFINED				0x00
-#define UVC_VS_INPUT_HEADER				0x01
-#define UVC_VS_OUTPUT_HEADER				0x02
-#define UVC_VS_STILL_IMAGE_FRAME			0x03
-#define UVC_VS_FORMAT_UNCOMPRESSED			0x04
-#define UVC_VS_FRAME_UNCOMPRESSED			0x05
-#define UVC_VS_FORMAT_MJPEG				0x06
-#define UVC_VS_FRAME_MJPEG				0x07
-#define UVC_VS_FORMAT_MPEG2TS				0x0a
-#define UVC_VS_FORMAT_DV				0x0c
-#define UVC_VS_COLORFORMAT				0x0d
-#define UVC_VS_FORMAT_FRAME_BASED			0x10
-#define UVC_VS_FRAME_FRAME_BASED			0x11
-#define UVC_VS_FORMAT_STREAM_BASED			0x12
-
-/* Endpoint type */
-#define UVC_EP_UNDEFINED				0x00
-#define UVC_EP_GENERAL					0x01
-#define UVC_EP_ENDPOINT					0x02
-#define UVC_EP_INTERRUPT				0x03
-
-/* Request codes */
-#define UVC_RC_UNDEFINED				0x00
-#define UVC_SET_CUR					0x01
-#define UVC_GET_CUR					0x81
-#define UVC_GET_MIN					0x82
-#define UVC_GET_MAX					0x83
-#define UVC_GET_RES					0x84
-#define UVC_GET_LEN					0x85
-#define UVC_GET_INFO					0x86
-#define UVC_GET_DEF					0x87
-
-/* VideoControl interface controls */
-#define UVC_VC_CONTROL_UNDEFINED			0x00
-#define UVC_VC_VIDEO_POWER_MODE_CONTROL			0x01
-#define UVC_VC_REQUEST_ERROR_CODE_CONTROL		0x02
-
-/* Terminal controls */
-#define UVC_TE_CONTROL_UNDEFINED			0x00
-
-/* Selector Unit controls */
-#define UVC_SU_CONTROL_UNDEFINED			0x00
-#define UVC_SU_INPUT_SELECT_CONTROL			0x01
-
-/* Camera Terminal controls */
-#define UVC_CT_CONTROL_UNDEFINED			0x00
-#define UVC_CT_SCANNING_MODE_CONTROL			0x01
-#define UVC_CT_AE_MODE_CONTROL				0x02
-#define UVC_CT_AE_PRIORITY_CONTROL			0x03
-#define UVC_CT_EXPOSURE_TIME_ABSOLUTE_CONTROL		0x04
-#define UVC_CT_EXPOSURE_TIME_RELATIVE_CONTROL		0x05
-#define UVC_CT_FOCUS_ABSOLUTE_CONTROL			0x06
-#define UVC_CT_FOCUS_RELATIVE_CONTROL			0x07
-#define UVC_CT_FOCUS_AUTO_CONTROL			0x08
-#define UVC_CT_IRIS_ABSOLUTE_CONTROL			0x09
-#define UVC_CT_IRIS_RELATIVE_CONTROL			0x0a
-#define UVC_CT_ZOOM_ABSOLUTE_CONTROL			0x0b
-#define UVC_CT_ZOOM_RELATIVE_CONTROL			0x0c
-#define UVC_CT_PANTILT_ABSOLUTE_CONTROL			0x0d
-#define UVC_CT_PANTILT_RELATIVE_CONTROL			0x0e
-#define UVC_CT_ROLL_ABSOLUTE_CONTROL			0x0f
-#define UVC_CT_ROLL_RELATIVE_CONTROL			0x10
-#define UVC_CT_PRIVACY_CONTROL				0x11
-
-/* Processing Unit controls */
-#define UVC_PU_CONTROL_UNDEFINED			0x00
-#define UVC_PU_BACKLIGHT_COMPENSATION_CONTROL		0x01
-#define UVC_PU_BRIGHTNESS_CONTROL			0x02
-#define UVC_PU_CONTRAST_CONTROL				0x03
-#define UVC_PU_GAIN_CONTROL				0x04
-#define UVC_PU_POWER_LINE_FREQUENCY_CONTROL		0x05
-#define UVC_PU_HUE_CONTROL				0x06
-#define UVC_PU_SATURATION_CONTROL			0x07
-#define UVC_PU_SHARPNESS_CONTROL			0x08
-#define UVC_PU_GAMMA_CONTROL				0x09
-#define UVC_PU_WHITE_BALANCE_TEMPERATURE_CONTROL	0x0a
-#define UVC_PU_WHITE_BALANCE_TEMPERATURE_AUTO_CONTROL	0x0b
-#define UVC_PU_WHITE_BALANCE_COMPONENT_CONTROL		0x0c
-#define UVC_PU_WHITE_BALANCE_COMPONENT_AUTO_CONTROL	0x0d
-#define UVC_PU_DIGITAL_MULTIPLIER_CONTROL		0x0e
-#define UVC_PU_DIGITAL_MULTIPLIER_LIMIT_CONTROL		0x0f
-#define UVC_PU_HUE_AUTO_CONTROL				0x10
-#define UVC_PU_ANALOG_VIDEO_STANDARD_CONTROL		0x11
-#define UVC_PU_ANALOG_LOCK_STATUS_CONTROL		0x12
-
-/* VideoStreaming interface controls */
-#define UVC_VS_CONTROL_UNDEFINED			0x00
-#define UVC_VS_PROBE_CONTROL				0x01
-#define UVC_VS_COMMIT_CONTROL				0x02
-#define UVC_VS_STILL_PROBE_CONTROL			0x03
-#define UVC_VS_STILL_COMMIT_CONTROL			0x04
-#define UVC_VS_STILL_IMAGE_TRIGGER_CONTROL		0x05
-#define UVC_VS_STREAM_ERROR_CODE_CONTROL		0x06
-#define UVC_VS_GENERATE_KEY_FRAME_CONTROL		0x07
-#define UVC_VS_UPDATE_FRAME_SEGMENT_CONTROL		0x08
-#define UVC_VS_SYNC_DELAY_CONTROL			0x09
-
-#define UVC_TT_VENDOR_SPECIFIC				0x0100
-#define UVC_TT_STREAMING				0x0101
-
-/* Input Terminal types */
-#define UVC_ITT_VENDOR_SPECIFIC				0x0200
-#define UVC_ITT_CAMERA					0x0201
-#define UVC_ITT_MEDIA_TRANSPORT_INPUT			0x0202
-
-/* Output Terminal types */
-#define UVC_OTT_VENDOR_SPECIFIC				0x0300
-#define UVC_OTT_DISPLAY					0x0301
-#define UVC_OTT_MEDIA_TRANSPORT_OUTPUT			0x0302
-
-/* External Terminal types */
-#define UVC_EXTERNAL_VENDOR_SPECIFIC			0x0400
-#define UVC_COMPOSITE_CONNECTOR				0x0401
-#define UVC_SVIDEO_CONNECTOR				0x0402
-#define UVC_COMPONENT_CONNECTOR				0x0403
-
 #define UVC_TERM_INPUT			0x0000
 #define UVC_TERM_OUTPUT			0x8000
 
@@ -216,8 +84,6 @@ struct uvc_xu_control {
 #define UVC_ENTITY_IS_OTERM(entity) \
 	(((entity)->type & 0x8000) == UVC_TERM_OUTPUT)
 
-#define UVC_STATUS_TYPE_CONTROL		1
-#define UVC_STATUS_TYPE_STREAMING	2
 
 /* ------------------------------------------------------------------------
  * GUIDs
diff --git a/include/linux/usb/video.h b/include/linux/usb/video.h
new file mode 100644
index 000000000000..be436d9ee479
--- /dev/null
+++ b/include/linux/usb/video.h
@@ -0,0 +1,164 @@
+/*
+ * USB Video Class definitions.
+ *
+ * Copyright (C) 2009 Laurent Pinchart <laurent.pinchart@skynet.be>
+ *
+ * This file holds USB constants and structures defined by the USB Device
+ * Class Definition for Video Devices. Unless otherwise stated, comments
+ * below reference relevant sections of the USB Video Class 1.1 specification
+ * available at
+ *
+ * http://www.usb.org/developers/devclass_docs/USB_Video_Class_1_1.zip
+ */
+
+#ifndef __LINUX_USB_VIDEO_H
+#define __LINUX_USB_VIDEO_H
+
+#include <linux/types.h>
+
+/* --------------------------------------------------------------------------
+ * UVC constants
+ */
+
+/* A.2. Video Interface Subclass Codes */
+#define UVC_SC_UNDEFINED				0x00
+#define UVC_SC_VIDEOCONTROL				0x01
+#define UVC_SC_VIDEOSTREAMING				0x02
+#define UVC_SC_VIDEO_INTERFACE_COLLECTION		0x03
+
+/* A.3. Video Interface Protocol Codes */
+#define UVC_PC_PROTOCOL_UNDEFINED			0x00
+
+/* A.5. Video Class-Specific VC Interface Descriptor Subtypes */
+#define UVC_VC_DESCRIPTOR_UNDEFINED			0x00
+#define UVC_VC_HEADER					0x01
+#define UVC_VC_INPUT_TERMINAL				0x02
+#define UVC_VC_OUTPUT_TERMINAL				0x03
+#define UVC_VC_SELECTOR_UNIT				0x04
+#define UVC_VC_PROCESSING_UNIT				0x05
+#define UVC_VC_EXTENSION_UNIT				0x06
+
+/* A.6. Video Class-Specific VS Interface Descriptor Subtypes */
+#define UVC_VS_UNDEFINED				0x00
+#define UVC_VS_INPUT_HEADER				0x01
+#define UVC_VS_OUTPUT_HEADER				0x02
+#define UVC_VS_STILL_IMAGE_FRAME			0x03
+#define UVC_VS_FORMAT_UNCOMPRESSED			0x04
+#define UVC_VS_FRAME_UNCOMPRESSED			0x05
+#define UVC_VS_FORMAT_MJPEG				0x06
+#define UVC_VS_FRAME_MJPEG				0x07
+#define UVC_VS_FORMAT_MPEG2TS				0x0a
+#define UVC_VS_FORMAT_DV				0x0c
+#define UVC_VS_COLORFORMAT				0x0d
+#define UVC_VS_FORMAT_FRAME_BASED			0x10
+#define UVC_VS_FRAME_FRAME_BASED			0x11
+#define UVC_VS_FORMAT_STREAM_BASED			0x12
+
+/* A.7. Video Class-Specific Endpoint Descriptor Subtypes */
+#define UVC_EP_UNDEFINED				0x00
+#define UVC_EP_GENERAL					0x01
+#define UVC_EP_ENDPOINT					0x02
+#define UVC_EP_INTERRUPT				0x03
+
+/* A.8. Video Class-Specific Request Codes */
+#define UVC_RC_UNDEFINED				0x00
+#define UVC_SET_CUR					0x01
+#define UVC_GET_CUR					0x81
+#define UVC_GET_MIN					0x82
+#define UVC_GET_MAX					0x83
+#define UVC_GET_RES					0x84
+#define UVC_GET_LEN					0x85
+#define UVC_GET_INFO					0x86
+#define UVC_GET_DEF					0x87
+
+/* A.9.1. VideoControl Interface Control Selectors */
+#define UVC_VC_CONTROL_UNDEFINED			0x00
+#define UVC_VC_VIDEO_POWER_MODE_CONTROL			0x01
+#define UVC_VC_REQUEST_ERROR_CODE_CONTROL		0x02
+
+/* A.9.2. Terminal Control Selectors */
+#define UVC_TE_CONTROL_UNDEFINED			0x00
+
+/* A.9.3. Selector Unit Control Selectors */
+#define UVC_SU_CONTROL_UNDEFINED			0x00
+#define UVC_SU_INPUT_SELECT_CONTROL			0x01
+
+/* A.9.4. Camera Terminal Control Selectors */
+#define UVC_CT_CONTROL_UNDEFINED			0x00
+#define UVC_CT_SCANNING_MODE_CONTROL			0x01
+#define UVC_CT_AE_MODE_CONTROL				0x02
+#define UVC_CT_AE_PRIORITY_CONTROL			0x03
+#define UVC_CT_EXPOSURE_TIME_ABSOLUTE_CONTROL		0x04
+#define UVC_CT_EXPOSURE_TIME_RELATIVE_CONTROL		0x05
+#define UVC_CT_FOCUS_ABSOLUTE_CONTROL			0x06
+#define UVC_CT_FOCUS_RELATIVE_CONTROL			0x07
+#define UVC_CT_FOCUS_AUTO_CONTROL			0x08
+#define UVC_CT_IRIS_ABSOLUTE_CONTROL			0x09
+#define UVC_CT_IRIS_RELATIVE_CONTROL			0x0a
+#define UVC_CT_ZOOM_ABSOLUTE_CONTROL			0x0b
+#define UVC_CT_ZOOM_RELATIVE_CONTROL			0x0c
+#define UVC_CT_PANTILT_ABSOLUTE_CONTROL			0x0d
+#define UVC_CT_PANTILT_RELATIVE_CONTROL			0x0e
+#define UVC_CT_ROLL_ABSOLUTE_CONTROL			0x0f
+#define UVC_CT_ROLL_RELATIVE_CONTROL			0x10
+#define UVC_CT_PRIVACY_CONTROL				0x11
+
+/* A.9.5. Processing Unit Control Selectors */
+#define UVC_PU_CONTROL_UNDEFINED			0x00
+#define UVC_PU_BACKLIGHT_COMPENSATION_CONTROL		0x01
+#define UVC_PU_BRIGHTNESS_CONTROL			0x02
+#define UVC_PU_CONTRAST_CONTROL				0x03
+#define UVC_PU_GAIN_CONTROL				0x04
+#define UVC_PU_POWER_LINE_FREQUENCY_CONTROL		0x05
+#define UVC_PU_HUE_CONTROL				0x06
+#define UVC_PU_SATURATION_CONTROL			0x07
+#define UVC_PU_SHARPNESS_CONTROL			0x08
+#define UVC_PU_GAMMA_CONTROL				0x09
+#define UVC_PU_WHITE_BALANCE_TEMPERATURE_CONTROL	0x0a
+#define UVC_PU_WHITE_BALANCE_TEMPERATURE_AUTO_CONTROL	0x0b
+#define UVC_PU_WHITE_BALANCE_COMPONENT_CONTROL		0x0c
+#define UVC_PU_WHITE_BALANCE_COMPONENT_AUTO_CONTROL	0x0d
+#define UVC_PU_DIGITAL_MULTIPLIER_CONTROL		0x0e
+#define UVC_PU_DIGITAL_MULTIPLIER_LIMIT_CONTROL		0x0f
+#define UVC_PU_HUE_AUTO_CONTROL				0x10
+#define UVC_PU_ANALOG_VIDEO_STANDARD_CONTROL		0x11
+#define UVC_PU_ANALOG_LOCK_STATUS_CONTROL		0x12
+
+/* A.9.7. VideoStreaming Interface Control Selectors */
+#define UVC_VS_CONTROL_UNDEFINED			0x00
+#define UVC_VS_PROBE_CONTROL				0x01
+#define UVC_VS_COMMIT_CONTROL				0x02
+#define UVC_VS_STILL_PROBE_CONTROL			0x03
+#define UVC_VS_STILL_COMMIT_CONTROL			0x04
+#define UVC_VS_STILL_IMAGE_TRIGGER_CONTROL		0x05
+#define UVC_VS_STREAM_ERROR_CODE_CONTROL		0x06
+#define UVC_VS_GENERATE_KEY_FRAME_CONTROL		0x07
+#define UVC_VS_UPDATE_FRAME_SEGMENT_CONTROL		0x08
+#define UVC_VS_SYNC_DELAY_CONTROL			0x09
+
+/* B.1. USB Terminal Types */
+#define UVC_TT_VENDOR_SPECIFIC				0x0100
+#define UVC_TT_STREAMING				0x0101
+
+/* B.2. Input Terminal Types */
+#define UVC_ITT_VENDOR_SPECIFIC				0x0200
+#define UVC_ITT_CAMERA					0x0201
+#define UVC_ITT_MEDIA_TRANSPORT_INPUT			0x0202
+
+/* B.3. Output Terminal Types */
+#define UVC_OTT_VENDOR_SPECIFIC				0x0300
+#define UVC_OTT_DISPLAY					0x0301
+#define UVC_OTT_MEDIA_TRANSPORT_OUTPUT			0x0302
+
+/* B.4. External Terminal Types */
+#define UVC_EXTERNAL_VENDOR_SPECIFIC			0x0400
+#define UVC_COMPOSITE_CONNECTOR				0x0401
+#define UVC_SVIDEO_CONNECTOR				0x0402
+#define UVC_COMPONENT_CONNECTOR				0x0403
+
+/* 2.4.2.2. Status Packet Type */
+#define UVC_STATUS_TYPE_CONTROL				1
+#define UVC_STATUS_TYPE_STREAMING			2
+
+#endif /* __LINUX_USB_VIDEO_H */
+
-- 
cgit v1.2.3-71-gd317


From ef5b5b7e73038f2c7d0d1b020c6eac6435c2b552 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Sat, 20 Jun 2009 05:37:27 -0300
Subject: V4L/DVB (12212): v4l2: add RDS API to videodev2.h

Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 include/linux/videodev2.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index eddd5ff99c80..22fc99d58d34 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -1186,6 +1186,7 @@ struct v4l2_modulator {
 #define V4L2_TUNER_CAP_LANG2		0x0020
 #define V4L2_TUNER_CAP_SAP		0x0020
 #define V4L2_TUNER_CAP_LANG1		0x0040
+#define V4L2_TUNER_CAP_RDS		0x0080
 
 /*  Flags for the 'rxsubchans' field */
 #define V4L2_TUNER_SUB_MONO		0x0001
@@ -1193,6 +1194,7 @@ struct v4l2_modulator {
 #define V4L2_TUNER_SUB_LANG2		0x0004
 #define V4L2_TUNER_SUB_SAP		0x0004
 #define V4L2_TUNER_SUB_LANG1		0x0008
+#define V4L2_TUNER_SUB_RDS		0x0010
 
 /*  Values for the 'audmode' field */
 #define V4L2_TUNER_MODE_MONO		0x0000
@@ -1217,6 +1219,27 @@ struct v4l2_hw_freq_seek {
 	__u32		      reserved[8];
 };
 
+/*
+ *	R D S
+ */
+
+struct v4l2_rds_data {
+	__u8 	lsb;
+	__u8 	msb;
+	__u8 	block;
+} __attribute__ ((packed));
+
+#define V4L2_RDS_BLOCK_MSK 	 0x7
+#define V4L2_RDS_BLOCK_A 	 0
+#define V4L2_RDS_BLOCK_B 	 1
+#define V4L2_RDS_BLOCK_C 	 2
+#define V4L2_RDS_BLOCK_D 	 3
+#define V4L2_RDS_BLOCK_C_ALT 	 4
+#define V4L2_RDS_BLOCK_INVALID 	 7
+
+#define V4L2_RDS_BLOCK_CORRECTED 0x40
+#define V4L2_RDS_BLOCK_ERROR 	 0x80
+
 /*
  *	A U D I O
  */
-- 
cgit v1.2.3-71-gd317


From 1cb662a3144992259edfd3cf9f54a6b25a913a0f Mon Sep 17 00:00:00 2001
From: Andreas Oberritter <obi@linuxtv.org>
Date: Tue, 14 Jul 2009 20:28:50 -0300
Subject: V4L/DVB (12275): Add two new ioctls: DMX_ADD_PID and DMX_REMOVE_PID

DMX_ADD_PID allows to add multiple PIDs to a transport stream filter
previously set up with DMX_SET_PES_FILTER and output=DMX_OUT_TSDEMUX_TAP.

DMX_REMOVE_PID is used to drop a PID from a filter.

These ioctls are to be used by readers of /dev/dvb/adapterX/demuxY. They
may be called at any time, i.e. before or after the first filter on the
shared file descriptor was started.

They make it possible to record multiple services without the need to de-
or re-multiplex TS packets.

To accomplish this, dmxdev_filter->feed.ts has been converted to a list
of struct dmxdev_feeds, each containing a PID value and a pointer to a
struct dmx_ts_feed.

Signed-off-by: Andreas Oberritter <obi@linuxtv.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/media/dvb/dvb-core/dmxdev.c | 231 +++++++++++++++++++++++++++---------
 drivers/media/dvb/dvb-core/dmxdev.h |   9 +-
 include/linux/dvb/dmx.h             |   2 +
 3 files changed, 184 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/dvb/dvb-core/dmxdev.c b/drivers/media/dvb/dvb-core/dmxdev.c
index 6d6121eb5d59..3750ff48cba1 100644
--- a/drivers/media/dvb/dvb-core/dmxdev.c
+++ b/drivers/media/dvb/dvb-core/dmxdev.c
@@ -430,6 +430,8 @@ static int dvb_dmxdev_ts_callback(const u8 *buffer1, size_t buffer1_len,
 /* stop feed but only mark the specified filter as stopped (state set) */
 static int dvb_dmxdev_feed_stop(struct dmxdev_filter *dmxdevfilter)
 {
+	struct dmxdev_feed *feed;
+
 	dvb_dmxdev_filter_state_set(dmxdevfilter, DMXDEV_STATE_SET);
 
 	switch (dmxdevfilter->type) {
@@ -438,7 +440,8 @@ static int dvb_dmxdev_feed_stop(struct dmxdev_filter *dmxdevfilter)
 		dmxdevfilter->feed.sec->stop_filtering(dmxdevfilter->feed.sec);
 		break;
 	case DMXDEV_TYPE_PES:
-		dmxdevfilter->feed.ts->stop_filtering(dmxdevfilter->feed.ts);
+		list_for_each_entry(feed, &dmxdevfilter->feed.ts, next)
+			feed->ts->stop_filtering(feed->ts);
 		break;
 	default:
 		return -EINVAL;
@@ -449,13 +452,23 @@ static int dvb_dmxdev_feed_stop(struct dmxdev_filter *dmxdevfilter)
 /* start feed associated with the specified filter */
 static int dvb_dmxdev_feed_start(struct dmxdev_filter *filter)
 {
+	struct dmxdev_feed *feed;
+	int ret;
+
 	dvb_dmxdev_filter_state_set(filter, DMXDEV_STATE_GO);
 
 	switch (filter->type) {
 	case DMXDEV_TYPE_SEC:
 		return filter->feed.sec->start_filtering(filter->feed.sec);
 	case DMXDEV_TYPE_PES:
-		return filter->feed.ts->start_filtering(filter->feed.ts);
+		list_for_each_entry(feed, &filter->feed.ts, next) {
+			ret = feed->ts->start_filtering(feed->ts);
+			if (ret < 0) {
+				dvb_dmxdev_feed_stop(filter);
+				return ret;
+			}
+		}
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -487,6 +500,9 @@ static int dvb_dmxdev_feed_restart(struct dmxdev_filter *filter)
 
 static int dvb_dmxdev_filter_stop(struct dmxdev_filter *dmxdevfilter)
 {
+	struct dmxdev_feed *feed;
+	struct dmx_demux *demux;
+
 	if (dmxdevfilter->state < DMXDEV_STATE_GO)
 		return 0;
 
@@ -503,13 +519,12 @@ static int dvb_dmxdev_filter_stop(struct dmxdev_filter *dmxdevfilter)
 		dmxdevfilter->feed.sec = NULL;
 		break;
 	case DMXDEV_TYPE_PES:
-		if (!dmxdevfilter->feed.ts)
-			break;
 		dvb_dmxdev_feed_stop(dmxdevfilter);
-		dmxdevfilter->dev->demux->
-		    release_ts_feed(dmxdevfilter->dev->demux,
-				    dmxdevfilter->feed.ts);
-		dmxdevfilter->feed.ts = NULL;
+		demux = dmxdevfilter->dev->demux;
+		list_for_each_entry(feed, &dmxdevfilter->feed.ts, next) {
+			demux->release_ts_feed(demux, feed->ts);
+			feed->ts = NULL;
+		}
 		break;
 	default:
 		if (dmxdevfilter->state == DMXDEV_STATE_ALLOCATED)
@@ -521,19 +536,88 @@ static int dvb_dmxdev_filter_stop(struct dmxdev_filter *dmxdevfilter)
 	return 0;
 }
 
+static void dvb_dmxdev_delete_pids(struct dmxdev_filter *dmxdevfilter)
+{
+	struct dmxdev_feed *feed, *tmp;
+
+	/* delete all PIDs */
+	list_for_each_entry_safe(feed, tmp, &dmxdevfilter->feed.ts, next) {
+		list_del(&feed->next);
+		kfree(feed);
+	}
+
+	BUG_ON(!list_empty(&dmxdevfilter->feed.ts));
+}
+
 static inline int dvb_dmxdev_filter_reset(struct dmxdev_filter *dmxdevfilter)
 {
 	if (dmxdevfilter->state < DMXDEV_STATE_SET)
 		return 0;
 
+	if (dmxdevfilter->type == DMXDEV_TYPE_PES)
+		dvb_dmxdev_delete_pids(dmxdevfilter);
+
 	dmxdevfilter->type = DMXDEV_TYPE_NONE;
 	dvb_dmxdev_filter_state_set(dmxdevfilter, DMXDEV_STATE_ALLOCATED);
 	return 0;
 }
 
+static int dvb_dmxdev_start_feed(struct dmxdev *dmxdev,
+				 struct dmxdev_filter *filter,
+				 struct dmxdev_feed *feed)
+{
+	struct timespec timeout = { 0 };
+	struct dmx_pes_filter_params *para = &filter->params.pes;
+	dmx_output_t otype;
+	int ret;
+	int ts_type;
+	enum dmx_ts_pes ts_pes;
+	struct dmx_ts_feed *tsfeed;
+
+	feed->ts = NULL;
+	otype = para->output;
+
+	ts_pes = (enum dmx_ts_pes)para->pes_type;
+
+	if (ts_pes < DMX_PES_OTHER)
+		ts_type = TS_DECODER;
+	else
+		ts_type = 0;
+
+	if (otype == DMX_OUT_TS_TAP)
+		ts_type |= TS_PACKET;
+	else if (otype == DMX_OUT_TSDEMUX_TAP)
+		ts_type |= TS_PACKET | TS_DEMUX;
+	else if (otype == DMX_OUT_TAP)
+		ts_type |= TS_PACKET | TS_DEMUX | TS_PAYLOAD_ONLY;
+
+	ret = dmxdev->demux->allocate_ts_feed(dmxdev->demux, &feed->ts,
+					      dvb_dmxdev_ts_callback);
+	if (ret < 0)
+		return ret;
+
+	tsfeed = feed->ts;
+	tsfeed->priv = filter;
+
+	ret = tsfeed->set(tsfeed, feed->pid, ts_type, ts_pes, 32768, timeout);
+	if (ret < 0) {
+		dmxdev->demux->release_ts_feed(dmxdev->demux, tsfeed);
+		return ret;
+	}
+
+	ret = tsfeed->start_filtering(tsfeed);
+	if (ret < 0) {
+		dmxdev->demux->release_ts_feed(dmxdev->demux, tsfeed);
+		return ret;
+	}
+
+	return 0;
+}
+
 static int dvb_dmxdev_filter_start(struct dmxdev_filter *filter)
 {
 	struct dmxdev *dmxdev = filter->dev;
+	struct dmxdev_feed *feed;
 	void *mem;
 	int ret, i;
 
@@ -631,56 +715,14 @@ static int dvb_dmxdev_filter_start(struct dmxdev_filter *filter)
 		break;
 	}
 	case DMXDEV_TYPE_PES:
-	{
-		struct timespec timeout = { 0 };
-		struct dmx_pes_filter_params *para = &filter->params.pes;
-		dmx_output_t otype;
-		int ts_type;
-		enum dmx_ts_pes ts_pes;
-		struct dmx_ts_feed **tsfeed = &filter->feed.ts;
-
-		filter->feed.ts = NULL;
-		otype = para->output;
-
-		ts_pes = (enum dmx_ts_pes)para->pes_type;
-
-		if (ts_pes < DMX_PES_OTHER)
-			ts_type = TS_DECODER;
-		else
-			ts_type = 0;
-
-		if (otype == DMX_OUT_TS_TAP)
-			ts_type |= TS_PACKET;
-		else if (otype == DMX_OUT_TSDEMUX_TAP)
-			ts_type |= TS_PACKET | TS_DEMUX;
-		else if (otype == DMX_OUT_TAP)
-			ts_type |= TS_PACKET | TS_DEMUX | TS_PAYLOAD_ONLY;
-
-		ret = dmxdev->demux->allocate_ts_feed(dmxdev->demux,
-						      tsfeed,
-						      dvb_dmxdev_ts_callback);
-		if (ret < 0)
-			return ret;
-
-		(*tsfeed)->priv = filter;
-
-		ret = (*tsfeed)->set(*tsfeed, para->pid, ts_type, ts_pes,
-				     32768, timeout);
-		if (ret < 0) {
-			dmxdev->demux->release_ts_feed(dmxdev->demux,
-						       *tsfeed);
-			return ret;
-		}
-
-		ret = filter->feed.ts->start_filtering(filter->feed.ts);
-		if (ret < 0) {
-			dmxdev->demux->release_ts_feed(dmxdev->demux,
-						       *tsfeed);
-			return ret;
+		list_for_each_entry(feed, &filter->feed.ts, next) {
+			ret = dvb_dmxdev_start_feed(dmxdev, filter, feed);
+			if (ret < 0) {
+				dvb_dmxdev_filter_stop(filter);
+				return ret;
+			}
 		}
-
 		break;
-	}
 	default:
 		return -EINVAL;
 	}
@@ -718,7 +760,7 @@ static int dvb_demux_open(struct inode *inode, struct file *file)
 	dvb_ringbuffer_init(&dmxdevfilter->buffer, NULL, 8192);
 	dmxdevfilter->type = DMXDEV_TYPE_NONE;
 	dvb_dmxdev_filter_state_set(dmxdevfilter, DMXDEV_STATE_ALLOCATED);
-	dmxdevfilter->feed.ts = NULL;
+	INIT_LIST_HEAD(&dmxdevfilter->feed.ts);
 	init_timer(&dmxdevfilter->timer);
 
 	dvbdev->users++;
@@ -760,6 +802,55 @@ static inline void invert_mode(dmx_filter_t *filter)
 		filter->mode[i] ^= 0xff;
 }
 
+static int dvb_dmxdev_add_pid(struct dmxdev *dmxdev,
+			      struct dmxdev_filter *filter, u16 pid)
+{
+	struct dmxdev_feed *feed;
+
+	if ((filter->type != DMXDEV_TYPE_PES) ||
+	    (filter->state < DMXDEV_STATE_SET))
+		return -EINVAL;
+
+	/* only TS packet filters may have multiple PIDs */
+	if ((filter->params.pes.output != DMX_OUT_TSDEMUX_TAP) &&
+	    (!list_empty(&filter->feed.ts)))
+		return -EINVAL;
+
+	feed = kzalloc(sizeof(struct dmxdev_feed), GFP_KERNEL);
+	if (feed == NULL)
+		return -ENOMEM;
+
+	feed->pid = pid;
+	list_add(&feed->next, &filter->feed.ts);
+
+	if (filter->state >= DMXDEV_STATE_GO)
+		return dvb_dmxdev_start_feed(dmxdev, filter, feed);
+
+	return 0;
+}
+
+static int dvb_dmxdev_remove_pid(struct dmxdev *dmxdev,
+				  struct dmxdev_filter *filter, u16 pid)
+{
+	struct dmxdev_feed *feed, *tmp;
+
+	if ((filter->type != DMXDEV_TYPE_PES) ||
+	    (filter->state < DMXDEV_STATE_SET))
+		return -EINVAL;
+
+	list_for_each_entry_safe(feed, tmp, &filter->feed.ts, next) {
+		if ((feed->pid == pid) && (feed->ts != NULL)) {
+			feed->ts->stop_filtering(feed->ts);
+			filter->dev->demux->release_ts_feed(filter->dev->demux,
+							    feed->ts);
+			list_del(&feed->next);
+			kfree(feed);
+		}
+	}
+
+	return 0;
+}
+
 static int dvb_dmxdev_filter_set(struct dmxdev *dmxdev,
 				 struct dmxdev_filter *dmxdevfilter,
 				 struct dmx_sct_filter_params *params)
@@ -784,7 +875,10 @@ static int dvb_dmxdev_pes_filter_set(struct dmxdev *dmxdev,
 				     struct dmxdev_filter *dmxdevfilter,
 				     struct dmx_pes_filter_params *params)
 {
+	int ret;
+
 	dvb_dmxdev_filter_stop(dmxdevfilter);
+	dvb_dmxdev_filter_reset(dmxdevfilter);
 
 	if (params->pes_type > DMX_PES_OTHER || params->pes_type < 0)
 		return -EINVAL;
@@ -795,6 +889,11 @@ static int dvb_dmxdev_pes_filter_set(struct dmxdev *dmxdev,
 
 	dvb_dmxdev_filter_state_set(dmxdevfilter, DMXDEV_STATE_SET);
 
+	ret = dvb_dmxdev_add_pid(dmxdev, dmxdevfilter,
+				 dmxdevfilter->params.pes.pid);
+	if (ret < 0)
+		return ret;
+
 	if (params->flags & DMX_IMMEDIATE_START)
 		return dvb_dmxdev_filter_start(dmxdevfilter);
 
@@ -958,6 +1057,24 @@ static int dvb_demux_do_ioctl(struct inode *inode, struct file *file,
 					     &((struct dmx_stc *)parg)->base);
 		break;
 
+	case DMX_ADD_PID:
+		if (mutex_lock_interruptible(&dmxdevfilter->mutex)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+		ret = dvb_dmxdev_add_pid(dmxdev, dmxdevfilter, *(u16 *)parg);
+		mutex_unlock(&dmxdevfilter->mutex);
+		break;
+
+	case DMX_REMOVE_PID:
+		if (mutex_lock_interruptible(&dmxdevfilter->mutex)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+		ret = dvb_dmxdev_remove_pid(dmxdev, dmxdevfilter, *(u16 *)parg);
+		mutex_unlock(&dmxdevfilter->mutex);
+		break;
+
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/drivers/media/dvb/dvb-core/dmxdev.h b/drivers/media/dvb/dvb-core/dmxdev.h
index 29746e70d325..c1379b56dfb4 100644
--- a/drivers/media/dvb/dvb-core/dmxdev.h
+++ b/drivers/media/dvb/dvb-core/dmxdev.h
@@ -53,13 +53,20 @@ enum dmxdev_state {
 	DMXDEV_STATE_TIMEDOUT
 };
 
+struct dmxdev_feed {
+	u16 pid;
+	struct dmx_ts_feed *ts;
+	struct list_head next;
+};
+
 struct dmxdev_filter {
 	union {
 		struct dmx_section_filter *sec;
 	} filter;
 
 	union {
-		struct dmx_ts_feed *ts;
+		/* list of TS and PES feeds (struct dmxdev_feed) */
+		struct list_head ts;
 		struct dmx_section_feed *sec;
 	} feed;
 
diff --git a/include/linux/dvb/dmx.h b/include/linux/dvb/dmx.h
index fef943738a24..f078f3ac82d4 100644
--- a/include/linux/dvb/dmx.h
+++ b/include/linux/dvb/dmx.h
@@ -151,5 +151,7 @@ struct dmx_stc {
 #define DMX_GET_CAPS             _IOR('o', 48, dmx_caps_t)
 #define DMX_SET_SOURCE           _IOW('o', 49, dmx_source_t)
 #define DMX_GET_STC              _IOWR('o', 50, struct dmx_stc)
+#define DMX_ADD_PID              _IOW('o', 51, __u16)
+#define DMX_REMOVE_PID           _IOW('o', 52, __u16)
 
 #endif /*_DVBDMX_H_*/
-- 
cgit v1.2.3-71-gd317


From 6ace40effd34331a604c5eeae90838cf8dd7eb8f Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Sat, 20 Jun 2009 09:50:39 -0300
Subject: V4L/DVB (12316): v4l: add V4L2_CAP_RDS_OUTPUT and V4L2_CAP_MODULATOR
 caps

Add capabilities to describe an FM transmitter device.

Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 include/linux/videodev2.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 22fc99d58d34..293ba181f95a 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -252,10 +252,12 @@ struct v4l2_capability {
 #define V4L2_CAP_RDS_CAPTURE		0x00000100  /* RDS data capture */
 #define V4L2_CAP_VIDEO_OUTPUT_OVERLAY	0x00000200  /* Can do video output overlay */
 #define V4L2_CAP_HW_FREQ_SEEK		0x00000400  /* Can do hardware frequency seek  */
+#define V4L2_CAP_RDS_OUTPUT		0x00000800  /* Is an RDS encoder */
 
 #define V4L2_CAP_TUNER			0x00010000  /* has a tuner */
 #define V4L2_CAP_AUDIO			0x00020000  /* has audio support */
 #define V4L2_CAP_RADIO			0x00040000  /* is a radio device */
+#define V4L2_CAP_MODULATOR		0x00080000  /* has a modulator */
 
 #define V4L2_CAP_READWRITE              0x01000000  /* read/write systemcalls */
 #define V4L2_CAP_ASYNCIO                0x02000000  /* async I/O */
-- 
cgit v1.2.3-71-gd317


From 6b5a9492ca0c991bab1ac495624e17520e9edf18 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Tue, 11 Aug 2009 18:47:18 -0300
Subject: V4L/DVB (12543): v4l: introduce string control support.

The upcoming RDS encoder needs support for string controls. This patch
implements the core implementation.

Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/media/video/v4l2-common.c         |  2 +
 drivers/media/video/v4l2-compat-ioctl32.c | 65 ++++++++++++++++++++++---------
 drivers/media/video/v4l2-ioctl.c          | 10 ++---
 include/linux/videodev2.h                 |  6 ++-
 4 files changed, 57 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/v4l2-common.c b/drivers/media/video/v4l2-common.c
index b91d66a767d7..536150c34067 100644
--- a/drivers/media/video/v4l2-common.c
+++ b/drivers/media/video/v4l2-common.c
@@ -156,6 +156,8 @@ int v4l2_ctrl_check(struct v4l2_ext_control *ctrl, struct v4l2_queryctrl *qctrl,
 		return -EINVAL;
 	if (qctrl->flags & V4L2_CTRL_FLAG_GRABBED)
 		return -EBUSY;
+	if (qctrl->type == V4L2_CTRL_TYPE_STRING)
+		return 0;
 	if (qctrl->type == V4L2_CTRL_TYPE_BUTTON ||
 	    qctrl->type == V4L2_CTRL_TYPE_INTEGER64 ||
 	    qctrl->type == V4L2_CTRL_TYPE_CTRL_CLASS)
diff --git a/drivers/media/video/v4l2-compat-ioctl32.c b/drivers/media/video/v4l2-compat-ioctl32.c
index 0056b115b42e..f788c41a3a5c 100644
--- a/drivers/media/video/v4l2-compat-ioctl32.c
+++ b/drivers/media/video/v4l2-compat-ioctl32.c
@@ -600,9 +600,35 @@ struct v4l2_ext_controls32 {
        compat_caddr_t controls; /* actually struct v4l2_ext_control32 * */
 };
 
+struct v4l2_ext_control32 {
+	__u32 id;
+	__u32 size;
+	__u32 reserved2[1];
+	union {
+		__s32 value;
+		__s64 value64;
+		compat_caddr_t string; /* actually char * */
+	};
+} __attribute__ ((packed));
+
+/* The following function really belong in v4l2-common, but that causes
+   a circular dependency between modules. We need to think about this, but
+   for now this will do. */
+
+/* Return non-zero if this control is a pointer type. Currently only
+ * type STRING is a pointer type.
+ *
+ * Note that there are currently no controls of this type, but at least the
+ * compat32 code is in place to properly handle such controls. Please
+ * remove this note once the first pointer controls are added. */
+static inline int ctrl_is_pointer(u32 id)
+{
+	return 0;
+}
+
 static int get_v4l2_ext_controls32(struct v4l2_ext_controls *kp, struct v4l2_ext_controls32 __user *up)
 {
-	struct v4l2_ext_control __user *ucontrols;
+	struct v4l2_ext_control32 __user *ucontrols;
 	struct v4l2_ext_control __user *kcontrols;
 	int n;
 	compat_caddr_t p;
@@ -626,15 +652,17 @@ static int get_v4l2_ext_controls32(struct v4l2_ext_controls *kp, struct v4l2_ext
 	kcontrols = compat_alloc_user_space(n * sizeof(struct v4l2_ext_control));
 	kp->controls = kcontrols;
 	while (--n >= 0) {
-		if (copy_in_user(&kcontrols->id, &ucontrols->id, sizeof(__u32)))
-			return -EFAULT;
-		if (copy_in_user(&kcontrols->reserved2, &ucontrols->reserved2, sizeof(ucontrols->reserved2)))
-			return -EFAULT;
-		/* Note: if the void * part of the union ever becomes relevant
-		   then we need to know the type of the control in order to do
-		   the right thing here. Luckily, that is not yet an issue. */
-		if (copy_in_user(&kcontrols->value, &ucontrols->value, sizeof(ucontrols->value)))
+		if (copy_in_user(kcontrols, ucontrols, sizeof(*kcontrols)))
 			return -EFAULT;
+		if (ctrl_is_pointer(kcontrols->id)) {
+			void __user *s;
+
+			if (get_user(p, &ucontrols->string))
+				return -EFAULT;
+			s = compat_ptr(p);
+			if (put_user(s, &kcontrols->string))
+				return -EFAULT;
+		}
 		ucontrols++;
 		kcontrols++;
 	}
@@ -643,7 +671,7 @@ static int get_v4l2_ext_controls32(struct v4l2_ext_controls *kp, struct v4l2_ext
 
 static int put_v4l2_ext_controls32(struct v4l2_ext_controls *kp, struct v4l2_ext_controls32 __user *up)
 {
-	struct v4l2_ext_control __user *ucontrols;
+	struct v4l2_ext_control32 __user *ucontrols;
 	struct v4l2_ext_control __user *kcontrols = kp->controls;
 	int n = kp->count;
 	compat_caddr_t p;
@@ -664,15 +692,14 @@ static int put_v4l2_ext_controls32(struct v4l2_ext_controls *kp, struct v4l2_ext
 		return -EFAULT;
 
 	while (--n >= 0) {
-		if (copy_in_user(&ucontrols->id, &kcontrols->id, sizeof(__u32)))
-			return -EFAULT;
-		if (copy_in_user(&ucontrols->reserved2, &kcontrols->reserved2,
-					sizeof(ucontrols->reserved2)))
-			return -EFAULT;
-		/* Note: if the void * part of the union ever becomes relevant
-		   then we need to know the type of the control in order to do
-		   the right thing here. Luckily, that is not yet an issue. */
-		if (copy_in_user(&ucontrols->value, &kcontrols->value, sizeof(ucontrols->value)))
+		unsigned size = sizeof(*ucontrols);
+
+		/* Do not modify the pointer when copying a pointer control.
+		   The contents of the pointer was changed, not the pointer
+		   itself. */
+		if (ctrl_is_pointer(kcontrols->id))
+			size -= sizeof(ucontrols->value64);
+		if (copy_in_user(ucontrols, kcontrols, size))
 			return -EFAULT;
 		ucontrols++;
 		kcontrols++;
diff --git a/drivers/media/video/v4l2-ioctl.c b/drivers/media/video/v4l2-ioctl.c
index c32e67608ad4..30cc3347ae52 100644
--- a/drivers/media/video/v4l2-ioctl.c
+++ b/drivers/media/video/v4l2-ioctl.c
@@ -513,11 +513,12 @@ static inline void v4l_print_ext_ctrls(unsigned int cmd,
 	dbgarg(cmd, "");
 	printk(KERN_CONT "class=0x%x", c->ctrl_class);
 	for (i = 0; i < c->count; i++) {
-		if (show_vals)
+		if (show_vals && !c->controls[i].size)
 			printk(KERN_CONT " id/val=0x%x/0x%x",
 				c->controls[i].id, c->controls[i].value);
 		else
-			printk(KERN_CONT " id=0x%x", c->controls[i].id);
+			printk(KERN_CONT " id=0x%x,size=%u",
+				c->controls[i].id, c->controls[i].size);
 	}
 	printk(KERN_CONT "\n");
 };
@@ -528,10 +529,9 @@ static inline int check_ext_ctrls(struct v4l2_ext_controls *c, int allow_priv)
 
 	/* zero the reserved fields */
 	c->reserved[0] = c->reserved[1] = 0;
-	for (i = 0; i < c->count; i++) {
+	for (i = 0; i < c->count; i++)
 		c->controls[i].reserved2[0] = 0;
-		c->controls[i].reserved2[1] = 0;
-	}
+
 	/* V4L2_CID_PRIVATE_BASE cannot be used as control class
 	   when using extended controls.
 	   Only when passed in through VIDIOC_G_CTRL and VIDIOC_S_CTRL
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 293ba181f95a..9ab4cbe676fd 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -167,6 +167,7 @@ enum v4l2_ctrl_type {
 	V4L2_CTRL_TYPE_BUTTON	     = 4,
 	V4L2_CTRL_TYPE_INTEGER64     = 5,
 	V4L2_CTRL_TYPE_CTRL_CLASS    = 6,
+	V4L2_CTRL_TYPE_STRING        = 7,
 };
 
 enum v4l2_tuner_type {
@@ -795,11 +796,12 @@ struct v4l2_control {
 
 struct v4l2_ext_control {
 	__u32 id;
-	__u32 reserved2[2];
+	__u32 size;
+	__u32 reserved2[1];
 	union {
 		__s32 value;
 		__s64 value64;
-		void *reserved;
+		char *string;
 	};
 } __attribute__ ((packed));
 
-- 
cgit v1.2.3-71-gd317


From 6b4249413abb634d9b2ff44c685da744f02e49d1 Mon Sep 17 00:00:00 2001
From: Eduardo Valentin <eduardo.valentin@nokia.com>
Date: Sat, 8 Aug 2009 08:38:58 -0300
Subject: V4L/DVB (12548): v4l2: video device: Add V4L2_CTRL_CLASS_FM_TX
 controls

This patch adds a new class of extended controls. This class
is intended to support FM Radio Modulators properties such as:
rds, audio limiters, audio compression, pilot tone generation,
tuning power levels and preemphasis properties.

Signed-off-by: Eduardo Valentin <eduardo.valentin@nokia.com>
Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 include/linux/videodev2.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 9ab4cbe676fd..5758066cb57b 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -817,6 +817,7 @@ struct v4l2_ext_controls {
 #define V4L2_CTRL_CLASS_USER 0x00980000	/* Old-style 'user' controls */
 #define V4L2_CTRL_CLASS_MPEG 0x00990000	/* MPEG-compression controls */
 #define V4L2_CTRL_CLASS_CAMERA 0x009a0000	/* Camera class controls */
+#define V4L2_CTRL_CLASS_FM_TX 0x009b0000	/* FM Modulator control class */
 
 #define V4L2_CTRL_ID_MASK      	  (0x0fffffff)
 #define V4L2_CTRL_ID2CLASS(id)    ((id) & 0x0fff0000UL)
@@ -1156,6 +1157,39 @@ enum  v4l2_exposure_auto_type {
 
 #define V4L2_CID_PRIVACY			(V4L2_CID_CAMERA_CLASS_BASE+16)
 
+/* FM Modulator class control IDs */
+#define V4L2_CID_FM_TX_CLASS_BASE		(V4L2_CTRL_CLASS_FM_TX | 0x900)
+#define V4L2_CID_FM_TX_CLASS			(V4L2_CTRL_CLASS_FM_TX | 1)
+
+#define V4L2_CID_RDS_TX_DEVIATION		(V4L2_CID_FM_TX_CLASS_BASE + 1)
+#define V4L2_CID_RDS_TX_PI			(V4L2_CID_FM_TX_CLASS_BASE + 2)
+#define V4L2_CID_RDS_TX_PTY			(V4L2_CID_FM_TX_CLASS_BASE + 3)
+#define V4L2_CID_RDS_TX_PS_NAME			(V4L2_CID_FM_TX_CLASS_BASE + 5)
+#define V4L2_CID_RDS_TX_RADIO_TEXT		(V4L2_CID_FM_TX_CLASS_BASE + 6)
+
+#define V4L2_CID_AUDIO_LIMITER_ENABLED		(V4L2_CID_FM_TX_CLASS_BASE + 64)
+#define V4L2_CID_AUDIO_LIMITER_RELEASE_TIME	(V4L2_CID_FM_TX_CLASS_BASE + 65)
+#define V4L2_CID_AUDIO_LIMITER_DEVIATION	(V4L2_CID_FM_TX_CLASS_BASE + 66)
+
+#define V4L2_CID_AUDIO_COMPRESSION_ENABLED	(V4L2_CID_FM_TX_CLASS_BASE + 80)
+#define V4L2_CID_AUDIO_COMPRESSION_GAIN		(V4L2_CID_FM_TX_CLASS_BASE + 81)
+#define V4L2_CID_AUDIO_COMPRESSION_THRESHOLD	(V4L2_CID_FM_TX_CLASS_BASE + 82)
+#define V4L2_CID_AUDIO_COMPRESSION_ATTACK_TIME	(V4L2_CID_FM_TX_CLASS_BASE + 83)
+#define V4L2_CID_AUDIO_COMPRESSION_RELEASE_TIME	(V4L2_CID_FM_TX_CLASS_BASE + 84)
+
+#define V4L2_CID_PILOT_TONE_ENABLED		(V4L2_CID_FM_TX_CLASS_BASE + 96)
+#define V4L2_CID_PILOT_TONE_DEVIATION		(V4L2_CID_FM_TX_CLASS_BASE + 97)
+#define V4L2_CID_PILOT_TONE_FREQUENCY		(V4L2_CID_FM_TX_CLASS_BASE + 98)
+
+#define V4L2_CID_TUNE_PREEMPHASIS		(V4L2_CID_FM_TX_CLASS_BASE + 112)
+enum v4l2_preemphasis {
+	V4L2_PREEMPHASIS_DISABLED	= 0,
+	V4L2_PREEMPHASIS_50_uS		= 1,
+	V4L2_PREEMPHASIS_75_uS		= 2,
+};
+#define V4L2_CID_TUNE_POWER_LEVEL		(V4L2_CID_FM_TX_CLASS_BASE + 113)
+#define V4L2_CID_TUNE_ANTENNA_CAPACITOR		(V4L2_CID_FM_TX_CLASS_BASE + 114)
+
 /*
  *	T U N I N G
  */
-- 
cgit v1.2.3-71-gd317


From 1e3c1f7695a446742e24ea1f0c80715be98edbc4 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Mon, 31 Aug 2009 11:32:46 -0300
Subject: V4L/DVB (12625): Add new V4L2_FMT_FLAG_EMULATED flag to videodev2.h

V4L2_FMT_FLAG_EMULATED	0x0002	This format is not native to the device but
emulated through software (usually libv4l2), where possible try to use a
native format instead for better performance.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 include/linux/videodev2.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 5758066cb57b..3689d7d81fe9 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -375,6 +375,7 @@ struct v4l2_fmtdesc {
 };
 
 #define V4L2_FMT_FLAG_COMPRESSED 0x0001
+#define V4L2_FMT_FLAG_EMULATED   0x0002
 
 #if 1
 	/* Experimental Frame Size and frame rate enumeration */
-- 
cgit v1.2.3-71-gd317


From 16bb8eb1b73bf940d30ff88cae622bfcd3790f61 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 12 Sep 2009 19:04:54 -0400
Subject: tracing: allow filter predicates to handle ksym symbols

This patch increases the max string used by predicates to
handle KSYM_SYMBOL_LEN.

Also moves an include to look nicer.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 0608b0ff2635..bd099ba82ccc 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -1,8 +1,8 @@
 #ifndef _LINUX_FTRACE_EVENT_H
 #define _LINUX_FTRACE_EVENT_H
 
-#include <linux/trace_seq.h>
 #include <linux/ring_buffer.h>
+#include <linux/trace_seq.h>
 #include <linux/percpu.h>
 
 struct trace_array;
@@ -135,7 +135,7 @@ struct ftrace_event_call {
 };
 
 #define MAX_FILTER_PRED		32
-#define MAX_FILTER_STR_VAL	128
+#define MAX_FILTER_STR_VAL	256	/* Should handle KSYM_SYMBOL_LEN */
 
 extern void destroy_preds(struct ftrace_event_call *call);
 extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
-- 
cgit v1.2.3-71-gd317


From f300baba5a1536070d6d77bf0c8c4ca999bb4f0f Mon Sep 17 00:00:00 2001
From: Alexandros Batsakis <batsakis@netapp.com>
Date: Thu, 10 Sep 2009 17:33:30 +0300
Subject: nfsd41: sunrpc: add new xprt class for nfsv4.1 backchannel

[sunrpc: change idle timeout value for the backchannel]
Signed-off-by: Alexandros Batsakis <batsakis@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 include/linux/sunrpc/clnt.h     |  1 +
 include/linux/sunrpc/xprt.h     | 18 ++++++++
 include/linux/sunrpc/xprtrdma.h |  5 ---
 include/linux/sunrpc/xprtsock.h | 11 -----
 net/sunrpc/clnt.c               |  1 +
 net/sunrpc/xprtsock.c           | 96 ++++++++++++++++++++++++++++++++++++++++-
 6 files changed, 114 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 3d025588e56e..8ed9642a5a76 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -114,6 +114,7 @@ struct rpc_create_args {
 	rpc_authflavor_t	authflavor;
 	unsigned long		flags;
 	char			*client_name;
+	struct svc_xprt		*bc_xprt;	/* NFSv4.1 backchannel */
 };
 
 /* Values for "flags" field */
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 228d694dbb90..6f9457a75b8f 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -124,6 +124,23 @@ struct rpc_xprt_ops {
 	void		(*print_stats)(struct rpc_xprt *xprt, struct seq_file *seq);
 };
 
+/*
+ * RPC transport identifiers
+ *
+ * To preserve compatibility with the historical use of raw IP protocol
+ * id's for transport selection, UDP and TCP identifiers are specified
+ * with the previous values. No such restriction exists for new transports,
+ * except that they may not collide with these values (17 and 6,
+ * respectively).
+ */
+#define XPRT_TRANSPORT_BC       (1 << 31)
+enum xprt_transports {
+	XPRT_TRANSPORT_UDP	= IPPROTO_UDP,
+	XPRT_TRANSPORT_TCP	= IPPROTO_TCP,
+	XPRT_TRANSPORT_BC_TCP	= IPPROTO_TCP | XPRT_TRANSPORT_BC,
+	XPRT_TRANSPORT_RDMA	= 256
+};
+
 struct rpc_xprt {
 	struct kref		kref;		/* Reference count */
 	struct rpc_xprt_ops *	ops;		/* transport methods */
@@ -232,6 +249,7 @@ struct xprt_create {
 	struct sockaddr *	srcaddr;	/* optional local address */
 	struct sockaddr *	dstaddr;	/* remote peer address */
 	size_t			addrlen;
+	struct svc_xprt		*bc_xprt;	/* NFSv4.1 backchannel */
 };
 
 struct xprt_class {
diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index 54a379c9e8eb..c2f04e1ae159 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -40,11 +40,6 @@
 #ifndef _LINUX_SUNRPC_XPRTRDMA_H
 #define _LINUX_SUNRPC_XPRTRDMA_H
 
-/*
- * RPC transport identifier for RDMA
- */
-#define XPRT_TRANSPORT_RDMA	256
-
 /*
  * rpcbind (v3+) RDMA netid.
  */
diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h
index c2a46c45c8f7..3f14a02e9cc0 100644
--- a/include/linux/sunrpc/xprtsock.h
+++ b/include/linux/sunrpc/xprtsock.h
@@ -12,17 +12,6 @@
 int		init_socket_xprt(void);
 void		cleanup_socket_xprt(void);
 
-/*
- * RPC transport identifiers for UDP, TCP
- *
- * To preserve compatibility with the historical use of raw IP protocol
- * id's for transport selection, these are specified with the previous
- * values. No such restriction exists for new transports, except that
- * they may not collide with these values (17 and 6, respectively).
- */
-#define XPRT_TRANSPORT_UDP	IPPROTO_UDP
-#define XPRT_TRANSPORT_TCP	IPPROTO_TCP
-
 /*
  * RPC slot table sizes for UDP, TCP transports
  */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index c1e467e1b07d..7389804e3bb7 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -288,6 +288,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 		.srcaddr = args->saddress,
 		.dstaddr = args->address,
 		.addrlen = args->addrsize,
+		.bc_xprt = args->bc_xprt,
 	};
 	char servername[48];
 
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index d9a2b815714e..bee415465754 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2468,11 +2468,93 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
 	return ERR_PTR(-EINVAL);
 }
 
+/**
+ * xs_setup_bc_tcp - Set up transport to use a TCP backchannel socket
+ * @args: rpc transport creation arguments
+ *
+ */
+static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
+{
+	struct sockaddr *addr = args->dstaddr;
+	struct rpc_xprt *xprt;
+	struct sock_xprt *transport;
+	struct svc_sock *bc_sock;
+
+	if (!args->bc_xprt)
+		ERR_PTR(-EINVAL);
+
+	xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries);
+	if (IS_ERR(xprt))
+		return xprt;
+	transport = container_of(xprt, struct sock_xprt, xprt);
+
+	xprt->prot = IPPROTO_TCP;
+	xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
+	xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
+	xprt->timeout = &xs_tcp_default_timeout;
+
+	/* backchannel */
+	xprt_set_bound(xprt);
+	xprt->bind_timeout = 0;
+	xprt->connect_timeout = 0;
+	xprt->reestablish_timeout = 0;
+	xprt->idle_timeout = 0;
+
+	/*
+	 * The backchannel uses the same socket connection as the
+	 * forechannel
+	 */
+	xprt->bc_xprt = args->bc_xprt;
+	bc_sock = container_of(args->bc_xprt, struct svc_sock, sk_xprt);
+	bc_sock->sk_bc_xprt = xprt;
+	transport->sock = bc_sock->sk_sock;
+	transport->inet = bc_sock->sk_sk;
+
+	xprt->ops = &bc_tcp_ops;
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		xs_format_peer_addresses(xprt, "tcp",
+					 RPCBIND_NETID_TCP);
+		break;
+	case AF_INET6:
+		xs_format_peer_addresses(xprt, "tcp",
+				   RPCBIND_NETID_TCP6);
+		break;
+	default:
+		kfree(xprt);
+		return ERR_PTR(-EAFNOSUPPORT);
+	}
+
+	if (xprt_bound(xprt))
+		dprintk("RPC:       set up xprt to %s (port %s) via %s\n",
+				xprt->address_strings[RPC_DISPLAY_ADDR],
+				xprt->address_strings[RPC_DISPLAY_PORT],
+				xprt->address_strings[RPC_DISPLAY_PROTO]);
+	else
+		dprintk("RPC:       set up xprt to %s (autobind) via %s\n",
+				xprt->address_strings[RPC_DISPLAY_ADDR],
+				xprt->address_strings[RPC_DISPLAY_PROTO]);
+
+	/*
+	 * Since we don't want connections for the backchannel, we set
+	 * the xprt status to connected
+	 */
+	xprt_set_connected(xprt);
+
+
+	if (try_module_get(THIS_MODULE))
+		return xprt;
+	kfree(xprt->slot);
+	kfree(xprt);
+	return ERR_PTR(-EINVAL);
+}
+
 static struct xprt_class	xs_udp_transport = {
 	.list		= LIST_HEAD_INIT(xs_udp_transport.list),
 	.name		= "udp",
 	.owner		= THIS_MODULE,
-	.ident		= IPPROTO_UDP,
+	.ident		= XPRT_TRANSPORT_UDP,
 	.setup		= xs_setup_udp,
 };
 
@@ -2480,10 +2562,18 @@ static struct xprt_class	xs_tcp_transport = {
 	.list		= LIST_HEAD_INIT(xs_tcp_transport.list),
 	.name		= "tcp",
 	.owner		= THIS_MODULE,
-	.ident		= IPPROTO_TCP,
+	.ident		= XPRT_TRANSPORT_TCP,
 	.setup		= xs_setup_tcp,
 };
 
+static struct xprt_class	xs_bc_tcp_transport = {
+	.list		= LIST_HEAD_INIT(xs_bc_tcp_transport.list),
+	.name		= "tcp NFSv4.1 backchannel",
+	.owner		= THIS_MODULE,
+	.ident		= XPRT_TRANSPORT_BC_TCP,
+	.setup		= xs_setup_bc_tcp,
+};
+
 /**
  * init_socket_xprt - set up xprtsock's sysctls, register with RPC client
  *
@@ -2497,6 +2587,7 @@ int init_socket_xprt(void)
 
 	xprt_register_transport(&xs_udp_transport);
 	xprt_register_transport(&xs_tcp_transport);
+	xprt_register_transport(&xs_bc_tcp_transport);
 
 	return 0;
 }
@@ -2516,6 +2607,7 @@ void cleanup_socket_xprt(void)
 
 	xprt_unregister_transport(&xs_udp_transport);
 	xprt_unregister_transport(&xs_tcp_transport);
+	xprt_unregister_transport(&xs_bc_tcp_transport);
 }
 
 static int param_set_uint_minmax(const char *val, struct kernel_param *kp,
-- 
cgit v1.2.3-71-gd317


From ed868a56988464cd31de0302426a5e94d3127f10 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Sat, 12 Sep 2009 22:54:10 -0400
Subject: Creds: creds->security can be NULL is selinux is disabled

__validate_process_creds should check if selinux is actually enabled before
running tests on the selinux portion of the credentials struct.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/cred.h       | 13 ++++++++-----
 include/linux/selinux.h    |  9 +++++++++
 security/selinux/exports.c |  6 ++++++
 3 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 24520a539c6f..fb371601a3b4 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -15,6 +15,7 @@
 #include <linux/capability.h>
 #include <linux/init.h>
 #include <linux/key.h>
+#include <linux/selinux.h>
 #include <asm/atomic.h>
 
 struct user_struct;
@@ -182,11 +183,13 @@ static inline bool creds_are_invalid(const struct cred *cred)
 	if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
 		return true;
 #ifdef CONFIG_SECURITY_SELINUX
-	if ((unsigned long) cred->security < PAGE_SIZE)
-		return true;
-	if ((*(u32*)cred->security & 0xffffff00) ==
-	    (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))
-		return true;
+	if (selinux_is_enabled()) {
+		if ((unsigned long) cred->security < PAGE_SIZE)
+			return true;
+		if ((*(u32 *)cred->security & 0xffffff00) ==
+		    (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))
+			return true;
+	}
 #endif
 	return false;
 }
diff --git a/include/linux/selinux.h b/include/linux/selinux.h
index 20f965d4b041..223d06a6feb1 100644
--- a/include/linux/selinux.h
+++ b/include/linux/selinux.h
@@ -61,6 +61,11 @@ void selinux_secmark_refcount_inc(void);
  *     existing SECMARK targets has been removed/flushed.
  */
 void selinux_secmark_refcount_dec(void);
+
+/**
+ * selinux_is_enabled - is SELinux enabled?
+ */
+bool selinux_is_enabled(void);
 #else
 
 static inline int selinux_string_to_sid(const char *str, u32 *sid)
@@ -84,6 +89,10 @@ static inline void selinux_secmark_refcount_dec(void)
 	return;
 }
 
+static bool selinux_is_enabled(void)
+{
+	return false;
+}
 #endif	/* CONFIG_SECURITY_SELINUX */
 
 #endif /* _LINUX_SELINUX_H */
diff --git a/security/selinux/exports.c b/security/selinux/exports.c
index c73aeaa008e8..c0a454aee1e0 100644
--- a/security/selinux/exports.c
+++ b/security/selinux/exports.c
@@ -63,3 +63,9 @@ void selinux_secmark_refcount_dec(void)
 	atomic_dec(&selinux_secmark_refcount);
 }
 EXPORT_SYMBOL_GPL(selinux_secmark_refcount_dec);
+
+bool selinux_is_enabled(void)
+{
+	return selinux_enabled;
+}
+EXPORT_SYMBOL_GPL(selinux_is_enabled);
-- 
cgit v1.2.3-71-gd317


From a9327cac440be4d8333bba975cbbf76045096275 Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Fri, 11 Sep 2009 09:18:54 +0200
Subject: Seperate read and write statistics of in_flight requests

Currently, there is a single in_flight counter measuring the number of
requests in the request_queue. But some monitoring tools would like to
know how many read requests and write requests are in progress. Split the
current in_flight counter into two seperate counters for read and write.

This information is exported as a sysfs attribute, as changing the
currently available stat files would break the existing tools.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c      |  6 +++---
 block/blk-merge.c     |  2 +-
 block/genhd.c         |  4 +++-
 drivers/md/dm.c       | 16 ++++++++++------
 fs/partitions/check.c | 12 +++++++++++-
 include/linux/genhd.h | 21 ++++++++++++++-------
 6 files changed, 42 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 982d634e67f9..182ebe3eb9e0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -69,7 +69,7 @@ static void drive_stat_acct(struct request *rq, int new_io)
 		part_stat_inc(cpu, part, merges[rw]);
 	else {
 		part_round_stats(cpu, part);
-		part_inc_in_flight(part);
+		part_inc_in_flight(part, rw);
 	}
 
 	part_stat_unlock();
@@ -1030,7 +1030,7 @@ static void part_round_stats_single(int cpu, struct hd_struct *part,
 
 	if (part->in_flight) {
 		__part_stat_add(cpu, part, time_in_queue,
-				part->in_flight * (now - part->stamp));
+				part_in_flight(part) * (now - part->stamp));
 		__part_stat_add(cpu, part, io_ticks, (now - part->stamp));
 	}
 	part->stamp = now;
@@ -1737,7 +1737,7 @@ static void blk_account_io_done(struct request *req)
 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
 		part_round_stats(cpu, part);
-		part_dec_in_flight(part);
+		part_dec_in_flight(part, rw);
 
 		part_stat_unlock();
 	}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index b0de8574fdc8..99cb5cf1f447 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -351,7 +351,7 @@ static void blk_account_io_merge(struct request *req)
 		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
 
 		part_round_stats(cpu, part);
-		part_dec_in_flight(part);
+		part_dec_in_flight(part, rq_data_dir(req));
 
 		part_stat_unlock();
 	}
diff --git a/block/genhd.c b/block/genhd.c
index b89328eceee2..5b76bf55d05c 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -869,6 +869,7 @@ static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
 static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
+static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
 	__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -888,6 +889,7 @@ static struct attribute *disk_attrs[] = {
 	&dev_attr_alignment_offset.attr,
 	&dev_attr_capability.attr,
 	&dev_attr_stat.attr,
+	&dev_attr_inflight.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
 #endif
@@ -1053,7 +1055,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 			   part_stat_read(hd, merges[1]),
 			   (unsigned long long)part_stat_read(hd, sectors[1]),
 			   jiffies_to_msecs(part_stat_read(hd, ticks[1])),
-			   hd->in_flight,
+			   part_in_flight(hd),
 			   jiffies_to_msecs(part_stat_read(hd, io_ticks)),
 			   jiffies_to_msecs(part_stat_read(hd, time_in_queue))
 			);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ec012f030240..eee28fac210c 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -130,7 +130,7 @@ struct mapped_device {
 	/*
 	 * A list of ios that arrived while we were suspended.
 	 */
-	atomic_t pending;
+	atomic_t pending[2];
 	wait_queue_head_t wait;
 	struct work_struct work;
 	struct bio_list deferred;
@@ -453,13 +453,14 @@ static void start_io_acct(struct dm_io *io)
 {
 	struct mapped_device *md = io->md;
 	int cpu;
+	int rw = bio_data_dir(io->bio);
 
 	io->start_time = jiffies;
 
 	cpu = part_stat_lock();
 	part_round_stats(cpu, &dm_disk(md)->part0);
 	part_stat_unlock();
-	dm_disk(md)->part0.in_flight = atomic_inc_return(&md->pending);
+	dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]);
 }
 
 static void end_io_acct(struct dm_io *io)
@@ -479,8 +480,9 @@ static void end_io_acct(struct dm_io *io)
 	 * After this is decremented the bio must not be touched if it is
 	 * a barrier.
 	 */
-	dm_disk(md)->part0.in_flight = pending =
-		atomic_dec_return(&md->pending);
+	dm_disk(md)->part0.in_flight[rw] = pending =
+		atomic_dec_return(&md->pending[rw]);
+	pending += atomic_read(&md->pending[rw^0x1]);
 
 	/* nudge anyone waiting on suspend queue */
 	if (!pending)
@@ -1785,7 +1787,8 @@ static struct mapped_device *alloc_dev(int minor)
 	if (!md->disk)
 		goto bad_disk;
 
-	atomic_set(&md->pending, 0);
+	atomic_set(&md->pending[0], 0);
+	atomic_set(&md->pending[1], 0);
 	init_waitqueue_head(&md->wait);
 	INIT_WORK(&md->work, dm_wq_work);
 	init_waitqueue_head(&md->eventq);
@@ -2088,7 +2091,8 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 				break;
 			}
 			spin_unlock_irqrestore(q->queue_lock, flags);
-		} else if (!atomic_read(&md->pending))
+		} else if (!atomic_read(&md->pending[0]) &&
+					!atomic_read(&md->pending[1]))
 			break;
 
 		if (interruptible == TASK_INTERRUPTIBLE &&
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index ea4e6cb29e13..619ba99dfe39 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -248,11 +248,19 @@ ssize_t part_stat_show(struct device *dev,
 		part_stat_read(p, merges[WRITE]),
 		(unsigned long long)part_stat_read(p, sectors[WRITE]),
 		jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
-		p->in_flight,
+		part_in_flight(p),
 		jiffies_to_msecs(part_stat_read(p, io_ticks)),
 		jiffies_to_msecs(part_stat_read(p, time_in_queue)));
 }
 
+ssize_t part_inflight_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+
+	return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]);
+}
+
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 ssize_t part_fail_show(struct device *dev,
 		       struct device_attribute *attr, char *buf)
@@ -281,6 +289,7 @@ static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
+static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
 	__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -292,6 +301,7 @@ static struct attribute *part_attrs[] = {
 	&dev_attr_size.attr,
 	&dev_attr_alignment_offset.attr,
 	&dev_attr_stat.attr,
+	&dev_attr_inflight.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
 #endif
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 45fc320a53c6..44263cb27121 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -98,7 +98,7 @@ struct hd_struct {
 	int make_it_fail;
 #endif
 	unsigned long stamp;
-	int in_flight;
+	int in_flight[2];
 #ifdef	CONFIG_SMP
 	struct disk_stats *dkstats;
 #else
@@ -322,18 +322,23 @@ static inline void free_part_stats(struct hd_struct *part)
 #define part_stat_sub(cpu, gendiskp, field, subnd)			\
 	part_stat_add(cpu, gendiskp, field, -subnd)
 
-static inline void part_inc_in_flight(struct hd_struct *part)
+static inline void part_inc_in_flight(struct hd_struct *part, int rw)
 {
-	part->in_flight++;
+	part->in_flight[rw]++;
 	if (part->partno)
-		part_to_disk(part)->part0.in_flight++;
+		part_to_disk(part)->part0.in_flight[rw]++;
 }
 
-static inline void part_dec_in_flight(struct hd_struct *part)
+static inline void part_dec_in_flight(struct hd_struct *part, int rw)
 {
-	part->in_flight--;
+	part->in_flight[rw]--;
 	if (part->partno)
-		part_to_disk(part)->part0.in_flight--;
+		part_to_disk(part)->part0.in_flight[rw]--;
+}
+
+static inline int part_in_flight(struct hd_struct *part)
+{
+	return part->in_flight[0] + part->in_flight[1];
 }
 
 /* block/blk-core.c */
@@ -546,6 +551,8 @@ extern ssize_t part_size_show(struct device *dev,
 			      struct device_attribute *attr, char *buf);
 extern ssize_t part_stat_show(struct device *dev,
 			      struct device_attribute *attr, char *buf);
+extern ssize_t part_inflight_show(struct device *dev,
+			      struct device_attribute *attr, char *buf);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 extern ssize_t part_fail_show(struct device *dev,
 			      struct device_attribute *attr, char *buf);
-- 
cgit v1.2.3-71-gd317


From 3c5820c743479285ce2678fd3c12b1fd39fe998f Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 11 Sep 2009 21:54:52 +0200
Subject: block: Optimal I/O limit wrapper

Implement blk_limits_io_opt() and make blk_queue_io_opt() a wrapper
around it. DM needs this to avoid poking at the queue_limits directly.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c   | 21 ++++++++++++++++++++-
 drivers/md/dm-stripe.c |  2 +-
 include/linux/blkdev.h |  1 +
 3 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 476d87065073..83413ff83739 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -427,6 +427,25 @@ void blk_queue_io_min(struct request_queue *q, unsigned int min)
 }
 EXPORT_SYMBOL(blk_queue_io_min);
 
+/**
+ * blk_limits_io_opt - set optimal request size for a device
+ * @limits: the queue limits
+ * @opt:  smallest I/O size in bytes
+ *
+ * Description:
+ *   Storage devices may report an optimal I/O size, which is the
+ *   device's preferred unit for sustained I/O.  This is rarely reported
+ *   for disk drives.  For RAID arrays it is usually the stripe width or
+ *   the internal track size.  A properly aligned multiple of
+ *   optimal_io_size is the preferred request size for workloads where
+ *   sustained throughput is desired.
+ */
+void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt)
+{
+	limits->io_opt = opt;
+}
+EXPORT_SYMBOL(blk_limits_io_opt);
+
 /**
  * blk_queue_io_opt - set optimal request size for the queue
  * @q:	the request queue for the device
@@ -442,7 +461,7 @@ EXPORT_SYMBOL(blk_queue_io_min);
  */
 void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
 {
-	q->limits.io_opt = opt;
+	blk_limits_io_opt(&q->limits, opt);
 }
 EXPORT_SYMBOL(blk_queue_io_opt);
 
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index fde658ccbcec..e0efc1adcaff 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -336,7 +336,7 @@ static void stripe_io_hints(struct dm_target *ti,
 	unsigned chunk_size = (sc->chunk_mask + 1) << 9;
 
 	blk_limits_io_min(limits, chunk_size);
-	limits->io_opt = chunk_size * sc->stripes;
+	blk_limits_io_opt(limits, chunk_size * sc->stripes);
 }
 
 static struct target_type stripe_target = {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8bf1a10e4d88..86c2bdff3b89 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -935,6 +935,7 @@ extern void blk_queue_alignment_offset(struct request_queue *q,
 				       unsigned int alignment);
 extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
 extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
+extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
 extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
 extern void blk_set_default_limits(struct queue_limits *lim);
 extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
-- 
cgit v1.2.3-71-gd317


From 3d2257f157c2324acbc0fa0fa54e8626a987edd2 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sat, 12 Sep 2009 07:35:37 +0200
Subject: Make DISCARD_BARRIER and DISCARD_NOBARRIER writes instead of reads

The commands are conceptually writes, and in the case of IDE and SCSI
commands actually are writes.  They were only reads because we thought
that would interact better with the elevators.  Now the elevators know
about discard requests, that advantage no longer exists.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/fs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index c1f993515f51..7f5a8ad25008 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -161,8 +161,8 @@ struct inodes_stat_t {
  * These aren't really reads or writes, they pass down information about
  * parts of device that are now unused by the file system.
  */
-#define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD)
-#define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER))
+#define DISCARD_NOBARRIER (WRITE | (1 << BIO_RW_DISCARD))
+#define DISCARD_BARRIER (DISCARD_NOBARRIER | (1 << BIO_RW_BARRIER))
 
 #define SEL_IN		1
 #define SEL_OUT		2
-- 
cgit v1.2.3-71-gd317


From 746cd1e7e4a555ddaee53b19a46e05c9c61eaf09 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 12 Sep 2009 07:35:43 +0200
Subject: block: use blkdev_issue_discard in blk_ioctl_discard

blk_ioctl_discard duplicates large amounts of code from blkdev_issue_discard,
the only difference between the two is that blkdev_issue_discard needs to
send a barrier discard request and blk_ioctl_discard a non-barrier one,
and blk_ioctl_discard needs to wait on the request.  To facilitates this
add a flags argument to blkdev_issue_discard to control both aspects of the
behaviour.  This will be very useful later on for using the waiting
funcitonality for other callers.

Based on an earlier patch from Matthew Wilcox <matthew@wil.cx>.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c    | 31 +++++++++++++++++++------------
 block/ioctl.c          | 49 ++-----------------------------------------------
 fs/btrfs/extent-tree.c |  3 ++-
 fs/gfs2/rgrp.c         |  6 ++++--
 include/linux/blkdev.h |  9 ++++++---
 mm/swapfile.c          |  6 ++++--
 6 files changed, 37 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 30022b4e2f63..6593ab39cfe9 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -348,6 +348,9 @@ static void blkdev_discard_end_io(struct bio *bio, int err)
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
 	}
 
+	if (bio->bi_private)
+		complete(bio->bi_private);
+
 	bio_put(bio);
 }
 
@@ -357,21 +360,20 @@ static void blkdev_discard_end_io(struct bio *bio, int err)
  * @sector:	start sector
  * @nr_sects:	number of sectors to discard
  * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @flags:	DISCARD_FL_* flags to control behaviour
  *
  * Description:
- *    Issue a discard request for the sectors in question. Does not wait.
+ *    Issue a discard request for the sectors in question.
  */
-int blkdev_issue_discard(struct block_device *bdev,
-			 sector_t sector, sector_t nr_sects, gfp_t gfp_mask)
+int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, int flags)
 {
-	struct request_queue *q;
-	struct bio *bio;
+	DECLARE_COMPLETION_ONSTACK(wait);
+	struct request_queue *q = bdev_get_queue(bdev);
+	int type = flags & DISCARD_FL_BARRIER ?
+		DISCARD_BARRIER : DISCARD_NOBARRIER;
 	int ret = 0;
 
-	if (bdev->bd_disk == NULL)
-		return -ENXIO;
-
-	q = bdev_get_queue(bdev);
 	if (!q)
 		return -ENXIO;
 
@@ -379,12 +381,14 @@ int blkdev_issue_discard(struct block_device *bdev,
 		return -EOPNOTSUPP;
 
 	while (nr_sects && !ret) {
-		bio = bio_alloc(gfp_mask, 0);
+		struct bio *bio = bio_alloc(gfp_mask, 0);
 		if (!bio)
 			return -ENOMEM;
 
 		bio->bi_end_io = blkdev_discard_end_io;
 		bio->bi_bdev = bdev;
+		if (flags & DISCARD_FL_WAIT)
+			bio->bi_private = &wait;
 
 		bio->bi_sector = sector;
 
@@ -396,10 +400,13 @@ int blkdev_issue_discard(struct block_device *bdev,
 			bio->bi_size = nr_sects << 9;
 			nr_sects = 0;
 		}
+
 		bio_get(bio);
-		submit_bio(DISCARD_BARRIER, bio);
+		submit_bio(type, bio);
+
+		if (flags & DISCARD_FL_WAIT)
+			wait_for_completion(&wait);
 
-		/* Check if it failed immediately */
 		if (bio_flagged(bio, BIO_EOPNOTSUPP))
 			ret = -EOPNOTSUPP;
 		else if (!bio_flagged(bio, BIO_UPTODATE))
diff --git a/block/ioctl.c b/block/ioctl.c
index 500e4c73cc52..d3e6b5827a34 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -112,22 +112,9 @@ static int blkdev_reread_part(struct block_device *bdev)
 	return res;
 }
 
-static void blk_ioc_discard_endio(struct bio *bio, int err)
-{
-	if (err) {
-		if (err == -EOPNOTSUPP)
-			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-	}
-	complete(bio->bi_private);
-}
-
 static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 			     uint64_t len)
 {
-	struct request_queue *q = bdev_get_queue(bdev);
-	int ret = 0;
-
 	if (start & 511)
 		return -EINVAL;
 	if (len & 511)
@@ -137,40 +124,8 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 
 	if (start + len > (bdev->bd_inode->i_size >> 9))
 		return -EINVAL;
-
-	if (!q->prepare_discard_fn)
-		return -EOPNOTSUPP;
-
-	while (len && !ret) {
-		DECLARE_COMPLETION_ONSTACK(wait);
-		struct bio *bio;
-
-		bio = bio_alloc(GFP_KERNEL, 0);
-
-		bio->bi_end_io = blk_ioc_discard_endio;
-		bio->bi_bdev = bdev;
-		bio->bi_private = &wait;
-		bio->bi_sector = start;
-
-		if (len > queue_max_hw_sectors(q)) {
-			bio->bi_size = queue_max_hw_sectors(q) << 9;
-			len -= queue_max_hw_sectors(q);
-			start += queue_max_hw_sectors(q);
-		} else {
-			bio->bi_size = len << 9;
-			len = 0;
-		}
-		submit_bio(DISCARD_NOBARRIER, bio);
-
-		wait_for_completion(&wait);
-
-		if (bio_flagged(bio, BIO_EOPNOTSUPP))
-			ret = -EOPNOTSUPP;
-		else if (!bio_flagged(bio, BIO_UPTODATE))
-			ret = -EIO;
-		bio_put(bio);
-	}
-	return ret;
+	return blkdev_issue_discard(bdev, start, len, GFP_KERNEL,
+				    DISCARD_FL_WAIT);
 }
 
 static int put_ushort(unsigned long arg, unsigned short val)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 72a2b9c28e9f..535f85ba104f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1511,7 +1511,8 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
 static void btrfs_issue_discard(struct block_device *bdev,
 				u64 start, u64 len)
 {
-	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
+			     DISCARD_FL_BARRIER);
 }
 #endif
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index fba795798d3a..fbc43241f2ef 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -857,7 +857,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 					goto start_new_extent;
 				if ((start + nr_sects) != blk) {
 					rv = blkdev_issue_discard(bdev, start,
-							    nr_sects, GFP_NOFS);
+							    nr_sects, GFP_NOFS,
+							    DISCARD_FL_BARRIER);
 					if (rv)
 						goto fail;
 					nr_sects = 0;
@@ -871,7 +872,8 @@ start_new_extent:
 		}
 	}
 	if (nr_sects) {
-		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS);
+		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
+					 DISCARD_FL_BARRIER);
 		if (rv)
 			goto fail;
 	}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 86c2bdff3b89..e23a86cae5ac 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -998,15 +998,18 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 }
 
 extern int blkdev_issue_flush(struct block_device *, sector_t *);
-extern int blkdev_issue_discard(struct block_device *,
-				sector_t sector, sector_t nr_sects, gfp_t);
+#define DISCARD_FL_WAIT		0x01	/* wait for completion */
+#define DISCARD_FL_BARRIER	0x02	/* issue DISCARD_BARRIER request */
+extern int blkdev_issue_discard(struct block_device *, sector_t sector,
+		sector_t nr_sects, gfp_t, int flags);
 
 static inline int sb_issue_discard(struct super_block *sb,
 				   sector_t block, sector_t nr_blocks)
 {
 	block <<= (sb->s_blocksize_bits - 9);
 	nr_blocks <<= (sb->s_blocksize_bits - 9);
-	return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_KERNEL);
+	return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_KERNEL,
+				    DISCARD_FL_BARRIER);
 }
 
 extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8ffdc0d23c53..74f1102e8749 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -161,7 +161,8 @@ static int discard_swap(struct swap_info_struct *si)
 		}
 
 		err = blkdev_issue_discard(si->bdev, start_block,
-						nr_blocks, GFP_KERNEL);
+						nr_blocks, GFP_KERNEL,
+						DISCARD_FL_BARRIER);
 		if (err)
 			break;
 
@@ -200,7 +201,8 @@ static void discard_swap_cluster(struct swap_info_struct *si,
 			start_block <<= PAGE_SHIFT - 9;
 			nr_blocks <<= PAGE_SHIFT - 9;
 			if (blkdev_issue_discard(si->bdev, start_block,
-							nr_blocks, GFP_NOIO))
+							nr_blocks, GFP_NOIO,
+							DISCARD_FL_BARRIER))
 				break;
 		}
 
-- 
cgit v1.2.3-71-gd317


From 7f53866932fd08add06ee2f93ead129949158490 Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Thu, 10 Sep 2009 12:34:09 -0600
Subject: PCI Hotplug: convert acpi_pci_detect_ejectable() to take an
 acpi_handle

acpi_pci_detect_ejectable() goes through effort to convert its
struct pci_bus arg to an acpi_handle, but every time we use this
interface, we already have the handle available.

So let's just use the handle instead of converting back and forth.

Reviewed-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Tested-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/acpi_pcihp.c   | 12 ++++++------
 drivers/pci/hotplug/acpiphp_glue.c | 33 +++++++++++----------------------
 drivers/pci/hotplug/pciehp_acpi.c  |  7 ++++---
 include/linux/pci_hotplug.h        |  2 +-
 4 files changed, 22 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/acpi_pcihp.c b/drivers/pci/hotplug/acpi_pcihp.c
index eb159587d0bf..ec3c039b7ebd 100644
--- a/drivers/pci/hotplug/acpi_pcihp.c
+++ b/drivers/pci/hotplug/acpi_pcihp.c
@@ -500,18 +500,18 @@ check_hotplug(acpi_handle handle, u32 lvl, void *context, void **rv)
 
 /**
  * acpi_pci_detect_ejectable - check if the PCI bus has ejectable slots
- * @pbus - PCI bus to scan
+ * @handle - handle of the PCI bus to scan
  *
  * Returns 1 if the PCI bus has ACPI based ejectable slots, 0 otherwise.
  */
-int acpi_pci_detect_ejectable(struct pci_bus *pbus)
+int acpi_pci_detect_ejectable(acpi_handle handle)
 {
-	acpi_handle handle;
 	int found = 0;
 
-	if (!(handle = acpi_pci_get_bridge_handle(pbus)))
-		return 0;
-	acpi_walk_namespace(ACPI_TYPE_DEVICE, handle, (u32)1,
+	if (!handle)
+		return found;
+
+	acpi_walk_namespace(ACPI_TYPE_DEVICE, handle, 1,
 			    check_hotplug, (void *)&found, NULL);
 	return found;
 }
diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
index e72e0adc0681..680c33635b6f 100644
--- a/drivers/pci/hotplug/acpiphp_glue.c
+++ b/drivers/pci/hotplug/acpiphp_glue.c
@@ -62,22 +62,6 @@ static void acpiphp_sanitize_bus(struct pci_bus *bus);
 static void acpiphp_set_hpp_values(acpi_handle handle, struct pci_bus *bus);
 static void handle_hotplug_event_func(acpi_handle handle, u32 type, void *context);
 
-static struct pci_bus *pci_bus_from_handle(acpi_handle handle)
-{
-	struct pci_bus *pbus;
-	struct acpi_pci_root *root;
-
-	root = acpi_pci_find_root(handle);
-	if (root)
-		pbus = root->bus;
-	else {
-		struct pci_dev *pdev = acpi_get_pci_dev(handle);
-		pbus = pdev->subordinate;
-		pci_dev_put(pdev);
-	}
-	return pbus;
-}
-
 /* callback routine to check for the existence of a pci dock device */
 static acpi_status
 is_pci_dock_device(acpi_handle handle, u32 lvl, void *context, void **rv)
@@ -279,11 +263,7 @@ register_slot(acpi_handle handle, u32 lvl, void *context, void **rv)
 /* see if it's worth looking at this bridge */
 static int detect_ejectable_slots(acpi_handle handle)
 {
-	int found;
-	struct pci_bus *pbus;
-
-	pbus = pci_bus_from_handle(handle);
-	found = acpi_pci_detect_ejectable(pbus);
+	int found = acpi_pci_detect_ejectable(handle);
 	if (!found) {
 		acpi_walk_namespace(ACPI_TYPE_DEVICE, handle, (u32)1,
 				    is_pci_dock_device, (void *)&found, NULL);
@@ -1364,7 +1344,16 @@ static void acpiphp_sanitize_bus(struct pci_bus *bus)
 /* Program resources in newly inserted bridge */
 static int acpiphp_configure_bridge (acpi_handle handle)
 {
-	struct pci_bus *bus = pci_bus_from_handle(handle);
+	struct pci_bus *bus;
+
+	if (acpi_is_root_bridge(handle)) {
+		struct acpi_pci_root *root = acpi_pci_find_root(handle);
+		bus = root->bus;
+	} else {
+		struct pci_dev *pdev = acpi_get_pci_dev(handle);
+		bus = pdev->subordinate;
+		pci_dev_put(pdev);
+	}
 
 	pci_bus_size_bridges(bus);
 	pci_bus_assign_resources(bus);
diff --git a/drivers/pci/hotplug/pciehp_acpi.c b/drivers/pci/hotplug/pciehp_acpi.c
index 96048010e7d9..7163e6a6cfae 100644
--- a/drivers/pci/hotplug/pciehp_acpi.c
+++ b/drivers/pci/hotplug/pciehp_acpi.c
@@ -47,7 +47,7 @@ int pciehp_acpi_slot_detection_check(struct pci_dev *dev)
 {
 	if (slot_detection_mode != PCIEHP_DETECT_ACPI)
 		return 0;
-	if (acpi_pci_detect_ejectable(dev->subordinate))
+	if (acpi_pci_detect_ejectable(DEVICE_ACPI_HANDLE(&dev->dev)))
 		return 0;
 	return -ENODEV;
 }
@@ -76,9 +76,9 @@ static int __init dummy_probe(struct pcie_device *dev)
 {
 	int pos;
 	u32 slot_cap;
+	acpi_handle handle;
 	struct slot *slot, *tmp;
 	struct pci_dev *pdev = dev->port;
-	struct pci_bus *pbus = pdev->subordinate;
 	/* Note: pciehp_detect_mode != PCIEHP_DETECT_ACPI here */
 	if (pciehp_get_hp_hw_control_from_firmware(pdev))
 		return -ENODEV;
@@ -94,7 +94,8 @@ static int __init dummy_probe(struct pcie_device *dev)
 			dup_slot_id++;
 	}
 	list_add_tail(&slot->slot_list, &dummy_slots);
-	if (!acpi_slot_detected && acpi_pci_detect_ejectable(pbus))
+	handle = DEVICE_ACPI_HANDLE(&pdev->dev);
+	if (!acpi_slot_detected && acpi_pci_detect_ejectable(handle))
 		acpi_slot_detected = 1;
 	return -ENODEV;         /* dummy driver always returns error */
 }
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index 1b00cc3177fc..f0c31ae3f842 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -231,7 +231,7 @@ extern acpi_status acpi_get_hp_params_from_firmware(struct pci_bus *bus,
 				struct hotplug_params *hpp);
 int acpi_get_hp_hw_control_from_firmware(struct pci_dev *dev, u32 flags);
 int acpi_pci_check_ejectable(struct pci_bus *pbus, acpi_handle handle);
-int acpi_pci_detect_ejectable(struct pci_bus *pbus);
+int acpi_pci_detect_ejectable(acpi_handle handle);
 #endif
 #endif
 
-- 
cgit v1.2.3-71-gd317


From 9d62ec6ca71d71c8a0d2cb1004f476d33f668955 Mon Sep 17 00:00:00 2001
From: Albin Tonnerre <albin.tonnerre@free-electrons.com>
Date: Wed, 5 Aug 2009 23:59:59 +0200
Subject: PM: Add convenience macro to make switching to dev_pm_ops less
 error-prone

In a number of cases, the .suspend, .freeze, .poweroff and .resume,
.thaw, .restore functions are identical. However, they all need to be
assigned to avoid regressionsm as the previous code called .suspend
resp. .resume in all those cases. SIMPLE_DEV_PM_OPS helps to deal
with this case.

[rjw: Changed the name of the macro and added the comment explaining its
 purpose.]

Signed-off-by: Albin Tonnerre <albin.tonnerre@free-electrons.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 include/linux/pm.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index 2b6e20df0e52..3b7e04b95bd2 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -213,6 +213,20 @@ struct dev_pm_ops {
 	int (*runtime_idle)(struct device *dev);
 };
 
+/*
+ * Use this if you want to use the same suspend and resume callbacks for suspend
+ * to RAM and hibernation.
+ */
+#define SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \
+struct dev_pm_ops name = { \
+	.suspend = suspend_fn, \
+	.resume = resume_fn, \
+	.freeze = suspend_fn, \
+	.thaw = resume_fn, \
+	.poweroff = suspend_fn, \
+	.restore = resume_fn, \
+}
+
 /**
  * PM_EVENT_ messages
  *
-- 
cgit v1.2.3-71-gd317


From 6a29172ba90e49c046245610caff9848307bfd6a Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Mon, 14 Sep 2009 16:35:15 -0600
Subject: PCI hotplug: clean up acpi_get_hp_params_from_firmware() interface

This patch makes acpi_get_hp_params_from_firmware() take a
pci_dev rather than a pci_bus and makes it return a standard
int errno rather than acpi_status.

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Reviewed-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Acked-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/acpi_pcihp.c   | 14 +++++++-------
 drivers/pci/hotplug/acpiphp_glue.c |  7 +++----
 drivers/pci/hotplug/pciehp.h       |  4 +---
 drivers/pci/hotplug/shpchp.h       |  4 +---
 include/linux/pci_hotplug.h        |  2 +-
 5 files changed, 13 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/acpi_pcihp.c b/drivers/pci/hotplug/acpi_pcihp.c
index ec3c039b7ebd..14d2d8a01f68 100644
--- a/drivers/pci/hotplug/acpi_pcihp.c
+++ b/drivers/pci/hotplug/acpi_pcihp.c
@@ -324,18 +324,18 @@ static acpi_status acpi_run_oshp(acpi_handle handle)
 
 /* acpi_get_hp_params_from_firmware
  *
- * @bus - the pci_bus of the bus on which the device is newly added
+ * @dev - the pci_dev for which we want parameters
  * @hpp - allocated by the caller
  */
-acpi_status acpi_get_hp_params_from_firmware(struct pci_bus *bus,
+int acpi_get_hp_params_from_firmware(struct pci_dev *dev,
 		struct hotplug_params *hpp)
 {
-	acpi_status status = AE_NOT_FOUND;
+	acpi_status status;
 	acpi_handle handle, phandle;
 	struct pci_bus *pbus;
 
 	handle = NULL;
-	for (pbus = bus; pbus; pbus = pbus->parent) {
+	for (pbus = dev->bus; pbus; pbus = pbus->parent) {
 		handle = acpi_pci_get_bridge_handle(pbus);
 		if (handle)
 			break;
@@ -350,10 +350,10 @@ acpi_status acpi_get_hp_params_from_firmware(struct pci_bus *bus,
 	while (handle) {
 		status = acpi_run_hpx(handle, hpp);
 		if (ACPI_SUCCESS(status))
-			break;
+			return 0;
 		status = acpi_run_hpp(handle, hpp);
 		if (ACPI_SUCCESS(status))
-			break;
+			return 0;
 		if (acpi_is_root_bridge(handle))
 			break;
 		status = acpi_get_parent(handle, &phandle);
@@ -361,7 +361,7 @@ acpi_status acpi_get_hp_params_from_firmware(struct pci_bus *bus,
 			break;
 		handle = phandle;
 	}
-	return status;
+	return -ENODEV;
 }
 EXPORT_SYMBOL_GPL(acpi_get_hp_params_from_firmware);
 
diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
index db81c08de8d5..14e6f1a17f28 100644
--- a/drivers/pci/hotplug/acpiphp_glue.c
+++ b/drivers/pci/hotplug/acpiphp_glue.c
@@ -275,11 +275,10 @@ static int detect_ejectable_slots(acpi_handle handle)
 /* decode ACPI 2.0 _HPP hot plug parameters */
 static void decode_hpp(struct pci_dev *dev, struct hotplug_params *hpp)
 {
-	acpi_status status;
+	int ret;
 
-	status = acpi_get_hp_params_from_firmware(dev->bus, hpp);
-	if (ACPI_FAILURE(status) ||
-	    !hpp->t0 || (hpp->t0->revision > 1)) {
+	ret = acpi_get_hp_params_from_firmware(dev, hpp);
+	if (ret || !hpp->t0 || (hpp->t0->revision > 1)) {
 		/* use default numbers */
 		printk(KERN_WARNING
 		       "%s: Could not get hotplug parameters. Use defaults\n",
diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h
index e6cf096498be..86cdfd71ed27 100644
--- a/drivers/pci/hotplug/pciehp.h
+++ b/drivers/pci/hotplug/pciehp.h
@@ -241,9 +241,7 @@ static inline int pciehp_get_hp_hw_control_from_firmware(struct pci_dev *dev)
 static inline int pciehp_get_hp_params_from_firmware(struct pci_dev *dev,
 			struct hotplug_params *hpp)
 {
-	if (ACPI_FAILURE(acpi_get_hp_params_from_firmware(dev->bus, hpp)))
-		return -ENODEV;
-	return 0;
+	return acpi_get_hp_params_from_firmware(dev, hpp);
 }
 #else
 #define pciehp_firmware_init()				do {} while (0)
diff --git a/drivers/pci/hotplug/shpchp.h b/drivers/pci/hotplug/shpchp.h
index 974e924ca96d..ad6a255cf0a5 100644
--- a/drivers/pci/hotplug/shpchp.h
+++ b/drivers/pci/hotplug/shpchp.h
@@ -191,9 +191,7 @@ static inline const char *slot_name(struct slot *slot)
 static inline int get_hp_params_from_firmware(struct pci_dev *dev,
 					      struct hotplug_params *hpp)
 {
-	if (ACPI_FAILURE(acpi_get_hp_params_from_firmware(dev->bus, hpp)))
-			return -ENODEV;
-	return 0;
+	return acpi_get_hp_params_from_firmware(dev, hpp);
 }
 
 static inline int get_hp_hw_control_from_firmware(struct pci_dev *dev)
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index f0c31ae3f842..0a7c2401d945 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -227,7 +227,7 @@ struct hotplug_params {
 #ifdef CONFIG_ACPI
 #include <acpi/acpi.h>
 #include <acpi/acpi_bus.h>
-extern acpi_status acpi_get_hp_params_from_firmware(struct pci_bus *bus,
+int acpi_get_hp_params_from_firmware(struct pci_dev *dev,
 				struct hotplug_params *hpp);
 int acpi_get_hp_hw_control_from_firmware(struct pci_dev *dev, u32 flags);
 int acpi_pci_check_ejectable(struct pci_bus *pbus, acpi_handle handle);
-- 
cgit v1.2.3-71-gd317


From 8838400db5193c37588813c2eb1249b821781950 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Mon, 14 Sep 2009 16:35:20 -0600
Subject: PCI hotplug: add pci_configure_slot()

This patch adds a new pci_configure_slot() function that programs the
PCI bus characteristics for a newly-added device.  This is based on
code in pciehp_pci.c, but should be generic enough to be used by pciehp,
shpchp, and acpiphp.

The hotplug_params struct and the program_hpp_typeX() functions are based
on the ACPI definitions, but they aren't really ACPI-specific, and there's
no alternate implementation, so I don't see the need to abstract them yet.

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Reviewed-by: Alex Chiang <achiang@hp.com>
Reviewed-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Acked-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/Makefile     |   2 +-
 drivers/pci/hotplug/pcihp_slot.c | 187 +++++++++++++++++++++++++++++++++++++++
 include/linux/pci_hotplug.h      |   9 ++
 3 files changed, 197 insertions(+), 1 deletion(-)
 create mode 100644 drivers/pci/hotplug/pcihp_slot.c

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/Makefile b/drivers/pci/hotplug/Makefile
index 2aa117c8cd87..3625b094bf7e 100644
--- a/drivers/pci/hotplug/Makefile
+++ b/drivers/pci/hotplug/Makefile
@@ -22,7 +22,7 @@ obj-$(CONFIG_HOTPLUG_PCI_SGI)		+= sgi_hotplug.o
 # Link this last so it doesn't claim devices that have a real hotplug driver
 obj-$(CONFIG_HOTPLUG_PCI_FAKE)		+= fakephp.o
 
-pci_hotplug-objs	:=	pci_hotplug_core.o
+pci_hotplug-objs	:=	pci_hotplug_core.o pcihp_slot.o
 
 ifdef CONFIG_HOTPLUG_PCI_CPCI
 pci_hotplug-objs	+=	cpci_hotplug_core.o	\
diff --git a/drivers/pci/hotplug/pcihp_slot.c b/drivers/pci/hotplug/pcihp_slot.c
new file mode 100644
index 000000000000..cc8ec3aa41a7
--- /dev/null
+++ b/drivers/pci/hotplug/pcihp_slot.c
@@ -0,0 +1,187 @@
+/*
+ * Copyright (C) 1995,2001 Compaq Computer Corporation
+ * Copyright (C) 2001 Greg Kroah-Hartman (greg@kroah.com)
+ * Copyright (C) 2001 IBM Corp.
+ * Copyright (C) 2003-2004 Intel Corporation
+ * (c) Copyright 2009 Hewlett-Packard Development Company, L.P.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/pci.h>
+#include <linux/pci_hotplug.h>
+
+static struct hpp_type0 pci_default_type0 = {
+	.revision = 1,
+	.cache_line_size = 8,
+	.latency_timer = 0x40,
+	.enable_serr = 0,
+	.enable_perr = 0,
+};
+
+static void program_hpp_type0(struct pci_dev *dev, struct hpp_type0 *hpp)
+{
+	u16 pci_cmd, pci_bctl;
+
+	if (!hpp) {
+		/*
+		 * Perhaps we *should* use default settings for PCIe, but
+		 * pciehp didn't, so we won't either.
+		 */
+		if (dev->is_pcie)
+			return;
+		dev_info(&dev->dev, "using default PCI settings\n");
+		hpp = &pci_default_type0;
+	}
+
+	if (hpp->revision > 1) {
+		dev_warn(&dev->dev,
+			 "PCI settings rev %d not supported; using defaults\n",
+			 hpp->revision);
+		hpp = &pci_default_type0;
+	}
+
+	pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE, hpp->cache_line_size);
+	pci_write_config_byte(dev, PCI_LATENCY_TIMER, hpp->latency_timer);
+	pci_read_config_word(dev, PCI_COMMAND, &pci_cmd);
+	if (hpp->enable_serr)
+		pci_cmd |= PCI_COMMAND_SERR;
+	else
+		pci_cmd &= ~PCI_COMMAND_SERR;
+	if (hpp->enable_perr)
+		pci_cmd |= PCI_COMMAND_PARITY;
+	else
+		pci_cmd &= ~PCI_COMMAND_PARITY;
+	pci_write_config_word(dev, PCI_COMMAND, pci_cmd);
+
+	/* Program bridge control value */
+	if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI) {
+		pci_write_config_byte(dev, PCI_SEC_LATENCY_TIMER,
+				      hpp->latency_timer);
+		pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &pci_bctl);
+		if (hpp->enable_serr)
+			pci_bctl |= PCI_BRIDGE_CTL_SERR;
+		else
+			pci_bctl &= ~PCI_BRIDGE_CTL_SERR;
+		if (hpp->enable_perr)
+			pci_bctl |= PCI_BRIDGE_CTL_PARITY;
+		else
+			pci_bctl &= ~PCI_BRIDGE_CTL_PARITY;
+		pci_write_config_word(dev, PCI_BRIDGE_CONTROL, pci_bctl);
+	}
+}
+
+static void program_hpp_type1(struct pci_dev *dev, struct hpp_type1 *hpp)
+{
+	if (hpp)
+		dev_warn(&dev->dev, "PCI-X settings not supported\n");
+}
+
+static void program_hpp_type2(struct pci_dev *dev, struct hpp_type2 *hpp)
+{
+	int pos;
+	u16 reg16;
+	u32 reg32;
+
+	if (!hpp)
+		return;
+
+	/* Find PCI Express capability */
+	pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
+	if (!pos)
+		return;
+
+	if (hpp->revision > 1) {
+		dev_warn(&dev->dev, "PCIe settings rev %d not supported\n",
+			 hpp->revision);
+		return;
+	}
+
+	/* Initialize Device Control Register */
+	pci_read_config_word(dev, pos + PCI_EXP_DEVCTL, &reg16);
+	reg16 = (reg16 & hpp->pci_exp_devctl_and) | hpp->pci_exp_devctl_or;
+	pci_write_config_word(dev, pos + PCI_EXP_DEVCTL, reg16);
+
+	/* Initialize Link Control Register */
+	if (dev->subordinate) {
+		pci_read_config_word(dev, pos + PCI_EXP_LNKCTL, &reg16);
+		reg16 = (reg16 & hpp->pci_exp_lnkctl_and)
+			| hpp->pci_exp_lnkctl_or;
+		pci_write_config_word(dev, pos + PCI_EXP_LNKCTL, reg16);
+	}
+
+	/* Find Advanced Error Reporting Enhanced Capability */
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
+	if (!pos)
+		return;
+
+	/* Initialize Uncorrectable Error Mask Register */
+	pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, &reg32);
+	reg32 = (reg32 & hpp->unc_err_mask_and) | hpp->unc_err_mask_or;
+	pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, reg32);
+
+	/* Initialize Uncorrectable Error Severity Register */
+	pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &reg32);
+	reg32 = (reg32 & hpp->unc_err_sever_and) | hpp->unc_err_sever_or;
+	pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, reg32);
+
+	/* Initialize Correctable Error Mask Register */
+	pci_read_config_dword(dev, pos + PCI_ERR_COR_MASK, &reg32);
+	reg32 = (reg32 & hpp->cor_err_mask_and) | hpp->cor_err_mask_or;
+	pci_write_config_dword(dev, pos + PCI_ERR_COR_MASK, reg32);
+
+	/* Initialize Advanced Error Capabilities and Control Register */
+	pci_read_config_dword(dev, pos + PCI_ERR_CAP, &reg32);
+	reg32 = (reg32 & hpp->adv_err_cap_and) | hpp->adv_err_cap_or;
+	pci_write_config_dword(dev, pos + PCI_ERR_CAP, reg32);
+
+	/*
+	 * FIXME: The following two registers are not supported yet.
+	 *
+	 *   o Secondary Uncorrectable Error Severity Register
+	 *   o Secondary Uncorrectable Error Mask Register
+	 */
+}
+
+void pci_configure_slot(struct pci_dev *dev)
+{
+	struct pci_dev *cdev;
+	struct hotplug_params hpp;
+	int ret;
+
+	if (!(dev->hdr_type == PCI_HEADER_TYPE_NORMAL ||
+			(dev->hdr_type == PCI_HEADER_TYPE_BRIDGE &&
+			(dev->class >> 8) == PCI_CLASS_BRIDGE_PCI)))
+		return;
+
+	memset(&hpp, 0, sizeof(hpp));
+	ret = pci_get_hp_params(dev, &hpp);
+	if (ret)
+		dev_warn(&dev->dev, "no hotplug settings from platform\n");
+
+	program_hpp_type2(dev, hpp.t2);
+	program_hpp_type1(dev, hpp.t1);
+	program_hpp_type0(dev, hpp.t0);
+
+	if (dev->subordinate) {
+		list_for_each_entry(cdev, &dev->subordinate->devices,
+				    bus_list)
+			pci_configure_slot(cdev);
+	}
+}
+EXPORT_SYMBOL_GPL(pci_configure_slot);
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index 0a7c2401d945..1cdef8317377 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -232,6 +232,15 @@ int acpi_get_hp_params_from_firmware(struct pci_dev *dev,
 int acpi_get_hp_hw_control_from_firmware(struct pci_dev *dev, u32 flags);
 int acpi_pci_check_ejectable(struct pci_bus *pbus, acpi_handle handle);
 int acpi_pci_detect_ejectable(acpi_handle handle);
+#define pci_get_hp_params(dev, hpp)  acpi_get_hp_params_from_firmware(dev, hpp)
+#else
+static inline int pci_get_hp_params(struct pci_dev *dev,
+				    struct hotplug_params *hpp)
+{
+	return -ENODEV;
+}
 #endif
+
+void pci_configure_slot(struct pci_dev *dev);
 #endif
 
-- 
cgit v1.2.3-71-gd317


From e81995bb1c0077a312cb621abc406a36f65a986a Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Mon, 14 Sep 2009 16:35:35 -0600
Subject: PCI hotplug: acpiphp: use generic pci_configure_slot()

Use the generic pci_configure_slot() rather than the acpiphp-specific
decode_hpp() and program_hpp().

Unlike the previous acpiphp-specific code, pci_configure_slot() programs
PCIe settings when an _HPX method provides them, so acpiphp-managed PCIe
devices can now be configured.

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Reviewed-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Acked-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/acpi_pcihp.c   |  9 ++---
 drivers/pci/hotplug/acpiphp_glue.c | 81 +-------------------------------------
 include/linux/pci_hotplug.h        |  4 +-
 3 files changed, 7 insertions(+), 87 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/acpi_pcihp.c b/drivers/pci/hotplug/acpi_pcihp.c
index 14d2d8a01f68..ee24de1c5fae 100644
--- a/drivers/pci/hotplug/acpi_pcihp.c
+++ b/drivers/pci/hotplug/acpi_pcihp.c
@@ -322,13 +322,12 @@ static acpi_status acpi_run_oshp(acpi_handle handle)
 	return status;
 }
 
-/* acpi_get_hp_params_from_firmware
+/* pci_get_hp_params
  *
  * @dev - the pci_dev for which we want parameters
  * @hpp - allocated by the caller
  */
-int acpi_get_hp_params_from_firmware(struct pci_dev *dev,
-		struct hotplug_params *hpp)
+int pci_get_hp_params(struct pci_dev *dev, struct hotplug_params *hpp)
 {
 	acpi_status status;
 	acpi_handle handle, phandle;
@@ -345,7 +344,7 @@ int acpi_get_hp_params_from_firmware(struct pci_dev *dev,
 	 * _HPP settings apply to all child buses, until another _HPP is
 	 * encountered. If we don't find an _HPP for the input pci dev,
 	 * look for it in the parent device scope since that would apply to
-	 * this pci dev. If we don't find any _HPP, use hardcoded defaults
+	 * this pci dev.
 	 */
 	while (handle) {
 		status = acpi_run_hpx(handle, hpp);
@@ -363,7 +362,7 @@ int acpi_get_hp_params_from_firmware(struct pci_dev *dev,
 	}
 	return -ENODEV;
 }
-EXPORT_SYMBOL_GPL(acpi_get_hp_params_from_firmware);
+EXPORT_SYMBOL_GPL(pci_get_hp_params);
 
 /**
  * acpi_get_hp_hw_control_from_firmware
diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
index 14e6f1a17f28..58d25a163a8b 100644
--- a/drivers/pci/hotplug/acpiphp_glue.c
+++ b/drivers/pci/hotplug/acpiphp_glue.c
@@ -271,29 +271,6 @@ static int detect_ejectable_slots(acpi_handle handle)
 	return found;
 }
 
-
-/* decode ACPI 2.0 _HPP hot plug parameters */
-static void decode_hpp(struct pci_dev *dev, struct hotplug_params *hpp)
-{
-	int ret;
-
-	ret = acpi_get_hp_params_from_firmware(dev, hpp);
-	if (ret || !hpp->t0 || (hpp->t0->revision > 1)) {
-		/* use default numbers */
-		printk(KERN_WARNING
-		       "%s: Could not get hotplug parameters. Use defaults\n",
-		       __func__);
-		hpp->t0 = &hpp->type0_data;
-		hpp->t0->revision = 0;
-		hpp->t0->cache_line_size = 0x10;
-		hpp->t0->latency_timer = 0x40;
-		hpp->t0->enable_serr = 0;
-		hpp->t0->enable_perr = 0;
-	}
-}
-
-
-
 /* initialize miscellaneous stuff for both root and PCI-to-PCI bridge */
 static void init_bridge_misc(struct acpiphp_bridge *bridge)
 {
@@ -1247,66 +1224,12 @@ static int acpiphp_check_bridge(struct acpiphp_bridge *bridge)
 	return retval;
 }
 
-static void program_hpp(struct pci_dev *dev, struct hotplug_params *hpp)
-{
-	u16 pci_cmd, pci_bctl;
-	struct pci_dev *cdev;
-
-	/* Program hpp values for this device */
-	if (!(dev->hdr_type == PCI_HEADER_TYPE_NORMAL ||
-			(dev->hdr_type == PCI_HEADER_TYPE_BRIDGE &&
-			(dev->class >> 8) == PCI_CLASS_BRIDGE_PCI)))
-		return;
-
-	if ((dev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
-		return;
-
-	pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE,
-			hpp->t0->cache_line_size);
-	pci_write_config_byte(dev, PCI_LATENCY_TIMER,
-			hpp->t0->latency_timer);
-	pci_read_config_word(dev, PCI_COMMAND, &pci_cmd);
-	if (hpp->t0->enable_serr)
-		pci_cmd |= PCI_COMMAND_SERR;
-	else
-		pci_cmd &= ~PCI_COMMAND_SERR;
-	if (hpp->t0->enable_perr)
-		pci_cmd |= PCI_COMMAND_PARITY;
-	else
-		pci_cmd &= ~PCI_COMMAND_PARITY;
-	pci_write_config_word(dev, PCI_COMMAND, pci_cmd);
-
-	/* Program bridge control value and child devices */
-	if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI) {
-		pci_write_config_byte(dev, PCI_SEC_LATENCY_TIMER,
-				hpp->t0->latency_timer);
-		pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &pci_bctl);
-		if (hpp->t0->enable_serr)
-			pci_bctl |= PCI_BRIDGE_CTL_SERR;
-		else
-			pci_bctl &= ~PCI_BRIDGE_CTL_SERR;
-		if (hpp->t0->enable_perr)
-			pci_bctl |= PCI_BRIDGE_CTL_PARITY;
-		else
-			pci_bctl &= ~PCI_BRIDGE_CTL_PARITY;
-		pci_write_config_word(dev, PCI_BRIDGE_CONTROL, pci_bctl);
-		if (dev->subordinate) {
-			list_for_each_entry(cdev, &dev->subordinate->devices,
-					bus_list)
-				program_hpp(cdev, hpp);
-		}
-	}
-}
-
 static void acpiphp_set_hpp_values(struct pci_bus *bus)
 {
 	struct pci_dev *dev;
-	struct hotplug_params hpp;
 
-	list_for_each_entry(dev, &bus->devices, bus_list) {
-		decode_hpp(dev, &hpp);
-		program_hpp(dev, &hpp);
-	}
+	list_for_each_entry(dev, &bus->devices, bus_list)
+		pci_configure_slot(dev);
 }
 
 /*
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index 1cdef8317377..652ba797696d 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -227,12 +227,10 @@ struct hotplug_params {
 #ifdef CONFIG_ACPI
 #include <acpi/acpi.h>
 #include <acpi/acpi_bus.h>
-int acpi_get_hp_params_from_firmware(struct pci_dev *dev,
-				struct hotplug_params *hpp);
+int pci_get_hp_params(struct pci_dev *dev, struct hotplug_params *hpp);
 int acpi_get_hp_hw_control_from_firmware(struct pci_dev *dev, u32 flags);
 int acpi_pci_check_ejectable(struct pci_bus *pbus, acpi_handle handle);
 int acpi_pci_detect_ejectable(acpi_handle handle);
-#define pci_get_hp_params(dev, hpp)  acpi_get_hp_params_from_firmware(dev, hpp)
 #else
 static inline int pci_get_hp_params(struct pci_dev *dev,
 				    struct hotplug_params *hpp)
-- 
cgit v1.2.3-71-gd317


From 353f6dd2dec992ddd34620a94b051b0f76227379 Mon Sep 17 00:00:00 2001
From: Anirban Sinha <asinha@zeugmasystems.com>
Date: Mon, 14 Sep 2009 11:13:37 -0700
Subject: cleanup console_print()

console_print() is an old legacy interface mostly unused in the entire
kernel tree. It's best to clean up its existing use and let developers
use their own implementation of it as they feel fit.

Signed-off-by: Anirban Sinha <asinha@zeugmasystems.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/head.S    |  1 +
 arch/ia64/kernel/head.h    |  1 +
 arch/ia64/kernel/process.c |  7 +++++++
 drivers/char/serial167.c   |  5 ++---
 include/linux/dtlk.h       | 19 -------------------
 include/linux/tty.h        |  4 ----
 kernel/printk.c            |  6 ------
 7 files changed, 11 insertions(+), 32 deletions(-)
 create mode 100644 arch/ia64/kernel/head.h

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S
index 23f846de62d5..e6c5c3d5e1f8 100644
--- a/arch/ia64/kernel/head.S
+++ b/arch/ia64/kernel/head.S
@@ -34,6 +34,7 @@
 #include <asm/mca_asm.h>
 #include <linux/init.h>
 #include <linux/linkage.h>
+#include "head.h"
 
 #ifdef CONFIG_HOTPLUG_CPU
 #define SAL_PSR_BITS_TO_SET				\
diff --git a/arch/ia64/kernel/head.h b/arch/ia64/kernel/head.h
new file mode 100644
index 000000000000..2e2ac6824e65
--- /dev/null
+++ b/arch/ia64/kernel/head.h
@@ -0,0 +1 @@
+extern void console_print(const char *s);
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 89969e950045..9bcec9945c12 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -161,6 +161,13 @@ show_regs (struct pt_regs *regs)
 		show_stack(NULL, NULL);
 }
 
+/* local support for deprecated console_print */
+void
+console_print(const char *s)
+{
+	printk(KERN_EMERG "%s", s);
+}
+
 void
 do_notify_resume_user(sigset_t *unused, struct sigscratch *scr, long in_syscall)
 {
diff --git a/drivers/char/serial167.c b/drivers/char/serial167.c
index 51e7a46787be..5942a9d674c0 100644
--- a/drivers/char/serial167.c
+++ b/drivers/char/serial167.c
@@ -171,7 +171,6 @@ static int startup(struct cyclades_port *);
 static void cy_throttle(struct tty_struct *);
 static void cy_unthrottle(struct tty_struct *);
 static void config_setup(struct cyclades_port *);
-extern void console_print(const char *);
 #ifdef CYCLOM_SHOW_STATUS
 static void show_status(int);
 #endif
@@ -245,7 +244,7 @@ void SP(char *data)
 {
 	unsigned long flags;
 	local_irq_save(flags);
-	console_print(data);
+	printk(KERN_EMERG "%s", data);
 	local_irq_restore(flags);
 }
 
@@ -255,7 +254,7 @@ void CP(char data)
 	unsigned long flags;
 	local_irq_save(flags);
 	scrn[0] = data;
-	console_print(scrn);
+	printk(KERN_EMERG "%c", scrn);
 	local_irq_restore(flags);
 }				/* CP */
 
diff --git a/include/linux/dtlk.h b/include/linux/dtlk.h
index 2896d90118a9..22a7b9a5f5d1 100644
--- a/include/linux/dtlk.h
+++ b/include/linux/dtlk.h
@@ -1,22 +1,3 @@
-#if 0
-
-#define TRACE_TXT(text) \
-	{ \
-	  if(dtlk_trace) \
-	  { \
-	    console_print(text); \
-	    console_print("\n"); \
-	  } \
-	}
-
-#define TRACE_CHR(chr) \
-	{ \
-	  if(dtlk_trace) \
-	    console_print(chr); \
-	} \
-
-#endif
-
 #define DTLK_MINOR	0
 #define DTLK_IO_EXTENT	0x02
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 0d3974f59c53..a916a318004e 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -519,10 +519,6 @@ extern void serial_console_init(void);
 
 extern int pcxe_open(struct tty_struct *tty, struct file *filp);
 
-/* printk.c */
-
-extern void console_print(const char *);
-
 /* vt.c */
 
 extern int vt_ioctl(struct tty_struct *tty, struct file *file,
diff --git a/kernel/printk.c b/kernel/printk.c
index e10d193a833a..602033acd6c7 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1075,12 +1075,6 @@ void __sched console_conditional_schedule(void)
 }
 EXPORT_SYMBOL(console_conditional_schedule);
 
-void console_print(const char *s)
-{
-	printk(KERN_EMERG "%s", s);
-}
-EXPORT_SYMBOL(console_print);
-
 void console_unblank(void)
 {
 	struct console *c;
-- 
cgit v1.2.3-71-gd317


From 8a478905adbb2e09a59644e76f7fe7e0ab644204 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 14 Sep 2009 20:59:48 -0400
Subject: SELinux: inline selinux_is_enabled in !CONFIG_SECURITY_SELINUX

Without this patch building a kernel emits millions of warning like:

include/linux/selinux.h:92: warning: ?selinux_is_enabled? defined but not used

When it is build without CONFIG_SECURITY_SELINUX.  This is harmless, but
the function should be inlined, so it gets compiled out.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/selinux.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/selinux.h b/include/linux/selinux.h
index 223d06a6feb1..82e0f26a1299 100644
--- a/include/linux/selinux.h
+++ b/include/linux/selinux.h
@@ -89,7 +89,7 @@ static inline void selinux_secmark_refcount_dec(void)
 	return;
 }
 
-static bool selinux_is_enabled(void)
+static inline bool selinux_is_enabled(void)
 {
 	return false;
 }
-- 
cgit v1.2.3-71-gd317


From 12e09337fe238981cb0c87543306e23775d1a143 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 14 Sep 2009 23:37:40 +0200
Subject: time: Prevent 32 bit overflow with set_normalized_timespec()

set_normalized_timespec() nsec argument is of type long. The recent
timekeeping changes of ktime_get_ts() feed

	ts->tv_nsec + tomono.tv_nsec + nsecs

to set_normalized_timespec(). On 32 bit machines that sum can be
larger than (1 << 31) and therefor result in a negative value which
screws up the result completely.

Make the nsec argument of set_normalized_timespec() s64 to fix the
problem at hand. This also prevents similar problems for future users
of set_normalized_timespec().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Carsten Emde <carsten.emde@osadl.org>
LKML-Reference: <new-submission>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: John Stultz <johnstul@us.ibm.com>
---
 include/linux/time.h | 2 +-
 kernel/time.c        | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index 256232f7e5e6..56787c093345 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -75,7 +75,7 @@ extern unsigned long mktime(const unsigned int year, const unsigned int mon,
 			    const unsigned int day, const unsigned int hour,
 			    const unsigned int min, const unsigned int sec);
 
-extern void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec);
+extern void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec);
 extern struct timespec timespec_add_safe(const struct timespec lhs,
 					 const struct timespec rhs);
 
diff --git a/kernel/time.c b/kernel/time.c
index 29511943871a..2e2e469a7fec 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -370,13 +370,20 @@ EXPORT_SYMBOL(mktime);
  *	0 <= tv_nsec < NSEC_PER_SEC
  * For negative values only the tv_sec field is negative !
  */
-void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
+void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec)
 {
 	while (nsec >= NSEC_PER_SEC) {
+		/*
+		 * The following asm() prevents the compiler from
+		 * optimising this loop into a modulo operation. See
+		 * also __iter_div_u64_rem() in include/linux/time.h
+		 */
+		asm("" : "+rm"(nsec));
 		nsec -= NSEC_PER_SEC;
 		++sec;
 	}
 	while (nsec < 0) {
+		asm("" : "+rm"(nsec));
 		nsec += NSEC_PER_SEC;
 		--sec;
 	}
-- 
cgit v1.2.3-71-gd317


From 5f3edc1b1ead6d9bd45a85c551f44eff8fe76b9f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 10 Sep 2009 13:42:00 +0200
Subject: sched: Hook sched_balance_self() into sched_class::select_task_rq()

Rather ugly patch to fully place the sched_balance_self() code
inside the fair class.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   |  3 ++-
 kernel/sched.c          | 14 +++++++-------
 kernel/sched_fair.c     |  7 ++++++-
 kernel/sched_idletask.c |  2 +-
 kernel/sched_rt.c       |  5 ++++-
 5 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f3d74bd04d18..5d3c9900943e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -811,6 +811,7 @@ enum cpu_idle_type {
 #define SD_SERIALIZE		0x0400	/* Only a single load balancing instance */
 #define SD_WAKE_IDLE_FAR	0x0800	/* Gain latency sacrificing cache hit */
 #define SD_PREFER_SIBLING	0x1000	/* Prefer to place tasks in a sibling domain */
+#define SD_BALANCE_WAKE		0x2000  /* Balance on wakeup */
 
 enum powersavings_balance_level {
 	POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
@@ -1032,7 +1033,7 @@ struct sched_class {
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
 #ifdef CONFIG_SMP
-	int  (*select_task_rq)(struct task_struct *p, int sync);
+	int  (*select_task_rq)(struct task_struct *p, int flag, int sync);
 
 	unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
 			struct rq *busiest, unsigned long max_load_move,
diff --git a/kernel/sched.c b/kernel/sched.c
index 60400a22401f..32b7a81230c2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2350,7 +2350,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	if (unlikely(task_running(rq, p)))
 		goto out_activate;
 
-	cpu = p->sched_class->select_task_rq(p, sync);
+	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, sync);
 	if (cpu != orig_cpu) {
 		set_task_cpu(p, cpu);
 		task_rq_unlock(rq, &flags);
@@ -2525,11 +2525,6 @@ void sched_fork(struct task_struct *p, int clone_flags)
 
 	__sched_fork(p);
 
-#ifdef CONFIG_SMP
-	cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
-#endif
-	set_task_cpu(p, cpu);
-
 	/*
 	 * Make sure we do not leak PI boosting priority to the child.
 	 */
@@ -2560,6 +2555,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
 	if (!rt_prio(p->prio))
 		p->sched_class = &fair_sched_class;
 
+#ifdef CONFIG_SMP
+	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
+#endif
+	set_task_cpu(p, cpu);
+
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (likely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -3114,7 +3114,7 @@ out:
 void sched_exec(void)
 {
 	int new_cpu, this_cpu = get_cpu();
-	new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
+	new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
 	put_cpu();
 	if (new_cpu != this_cpu)
 		sched_migrate_task(current, new_cpu);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a82d71d3afed..f2eb5b934715 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1300,7 +1300,9 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
 	return 0;
 }
 
-static int select_task_rq_fair(struct task_struct *p, int sync)
+static int sched_balance_self(int cpu, int flag);
+
+static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
 {
 	struct sched_domain *sd, *this_sd = NULL;
 	int prev_cpu, this_cpu, new_cpu;
@@ -1314,6 +1316,9 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
 	this_rq		= cpu_rq(this_cpu);
 	new_cpu		= prev_cpu;
 
+	if (flag != SD_BALANCE_WAKE)
+		return sched_balance_self(this_cpu, flag);
+
 	/*
 	 * 'this_sd' is the first domain that both
 	 * this_cpu and prev_cpu are present in:
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 499672c10cbd..99b2f0337609 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,7 @@
  */
 
 #ifdef CONFIG_SMP
-static int select_task_rq_idle(struct task_struct *p, int sync)
+static int select_task_rq_idle(struct task_struct *p, int flag, int sync)
 {
 	return task_cpu(p); /* IDLE tasks as never migrated */
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 2eb4bd6a526c..438380810ac4 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -938,10 +938,13 @@ static void yield_task_rt(struct rq *rq)
 #ifdef CONFIG_SMP
 static int find_lowest_rq(struct task_struct *task);
 
-static int select_task_rq_rt(struct task_struct *p, int sync)
+static int select_task_rq_rt(struct task_struct *p, int flag, int sync)
 {
 	struct rq *rq = task_rq(p);
 
+	if (flag != SD_BALANCE_WAKE)
+		return smp_processor_id();
+
 	/*
 	 * If the current task is an RT task, then
 	 * try to see if we can wake this RT task up on another
-- 
cgit v1.2.3-71-gd317


From e9c8431185d6c406887190519f6dbdd112641686 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 15 Sep 2009 14:43:03 +0200
Subject: sched: Add TASK_WAKING

We're going to want to drop rq->lock in try_to_wake_up() for a
longer period of time, however we also want to deal with concurrent
waking of the same task, which is currently handled by holding
rq->lock.

So introduce a new TASK state, namely TASK_WAKING, which indicates
someone is already waking the task (other wakers will fail p->state
& state).

We also keep preemption disabled over the whole ttwu().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 +
 kernel/sched.c        | 31 +++++++++++++++----------------
 2 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5d3c9900943e..3b0ca66bd6ce 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -190,6 +190,7 @@ extern unsigned long long time_sync_thresh;
 /* in tsk->state again */
 #define TASK_DEAD		64
 #define TASK_WAKEKILL		128
+#define TASK_WAKING		256
 
 /* Convenience macros for the sake of set_task_state */
 #define TASK_KILLABLE		(TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
diff --git a/kernel/sched.c b/kernel/sched.c
index 32b7a81230c2..fc6fda881d2e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2310,7 +2310,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 {
 	int cpu, orig_cpu, this_cpu, success = 0;
 	unsigned long flags;
-	long old_state;
 	struct rq *rq;
 
 	if (!sched_feat(SYNC_WAKEUPS))
@@ -2332,11 +2331,12 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	}
 #endif
 
+	this_cpu = get_cpu();
+
 	smp_wmb();
 	rq = task_rq_lock(p, &flags);
 	update_rq_clock(rq);
-	old_state = p->state;
-	if (!(old_state & state))
+	if (!(p->state & state))
 		goto out;
 
 	if (p->se.on_rq)
@@ -2344,27 +2344,25 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 
 	cpu = task_cpu(p);
 	orig_cpu = cpu;
-	this_cpu = smp_processor_id();
 
 #ifdef CONFIG_SMP
 	if (unlikely(task_running(rq, p)))
 		goto out_activate;
 
+	/*
+	 * In order to handle concurrent wakeups and release the rq->lock
+	 * we put the task in TASK_WAKING state.
+	 */
+	p->state = TASK_WAKING;
+	task_rq_unlock(rq, &flags);
+
 	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, sync);
-	if (cpu != orig_cpu) {
+	if (cpu != orig_cpu)
 		set_task_cpu(p, cpu);
-		task_rq_unlock(rq, &flags);
-		/* might preempt at this point */
-		rq = task_rq_lock(p, &flags);
-		old_state = p->state;
-		if (!(old_state & state))
-			goto out;
-		if (p->se.on_rq)
-			goto out_running;
 
-		this_cpu = smp_processor_id();
-		cpu = task_cpu(p);
-	}
+	rq = task_rq_lock(p, &flags);
+	WARN_ON(p->state != TASK_WAKING);
+	cpu = task_cpu(p);
 
 #ifdef CONFIG_SCHEDSTATS
 	schedstat_inc(rq, ttwu_count);
@@ -2422,6 +2420,7 @@ out_running:
 #endif
 out:
 	task_rq_unlock(rq, &flags);
+	put_cpu();
 
 	return success;
 }
-- 
cgit v1.2.3-71-gd317


From c88d5910890ad35af283344417891344604f0438 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 10 Sep 2009 13:50:02 +0200
Subject: sched: Merge select_task_rq_fair() and sched_balance_self()

The problem with wake_idle() is that is doesn't respect things like
cpu_power, which means it doesn't deal well with SMT nor the recent
RT interaction.

To cure this, it needs to do what sched_balance_self() does, which
leads to the possibility of merging select_task_rq_fair() and
sched_balance_self().

Modify sched_balance_self() to:

  - update_shares() when walking up the domain tree,
    (it only called it for the top domain, but it should
     have done this anyway), which allows us to remove
    this ugly bit from try_to_wake_up().

  - do wake_affine() on the smallest domain that contains
    both this (the waking) and the prev (the wakee) cpu for
    WAKE invocations.

Then use the top-down balance steps it had to replace wake_idle().

This leads to the dissapearance of SD_WAKE_BALANCE and
SD_WAKE_IDLE_FAR, with SD_WAKE_IDLE replaced with SD_BALANCE_WAKE.

SD_WAKE_AFFINE needs SD_BALANCE_WAKE to be effective.

Touch all topology bits to replace the old with new SD flags --
platforms might need re-tuning, enabling SD_BALANCE_WAKE
conditionally on a NUMA distance seems like a good additional
feature, magny-core and small nehalem systems would want this
enabled, systems with slow interconnects would not.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/ia64/include/asm/topology.h           |   5 +-
 arch/mips/include/asm/mach-ip27/topology.h |   2 +-
 arch/powerpc/include/asm/topology.h        |   5 +-
 arch/sh/include/asm/topology.h             |   4 +-
 arch/sparc/include/asm/topology_64.h       |   4 +-
 arch/x86/include/asm/topology.h            |   4 +-
 include/linux/sched.h                      |   7 +-
 include/linux/topology.h                   |  16 +-
 kernel/sched.c                             |  41 +----
 kernel/sched_fair.c                        | 233 ++++++++---------------------
 10 files changed, 84 insertions(+), 237 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 7b4c8c70b2d1..cf6053b226c3 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -67,6 +67,7 @@ void build_cpu_to_node_map(void);
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
+				| SD_BALANCE_WAKE	\
 				| SD_WAKE_AFFINE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
@@ -91,8 +92,8 @@ void build_cpu_to_node_map(void);
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
 				| SD_BALANCE_FORK	\
-				| SD_SERIALIZE		\
-				| SD_WAKE_BALANCE,	\
+				| SD_BALANCE_WAKE	\
+				| SD_SERIALIZE,		\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 64,			\
 	.nr_balance_failed	= 0,			\
diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h
index 07547231e078..d8332398f5be 100644
--- a/arch/mips/include/asm/mach-ip27/topology.h
+++ b/arch/mips/include/asm/mach-ip27/topology.h
@@ -48,7 +48,7 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
 	.cache_nice_tries	= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
-				| SD_WAKE_BALANCE,	\
+				| SD_BALANCE_WAKE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
 	.nr_balance_failed	= 0,			\
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 054a16d68082..c6343313ff59 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -62,9 +62,8 @@ static inline int pcibus_to_node(struct pci_bus *bus)
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
 				| SD_BALANCE_NEWIDLE	\
-				| SD_WAKE_IDLE		\
-				| SD_SERIALIZE		\
-				| SD_WAKE_BALANCE,	\
+				| SD_BALANCE_WAKE	\
+				| SD_SERIALIZE,		\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
 	.nr_balance_failed	= 0,			\
diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h
index b69ee850906d..dc1531e2f25f 100644
--- a/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@ -21,8 +21,8 @@
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
-				| SD_SERIALIZE		\
-				| SD_WAKE_BALANCE,	\
+				| SD_BALANCE_WAKE	\
+				| SD_SERIALIZE,		\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
 	.nr_balance_failed	= 0,			\
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index e5ea8d332421..1d091abd2d13 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -57,8 +57,8 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
-				| SD_SERIALIZE		\
-				| SD_WAKE_BALANCE,	\
+				| SD_BALANCE_WAKE	\
+				| SD_SERIALIZE,		\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
 }
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 26d06e052a18..966d58dc6274 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -145,14 +145,12 @@ extern unsigned long node_remap_size[];
 				| 1*SD_BALANCE_NEWIDLE			\
 				| 1*SD_BALANCE_EXEC			\
 				| 1*SD_BALANCE_FORK			\
-				| 0*SD_WAKE_IDLE			\
+				| 1*SD_BALANCE_WAKE			\
 				| 1*SD_WAKE_AFFINE			\
-				| 1*SD_WAKE_BALANCE			\
 				| 0*SD_SHARE_CPUPOWER			\
 				| 0*SD_POWERSAVINGS_BALANCE		\
 				| 0*SD_SHARE_PKG_RESOURCES		\
 				| 1*SD_SERIALIZE			\
-				| 1*SD_WAKE_IDLE_FAR			\
 				| 0*SD_PREFER_SIBLING			\
 				,					\
 	.last_balance		= jiffies,				\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3b0ca66bd6ce..c30bf3d516d1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -803,16 +803,15 @@ enum cpu_idle_type {
 #define SD_BALANCE_NEWIDLE	0x0002	/* Balance when about to become idle */
 #define SD_BALANCE_EXEC		0x0004	/* Balance on exec */
 #define SD_BALANCE_FORK		0x0008	/* Balance on fork, clone */
-#define SD_WAKE_IDLE		0x0010	/* Wake to idle CPU on task wakeup */
+#define SD_BALANCE_WAKE		0x0010  /* Balance on wakeup */
 #define SD_WAKE_AFFINE		0x0020	/* Wake task to waking CPU */
-#define SD_WAKE_BALANCE		0x0040	/* Perform balancing at task wakeup */
+
 #define SD_SHARE_CPUPOWER	0x0080	/* Domain members share cpu power */
 #define SD_POWERSAVINGS_BALANCE	0x0100	/* Balance for power savings */
 #define SD_SHARE_PKG_RESOURCES	0x0200	/* Domain members share cpu pkg resources */
 #define SD_SERIALIZE		0x0400	/* Only a single load balancing instance */
-#define SD_WAKE_IDLE_FAR	0x0800	/* Gain latency sacrificing cache hit */
+
 #define SD_PREFER_SIBLING	0x1000	/* Prefer to place tasks in a sibling domain */
-#define SD_BALANCE_WAKE		0x2000  /* Balance on wakeup */
 
 enum powersavings_balance_level {
 	POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 85e8cf7d393c..6a8cd15555bb 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -95,14 +95,12 @@ int arch_update_cpu_topology(void);
 				| 1*SD_BALANCE_NEWIDLE			\
 				| 1*SD_BALANCE_EXEC			\
 				| 1*SD_BALANCE_FORK			\
-				| 0*SD_WAKE_IDLE			\
+				| 1*SD_BALANCE_WAKE			\
 				| 1*SD_WAKE_AFFINE			\
-				| 1*SD_WAKE_BALANCE			\
 				| 1*SD_SHARE_CPUPOWER			\
 				| 0*SD_POWERSAVINGS_BALANCE		\
 				| 0*SD_SHARE_PKG_RESOURCES		\
 				| 0*SD_SERIALIZE			\
-				| 0*SD_WAKE_IDLE_FAR			\
 				| 0*SD_PREFER_SIBLING			\
 				,					\
 	.last_balance		= jiffies,				\
@@ -129,13 +127,11 @@ int arch_update_cpu_topology(void);
 				| 1*SD_BALANCE_NEWIDLE			\
 				| 1*SD_BALANCE_EXEC			\
 				| 1*SD_BALANCE_FORK			\
-				| 1*SD_WAKE_IDLE			\
+				| 1*SD_BALANCE_WAKE			\
 				| 1*SD_WAKE_AFFINE			\
-				| 1*SD_WAKE_BALANCE			\
 				| 0*SD_SHARE_CPUPOWER			\
 				| 1*SD_SHARE_PKG_RESOURCES		\
 				| 0*SD_SERIALIZE			\
-				| 0*SD_WAKE_IDLE_FAR			\
 				| sd_balance_for_mc_power()		\
 				| sd_power_saving_flags()		\
 				,					\
@@ -163,13 +159,11 @@ int arch_update_cpu_topology(void);
 				| 1*SD_BALANCE_NEWIDLE			\
 				| 1*SD_BALANCE_EXEC			\
 				| 1*SD_BALANCE_FORK			\
-				| 1*SD_WAKE_IDLE			\
+				| 1*SD_BALANCE_WAKE			\
 				| 0*SD_WAKE_AFFINE			\
-				| 1*SD_WAKE_BALANCE			\
 				| 0*SD_SHARE_CPUPOWER			\
 				| 0*SD_SHARE_PKG_RESOURCES		\
 				| 0*SD_SERIALIZE			\
-				| 0*SD_WAKE_IDLE_FAR			\
 				| sd_balance_for_package_power()	\
 				| sd_power_saving_flags()		\
 				,					\
@@ -191,14 +185,12 @@ int arch_update_cpu_topology(void);
 				| 1*SD_BALANCE_NEWIDLE			\
 				| 0*SD_BALANCE_EXEC			\
 				| 0*SD_BALANCE_FORK			\
-				| 0*SD_WAKE_IDLE			\
+				| 0*SD_BALANCE_WAKE			\
 				| 1*SD_WAKE_AFFINE			\
-				| 0*SD_WAKE_BALANCE			\
 				| 0*SD_SHARE_CPUPOWER			\
 				| 0*SD_POWERSAVINGS_BALANCE		\
 				| 0*SD_SHARE_PKG_RESOURCES		\
 				| 1*SD_SERIALIZE			\
-				| 1*SD_WAKE_IDLE_FAR			\
 				| 0*SD_PREFER_SIBLING			\
 				,					\
 	.last_balance		= jiffies,				\
diff --git a/kernel/sched.c b/kernel/sched.c
index fc6fda881d2e..6c819f338b11 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -512,14 +512,6 @@ struct root_domain {
 #ifdef CONFIG_SMP
 	struct cpupri cpupri;
 #endif
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-	/*
-	 * Preferred wake up cpu nominated by sched_mc balance that will be
-	 * used when most cpus are idle in the system indicating overall very
-	 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
-	 */
-	unsigned int sched_mc_preferred_wakeup_cpu;
-#endif
 };
 
 /*
@@ -2315,22 +2307,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	if (!sched_feat(SYNC_WAKEUPS))
 		sync = 0;
 
-#ifdef CONFIG_SMP
-	if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
-		struct sched_domain *sd;
-
-		this_cpu = raw_smp_processor_id();
-		cpu = task_cpu(p);
-
-		for_each_domain(this_cpu, sd) {
-			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-				update_shares(sd);
-				break;
-			}
-		}
-	}
-#endif
-
 	this_cpu = get_cpu();
 
 	smp_wmb();
@@ -3533,11 +3509,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
 	*imbalance = sds->min_load_per_task;
 	sds->busiest = sds->group_min;
 
-	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
-		cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
-			group_first_cpu(sds->group_leader);
-	}
-
 	return 1;
 
 }
@@ -7850,9 +7821,7 @@ static int sd_degenerate(struct sched_domain *sd)
 	}
 
 	/* Following flags don't use groups */
-	if (sd->flags & (SD_WAKE_IDLE |
-			 SD_WAKE_AFFINE |
-			 SD_WAKE_BALANCE))
+	if (sd->flags & (SD_WAKE_AFFINE))
 		return 0;
 
 	return 1;
@@ -7869,10 +7838,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
 		return 0;
 
-	/* Does parent contain flags not in child? */
-	/* WAKE_BALANCE is a subset of WAKE_AFFINE */
-	if (cflags & SD_WAKE_AFFINE)
-		pflags &= ~SD_WAKE_BALANCE;
 	/* Flags needing groups don't count if only 1 group in parent */
 	if (parent->groups == parent->groups->next) {
 		pflags &= ~(SD_LOAD_BALANCE |
@@ -8558,10 +8523,10 @@ static void set_domain_attribute(struct sched_domain *sd,
 		request = attr->relax_domain_level;
 	if (request < sd->level) {
 		/* turn off idle balance on this domain */
-		sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
+		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
 	} else {
 		/* turn on idle balance on this domain */
-		sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
+		sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
 	}
 }
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f2eb5b934715..09d19f77eb3a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1062,83 +1062,6 @@ static void yield_task_fair(struct rq *rq)
 	se->vruntime = rightmost->vruntime + 1;
 }
 
-/*
- * wake_idle() will wake a task on an idle cpu if task->cpu is
- * not idle and an idle cpu is available.  The span of cpus to
- * search starts with cpus closest then further out as needed,
- * so we always favor a closer, idle cpu.
- * Domains may include CPUs that are not usable for migration,
- * hence we need to mask them out (rq->rd->online)
- *
- * Returns the CPU we should wake onto.
- */
-#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
-
-#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
-
-static int wake_idle(int cpu, struct task_struct *p)
-{
-	struct sched_domain *sd;
-	int i;
-	unsigned int chosen_wakeup_cpu;
-	int this_cpu;
-	struct rq *task_rq = task_rq(p);
-
-	/*
-	 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
-	 * are idle and this is not a kernel thread and this task's affinity
-	 * allows it to be moved to preferred cpu, then just move!
-	 */
-
-	this_cpu = smp_processor_id();
-	chosen_wakeup_cpu =
-		cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
-
-	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
-		idle_cpu(cpu) && idle_cpu(this_cpu) &&
-		p->mm && !(p->flags & PF_KTHREAD) &&
-		cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
-		return chosen_wakeup_cpu;
-
-	/*
-	 * If it is idle, then it is the best cpu to run this task.
-	 *
-	 * This cpu is also the best, if it has more than one task already.
-	 * Siblings must be also busy(in most cases) as they didn't already
-	 * pickup the extra load from this cpu and hence we need not check
-	 * sibling runqueue info. This will avoid the checks and cache miss
-	 * penalities associated with that.
-	 */
-	if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
-		return cpu;
-
-	for_each_domain(cpu, sd) {
-		if ((sd->flags & SD_WAKE_IDLE)
-		    || ((sd->flags & SD_WAKE_IDLE_FAR)
-			&& !task_hot(p, task_rq->clock, sd))) {
-			for_each_cpu_and(i, sched_domain_span(sd),
-					 &p->cpus_allowed) {
-				if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
-					if (i != task_cpu(p)) {
-						schedstat_inc(p,
-						       se.nr_wakeups_idle);
-					}
-					return i;
-				}
-			}
-		} else {
-			break;
-		}
-	}
-	return cpu;
-}
-#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
-static inline int wake_idle(int cpu, struct task_struct *p)
-{
-	return cpu;
-}
-#endif
-
 #ifdef CONFIG_SMP
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1225,21 +1148,22 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
 
 #endif
 
-static int
-wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
-	    struct task_struct *p, int prev_cpu, int this_cpu, int sync,
-	    int idx, unsigned long load, unsigned long this_load,
-	    unsigned int imbalance)
+static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 {
-	struct task_struct *curr = this_rq->curr;
-	struct task_group *tg;
-	unsigned long tl = this_load;
+	struct task_struct *curr = current;
+	unsigned long this_load, load;
+	int idx, this_cpu, prev_cpu;
 	unsigned long tl_per_task;
+	unsigned int imbalance;
+	struct task_group *tg;
 	unsigned long weight;
 	int balanced;
 
-	if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
-		return 0;
+	idx	  = sd->wake_idx;
+	this_cpu  = smp_processor_id();
+	prev_cpu  = task_cpu(p);
+	load	  = source_load(prev_cpu, idx);
+	this_load = target_load(this_cpu, idx);
 
 	if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
 			p->se.avg_overlap > sysctl_sched_migration_cost))
@@ -1254,24 +1178,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
 		tg = task_group(current);
 		weight = current->se.load.weight;
 
-		tl += effective_load(tg, this_cpu, -weight, -weight);
+		this_load += effective_load(tg, this_cpu, -weight, -weight);
 		load += effective_load(tg, prev_cpu, 0, -weight);
 	}
 
 	tg = task_group(p);
 	weight = p->se.load.weight;
 
+	imbalance = 100 + (sd->imbalance_pct - 100) / 2;
+
 	/*
 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
-	 * due to the sync cause above having dropped tl to 0, we'll always have
-	 * an imbalance, but there's really nothing you can do about that, so
-	 * that's good too.
+	 * due to the sync cause above having dropped this_load to 0, we'll
+	 * always have an imbalance, but there's really nothing you can do
+	 * about that, so that's good too.
 	 *
 	 * Otherwise check if either cpus are near enough in load to allow this
 	 * task to be woken on this_cpu.
 	 */
-	balanced = !tl ||
-		100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
+	balanced = !this_load ||
+		100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
 		imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
 
 	/*
@@ -1285,14 +1211,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
 	schedstat_inc(p, se.nr_wakeups_affine_attempts);
 	tl_per_task = cpu_avg_load_per_task(this_cpu);
 
-	if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
-			tl_per_task)) {
+	if (balanced ||
+	    (this_load <= load &&
+	     this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
 		/*
 		 * This domain has SD_WAKE_AFFINE and
 		 * p is cache cold in this domain, and
 		 * there is no bad imbalance.
 		 */
-		schedstat_inc(this_sd, ttwu_move_affine);
+		schedstat_inc(sd, ttwu_move_affine);
 		schedstat_inc(p, se.nr_wakeups_affine);
 
 		return 1;
@@ -1300,72 +1227,6 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
 	return 0;
 }
 
-static int sched_balance_self(int cpu, int flag);
-
-static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
-{
-	struct sched_domain *sd, *this_sd = NULL;
-	int prev_cpu, this_cpu, new_cpu;
-	unsigned long load, this_load;
-	struct rq *this_rq;
-	unsigned int imbalance;
-	int idx;
-
-	prev_cpu	= task_cpu(p);
-	this_cpu	= smp_processor_id();
-	this_rq		= cpu_rq(this_cpu);
-	new_cpu		= prev_cpu;
-
-	if (flag != SD_BALANCE_WAKE)
-		return sched_balance_self(this_cpu, flag);
-
-	/*
-	 * 'this_sd' is the first domain that both
-	 * this_cpu and prev_cpu are present in:
-	 */
-	for_each_domain(this_cpu, sd) {
-		if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
-			this_sd = sd;
-			break;
-		}
-	}
-
-	if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
-		goto out;
-
-	/*
-	 * Check for affine wakeup and passive balancing possibilities.
-	 */
-	if (!this_sd)
-		goto out;
-
-	idx = this_sd->wake_idx;
-
-	imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
-
-	load = source_load(prev_cpu, idx);
-	this_load = target_load(this_cpu, idx);
-
-	if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
-				     load, this_load, imbalance))
-		return this_cpu;
-
-	/*
-	 * Start passive balancing when half the imbalance_pct
-	 * limit is reached.
-	 */
-	if (this_sd->flags & SD_WAKE_BALANCE) {
-		if (imbalance*this_load <= 100*load) {
-			schedstat_inc(this_sd, ttwu_move_balance);
-			schedstat_inc(p, se.nr_wakeups_passive);
-			return this_cpu;
-		}
-	}
-
-out:
-	return wake_idle(new_cpu, p);
-}
-
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
@@ -1455,10 +1316,20 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  *
  * preempt must be disabled.
  */
-static int sched_balance_self(int cpu, int flag)
+static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
 {
 	struct task_struct *t = current;
 	struct sched_domain *tmp, *sd = NULL;
+	int cpu = smp_processor_id();
+	int prev_cpu = task_cpu(p);
+	int new_cpu = cpu;
+	int want_affine = 0;
+
+	if (flag & SD_BALANCE_WAKE) {
+		if (sched_feat(AFFINE_WAKEUPS))
+			want_affine = 1;
+		new_cpu = prev_cpu;
+	}
 
 	for_each_domain(cpu, tmp) {
 		/*
@@ -1466,16 +1337,38 @@ static int sched_balance_self(int cpu, int flag)
 		 */
 		if (tmp->flags & SD_POWERSAVINGS_BALANCE)
 			break;
-		if (tmp->flags & flag)
-			sd = tmp;
-	}
 
-	if (sd)
-		update_shares(sd);
+		switch (flag) {
+		case SD_BALANCE_WAKE:
+			if (!sched_feat(LB_WAKEUP_UPDATE))
+				break;
+		case SD_BALANCE_FORK:
+		case SD_BALANCE_EXEC:
+			if (root_task_group_empty())
+				break;
+			update_shares(tmp);
+		default:
+			break;
+		}
+
+		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+
+			if (wake_affine(tmp, p, sync))
+				return cpu;
+
+			want_affine = 0;
+		}
+
+		if (!(tmp->flags & flag))
+			continue;
+
+		sd = tmp;
+	}
 
 	while (sd) {
 		struct sched_group *group;
-		int new_cpu, weight;
+		int weight;
 
 		if (!(sd->flags & flag)) {
 			sd = sd->child;
@@ -1508,7 +1401,7 @@ static int sched_balance_self(int cpu, int flag)
 		/* while loop will break here if sd == NULL */
 	}
 
-	return cpu;
+	return new_cpu;
 }
 #endif /* CONFIG_SMP */
 
-- 
cgit v1.2.3-71-gd317


From 78e7ed53c9f42f04f9401ada6f7047db60781676 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 3 Sep 2009 13:16:51 +0200
Subject: sched: Tweak wake_idx

When merging select_task_rq_fair() and sched_balance_self() we lost
the use of wake_idx, restore that and set them to 0 to make wake
balancing more aggressive.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/ia64/include/asm/topology.h     |  5 +++--
 arch/powerpc/include/asm/topology.h  |  3 ++-
 arch/sh/include/asm/topology.h       |  2 +-
 arch/sparc/include/asm/topology_64.h |  2 +-
 arch/x86/include/asm/topology.h      |  2 +-
 include/linux/topology.h             |  4 ++--
 kernel/sched_fair.c                  | 21 ++++++++++++++++++---
 7 files changed, 28 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index cf6053b226c3..47f3c51d5e27 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -62,11 +62,12 @@ void build_cpu_to_node_map(void);
 	.busy_idx		= 2,			\
 	.idle_idx		= 1,			\
 	.newidle_idx		= 2,			\
-	.wake_idx		= 1,			\
+	.wake_idx		= 0,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
+				| SD_BALANCE_FORK	\
 				| SD_BALANCE_WAKE	\
 				| SD_WAKE_AFFINE,	\
 	.last_balance		= jiffies,		\
@@ -87,7 +88,7 @@ void build_cpu_to_node_map(void);
 	.busy_idx		= 3,			\
 	.idle_idx		= 2,			\
 	.newidle_idx		= 2,			\
-	.wake_idx		= 1,			\
+	.wake_idx		= 0,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index c6343313ff59..a6b220ab56db 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -58,9 +58,10 @@ static inline int pcibus_to_node(struct pci_bus *bus)
 	.busy_idx		= 3,			\
 	.idle_idx		= 1,			\
 	.newidle_idx		= 2,			\
-	.wake_idx		= 1,			\
+	.wake_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
+				| SD_BALANCE_FORK	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_WAKE	\
 				| SD_SERIALIZE,		\
diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h
index dc1531e2f25f..9054e5c0ad54 100644
--- a/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@ -16,7 +16,7 @@
 	.busy_idx		= 3,			\
 	.idle_idx		= 2,			\
 	.newidle_idx		= 2,			\
-	.wake_idx		= 1,			\
+	.wake_idx		= 0,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_FORK	\
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index 1d091abd2d13..bc3a0930ed64 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -52,7 +52,7 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
 	.busy_idx		= 3,			\
 	.idle_idx		= 2,			\
 	.newidle_idx		= 0, 			\
-	.wake_idx		= 1,			\
+	.wake_idx		= 0,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_FORK	\
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 966d58dc6274..4b1b335097b5 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -138,7 +138,7 @@ extern unsigned long node_remap_size[];
 	.busy_idx		= 3,					\
 	.idle_idx		= SD_IDLE_IDX,				\
 	.newidle_idx		= SD_NEWIDLE_IDX,			\
-	.wake_idx		= 1,					\
+	.wake_idx		= 0,					\
 	.forkexec_idx		= SD_FORKEXEC_IDX,			\
 									\
 	.flags			= 1*SD_LOAD_BALANCE			\
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 6a8cd15555bb..fef57040a4e2 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -120,7 +120,7 @@ int arch_update_cpu_topology(void);
 	.imbalance_pct		= 125,					\
 	.cache_nice_tries	= 1,					\
 	.busy_idx		= 2,					\
-	.wake_idx		= 1,					\
+	.wake_idx		= 0,					\
 	.forkexec_idx		= 1,					\
 									\
 	.flags			= 1*SD_LOAD_BALANCE			\
@@ -152,7 +152,7 @@ int arch_update_cpu_topology(void);
 	.busy_idx		= 2,					\
 	.idle_idx		= 1,					\
 	.newidle_idx		= 2,					\
-	.wake_idx		= 1,					\
+	.wake_idx		= 0,					\
 	.forkexec_idx		= 1,					\
 									\
 	.flags			= 1*SD_LOAD_BALANCE			\
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8b3eddbcf9a4..19593568031a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1232,12 +1232,27 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
  * domain.
  */
 static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+find_idlest_group(struct sched_domain *sd, struct task_struct *p,
+		  int this_cpu, int flag)
 {
 	struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long min_load = ULONG_MAX, this_load = 0;
-	int load_idx = sd->forkexec_idx;
 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
+	int load_idx = 0;
+
+	switch (flag) {
+	case SD_BALANCE_FORK:
+	case SD_BALANCE_EXEC:
+		load_idx = sd->forkexec_idx;
+		break;
+
+	case SD_BALANCE_WAKE:
+		load_idx = sd->wake_idx;
+		break;
+
+	default:
+		break;
+	}
 
 	do {
 		unsigned long load, avg_load;
@@ -1392,7 +1407,7 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
 			continue;
 		}
 
-		group = find_idlest_group(sd, p, cpu);
+		group = find_idlest_group(sd, p, cpu, flag);
 		if (!group) {
 			sd = sd->child;
 			continue;
-- 
cgit v1.2.3-71-gd317


From 6bd7821f905a8d6c471f0d6675f5cb7ea448d791 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 11 Sep 2009 18:42:15 +0200
Subject: sched: Fix some domain tunings

CPU level should have WAKE_AFFINE, whereas ALLNODES is dubious.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/topology.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/topology.h b/include/linux/topology.h
index fef57040a4e2..c87edcd87967 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -160,7 +160,7 @@ int arch_update_cpu_topology(void);
 				| 1*SD_BALANCE_EXEC			\
 				| 1*SD_BALANCE_FORK			\
 				| 1*SD_BALANCE_WAKE			\
-				| 0*SD_WAKE_AFFINE			\
+				| 1*SD_WAKE_AFFINE			\
 				| 0*SD_SHARE_CPUPOWER			\
 				| 0*SD_SHARE_PKG_RESOURCES		\
 				| 0*SD_SERIALIZE			\
@@ -186,7 +186,7 @@ int arch_update_cpu_topology(void);
 				| 0*SD_BALANCE_EXEC			\
 				| 0*SD_BALANCE_FORK			\
 				| 0*SD_BALANCE_WAKE			\
-				| 1*SD_WAKE_AFFINE			\
+				| 0*SD_WAKE_AFFINE			\
 				| 0*SD_SHARE_CPUPOWER			\
 				| 0*SD_POWERSAVINGS_BALANCE		\
 				| 0*SD_SHARE_PKG_RESOURCES		\
-- 
cgit v1.2.3-71-gd317


From 0ec9fab3d186d9cbb00c0f694d4a260d07c198d9 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 15 Sep 2009 15:07:03 +0200
Subject: sched: Improve latencies and throughput

Make the idle balancer more agressive, to improve a
x264 encoding workload provided by Jason Garrett-Glaser:

 NEXT_BUDDY NO_LB_BIAS
 encoded 600 frames, 252.82 fps, 22096.60 kb/s
 encoded 600 frames, 250.69 fps, 22096.60 kb/s
 encoded 600 frames, 245.76 fps, 22096.60 kb/s

 NO_NEXT_BUDDY LB_BIAS
 encoded 600 frames, 344.44 fps, 22096.60 kb/s
 encoded 600 frames, 346.66 fps, 22096.60 kb/s
 encoded 600 frames, 352.59 fps, 22096.60 kb/s

 NO_NEXT_BUDDY NO_LB_BIAS
 encoded 600 frames, 425.75 fps, 22096.60 kb/s
 encoded 600 frames, 425.45 fps, 22096.60 kb/s
 encoded 600 frames, 422.49 fps, 22096.60 kb/s

Peter pointed out that this is better done via newidle_idx,
not via LB_BIAS, newidle balancing should look for where
there is load _now_, not where there was load 2 ticks ago.

Worst-case latencies are improved as well as no buddies
means less vruntime spread. (as per prior lkml discussions)

This change improves kbuild-peak parallelism as well.

Reported-by: Jason Garrett-Glaser <darkshikari@gmail.com>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1253011667.9128.16.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/ia64/include/asm/topology.h    | 5 +++--
 arch/powerpc/include/asm/topology.h | 2 +-
 arch/sh/include/asm/topology.h      | 3 ++-
 arch/x86/include/asm/topology.h     | 4 +---
 include/linux/topology.h            | 2 +-
 kernel/sched_features.h             | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 47f3c51d5e27..42f1673ec83f 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -61,7 +61,7 @@ void build_cpu_to_node_map(void);
 	.cache_nice_tries	= 2,			\
 	.busy_idx		= 2,			\
 	.idle_idx		= 1,			\
-	.newidle_idx		= 2,			\
+	.newidle_idx		= 0,			\
 	.wake_idx		= 0,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
@@ -87,10 +87,11 @@ void build_cpu_to_node_map(void);
 	.cache_nice_tries	= 2,			\
 	.busy_idx		= 3,			\
 	.idle_idx		= 2,			\
-	.newidle_idx		= 2,			\
+	.newidle_idx		= 0,			\
 	.wake_idx		= 0,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
+				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
 				| SD_BALANCE_FORK	\
 				| SD_BALANCE_WAKE	\
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index a6b220ab56db..1a2c9eb42a03 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -57,7 +57,7 @@ static inline int pcibus_to_node(struct pci_bus *bus)
 	.cache_nice_tries	= 1,			\
 	.busy_idx		= 3,			\
 	.idle_idx		= 1,			\
-	.newidle_idx		= 2,			\
+	.newidle_idx		= 0,			\
 	.wake_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h
index 9054e5c0ad54..c8436771e31d 100644
--- a/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@ -15,13 +15,14 @@
 	.cache_nice_tries	= 2,			\
 	.busy_idx		= 3,			\
 	.idle_idx		= 2,			\
-	.newidle_idx		= 2,			\
+	.newidle_idx		= 0,			\
 	.wake_idx		= 0,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
 				| SD_BALANCE_WAKE	\
+				| SD_BALANCE_NEWIDLE	\
 				| SD_SERIALIZE,		\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 4b1b335097b5..7fafd1bc4149 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -116,14 +116,12 @@ extern unsigned long node_remap_size[];
 
 # define SD_CACHE_NICE_TRIES	1
 # define SD_IDLE_IDX		1
-# define SD_NEWIDLE_IDX		2
 # define SD_FORKEXEC_IDX	0
 
 #else
 
 # define SD_CACHE_NICE_TRIES	2
 # define SD_IDLE_IDX		2
-# define SD_NEWIDLE_IDX		2
 # define SD_FORKEXEC_IDX	1
 
 #endif
@@ -137,7 +135,7 @@ extern unsigned long node_remap_size[];
 	.cache_nice_tries	= SD_CACHE_NICE_TRIES,			\
 	.busy_idx		= 3,					\
 	.idle_idx		= SD_IDLE_IDX,				\
-	.newidle_idx		= SD_NEWIDLE_IDX,			\
+	.newidle_idx		= 0,					\
 	.wake_idx		= 0,					\
 	.forkexec_idx		= SD_FORKEXEC_IDX,			\
 									\
diff --git a/include/linux/topology.h b/include/linux/topology.h
index c87edcd87967..4298745615a5 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -151,7 +151,7 @@ int arch_update_cpu_topology(void);
 	.cache_nice_tries	= 1,					\
 	.busy_idx		= 2,					\
 	.idle_idx		= 1,					\
-	.newidle_idx		= 2,					\
+	.newidle_idx		= 0,					\
 	.wake_idx		= 0,					\
 	.forkexec_idx		= 1,					\
 									\
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 891ea0f72b46..e98c2e8de1d5 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -67,7 +67,7 @@ SCHED_FEAT(AFFINE_WAKEUPS, 1)
  * wakeup-preemption), since its likely going to consume data we
  * touched, increases cache locality.
  */
-SCHED_FEAT(NEXT_BUDDY, 1)
+SCHED_FEAT(NEXT_BUDDY, 0)
 
 /*
  * Prefer to schedule the task that ran last (when we did
-- 
cgit v1.2.3-71-gd317


From b8a543ea5a5896830a9969bacfd047f9d15940b2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 15 Sep 2009 15:22:03 +0200
Subject: sched: Reduce forkexec_idx

If we're looking to place a new task, we might as well find the
idlest position _now_, not 1 tick ago.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/ia64/include/asm/topology.h     | 4 ++--
 arch/sh/include/asm/topology.h       | 2 +-
 arch/sparc/include/asm/topology_64.h | 2 +-
 arch/x86/include/asm/topology.h      | 4 +---
 include/linux/topology.h             | 4 ++--
 5 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 42f1673ec83f..569b9dafc78c 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -63,7 +63,7 @@ void build_cpu_to_node_map(void);
 	.idle_idx		= 1,			\
 	.newidle_idx		= 0,			\
 	.wake_idx		= 0,			\
-	.forkexec_idx		= 1,			\
+	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
@@ -89,7 +89,7 @@ void build_cpu_to_node_map(void);
 	.idle_idx		= 2,			\
 	.newidle_idx		= 0,			\
 	.wake_idx		= 0,			\
-	.forkexec_idx		= 1,			\
+	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h
index c8436771e31d..a8cc564b703d 100644
--- a/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@ -17,7 +17,7 @@
 	.idle_idx		= 2,			\
 	.newidle_idx		= 0,			\
 	.wake_idx		= 0,			\
-	.forkexec_idx		= 1,			\
+	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index bc3a0930ed64..10b979d1de20 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -53,7 +53,7 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
 	.idle_idx		= 2,			\
 	.newidle_idx		= 0, 			\
 	.wake_idx		= 0,			\
-	.forkexec_idx		= 1,			\
+	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 7fafd1bc4149..589f12383d78 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -116,13 +116,11 @@ extern unsigned long node_remap_size[];
 
 # define SD_CACHE_NICE_TRIES	1
 # define SD_IDLE_IDX		1
-# define SD_FORKEXEC_IDX	0
 
 #else
 
 # define SD_CACHE_NICE_TRIES	2
 # define SD_IDLE_IDX		2
-# define SD_FORKEXEC_IDX	1
 
 #endif
 
@@ -137,7 +135,7 @@ extern unsigned long node_remap_size[];
 	.idle_idx		= SD_IDLE_IDX,				\
 	.newidle_idx		= 0,					\
 	.wake_idx		= 0,					\
-	.forkexec_idx		= SD_FORKEXEC_IDX,			\
+	.forkexec_idx		= 0,					\
 									\
 	.flags			= 1*SD_LOAD_BALANCE			\
 				| 1*SD_BALANCE_NEWIDLE			\
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 4298745615a5..936ab2b37683 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -121,7 +121,7 @@ int arch_update_cpu_topology(void);
 	.cache_nice_tries	= 1,					\
 	.busy_idx		= 2,					\
 	.wake_idx		= 0,					\
-	.forkexec_idx		= 1,					\
+	.forkexec_idx		= 0,					\
 									\
 	.flags			= 1*SD_LOAD_BALANCE			\
 				| 1*SD_BALANCE_NEWIDLE			\
@@ -153,7 +153,7 @@ int arch_update_cpu_topology(void);
 	.idle_idx		= 1,					\
 	.newidle_idx		= 0,					\
 	.wake_idx		= 0,					\
-	.forkexec_idx		= 1,					\
+	.forkexec_idx		= 0,					\
 									\
 	.flags			= 1*SD_LOAD_BALANCE			\
 				| 1*SD_BALANCE_NEWIDLE			\
-- 
cgit v1.2.3-71-gd317


From 47fe38fcff0517e67d395c039d2e26d2de688a60 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 2 Sep 2009 13:49:18 +0200
Subject: x86: sched: Provide arch implementations using aperf/mperf

APERF/MPERF support for cpu_power.

APERF/MPERF is arch defined to be a relative scale of work capacity
per logical cpu, this is assumed to include SMT and Turbo mode.

APERF/MPERF are specified to both reset to 0 when either counter
wraps, which is highly inconvenient, since that'll give a blimp
when that happens. The manual specifies writing 0 to the counters
after each read, but that's 1) too expensive, and 2) destroys the
possibility of sharing these counters with other users, so we live
with the blimp - the other existing user does too.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/Makefile |  2 +-
 arch/x86/kernel/cpu/sched.c  | 55 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/sched.h        |  4 ++++
 3 files changed, 60 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/kernel/cpu/sched.c

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c1f253dac155..8dd30638fe44 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -13,7 +13,7 @@ CFLAGS_common.o		:= $(nostackp)
 
 obj-y			:= intel_cacheinfo.o addon_cpuid_features.o
 obj-y			+= proc.o capflags.o powerflags.o common.o
-obj-y			+= vmware.o hypervisor.o
+obj-y			+= vmware.o hypervisor.o sched.o
 
 obj-$(CONFIG_X86_32)	+= bugs.o cmpxchg.o
 obj-$(CONFIG_X86_64)	+= bugs_64.o
diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c
new file mode 100644
index 000000000000..6c00a8f3cce5
--- /dev/null
+++ b/arch/x86/kernel/cpu/sched.c
@@ -0,0 +1,55 @@
+#include <linux/sched.h>
+#include <linux/math64.h>
+#include <linux/percpu.h>
+#include <linux/irqflags.h>
+
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+
+#ifdef CONFIG_SMP
+
+static DEFINE_PER_CPU(struct aperfmperf, old_perf);
+
+static unsigned long scale_aperfmperf(void)
+{
+	struct aperfmperf val, *old = &__get_cpu_var(old_perf);
+	unsigned long ratio, flags;
+
+	local_irq_save(flags);
+	get_aperfmperf(&val);
+	local_irq_restore(flags);
+
+	ratio = calc_aperfmperf_ratio(old, &val);
+	*old = val;
+
+	return ratio;
+}
+
+unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+	/*
+	 * do aperf/mperf on the cpu level because it includes things
+	 * like turbo mode, which are relevant to full cores.
+	 */
+	if (boot_cpu_has(X86_FEATURE_APERFMPERF))
+		return scale_aperfmperf();
+
+	/*
+	 * maybe have something cpufreq here
+	 */
+
+	return default_scale_freq_power(sd, cpu);
+}
+
+unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+	/*
+	 * aperf/mperf already includes the smt gain
+	 */
+	if (boot_cpu_has(X86_FEATURE_APERFMPERF))
+		return SCHED_LOAD_SCALE;
+
+	return default_scale_smt_power(sd, cpu);
+}
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c30bf3d516d1..fc4c0f9393d2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -992,6 +992,9 @@ static inline int test_sd_parent(struct sched_domain *sd, int flag)
 	return 0;
 }
 
+unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu);
+unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu);
+
 #else /* CONFIG_SMP */
 
 struct sched_domain_attr;
@@ -1003,6 +1006,7 @@ partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
 }
 #endif	/* !CONFIG_SMP */
 
+
 struct io_context;			/* See blkdev.h */
 
 
-- 
cgit v1.2.3-71-gd317


From 0763a660a84220cc3900fd32abdd7ad109e2278d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 14 Sep 2009 19:37:39 +0200
Subject: sched: Rename select_task_rq() argument

In order to be able to rename the sync argument, we need to rename
the current flag argument.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   |  2 +-
 kernel/sched_fair.c     | 14 +++++++-------
 kernel/sched_idletask.c |  2 +-
 kernel/sched_rt.c       |  4 ++--
 4 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fc4c0f9393d2..5c116f03d74c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1037,7 +1037,7 @@ struct sched_class {
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
 #ifdef CONFIG_SMP
-	int  (*select_task_rq)(struct task_struct *p, int flag, int sync);
+	int  (*select_task_rq)(struct task_struct *p, int sd_flag, int sync);
 
 	unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
 			struct rq *busiest, unsigned long max_load_move,
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 19593568031a..b554e63c521a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1331,7 +1331,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  *
  * preempt must be disabled.
  */
-static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
+static int select_task_rq_fair(struct task_struct *p, int sd_flag, int sync)
 {
 	struct sched_domain *tmp, *sd = NULL;
 	int cpu = smp_processor_id();
@@ -1339,7 +1339,7 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
 	int new_cpu = cpu;
 	int want_affine = 0;
 
-	if (flag & SD_BALANCE_WAKE) {
+	if (sd_flag & SD_BALANCE_WAKE) {
 		if (sched_feat(AFFINE_WAKEUPS))
 			want_affine = 1;
 		new_cpu = prev_cpu;
@@ -1368,7 +1368,7 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
 				break;
 		}
 
-		switch (flag) {
+		switch (sd_flag) {
 		case SD_BALANCE_WAKE:
 			if (!sched_feat(LB_WAKEUP_UPDATE))
 				break;
@@ -1392,7 +1392,7 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
 			want_affine = 0;
 		}
 
-		if (!(tmp->flags & flag))
+		if (!(tmp->flags & sd_flag))
 			continue;
 
 		sd = tmp;
@@ -1402,12 +1402,12 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
 		struct sched_group *group;
 		int weight;
 
-		if (!(sd->flags & flag)) {
+		if (!(sd->flags & sd_flag)) {
 			sd = sd->child;
 			continue;
 		}
 
-		group = find_idlest_group(sd, p, cpu, flag);
+		group = find_idlest_group(sd, p, cpu, sd_flag);
 		if (!group) {
 			sd = sd->child;
 			continue;
@@ -1427,7 +1427,7 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
 		for_each_domain(cpu, tmp) {
 			if (weight <= cpumask_weight(sched_domain_span(tmp)))
 				break;
-			if (tmp->flags & flag)
+			if (tmp->flags & sd_flag)
 				sd = tmp;
 		}
 		/* while loop will break here if sd == NULL */
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 99b2f0337609..9ff7697e5dc4 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,7 @@
  */
 
 #ifdef CONFIG_SMP
-static int select_task_rq_idle(struct task_struct *p, int flag, int sync)
+static int select_task_rq_idle(struct task_struct *p, int sd_flag, int sync)
 {
 	return task_cpu(p); /* IDLE tasks as never migrated */
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 438380810ac4..97c53f3f51a7 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -938,11 +938,11 @@ static void yield_task_rt(struct rq *rq)
 #ifdef CONFIG_SMP
 static int find_lowest_rq(struct task_struct *task);
 
-static int select_task_rq_rt(struct task_struct *p, int flag, int sync)
+static int select_task_rq_rt(struct task_struct *p, int sd_flag, int sync)
 {
 	struct rq *rq = task_rq(p);
 
-	if (flag != SD_BALANCE_WAKE)
+	if (sd_flag != SD_BALANCE_WAKE)
 		return smp_processor_id();
 
 	/*
-- 
cgit v1.2.3-71-gd317


From 7d47872146398dbede13223299fe1cb368ebc781 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 14 Sep 2009 19:55:44 +0200
Subject: sched: Rename sync arguments

In order to extend the functions to have more than 1 flag (sync),
rename the argument to flags, and explicitly define a WF_ space for
individual flags.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   |  9 +++++++--
 include/linux/wait.h    |  4 ++--
 kernel/sched.c          | 30 ++++++++++++++++--------------
 kernel/sched_fair.c     |  6 ++++--
 kernel/sched_idletask.c |  4 ++--
 kernel/sched_rt.c       |  4 ++--
 6 files changed, 33 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5c116f03d74c..3b07168b6f03 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1024,6 +1024,11 @@ struct uts_namespace;
 struct rq;
 struct sched_domain;
 
+/*
+ * wake flags
+ */
+#define WF_SYNC		0x01		/* waker goes to sleep after wakup */
+
 struct sched_class {
 	const struct sched_class *next;
 
@@ -1031,13 +1036,13 @@ struct sched_class {
 	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
 	void (*yield_task) (struct rq *rq);
 
-	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync);
+	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
 
 	struct task_struct * (*pick_next_task) (struct rq *rq);
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
 #ifdef CONFIG_SMP
-	int  (*select_task_rq)(struct task_struct *p, int sd_flag, int sync);
+	int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
 
 	unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
 			struct rq *busiest, unsigned long max_load_move,
diff --git a/include/linux/wait.h b/include/linux/wait.h
index cf3c2f5dba51..a48e16b77d5e 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -26,8 +26,8 @@
 #include <asm/current.h>
 
 typedef struct __wait_queue wait_queue_t;
-typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int sync, void *key);
-int default_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
+typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
+int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key);
 
 struct __wait_queue {
 	unsigned int flags;
diff --git a/kernel/sched.c b/kernel/sched.c
index e8e603bf8761..4da335cec8ee 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -636,9 +636,10 @@ struct rq {
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
-static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
+static inline
+void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 {
-	rq->curr->sched_class->check_preempt_curr(rq, p, sync);
+	rq->curr->sched_class->check_preempt_curr(rq, p, flags);
 }
 
 static inline int cpu_of(struct rq *rq)
@@ -2318,14 +2319,15 @@ void task_oncpu_function_call(struct task_struct *p,
  *
  * returns failure only if the task is already active.
  */
-static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
+static int try_to_wake_up(struct task_struct *p, unsigned int state,
+			  int wake_flags)
 {
 	int cpu, orig_cpu, this_cpu, success = 0;
 	unsigned long flags;
 	struct rq *rq;
 
 	if (!sched_feat(SYNC_WAKEUPS))
-		sync = 0;
+		wake_flags &= ~WF_SYNC;
 
 	this_cpu = get_cpu();
 
@@ -2352,7 +2354,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	p->state = TASK_WAKING;
 	task_rq_unlock(rq, &flags);
 
-	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, sync);
+	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
 	if (cpu != orig_cpu)
 		set_task_cpu(p, cpu);
 
@@ -2378,7 +2380,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 out_activate:
 #endif /* CONFIG_SMP */
 	schedstat_inc(p, se.nr_wakeups);
-	if (sync)
+	if (wake_flags & WF_SYNC)
 		schedstat_inc(p, se.nr_wakeups_sync);
 	if (orig_cpu != cpu)
 		schedstat_inc(p, se.nr_wakeups_migrate);
@@ -2407,7 +2409,7 @@ out_activate:
 
 out_running:
 	trace_sched_wakeup(rq, p, success);
-	check_preempt_curr(rq, p, sync);
+	check_preempt_curr(rq, p, wake_flags);
 
 	p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
@@ -5562,10 +5564,10 @@ asmlinkage void __sched preempt_schedule_irq(void)
 
 #endif /* CONFIG_PREEMPT */
 
-int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
+int default_wake_function(wait_queue_t *curr, unsigned mode, int flags,
 			  void *key)
 {
-	return try_to_wake_up(curr->private, mode, sync);
+	return try_to_wake_up(curr->private, mode, flags);
 }
 EXPORT_SYMBOL(default_wake_function);
 
@@ -5579,14 +5581,14 @@ EXPORT_SYMBOL(default_wake_function);
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
-			int nr_exclusive, int sync, void *key)
+			int nr_exclusive, int flags, void *key)
 {
 	wait_queue_t *curr, *next;
 
 	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
 		unsigned flags = curr->flags;
 
-		if (curr->func(curr, mode, sync, key) &&
+		if (curr->func(curr, mode, flags, key) &&
 				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
 			break;
 	}
@@ -5647,16 +5649,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, void *key)
 {
 	unsigned long flags;
-	int sync = 1;
+	int wake_flags = WF_SYNC;
 
 	if (unlikely(!q))
 		return;
 
 	if (unlikely(!nr_exclusive))
-		sync = 0;
+		wake_flags = 0;
 
 	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_common(q, mode, nr_exclusive, sync, key);
+	__wake_up_common(q, mode, nr_exclusive, wake_flags, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync_key);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b554e63c521a..007958e3c93a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1331,13 +1331,14 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  *
  * preempt must be disabled.
  */
-static int select_task_rq_fair(struct task_struct *p, int sd_flag, int sync)
+static int select_task_rq_fair(struct task_struct *p, int sd_flag, int flags)
 {
 	struct sched_domain *tmp, *sd = NULL;
 	int cpu = smp_processor_id();
 	int prev_cpu = task_cpu(p);
 	int new_cpu = cpu;
 	int want_affine = 0;
+	int sync = flags & WF_SYNC;
 
 	if (sd_flag & SD_BALANCE_WAKE) {
 		if (sched_feat(AFFINE_WAKEUPS))
@@ -1548,11 +1549,12 @@ static void set_next_buddy(struct sched_entity *se)
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
+static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct task_struct *curr = rq->curr;
 	struct sched_entity *se = &curr->se, *pse = &p->se;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+	int sync = flags & WF_SYNC;
 
 	update_curr(cfs_rq);
 
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 9ff7697e5dc4..a8b448af004b 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,7 @@
  */
 
 #ifdef CONFIG_SMP
-static int select_task_rq_idle(struct task_struct *p, int sd_flag, int sync)
+static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
 {
 	return task_cpu(p); /* IDLE tasks as never migrated */
 }
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sd_flag, int sync)
 /*
  * Idle tasks are unconditionally rescheduled:
  */
-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync)
+static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
 {
 	resched_task(rq->idle);
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 97c53f3f51a7..13de7126a6ab 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -938,7 +938,7 @@ static void yield_task_rt(struct rq *rq)
 #ifdef CONFIG_SMP
 static int find_lowest_rq(struct task_struct *task);
 
-static int select_task_rq_rt(struct task_struct *p, int sd_flag, int sync)
+static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
 {
 	struct rq *rq = task_rq(p);
 
@@ -1002,7 +1002,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
+static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
 {
 	if (p->prio < rq->curr->prio) {
 		resched_task(rq->curr);
-- 
cgit v1.2.3-71-gd317


From a7558e01056f5191ff2ecff53b075dcb9e484188 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 14 Sep 2009 20:02:34 +0200
Subject: sched: Add WF_FORK

Avoid the cache buddies from biasing the time distribution away
from fork()ers. Normally the next buddy will be the preferred
scheduling target, but this makes fork()s prefer to run the new
child, whereas we prefer to run the parent, since that will
generate more work.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 1 +
 kernel/sched.c        | 2 +-
 kernel/sched_fair.c   | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3b07168b6f03..ee1f88993097 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1028,6 +1028,7 @@ struct sched_domain;
  * wake flags
  */
 #define WF_SYNC		0x01		/* waker goes to sleep after wakup */
+#define WF_FORK		0x02		/* child wakeup after fork */
 
 struct sched_class {
 	const struct sched_class *next;
diff --git a/kernel/sched.c b/kernel/sched.c
index 4da335cec8ee..0d4c4fea3317 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2602,7 +2602,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		inc_nr_running(rq);
 	}
 	trace_sched_wakeup_new(rq, p, 1);
-	check_preempt_curr(rq, p, 0);
+	check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
 		p->sched_class->task_wake_up(rq, p);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 007958e3c93a..6766959c7f44 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1580,7 +1580,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int flags
 	 */
 	if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
 		set_last_buddy(se);
-	if (sched_feat(NEXT_BUDDY))
+	if (sched_feat(NEXT_BUDDY) && !(flags & WF_FORK))
 		set_next_buddy(pse);
 
 	/*
-- 
cgit v1.2.3-71-gd317


From 2f82af08fcc7dc01a7e98a49a5995a77e32a2925 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@fluxnic.net>
Date: Mon, 14 Sep 2009 03:25:28 -0400
Subject: Nicolas Pitre has a new email address

Due to problems at cam.org, my nico@cam.org email address is no longer
valid.  FRom now on, nico@fluxnic.net should be used instead.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 CREDITS                                        | 2 +-
 Documentation/arm/SA1100/ADSBitsy              | 2 +-
 Documentation/arm/SA1100/Assabet               | 2 +-
 Documentation/arm/SA1100/Brutus                | 2 +-
 Documentation/arm/SA1100/GraphicsClient        | 4 ++--
 Documentation/arm/SA1100/GraphicsMaster        | 4 ++--
 Documentation/arm/SA1100/Victor                | 2 +-
 MAINTAINERS                                    | 4 ++--
 arch/arm/boot/compressed/head-sa1100.S         | 2 +-
 arch/arm/lib/lib1funcs.S                       | 2 +-
 arch/arm/lib/sha1.S                            | 2 +-
 arch/arm/mach-sa1100/include/mach/assabet.h    | 2 +-
 arch/arm/mach-sa1100/include/mach/hardware.h   | 2 +-
 arch/arm/mach-sa1100/include/mach/memory.h     | 2 +-
 arch/arm/mach-sa1100/include/mach/neponset.h   | 2 +-
 arch/arm/mach-sa1100/include/mach/system.h     | 2 +-
 arch/arm/mach-sa1100/include/mach/uncompress.h | 2 +-
 arch/arm/mach-sa1100/pm.c                      | 2 +-
 arch/arm/mach-sa1100/time.c                    | 2 +-
 arch/arm/mm/proc-xscale.S                      | 2 +-
 arch/arm/plat-iop/setup.c                      | 2 +-
 arch/arm/plat-omap/include/mach/system.h       | 2 +-
 drivers/input/keyboard/pxa27x_keypad.c         | 2 +-
 drivers/mtd/chips/cfi_cmdset_0001.c            | 2 +-
 drivers/mtd/chips/cfi_cmdset_0020.c            | 2 +-
 drivers/mtd/maps/bfin-async-flash.c            | 2 +-
 drivers/mtd/maps/ceiva.c                       | 2 +-
 drivers/mtd/maps/dc21285.c                     | 4 ++--
 drivers/mtd/maps/ipaq-flash.c                  | 2 +-
 drivers/mtd/maps/pxa2xx-flash.c                | 2 +-
 drivers/mtd/maps/sa1100-flash.c                | 2 +-
 drivers/mtd/mtdblock.c                         | 4 ++--
 drivers/mtd/mtdpart.c                          | 2 +-
 drivers/net/smc91x.c                           | 4 ++--
 drivers/net/smc91x.h                           | 2 +-
 drivers/rtc/rtc-sa1100.c                       | 2 +-
 drivers/video/sa1100fb.c                       | 2 +-
 include/linux/mtd/partitions.h                 | 2 +-
 lib/inflate.c                                  | 2 +-
 39 files changed, 45 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/CREDITS b/CREDITS
index 1a41bf4addd0..72b487869788 100644
--- a/CREDITS
+++ b/CREDITS
@@ -2800,7 +2800,7 @@ D: Starter of Linux1394 effort
 S: ask per mail for current address
 
 N: Nicolas Pitre
-E: nico@cam.org
+E: nico@fluxnic.net
 D: StrongARM SA1100 support integrator & hacker
 D: Xscale PXA architecture
 D: unified SMC 91C9x/91C11x ethernet driver (smc91x)
diff --git a/Documentation/arm/SA1100/ADSBitsy b/Documentation/arm/SA1100/ADSBitsy
index ab47c3833908..7197a9e958ee 100644
--- a/Documentation/arm/SA1100/ADSBitsy
+++ b/Documentation/arm/SA1100/ADSBitsy
@@ -40,4 +40,4 @@ Notes:
   mode, the timing is off so the image is corrupted.  This will be
   fixed soon.
 
-Any contribution can be sent to nico@cam.org and will be greatly welcome!
+Any contribution can be sent to nico@fluxnic.net and will be greatly welcome!
diff --git a/Documentation/arm/SA1100/Assabet b/Documentation/arm/SA1100/Assabet
index 78bc1c1b04e5..91f7ce7ba426 100644
--- a/Documentation/arm/SA1100/Assabet
+++ b/Documentation/arm/SA1100/Assabet
@@ -240,7 +240,7 @@ Then, rebooting the Assabet is just a matter of waiting for the login prompt.
 
 
 Nicolas Pitre
-nico@cam.org
+nico@fluxnic.net
 June 12, 2001
 
 
diff --git a/Documentation/arm/SA1100/Brutus b/Documentation/arm/SA1100/Brutus
index 2254c8f0b326..b1cfd405dccc 100644
--- a/Documentation/arm/SA1100/Brutus
+++ b/Documentation/arm/SA1100/Brutus
@@ -60,7 +60,7 @@ little modifications.
 
 Any contribution is welcome.
 
-Please send patches to nico@cam.org
+Please send patches to nico@fluxnic.net
 
 Have Fun !
 
diff --git a/Documentation/arm/SA1100/GraphicsClient b/Documentation/arm/SA1100/GraphicsClient
index 8fa7e8027ff1..6c9c4f5a36e1 100644
--- a/Documentation/arm/SA1100/GraphicsClient
+++ b/Documentation/arm/SA1100/GraphicsClient
@@ -4,7 +4,7 @@ For more details, contact Applied Data Systems or see
 http://www.applieddata.net/products.html
 
 The original Linux support for this product has been provided by 
-Nicolas Pitre <nico@cam.org>. Continued development work by
+Nicolas Pitre <nico@fluxnic.net>. Continued development work by
 Woojung Huh <whuh@applieddata.net>
 
 It's currently possible to mount a root filesystem via NFS providing a
@@ -94,5 +94,5 @@ Notes:
   mode, the timing is off so the image is corrupted.  This will be
   fixed soon.
 
-Any contribution can be sent to nico@cam.org and will be greatly welcome!
+Any contribution can be sent to nico@fluxnic.net and will be greatly welcome!
 
diff --git a/Documentation/arm/SA1100/GraphicsMaster b/Documentation/arm/SA1100/GraphicsMaster
index dd28745ac521..ee7c6595f23f 100644
--- a/Documentation/arm/SA1100/GraphicsMaster
+++ b/Documentation/arm/SA1100/GraphicsMaster
@@ -4,7 +4,7 @@ For more details, contact Applied Data Systems or see
 http://www.applieddata.net/products.html
 
 The original Linux support for this product has been provided by
-Nicolas Pitre <nico@cam.org>. Continued development work by
+Nicolas Pitre <nico@fluxnic.net>. Continued development work by
 Woojung Huh <whuh@applieddata.net>
 
 Use 'make graphicsmaster_config' before any 'make config'.
@@ -50,4 +50,4 @@ Notes:
   mode, the timing is off so the image is corrupted.  This will be
   fixed soon.
 
-Any contribution can be sent to nico@cam.org and will be greatly welcome!
+Any contribution can be sent to nico@fluxnic.net and will be greatly welcome!
diff --git a/Documentation/arm/SA1100/Victor b/Documentation/arm/SA1100/Victor
index 01e81fc49461..f938a29fdc20 100644
--- a/Documentation/arm/SA1100/Victor
+++ b/Documentation/arm/SA1100/Victor
@@ -9,7 +9,7 @@ Of course Victor is using Linux as its main operating system.
 The Victor implementation for Linux is maintained by Nicolas Pitre:
 
 	nico@visuaide.com
-	nico@cam.org
+	nico@fluxnic.net
 
 For any comments, please feel free to contact me through the above
 addresses.
diff --git a/MAINTAINERS b/MAINTAINERS
index 837b5985ac40..64b9e447545c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3317,7 +3317,7 @@ S:	Supported
 F:	drivers/net/wireless/mwl8k.c
 
 MARVELL SOC MMC/SD/SDIO CONTROLLER DRIVER
-M:	Nicolas Pitre <nico@cam.org>
+M:	Nicolas Pitre <nico@fluxnic.net>
 S:	Maintained
 
 MARVELL YUKON / SYSKONNECT DRIVER
@@ -4689,7 +4689,7 @@ F:	include/linux/sl?b*.h
 F:	mm/sl?b.c
 
 SMC91x ETHERNET DRIVER
-M:	Nicolas Pitre <nico@cam.org>
+M:	Nicolas Pitre <nico@fluxnic.net>
 S:	Maintained
 F:	drivers/net/smc91x.*
 
diff --git a/arch/arm/boot/compressed/head-sa1100.S b/arch/arm/boot/compressed/head-sa1100.S
index 4c8c0e46027d..6179d94dd5c6 100644
--- a/arch/arm/boot/compressed/head-sa1100.S
+++ b/arch/arm/boot/compressed/head-sa1100.S
@@ -1,7 +1,7 @@
 /* 
  * linux/arch/arm/boot/compressed/head-sa1100.S
  * 
- * Copyright (C) 1999 Nicolas Pitre <nico@cam.org>
+ * Copyright (C) 1999 Nicolas Pitre <nico@fluxnic.net>
  * 
  * SA1100 specific tweaks.  This is merged into head.S by the linker.
  *
diff --git a/arch/arm/lib/lib1funcs.S b/arch/arm/lib/lib1funcs.S
index 67964bcfc854..6dc06487f3c3 100644
--- a/arch/arm/lib/lib1funcs.S
+++ b/arch/arm/lib/lib1funcs.S
@@ -1,7 +1,7 @@
 /*
  * linux/arch/arm/lib/lib1funcs.S: Optimized ARM division routines
  *
- * Author: Nicolas Pitre <nico@cam.org>
+ * Author: Nicolas Pitre <nico@fluxnic.net>
  *   - contributed to gcc-3.4 on Sep 30, 2003
  *   - adapted for the Linux kernel on Oct 2, 2003
  */
diff --git a/arch/arm/lib/sha1.S b/arch/arm/lib/sha1.S
index 09b548cac1a4..eb0edb80d7b8 100644
--- a/arch/arm/lib/sha1.S
+++ b/arch/arm/lib/sha1.S
@@ -3,7 +3,7 @@
  *
  *  SHA transform optimized for ARM
  *
- *  Copyright:	(C) 2005 by Nicolas Pitre <nico@cam.org>
+ *  Copyright:	(C) 2005 by Nicolas Pitre <nico@fluxnic.net>
  *  Created:	September 17, 2005
  *
  *  This program is free software; you can redistribute it and/or modify
diff --git a/arch/arm/mach-sa1100/include/mach/assabet.h b/arch/arm/mach-sa1100/include/mach/assabet.h
index 3959b20d5d1c..28c2cf50c259 100644
--- a/arch/arm/mach-sa1100/include/mach/assabet.h
+++ b/arch/arm/mach-sa1100/include/mach/assabet.h
@@ -1,7 +1,7 @@
 /*
  * arch/arm/mach-sa1100/include/mach/assabet.h
  *
- * Created 2000/06/05 by Nicolas Pitre <nico@cam.org>
+ * Created 2000/06/05 by Nicolas Pitre <nico@fluxnic.net>
  *
  * This file contains the hardware specific definitions for Assabet
  * Only include this file from SA1100-specific files.
diff --git a/arch/arm/mach-sa1100/include/mach/hardware.h b/arch/arm/mach-sa1100/include/mach/hardware.h
index 60711822b125..99f5856d8de4 100644
--- a/arch/arm/mach-sa1100/include/mach/hardware.h
+++ b/arch/arm/mach-sa1100/include/mach/hardware.h
@@ -1,7 +1,7 @@
 /*
  * arch/arm/mach-sa1100/include/mach/hardware.h
  *
- * Copyright (C) 1998 Nicolas Pitre <nico@cam.org>
+ * Copyright (C) 1998 Nicolas Pitre <nico@fluxnic.net>
  *
  * This file contains the hardware definitions for SA1100 architecture
  *
diff --git a/arch/arm/mach-sa1100/include/mach/memory.h b/arch/arm/mach-sa1100/include/mach/memory.h
index e9f8eed900f5..d5277f9bee77 100644
--- a/arch/arm/mach-sa1100/include/mach/memory.h
+++ b/arch/arm/mach-sa1100/include/mach/memory.h
@@ -1,7 +1,7 @@
 /*
  * arch/arm/mach-sa1100/include/mach/memory.h
  *
- * Copyright (C) 1999-2000 Nicolas Pitre <nico@cam.org>
+ * Copyright (C) 1999-2000 Nicolas Pitre <nico@fluxnic.net>
  */
 
 #ifndef __ASM_ARCH_MEMORY_H
diff --git a/arch/arm/mach-sa1100/include/mach/neponset.h b/arch/arm/mach-sa1100/include/mach/neponset.h
index d3f044f92c00..ffe2bc45eed0 100644
--- a/arch/arm/mach-sa1100/include/mach/neponset.h
+++ b/arch/arm/mach-sa1100/include/mach/neponset.h
@@ -1,7 +1,7 @@
 /*
  * arch/arm/mach-sa1100/include/mach/neponset.h
  *
- * Created 2000/06/05 by Nicolas Pitre <nico@cam.org>
+ * Created 2000/06/05 by Nicolas Pitre <nico@fluxnic.net>
  *
  * This file contains the hardware specific definitions for Assabet
  * Only include this file from SA1100-specific files.
diff --git a/arch/arm/mach-sa1100/include/mach/system.h b/arch/arm/mach-sa1100/include/mach/system.h
index 942b153e251d..ba9da9f7f183 100644
--- a/arch/arm/mach-sa1100/include/mach/system.h
+++ b/arch/arm/mach-sa1100/include/mach/system.h
@@ -1,7 +1,7 @@
 /*
  * arch/arm/mach-sa1100/include/mach/system.h
  *
- * Copyright (c) 1999 Nicolas Pitre <nico@cam.org>
+ * Copyright (c) 1999 Nicolas Pitre <nico@fluxnic.net>
  */
 #include <mach/hardware.h>
 
diff --git a/arch/arm/mach-sa1100/include/mach/uncompress.h b/arch/arm/mach-sa1100/include/mach/uncompress.h
index 714160b03d7a..6cb39ddde656 100644
--- a/arch/arm/mach-sa1100/include/mach/uncompress.h
+++ b/arch/arm/mach-sa1100/include/mach/uncompress.h
@@ -1,7 +1,7 @@
 /*
  * arch/arm/mach-sa1100/include/mach/uncompress.h
  *
- * (C) 1999 Nicolas Pitre <nico@cam.org>
+ * (C) 1999 Nicolas Pitre <nico@fluxnic.net>
  *
  * Reorganised to be machine independent.
  */
diff --git a/arch/arm/mach-sa1100/pm.c b/arch/arm/mach-sa1100/pm.c
index 111cce67ad2f..c83fdc80edfd 100644
--- a/arch/arm/mach-sa1100/pm.c
+++ b/arch/arm/mach-sa1100/pm.c
@@ -15,7 +15,7 @@
  * 			Save more value for the resume function! Support
  * 			Bitsy/Assabet/Freebird board
  *
- * 2001-08-29:	Nicolas Pitre <nico@cam.org>
+ * 2001-08-29:	Nicolas Pitre <nico@fluxnic.net>
  * 			Cleaned up, pushed platform dependent stuff
  * 			in the platform specific files.
  *
diff --git a/arch/arm/mach-sa1100/time.c b/arch/arm/mach-sa1100/time.c
index 711c0295c66f..95d92e8e56a8 100644
--- a/arch/arm/mach-sa1100/time.c
+++ b/arch/arm/mach-sa1100/time.c
@@ -4,7 +4,7 @@
  * Copyright (C) 1998 Deborah Wallach.
  * Twiddles  (C) 1999 Hugo Fiennes <hugo@empeg.com>
  *
- * 2000/03/29 (C) Nicolas Pitre <nico@cam.org>
+ * 2000/03/29 (C) Nicolas Pitre <nico@fluxnic.net>
  *	Rewritten: big cleanup, much simpler, better HZ accuracy.
  *
  */
diff --git a/arch/arm/mm/proc-xscale.S b/arch/arm/mm/proc-xscale.S
index 0cce37b93937..423394260bcb 100644
--- a/arch/arm/mm/proc-xscale.S
+++ b/arch/arm/mm/proc-xscale.S
@@ -17,7 +17,7 @@
  *
  * 2001 Sep 08:
  *	Completely revisited, many important fixes
- *	Nicolas Pitre <nico@cam.org>
+ *	Nicolas Pitre <nico@fluxnic.net>
  */
 
 #include <linux/linkage.h>
diff --git a/arch/arm/plat-iop/setup.c b/arch/arm/plat-iop/setup.c
index 9e573e78176a..bade586fed0f 100644
--- a/arch/arm/plat-iop/setup.c
+++ b/arch/arm/plat-iop/setup.c
@@ -1,7 +1,7 @@
 /*
  * arch/arm/plat-iop/setup.c
  *
- * Author: Nicolas Pitre <nico@cam.org>
+ * Author: Nicolas Pitre <nico@fluxnic.net>
  * Copyright (C) 2001 MontaVista Software, Inc.
  * Copyright (C) 2004 Intel Corporation.
  *
diff --git a/arch/arm/plat-omap/include/mach/system.h b/arch/arm/plat-omap/include/mach/system.h
index 1060e345423b..ed8ec7477261 100644
--- a/arch/arm/plat-omap/include/mach/system.h
+++ b/arch/arm/plat-omap/include/mach/system.h
@@ -1,6 +1,6 @@
 /*
  * Copied from arch/arm/mach-sa1100/include/mach/system.h
- * Copyright (c) 1999 Nicolas Pitre <nico@cam.org>
+ * Copyright (c) 1999 Nicolas Pitre <nico@fluxnic.net>
  */
 #ifndef __ASM_ARCH_SYSTEM_H
 #define __ASM_ARCH_SYSTEM_H
diff --git a/drivers/input/keyboard/pxa27x_keypad.c b/drivers/input/keyboard/pxa27x_keypad.c
index 76f9668221a4..79cd3e9fdf2e 100644
--- a/drivers/input/keyboard/pxa27x_keypad.c
+++ b/drivers/input/keyboard/pxa27x_keypad.c
@@ -8,7 +8,7 @@
  *
  * Based on a previous implementations by Kevin O'Connor
  * <kevin_at_koconnor.net> and Alex Osborne <bobofdoom@gmail.com> and
- * on some suggestions by Nicolas Pitre <nico@cam.org>.
+ * on some suggestions by Nicolas Pitre <nico@fluxnic.net>.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/drivers/mtd/chips/cfi_cmdset_0001.c b/drivers/mtd/chips/cfi_cmdset_0001.c
index 8664feebc93b..e7563a9872d0 100644
--- a/drivers/mtd/chips/cfi_cmdset_0001.c
+++ b/drivers/mtd/chips/cfi_cmdset_0001.c
@@ -5,7 +5,7 @@
  * (C) 2000 Red Hat. GPL'd
  *
  *
- * 10/10/2000	Nicolas Pitre <nico@cam.org>
+ * 10/10/2000	Nicolas Pitre <nico@fluxnic.net>
  * 	- completely revamped method functions so they are aware and
  * 	  independent of the flash geometry (buswidth, interleave, etc.)
  * 	- scalability vs code size is completely set at compile-time
diff --git a/drivers/mtd/chips/cfi_cmdset_0020.c b/drivers/mtd/chips/cfi_cmdset_0020.c
index 6c740f346f91..0667a671525d 100644
--- a/drivers/mtd/chips/cfi_cmdset_0020.c
+++ b/drivers/mtd/chips/cfi_cmdset_0020.c
@@ -4,7 +4,7 @@
  *
  * (C) 2000 Red Hat. GPL'd
  *
- * 10/10/2000	Nicolas Pitre <nico@cam.org>
+ * 10/10/2000	Nicolas Pitre <nico@fluxnic.net>
  * 	- completely revamped method functions so they are aware and
  * 	  independent of the flash geometry (buswidth, interleave, etc.)
  * 	- scalability vs code size is completely set at compile-time
diff --git a/drivers/mtd/maps/bfin-async-flash.c b/drivers/mtd/maps/bfin-async-flash.c
index 365c77b1b871..a7c808b577d3 100644
--- a/drivers/mtd/maps/bfin-async-flash.c
+++ b/drivers/mtd/maps/bfin-async-flash.c
@@ -6,7 +6,7 @@
  * for example.  All board-specific configuration goes in your
  * board resources file.
  *
- * Copyright 2000 Nicolas Pitre <nico@cam.org>
+ * Copyright 2000 Nicolas Pitre <nico@fluxnic.net>
  * Copyright 2005-2008 Analog Devices Inc.
  *
  * Enter bugs at http://blackfin.uclinux.org/
diff --git a/drivers/mtd/maps/ceiva.c b/drivers/mtd/maps/ceiva.c
index 60e68bde0fea..d41f34766e53 100644
--- a/drivers/mtd/maps/ceiva.c
+++ b/drivers/mtd/maps/ceiva.c
@@ -9,7 +9,7 @@
  * Based on: sa1100-flash.c, which has the following copyright:
  * Flash memory access on SA11x0 based devices
  *
- * (C) 2000 Nicolas Pitre <nico@cam.org>
+ * (C) 2000 Nicolas Pitre <nico@fluxnic.net>
  *
  */
 
diff --git a/drivers/mtd/maps/dc21285.c b/drivers/mtd/maps/dc21285.c
index 42969fe051b2..b3cb3a183809 100644
--- a/drivers/mtd/maps/dc21285.c
+++ b/drivers/mtd/maps/dc21285.c
@@ -1,7 +1,7 @@
 /*
  * MTD map driver for flash on the DC21285 (the StrongARM-110 companion chip)
  *
- * (C) 2000  Nicolas Pitre <nico@cam.org>
+ * (C) 2000  Nicolas Pitre <nico@fluxnic.net>
  *
  * This code is GPL
  */
@@ -249,5 +249,5 @@ module_exit(cleanup_dc21285);
 
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Nicolas Pitre <nico@cam.org>");
+MODULE_AUTHOR("Nicolas Pitre <nico@fluxnic.net>");
 MODULE_DESCRIPTION("MTD map driver for DC21285 boards");
diff --git a/drivers/mtd/maps/ipaq-flash.c b/drivers/mtd/maps/ipaq-flash.c
index 748c85f635f1..76708e796b70 100644
--- a/drivers/mtd/maps/ipaq-flash.c
+++ b/drivers/mtd/maps/ipaq-flash.c
@@ -1,7 +1,7 @@
 /*
  * Flash memory access on iPAQ Handhelds (either SA1100 or PXA250 based)
  *
- * (C) 2000 Nicolas Pitre <nico@cam.org>
+ * (C) 2000 Nicolas Pitre <nico@fluxnic.net>
  * (C) 2002 Hewlett-Packard Company <jamey.hicks@hp.com>
  * (C) 2003 Christian Pellegrin <chri@ascensit.com>, <chri@infis.univ.ts.it>: concatenation of multiple flashes
  */
diff --git a/drivers/mtd/maps/pxa2xx-flash.c b/drivers/mtd/maps/pxa2xx-flash.c
index 643aa06b599e..74fa075c838a 100644
--- a/drivers/mtd/maps/pxa2xx-flash.c
+++ b/drivers/mtd/maps/pxa2xx-flash.c
@@ -175,5 +175,5 @@ module_init(init_pxa2xx_flash);
 module_exit(cleanup_pxa2xx_flash);
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Nicolas Pitre <nico@cam.org>");
+MODULE_AUTHOR("Nicolas Pitre <nico@fluxnic.net>");
 MODULE_DESCRIPTION("MTD map driver for Intel XScale PXA2xx");
diff --git a/drivers/mtd/maps/sa1100-flash.c b/drivers/mtd/maps/sa1100-flash.c
index c6210f5118d1..fdb97f3d30e9 100644
--- a/drivers/mtd/maps/sa1100-flash.c
+++ b/drivers/mtd/maps/sa1100-flash.c
@@ -1,7 +1,7 @@
 /*
  * Flash memory access on SA11x0 based devices
  *
- * (C) 2000 Nicolas Pitre <nico@cam.org>
+ * (C) 2000 Nicolas Pitre <nico@fluxnic.net>
  */
 #include <linux/module.h>
 #include <linux/types.h>
diff --git a/drivers/mtd/mtdblock.c b/drivers/mtd/mtdblock.c
index 77db5ce24d92..2d70295a5fa3 100644
--- a/drivers/mtd/mtdblock.c
+++ b/drivers/mtd/mtdblock.c
@@ -1,7 +1,7 @@
 /*
  * Direct MTD block device access
  *
- * (C) 2000-2003 Nicolas Pitre <nico@cam.org>
+ * (C) 2000-2003 Nicolas Pitre <nico@fluxnic.net>
  * (C) 1999-2003 David Woodhouse <dwmw2@infradead.org>
  */
 
@@ -403,5 +403,5 @@ module_exit(cleanup_mtdblock);
 
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Nicolas Pitre <nico@cam.org> et al.");
+MODULE_AUTHOR("Nicolas Pitre <nico@fluxnic.net> et al.");
 MODULE_DESCRIPTION("Caching read/erase/writeback block device emulation access to MTD devices");
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 349fcbe5cc0f..742504ea96f5 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -1,7 +1,7 @@
 /*
  * Simple MTD partitioning layer
  *
- * (C) 2000 Nicolas Pitre <nico@cam.org>
+ * (C) 2000 Nicolas Pitre <nico@fluxnic.net>
  *
  * This code is GPL
  *
diff --git a/drivers/net/smc91x.c b/drivers/net/smc91x.c
index 61be6d7680f6..05c91ee6921e 100644
--- a/drivers/net/smc91x.c
+++ b/drivers/net/smc91x.c
@@ -35,7 +35,7 @@
  *
  * contributors:
  * 	Daris A Nevil <dnevil@snmc.com>
- *      Nicolas Pitre <nico@cam.org>
+ *      Nicolas Pitre <nico@fluxnic.net>
  *	Russell King <rmk@arm.linux.org.uk>
  *
  * History:
@@ -58,7 +58,7 @@
  *   22/09/04  Nicolas Pitre      big update (see commit log for details)
  */
 static const char version[] =
-	"smc91x.c: v1.1, sep 22 2004 by Nicolas Pitre <nico@cam.org>\n";
+	"smc91x.c: v1.1, sep 22 2004 by Nicolas Pitre <nico@fluxnic.net>\n";
 
 /* Debugging level */
 #ifndef SMC_DEBUG
diff --git a/drivers/net/smc91x.h b/drivers/net/smc91x.h
index 57a159fac99f..784b631cfa3c 100644
--- a/drivers/net/smc91x.h
+++ b/drivers/net/smc91x.h
@@ -28,7 +28,7 @@
  . Authors
  .	Erik Stahlman		<erik@vt.edu>
  .	Daris A Nevil		<dnevil@snmc.com>
- .	Nicolas Pitre 		<nico@cam.org>
+ .	Nicolas Pitre 		<nico@fluxnic.net>
  .
  ---------------------------------------------------------------------------*/
 #ifndef _SMC91X_H_
diff --git a/drivers/rtc/rtc-sa1100.c b/drivers/rtc/rtc-sa1100.c
index 4f247e4dd3f9..021b2928f0b9 100644
--- a/drivers/rtc/rtc-sa1100.c
+++ b/drivers/rtc/rtc-sa1100.c
@@ -9,7 +9,7 @@
  *
  * Modifications from:
  *   CIH <cih@coventive.com>
- *   Nicolas Pitre <nico@cam.org>
+ *   Nicolas Pitre <nico@fluxnic.net>
  *   Andrew Christian <andrew.christian@hp.com>
  *
  * Converted to the RTC subsystem and Driver Model
diff --git a/drivers/video/sa1100fb.c b/drivers/video/sa1100fb.c
index 10ddad8e17d6..cdaa873a6054 100644
--- a/drivers/video/sa1100fb.c
+++ b/drivers/video/sa1100fb.c
@@ -66,7 +66,7 @@
  *	- FrameBuffer memory is now allocated at run-time when the
  *	  driver is initialized.    
  *
- * 2000/04/10: Nicolas Pitre <nico@cam.org>
+ * 2000/04/10: Nicolas Pitre <nico@fluxnic.net>
  *	- Big cleanup for dynamic selection of machine type at run time.
  *
  * 2000/07/19: Jamey Hicks <jamey@crl.dec.com>
diff --git a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h
index b70313d33ff8..274b6196091d 100644
--- a/include/linux/mtd/partitions.h
+++ b/include/linux/mtd/partitions.h
@@ -1,7 +1,7 @@
 /*
  * MTD partitioning layer definitions
  *
- * (C) 2000 Nicolas Pitre <nico@cam.org>
+ * (C) 2000 Nicolas Pitre <nico@fluxnic.net>
  *
  * This code is GPL
  */
diff --git a/lib/inflate.c b/lib/inflate.c
index 1a8e8a978128..d10255973a9f 100644
--- a/lib/inflate.c
+++ b/lib/inflate.c
@@ -7,7 +7,7 @@
  * Adapted for booting Linux by Hannu Savolainen 1993
  * based on gzip-1.0.3 
  *
- * Nicolas Pitre <nico@cam.org>, 1999/04/14 :
+ * Nicolas Pitre <nico@fluxnic.net>, 1999/04/14 :
  *   Little mods for all variable to reside either into rodata or bss segments
  *   by marking constant variables with 'const' and initializing all the others
  *   at run-time only.  This allows for the kernel uncompressor to run
-- 
cgit v1.2.3-71-gd317


From b4028437876866aba4747a655ede00f892089e14 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 11 May 2009 14:16:57 -0700
Subject: Driver core: move dev_get/set_drvdata to drivers/base/dd.c

No one should directly access the driver_data field, so remove the field
and make it private.  We dynamically create the private field now if it
is needed, to handle drivers that call get/set before they are
registered with the driver core.

Also update the copyright notices on these files while we are there.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/base.h    |  5 +++++
 drivers/base/core.c    | 20 ++++++++++++++------
 drivers/base/dd.c      | 31 +++++++++++++++++++++++++++++--
 include/linux/device.h | 16 ++++------------
 4 files changed, 52 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/base.h b/drivers/base/base.h
index 1e52c125f437..503d59c57501 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -70,6 +70,8 @@ struct class_private {
  * @knode_parent - node in sibling list
  * @knode_driver - node in driver list
  * @knode_bus - node in bus list
+ * @driver_data - private pointer for driver specific info.  Will turn into a
+ * list soon.
  * @device - pointer back to the struct class that this structure is
  * associated with.
  *
@@ -80,6 +82,7 @@ struct device_private {
 	struct klist_node knode_parent;
 	struct klist_node knode_driver;
 	struct klist_node knode_bus;
+	void *driver_data;
 	struct device *device;
 };
 #define to_device_private_parent(obj)	\
@@ -89,6 +92,8 @@ struct device_private {
 #define to_device_private_bus(obj)	\
 	container_of(obj, struct device_private, knode_bus)
 
+extern int device_private_init(struct device *dev);
+
 /* initialisation functions */
 extern int devices_init(void);
 extern int buses_init(void);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index c34774d0b9d3..99dfe96fffcb 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -843,6 +843,17 @@ static void device_remove_sys_dev_entry(struct device *dev)
 	}
 }
 
+int device_private_init(struct device *dev)
+{
+	dev->p = kzalloc(sizeof(*dev->p), GFP_KERNEL);
+	if (!dev->p)
+		return -ENOMEM;
+	dev->p->device = dev;
+	klist_init(&dev->p->klist_children, klist_children_get,
+		   klist_children_put);
+	return 0;
+}
+
 /**
  * device_add - add device to device hierarchy.
  * @dev: device.
@@ -868,14 +879,11 @@ int device_add(struct device *dev)
 	if (!dev)
 		goto done;
 
-	dev->p = kzalloc(sizeof(*dev->p), GFP_KERNEL);
 	if (!dev->p) {
-		error = -ENOMEM;
-		goto done;
+		error = device_private_init(dev);
+		if (error)
+			goto done;
 	}
-	dev->p->device = dev;
-	klist_init(&dev->p->klist_children, klist_children_get,
-		   klist_children_put);
 
 	/*
 	 * for statically allocated devices, which should all be converted
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 7b34b3a48f67..979d159b5cd1 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -11,8 +11,8 @@
  *
  * Copyright (c) 2002-5 Patrick Mochel
  * Copyright (c) 2002-3 Open Source Development Labs
- * Copyright (c) 2007 Greg Kroah-Hartman <gregkh@suse.de>
- * Copyright (c) 2007 Novell Inc.
+ * Copyright (c) 2007-2009 Greg Kroah-Hartman <gregkh@suse.de>
+ * Copyright (c) 2007-2009 Novell Inc.
  *
  * This file is released under the GPLv2
  */
@@ -391,3 +391,30 @@ void driver_detach(struct device_driver *drv)
 		put_device(dev);
 	}
 }
+
+/*
+ * These exports can't be _GPL due to .h files using this within them, and it
+ * might break something that was previously working...
+ */
+void *dev_get_drvdata(const struct device *dev)
+{
+	if (dev && dev->p)
+		return dev->p->driver_data;
+	return NULL;
+}
+EXPORT_SYMBOL(dev_get_drvdata);
+
+void dev_set_drvdata(struct device *dev, void *data)
+{
+	int error;
+
+	if (!dev)
+		return;
+	if (!dev->p) {
+		error = device_private_init(dev);
+		if (error)
+			return;
+	}
+	dev->p->driver_data = data;
+}
+EXPORT_SYMBOL(dev_set_drvdata);
diff --git a/include/linux/device.h b/include/linux/device.h
index a28642975053..c0bd23048be0 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -2,7 +2,8 @@
  * device.h - generic, centralized driver model
  *
  * Copyright (c) 2001-2003 Patrick Mochel <mochel@osdl.org>
- * Copyright (c) 2004-2007 Greg Kroah-Hartman <gregkh@suse.de>
+ * Copyright (c) 2004-2009 Greg Kroah-Hartman <gregkh@suse.de>
+ * Copyright (c) 2008-2009 Novell Inc.
  *
  * This file is released under the GPLv2
  *
@@ -381,7 +382,6 @@ struct device {
 	struct bus_type	*bus;		/* type of bus device is on */
 	struct device_driver *driver;	/* which driver has allocated this
 					   device */
-	void		*driver_data;	/* data private to the driver */
 	void		*platform_data;	/* Platform specific data, device
 					   core doesn't touch it */
 	struct dev_pm_info	power;
@@ -447,16 +447,6 @@ static inline void set_dev_node(struct device *dev, int node)
 }
 #endif
 
-static inline void *dev_get_drvdata(const struct device *dev)
-{
-	return dev->driver_data;
-}
-
-static inline void dev_set_drvdata(struct device *dev, void *data)
-{
-	dev->driver_data = data;
-}
-
 static inline unsigned int dev_get_uevent_suppress(const struct device *dev)
 {
 	return dev->kobj.uevent_suppress;
@@ -490,6 +480,8 @@ extern int device_rename(struct device *dev, char *new_name);
 extern int device_move(struct device *dev, struct device *new_parent,
 		       enum dpm_order dpm_order);
 extern const char *device_get_nodename(struct device *dev, const char **tmp);
+extern void *dev_get_drvdata(const struct device *dev);
+extern void dev_set_drvdata(struct device *dev, void *data);
 
 /*
  * Root device objects for grouping under /sys/devices
-- 
cgit v1.2.3-71-gd317


From a5b8b1ada6dd76503884f5492b995cd29eefae0f Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Fri, 17 Jul 2009 15:06:08 +0100
Subject: Driver core: Add accessor for device platform data

For consistency with driver data provide a dev_get_platdata() accessor
for reading the platform data from a device.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/device.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index c0bd23048be0..3f33f17f556c 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -494,6 +494,11 @@ static inline struct device *root_device_register(const char *name)
 }
 extern void root_device_unregister(struct device *root);
 
+static inline void *dev_get_platdata(const struct device *dev)
+{
+	return dev->platform_data;
+}
+
 /*
  * Manual binding of a device to driver. See drivers/base/bus.c
  * for information on use.
-- 
cgit v1.2.3-71-gd317


From a4dbd6740df0872cdf0a86841f75beec8381964d Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Wed, 24 Jun 2009 10:06:31 -0700
Subject: driver model: constify attribute groups

Let attribute group vectors be declared "const".  We'd
like to let most attribute metadata live in read-only
sections... this is a start.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 block/genhd.c                              | 2 +-
 drivers/base/core.c                        | 4 ++--
 drivers/base/driver.c                      | 4 ++--
 drivers/block/cciss.c                      | 2 +-
 drivers/firewire/core-device.c             | 2 +-
 drivers/firmware/dmi-id.c                  | 2 +-
 drivers/infiniband/hw/ehca/ehca_main.c     | 2 +-
 drivers/infiniband/hw/ipath/ipath_kernel.h | 2 +-
 drivers/infiniband/hw/ipath/ipath_sysfs.c  | 2 +-
 drivers/input/input.c                      | 2 +-
 drivers/misc/enclosure.c                   | 4 ++--
 drivers/mmc/core/mmc.c                     | 2 +-
 drivers/mmc/core/sd.c                      | 2 +-
 drivers/mtd/mtdcore.c                      | 2 +-
 drivers/s390/cio/css.c                     | 2 +-
 drivers/s390/cio/device.c                  | 2 +-
 drivers/s390/net/netiucv.c                 | 2 +-
 drivers/scsi/scsi_priv.h                   | 2 +-
 drivers/scsi/scsi_sysfs.c                  | 4 ++--
 drivers/usb/core/endpoint.c                | 2 +-
 drivers/usb/core/sysfs.c                   | 4 ++--
 drivers/usb/core/usb.h                     | 4 ++--
 drivers/uwb/lc-dev.c                       | 2 +-
 fs/partitions/check.c                      | 2 +-
 include/linux/attribute_container.h        | 2 +-
 include/linux/device.h                     | 6 +++---
 include/linux/netdevice.h                  | 2 +-
 include/linux/transport_class.h            | 2 +-
 net/bluetooth/hci_sysfs.c                  | 4 ++--
 net/core/net-sysfs.c                       | 2 +-
 30 files changed, 39 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 5b76bf55d05c..2ad91ddad8e2 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -903,7 +903,7 @@ static struct attribute_group disk_attr_group = {
 	.attrs = disk_attrs,
 };
 
-static struct attribute_group *disk_attr_groups[] = {
+static const struct attribute_group *disk_attr_groups[] = {
 	&disk_attr_group,
 	NULL
 };
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 99dfe96fffcb..a992985d1fab 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -341,7 +341,7 @@ static void device_remove_attributes(struct device *dev,
 }
 
 static int device_add_groups(struct device *dev,
-			     struct attribute_group **groups)
+			     const struct attribute_group **groups)
 {
 	int error = 0;
 	int i;
@@ -361,7 +361,7 @@ static int device_add_groups(struct device *dev,
 }
 
 static void device_remove_groups(struct device *dev,
-				 struct attribute_group **groups)
+				 const struct attribute_group **groups)
 {
 	int i;
 
diff --git a/drivers/base/driver.c b/drivers/base/driver.c
index 8ae0f63602e0..ed2ebd3c287d 100644
--- a/drivers/base/driver.c
+++ b/drivers/base/driver.c
@@ -181,7 +181,7 @@ void put_driver(struct device_driver *drv)
 EXPORT_SYMBOL_GPL(put_driver);
 
 static int driver_add_groups(struct device_driver *drv,
-			     struct attribute_group **groups)
+			     const struct attribute_group **groups)
 {
 	int error = 0;
 	int i;
@@ -201,7 +201,7 @@ static int driver_add_groups(struct device_driver *drv,
 }
 
 static void driver_remove_groups(struct device_driver *drv,
-				 struct attribute_group **groups)
+				 const struct attribute_group **groups)
 {
 	int i;
 
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 0589dfbbd7db..d8372b432826 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -572,7 +572,7 @@ static struct attribute_group cciss_dev_attr_group = {
 	.attrs = cciss_dev_attrs,
 };
 
-static struct attribute_group *cciss_dev_attr_groups[] = {
+static const struct attribute_group *cciss_dev_attr_groups[] = {
 	&cciss_dev_attr_group,
 	NULL
 };
diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c
index 97e656af2d22..9d0dfcbe2c1c 100644
--- a/drivers/firewire/core-device.c
+++ b/drivers/firewire/core-device.c
@@ -312,7 +312,7 @@ static void init_fw_attribute_group(struct device *dev,
 	group->groups[0] = &group->group;
 	group->groups[1] = NULL;
 	group->group.attrs = group->attrs;
-	dev->groups = group->groups;
+	dev->groups = (const struct attribute_group **) group->groups;
 }
 
 static ssize_t modalias_show(struct device *dev,
diff --git a/drivers/firmware/dmi-id.c b/drivers/firmware/dmi-id.c
index 5a76d056b9d0..dbdf6fadfc79 100644
--- a/drivers/firmware/dmi-id.c
+++ b/drivers/firmware/dmi-id.c
@@ -139,7 +139,7 @@ static struct attribute_group sys_dmi_attribute_group = {
 	.attrs = sys_dmi_attributes,
 };
 
-static struct attribute_group* sys_dmi_attribute_groups[] = {
+static const struct attribute_group* sys_dmi_attribute_groups[] = {
 	&sys_dmi_attribute_group,
 	NULL
 };
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
index 5b635aa5947e..fb2d83c5bf01 100644
--- a/drivers/infiniband/hw/ehca/ehca_main.c
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -623,7 +623,7 @@ static struct attribute_group ehca_drv_attr_grp = {
 	.attrs = ehca_drv_attrs
 };
 
-static struct attribute_group *ehca_drv_attr_groups[] = {
+static const struct attribute_group *ehca_drv_attr_groups[] = {
 	&ehca_drv_attr_grp,
 	NULL,
 };
diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h
index 6ba4861dd6ac..b3d7efcdf021 100644
--- a/drivers/infiniband/hw/ipath/ipath_kernel.h
+++ b/drivers/infiniband/hw/ipath/ipath_kernel.h
@@ -1286,7 +1286,7 @@ struct device_driver;
 
 extern const char ib_ipath_version[];
 
-extern struct attribute_group *ipath_driver_attr_groups[];
+extern const struct attribute_group *ipath_driver_attr_groups[];
 
 int ipath_device_create_group(struct device *, struct ipath_devdata *);
 void ipath_device_remove_group(struct device *, struct ipath_devdata *);
diff --git a/drivers/infiniband/hw/ipath/ipath_sysfs.c b/drivers/infiniband/hw/ipath/ipath_sysfs.c
index a6c8efbdc0c9..b8cb2f145ae4 100644
--- a/drivers/infiniband/hw/ipath/ipath_sysfs.c
+++ b/drivers/infiniband/hw/ipath/ipath_sysfs.c
@@ -1069,7 +1069,7 @@ static ssize_t show_tempsense(struct device *dev,
 	return ret;
 }
 
-struct attribute_group *ipath_driver_attr_groups[] = {
+const struct attribute_group *ipath_driver_attr_groups[] = {
 	&driver_attr_group,
 	NULL,
 };
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 7c237e6ac711..851791d955f3 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -1144,7 +1144,7 @@ static struct attribute_group input_dev_caps_attr_group = {
 	.attrs	= input_dev_caps_attrs,
 };
 
-static struct attribute_group *input_dev_attr_groups[] = {
+static const struct attribute_group *input_dev_attr_groups[] = {
 	&input_dev_attr_group,
 	&input_dev_id_attr_group,
 	&input_dev_caps_attr_group,
diff --git a/drivers/misc/enclosure.c b/drivers/misc/enclosure.c
index 7b039306037f..e9eae4a78402 100644
--- a/drivers/misc/enclosure.c
+++ b/drivers/misc/enclosure.c
@@ -238,7 +238,7 @@ static void enclosure_component_release(struct device *dev)
 	put_device(dev->parent);
 }
 
-static struct attribute_group *enclosure_groups[];
+static const struct attribute_group *enclosure_groups[];
 
 /**
  * enclosure_component_register - add a particular component to an enclosure
@@ -536,7 +536,7 @@ static struct attribute_group enclosure_group = {
 	.attrs = enclosure_component_attrs,
 };
 
-static struct attribute_group *enclosure_groups[] = {
+static const struct attribute_group *enclosure_groups[] = {
 	&enclosure_group,
 	NULL
 };
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 06084dbf1277..2fb9d5f271ea 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -276,7 +276,7 @@ static struct attribute_group mmc_std_attr_group = {
 	.attrs = mmc_std_attrs,
 };
 
-static struct attribute_group *mmc_attr_groups[] = {
+static const struct attribute_group *mmc_attr_groups[] = {
 	&mmc_std_attr_group,
 	NULL,
 };
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index cd81c395e164..7ad646fe077e 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -314,7 +314,7 @@ static struct attribute_group sd_std_attr_group = {
 	.attrs = sd_std_attrs,
 };
 
-static struct attribute_group *sd_attr_groups[] = {
+static const struct attribute_group *sd_attr_groups[] = {
 	&sd_std_attr_group,
 	NULL,
 };
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 00ebf7af7467..69007a6eff50 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -217,7 +217,7 @@ struct attribute_group mtd_group = {
 	.attrs		= mtd_attrs,
 };
 
-struct attribute_group *mtd_groups[] = {
+const struct attribute_group *mtd_groups[] = {
 	&mtd_group,
 	NULL,
 };
diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
index e995123fd805..393c73c47f87 100644
--- a/drivers/s390/cio/css.c
+++ b/drivers/s390/cio/css.c
@@ -266,7 +266,7 @@ static struct attribute_group subch_attr_group = {
 	.attrs = subch_attrs,
 };
 
-static struct attribute_group *default_subch_attr_groups[] = {
+static const struct attribute_group *default_subch_attr_groups[] = {
 	&subch_attr_group,
 	NULL,
 };
diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index 0f95405c2c5e..6527f3f34493 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -656,7 +656,7 @@ static struct attribute_group ccwdev_attr_group = {
 	.attrs = ccwdev_attrs,
 };
 
-static struct attribute_group *ccwdev_attr_groups[] = {
+static const struct attribute_group *ccwdev_attr_groups[] = {
 	&ccwdev_attr_group,
 	NULL,
 };
diff --git a/drivers/s390/net/netiucv.c b/drivers/s390/net/netiucv.c
index 9215fbbccc08..a4b2c576144b 100644
--- a/drivers/s390/net/netiucv.c
+++ b/drivers/s390/net/netiucv.c
@@ -2159,7 +2159,7 @@ static struct attribute_group netiucv_drv_attr_group = {
 	.attrs = netiucv_drv_attrs,
 };
 
-static struct attribute_group *netiucv_drv_attr_groups[] = {
+static const struct attribute_group *netiucv_drv_attr_groups[] = {
 	&netiucv_drv_attr_group,
 	NULL,
 };
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index 021e503c8c44..1fbf7c78bba0 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -132,7 +132,7 @@ extern struct scsi_transport_template blank_transport_template;
 extern void __scsi_remove_device(struct scsi_device *);
 
 extern struct bus_type scsi_bus_type;
-extern struct attribute_group *scsi_sysfs_shost_attr_groups[];
+extern const struct attribute_group *scsi_sysfs_shost_attr_groups[];
 
 /* scsi_netlink.c */
 #ifdef CONFIG_SCSI_NETLINK
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 91482f2dcc50..fde54537d715 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -275,7 +275,7 @@ struct attribute_group scsi_shost_attr_group = {
 	.attrs =	scsi_sysfs_shost_attrs,
 };
 
-struct attribute_group *scsi_sysfs_shost_attr_groups[] = {
+const struct attribute_group *scsi_sysfs_shost_attr_groups[] = {
 	&scsi_shost_attr_group,
 	NULL
 };
@@ -745,7 +745,7 @@ static struct attribute_group scsi_sdev_attr_group = {
 	.attrs =	scsi_sdev_attrs,
 };
 
-static struct attribute_group *scsi_sdev_attr_groups[] = {
+static const struct attribute_group *scsi_sdev_attr_groups[] = {
 	&scsi_sdev_attr_group,
 	NULL
 };
diff --git a/drivers/usb/core/endpoint.c b/drivers/usb/core/endpoint.c
index bc39fc40bbde..fdfaa7885515 100644
--- a/drivers/usb/core/endpoint.c
+++ b/drivers/usb/core/endpoint.c
@@ -154,7 +154,7 @@ static struct attribute *ep_dev_attrs[] = {
 static struct attribute_group ep_dev_attr_grp = {
 	.attrs = ep_dev_attrs,
 };
-static struct attribute_group *ep_dev_groups[] = {
+static const struct attribute_group *ep_dev_groups[] = {
 	&ep_dev_attr_grp,
 	NULL
 };
diff --git a/drivers/usb/core/sysfs.c b/drivers/usb/core/sysfs.c
index b5c72e458943..7ec3041ae79e 100644
--- a/drivers/usb/core/sysfs.c
+++ b/drivers/usb/core/sysfs.c
@@ -573,7 +573,7 @@ static struct attribute_group dev_string_attr_grp = {
 	.is_visible =	dev_string_attrs_are_visible,
 };
 
-struct attribute_group *usb_device_groups[] = {
+const struct attribute_group *usb_device_groups[] = {
 	&dev_attr_grp,
 	&dev_string_attr_grp,
 	NULL
@@ -799,7 +799,7 @@ static struct attribute_group intf_assoc_attr_grp = {
 	.is_visible =	intf_assoc_attrs_are_visible,
 };
 
-struct attribute_group *usb_interface_groups[] = {
+const struct attribute_group *usb_interface_groups[] = {
 	&intf_attr_grp,
 	&intf_assoc_attr_grp,
 	NULL
diff --git a/drivers/usb/core/usb.h b/drivers/usb/core/usb.h
index e2a8cfaade1d..c0e0ae2bb8e7 100644
--- a/drivers/usb/core/usb.h
+++ b/drivers/usb/core/usb.h
@@ -152,8 +152,8 @@ static inline int is_active(const struct usb_interface *f)
 extern const char *usbcore_name;
 
 /* sysfs stuff */
-extern struct attribute_group *usb_device_groups[];
-extern struct attribute_group *usb_interface_groups[];
+extern const struct attribute_group *usb_device_groups[];
+extern const struct attribute_group *usb_interface_groups[];
 
 /* usbfs stuff */
 extern struct mutex usbfs_mutex;
diff --git a/drivers/uwb/lc-dev.c b/drivers/uwb/lc-dev.c
index e9fe1bb7eb23..1097e81b56d0 100644
--- a/drivers/uwb/lc-dev.c
+++ b/drivers/uwb/lc-dev.c
@@ -255,7 +255,7 @@ static struct attribute_group dev_attr_group = {
 	.attrs = dev_attrs,
 };
 
-static struct attribute_group *groups[] = {
+static const struct attribute_group *groups[] = {
 	&dev_attr_group,
 	NULL,
 };
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 619ba99dfe39..fbeaddf595d3 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -312,7 +312,7 @@ static struct attribute_group part_attr_group = {
 	.attrs = part_attrs,
 };
 
-static struct attribute_group *part_attr_groups[] = {
+static const struct attribute_group *part_attr_groups[] = {
 	&part_attr_group,
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	&blk_trace_attr_group,
diff --git a/include/linux/attribute_container.h b/include/linux/attribute_container.h
index 794ad74b1d61..c3ab81428c66 100644
--- a/include/linux/attribute_container.h
+++ b/include/linux/attribute_container.h
@@ -17,7 +17,7 @@ struct attribute_container {
 	struct list_head	node;
 	struct klist		containers;
 	struct class		*class;
-	struct attribute_group	*grp;
+	const struct attribute_group *grp;
 	struct device_attribute **attrs;
 	int (*match)(struct attribute_container *, struct device *);
 #define	ATTRIBUTE_CONTAINER_NO_CLASSDEVS	0x01
diff --git a/include/linux/device.h b/include/linux/device.h
index 3f33f17f556c..e19e40a3dcbe 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -131,7 +131,7 @@ struct device_driver {
 	void (*shutdown) (struct device *dev);
 	int (*suspend) (struct device *dev, pm_message_t state);
 	int (*resume) (struct device *dev);
-	struct attribute_group **groups;
+	const struct attribute_group **groups;
 
 	const struct dev_pm_ops *pm;
 
@@ -288,7 +288,7 @@ extern void class_destroy(struct class *cls);
  */
 struct device_type {
 	const char *name;
-	struct attribute_group **groups;
+	const struct attribute_group **groups;
 	int (*uevent)(struct device *dev, struct kobj_uevent_env *env);
 	char *(*nodename)(struct device *dev);
 	void (*release)(struct device *dev);
@@ -412,7 +412,7 @@ struct device {
 
 	struct klist_node	knode_class;
 	struct class		*class;
-	struct attribute_group	**groups;	/* optional groups */
+	const struct attribute_group **groups;	/* optional groups */
 
 	void	(*release)(struct device *dev);
 };
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 65ee1929b2b1..a9aa4b5917d7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -895,7 +895,7 @@ struct net_device
 	/* class/net/name entry */
 	struct device		dev;
 	/* space for optional statistics and wireless sysfs groups */
-	struct attribute_group  *sysfs_groups[3];
+	const struct attribute_group *sysfs_groups[3];
 
 	/* rtnetlink link ops */
 	const struct rtnl_link_ops *rtnl_link_ops;
diff --git a/include/linux/transport_class.h b/include/linux/transport_class.h
index eaec1ea9558e..9ae8da3e6407 100644
--- a/include/linux/transport_class.h
+++ b/include/linux/transport_class.h
@@ -55,7 +55,7 @@ struct anon_transport_class cls = {				\
 
 struct transport_container {
 	struct attribute_container ac;
-	struct attribute_group *statistics;
+	const struct attribute_group *statistics;
 };
 
 #define attribute_container_to_transport_container(x) \
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index 95f7a7a544b4..7f939ce29801 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -68,7 +68,7 @@ static struct attribute_group bt_link_group = {
 	.attrs = bt_link_attrs,
 };
 
-static struct attribute_group *bt_link_groups[] = {
+static const struct attribute_group *bt_link_groups[] = {
 	&bt_link_group,
 	NULL
 };
@@ -392,7 +392,7 @@ static struct attribute_group bt_host_group = {
 	.attrs = bt_host_attrs,
 };
 
-static struct attribute_group *bt_host_groups[] = {
+static const struct attribute_group *bt_host_groups[] = {
 	&bt_host_group,
 	NULL
 };
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index ad91e9e5f475..7d4c57523b09 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -493,7 +493,7 @@ void netdev_unregister_kobject(struct net_device * net)
 int netdev_register_kobject(struct net_device *net)
 {
 	struct device *dev = &(net->dev);
-	struct attribute_group **groups = net->sysfs_groups;
+	const struct attribute_group **groups = net->sysfs_groups;
 
 	dev->class = &net_class;
 	dev->platform_data = net;
-- 
cgit v1.2.3-71-gd317


From ccb86a6907c9ba7b5be5f521362fc308e80bed34 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 20 Jul 2009 10:29:34 +0300
Subject: uio: add generic driver for PCI 2.3 devices

This adds a generic uio driver that can bind to any PCI device.  First
user will be virtualization where a qemu userspace process needs to give
guest OS access to the device.

Interrupts are handled using the Interrupt Disable bit in the PCI
command register and Interrupt Status bit in the PCI status register.
All devices compliant to PCI 2.3 (circa 2002) and all compliant PCI
Express devices should support these bits.  Driver detects this support,
and won't bind to devices which do not support the Interrupt Disable Bit
in the command register.

It's expected that more features of interest to virtualization will be
added to this driver in the future. Possibilities are: mmap for device
resources, MSI/MSI-X, eventfd (to interface with kvm), iommu.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Chris Wright <chrisw@redhat.com>
Signed-off-by: Hans J. Koch <hjk@linutronix.de>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/DocBook/uio-howto.tmpl | 163 +++++++++++++++++++++++++++
 MAINTAINERS                          |   7 ++
 drivers/uio/Kconfig                  |  10 ++
 drivers/uio/Makefile                 |   1 +
 drivers/uio/uio_pci_generic.c        | 207 +++++++++++++++++++++++++++++++++++
 include/linux/pci_regs.h             |   1 +
 6 files changed, 389 insertions(+)
 create mode 100644 drivers/uio/uio_pci_generic.c

(limited to 'include/linux')

diff --git a/Documentation/DocBook/uio-howto.tmpl b/Documentation/DocBook/uio-howto.tmpl
index 8f6e3b2403c7..4d4ce0e61e42 100644
--- a/Documentation/DocBook/uio-howto.tmpl
+++ b/Documentation/DocBook/uio-howto.tmpl
@@ -25,6 +25,10 @@
 	<year>2006-2008</year>
 	<holder>Hans-Jürgen Koch.</holder>
 </copyright>
+<copyright>
+	<year>2009</year>
+	<holder>Red Hat Inc, Michael S. Tsirkin (mst@redhat.com)</holder>
+</copyright>
 
 <legalnotice>
 <para>
@@ -41,6 +45,13 @@ GPL version 2.
 </abstract>
 
 <revhistory>
+	<revision>
+	<revnumber>0.9</revnumber>
+	<date>2009-07-16</date>
+	<authorinitials>mst</authorinitials>
+	<revremark>Added generic pci driver
+		</revremark>
+	</revision>
 	<revision>
 	<revnumber>0.8</revnumber>
 	<date>2008-12-24</date>
@@ -809,6 +820,158 @@ framework to set up sysfs files for this region. Simply leave it alone.
 
 </chapter>
 
+<chapter id="uio_pci_generic" xreflabel="Using Generic driver for PCI cards">
+<?dbhtml filename="uio_pci_generic.html"?>
+<title>Generic PCI UIO driver</title>
+	<para>
+	The generic driver is a kernel module named uio_pci_generic.
+	It can work with any device compliant to PCI 2.3 (circa 2002) and
+	any compliant PCI Express device. Using this, you only need to
+        write the userspace driver, removing the need to write
+        a hardware-specific kernel module.
+	</para>
+
+<sect1 id="uio_pci_generic_binding">
+<title>Making the driver recognize the device</title>
+	<para>
+Since the driver does not declare any device ids, it will not get loaded
+automatically and will not automatically bind to any devices, you must load it
+and allocate id to the driver yourself. For example:
+	<programlisting>
+ modprobe uio_pci_generic
+ echo &quot;8086 10f5&quot; &gt; /sys/bus/pci/drivers/uio_pci_generic/new_id
+	</programlisting>
+	</para>
+	<para>
+If there already is a hardware specific kernel driver for your device, the
+generic driver still won't bind to it, in this case if you want to use the
+generic driver (why would you?) you'll have to manually unbind the hardware
+specific driver and bind the generic driver, like this:
+	<programlisting>
+    echo -n 0000:00:19.0 &gt; /sys/bus/pci/drivers/e1000e/unbind
+    echo -n 0000:00:19.0 &gt; /sys/bus/pci/drivers/uio_pci_generic/bind
+	</programlisting>
+	</para>
+	<para>
+You can verify that the device has been bound to the driver
+by looking for it in sysfs, for example like the following:
+	<programlisting>
+    ls -l /sys/bus/pci/devices/0000:00:19.0/driver
+	</programlisting>
+Which if successful should print
+	<programlisting>
+  .../0000:00:19.0/driver -&gt; ../../../bus/pci/drivers/uio_pci_generic
+	</programlisting>
+Note that the generic driver will not bind to old PCI 2.2 devices.
+If binding the device failed, run the following command:
+	<programlisting>
+  dmesg
+	</programlisting>
+and look in the output for failure reasons
+	</para>
+</sect1>
+
+<sect1 id="uio_pci_generic_internals">
+<title>Things to know about uio_pci_generic</title>
+	<para>
+Interrupts are handled using the Interrupt Disable bit in the PCI command
+register and Interrupt Status bit in the PCI status register.  All devices
+compliant to PCI 2.3 (circa 2002) and all compliant PCI Express devices should
+support these bits.  uio_pci_generic detects this support, and won't bind to
+devices which do not support the Interrupt Disable Bit in the command register.
+	</para>
+	<para>
+On each interrupt, uio_pci_generic sets the Interrupt Disable bit.
+This prevents the device from generating further interrupts
+until the bit is cleared. The userspace driver should clear this
+bit before blocking and waiting for more interrupts.
+	</para>
+</sect1>
+<sect1 id="uio_pci_generic_userspace">
+<title>Writing userspace driver using uio_pci_generic</title>
+	<para>
+Userspace driver can use pci sysfs interface, or the
+libpci libray that wraps it, to talk to the device and to
+re-enable interrupts by writing to the command register.
+	</para>
+</sect1>
+<sect1 id="uio_pci_generic_example">
+<title>Example code using uio_pci_generic</title>
+	<para>
+Here is some sample userspace driver code using uio_pci_generic:
+<programlisting>
+#include &lt;stdlib.h&gt;
+#include &lt;stdio.h&gt;
+#include &lt;unistd.h&gt;
+#include &lt;sys/types.h&gt;
+#include &lt;sys/stat.h&gt;
+#include &lt;fcntl.h&gt;
+#include &lt;errno.h&gt;
+
+int main()
+{
+	int uiofd;
+	int configfd;
+	int err;
+	int i;
+	unsigned icount;
+	unsigned char command_high;
+
+	uiofd = open(&quot;/dev/uio0&quot;, O_RDONLY);
+	if (uiofd &lt; 0) {
+		perror(&quot;uio open:&quot;);
+		return errno;
+	}
+	configfd = open(&quot;/sys/class/uio/uio0/device/config&quot;, O_RDWR);
+	if (uiofd &lt; 0) {
+		perror(&quot;config open:&quot;);
+		return errno;
+	}
+
+	/* Read and cache command value */
+	err = pread(configfd, &amp;command_high, 1, 5);
+	if (err != 1) {
+		perror(&quot;command config read:&quot;);
+		return errno;
+	}
+	command_high &amp;= ~0x4;
+
+	for(i = 0;; ++i) {
+		/* Print out a message, for debugging. */
+		if (i == 0)
+			fprintf(stderr, &quot;Started uio test driver.\n&quot;);
+		else
+			fprintf(stderr, &quot;Interrupts: %d\n&quot;, icount);
+
+		/****************************************/
+		/* Here we got an interrupt from the
+		   device. Do something to it. */
+		/****************************************/
+
+		/* Re-enable interrupts. */
+		err = pwrite(configfd, &amp;command_high, 1, 5);
+		if (err != 1) {
+			perror(&quot;config write:&quot;);
+			break;
+		}
+
+		/* Wait for next interrupt. */
+		err = read(uiofd, &amp;icount, 4);
+		if (err != 4) {
+			perror(&quot;uio read:&quot;);
+			break;
+		}
+
+	}
+	return errno;
+}
+
+</programlisting>
+	</para>
+</sect1>
+
+</chapter>
+
 <appendix id="app1">
 <title>Further information</title>
 <itemizedlist>
diff --git a/MAINTAINERS b/MAINTAINERS
index 837b5985ac40..01193a4fe30e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2218,6 +2218,13 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git
 S:	Maintained
 F:	include/asm-generic
 
+GENERIC UIO DRIVER FOR PCI DEVICES
+M:	Michael S. Tsirkin <mst@redhat.com>
+L:	kvm@vger.kernel.org
+L:	linux-kernel@vger.kernel.org
+S:	Supported
+F:	drivers/uio/uio_pci_generic.c
+
 GFS2 FILE SYSTEM
 M:	Steven Whitehouse <swhiteho@redhat.com>
 L:	cluster-devel@redhat.com
diff --git a/drivers/uio/Kconfig b/drivers/uio/Kconfig
index 45200fd68534..8aa1955f35ed 100644
--- a/drivers/uio/Kconfig
+++ b/drivers/uio/Kconfig
@@ -84,4 +84,14 @@ config UIO_SERCOS3
 
 	  If you compile this as a module, it will be called uio_sercos3.
 
+config UIO_PCI_GENERIC
+	tristate "Generic driver for PCI 2.3 and PCI Express cards"
+	depends on PCI
+	default n
+	help
+	  Generic driver that you can bind, dynamically, to any
+	  PCI 2.3 compliant and PCI Express card. It is useful,
+	  primarily, for virtualization scenarios.
+	  If you compile this as a module, it will be called uio_pci_generic.
+
 endif
diff --git a/drivers/uio/Makefile b/drivers/uio/Makefile
index 5c2586d75797..73b2e7516729 100644
--- a/drivers/uio/Makefile
+++ b/drivers/uio/Makefile
@@ -5,3 +5,4 @@ obj-$(CONFIG_UIO_PDRV_GENIRQ)	+= uio_pdrv_genirq.o
 obj-$(CONFIG_UIO_SMX)	+= uio_smx.o
 obj-$(CONFIG_UIO_AEC)	+= uio_aec.o
 obj-$(CONFIG_UIO_SERCOS3)	+= uio_sercos3.o
+obj-$(CONFIG_UIO_PCI_GENERIC)	+= uio_pci_generic.o
diff --git a/drivers/uio/uio_pci_generic.c b/drivers/uio/uio_pci_generic.c
new file mode 100644
index 000000000000..313da35984af
--- /dev/null
+++ b/drivers/uio/uio_pci_generic.c
@@ -0,0 +1,207 @@
+/* uio_pci_generic - generic UIO driver for PCI 2.3 devices
+ *
+ * Copyright (C) 2009 Red Hat, Inc.
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Since the driver does not declare any device ids, you must allocate
+ * id and bind the device to the driver yourself.  For example:
+ *
+ * # echo "8086 10f5" > /sys/bus/pci/drivers/uio_pci_generic/new_id
+ * # echo -n 0000:00:19.0 > /sys/bus/pci/drivers/e1000e/unbind
+ * # echo -n 0000:00:19.0 > /sys/bus/pci/drivers/uio_pci_generic/bind
+ * # ls -l /sys/bus/pci/devices/0000:00:19.0/driver
+ * .../0000:00:19.0/driver -> ../../../bus/pci/drivers/uio_pci_generic
+ *
+ * Driver won't bind to devices which do not support the Interrupt Disable Bit
+ * in the command register. All devices compliant to PCI 2.3 (circa 2002) and
+ * all compliant PCI Express devices should support this bit.
+ */
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/uio_driver.h>
+#include <linux/spinlock.h>
+
+#define DRIVER_VERSION	"0.01.0"
+#define DRIVER_AUTHOR	"Michael S. Tsirkin <mst@redhat.com>"
+#define DRIVER_DESC	"Generic UIO driver for PCI 2.3 devices"
+
+struct uio_pci_generic_dev {
+	struct uio_info info;
+	struct pci_dev *pdev;
+	spinlock_t lock; /* guards command register accesses */
+};
+
+static inline struct uio_pci_generic_dev *
+to_uio_pci_generic_dev(struct uio_info *info)
+{
+	return container_of(info, struct uio_pci_generic_dev, info);
+}
+
+/* Interrupt handler. Read/modify/write the command register to disable
+ * the interrupt. */
+static irqreturn_t irqhandler(int irq, struct uio_info *info)
+{
+	struct uio_pci_generic_dev *gdev = to_uio_pci_generic_dev(info);
+	struct pci_dev *pdev = gdev->pdev;
+	irqreturn_t ret = IRQ_NONE;
+	u32 cmd_status_dword;
+	u16 origcmd, newcmd, status;
+
+	/* We do a single dword read to retrieve both command and status.
+	 * Document assumptions that make this possible. */
+	BUILD_BUG_ON(PCI_COMMAND % 4);
+	BUILD_BUG_ON(PCI_COMMAND + 2 != PCI_STATUS);
+
+	spin_lock_irq(&gdev->lock);
+	pci_block_user_cfg_access(pdev);
+
+	/* Read both command and status registers in a single 32-bit operation.
+	 * Note: we could cache the value for command and move the status read
+	 * out of the lock if there was a way to get notified of user changes
+	 * to command register through sysfs. Should be good for shared irqs. */
+	pci_read_config_dword(pdev, PCI_COMMAND, &cmd_status_dword);
+	origcmd = cmd_status_dword;
+	status = cmd_status_dword >> 16;
+
+	/* Check interrupt status register to see whether our device
+	 * triggered the interrupt. */
+	if (!(status & PCI_STATUS_INTERRUPT))
+		goto done;
+
+	/* We triggered the interrupt, disable it. */
+	newcmd = origcmd | PCI_COMMAND_INTX_DISABLE;
+	if (newcmd != origcmd)
+		pci_write_config_word(pdev, PCI_COMMAND, newcmd);
+
+	/* UIO core will signal the user process. */
+	ret = IRQ_HANDLED;
+done:
+
+	pci_unblock_user_cfg_access(pdev);
+	spin_unlock_irq(&gdev->lock);
+	return ret;
+}
+
+/* Verify that the device supports Interrupt Disable bit in command register,
+ * per PCI 2.3, by flipping this bit and reading it back: this bit was readonly
+ * in PCI 2.2. */
+static int __devinit verify_pci_2_3(struct pci_dev *pdev)
+{
+	u16 orig, new;
+	int err = 0;
+
+	pci_block_user_cfg_access(pdev);
+	pci_read_config_word(pdev, PCI_COMMAND, &orig);
+	pci_write_config_word(pdev, PCI_COMMAND,
+			      orig ^ PCI_COMMAND_INTX_DISABLE);
+	pci_read_config_word(pdev, PCI_COMMAND, &new);
+	/* There's no way to protect against
+	 * hardware bugs or detect them reliably, but as long as we know
+	 * what the value should be, let's go ahead and check it. */
+	if ((new ^ orig) & ~PCI_COMMAND_INTX_DISABLE) {
+		err = -EBUSY;
+		dev_err(&pdev->dev, "Command changed from 0x%x to 0x%x: "
+			"driver or HW bug?\n", orig, new);
+		goto err;
+	}
+	if (!((new ^ orig) & PCI_COMMAND_INTX_DISABLE)) {
+		dev_warn(&pdev->dev, "Device does not support "
+			 "disabling interrupts: unable to bind.\n");
+		err = -ENODEV;
+		goto err;
+	}
+	/* Now restore the original value. */
+	pci_write_config_word(pdev, PCI_COMMAND, orig);
+err:
+	pci_unblock_user_cfg_access(pdev);
+	return err;
+}
+
+static int __devinit probe(struct pci_dev *pdev,
+			   const struct pci_device_id *id)
+{
+	struct uio_pci_generic_dev *gdev;
+	int err;
+
+	if (!pdev->irq) {
+		dev_warn(&pdev->dev, "No IRQ assigned to device: "
+			 "no support for interrupts?\n");
+		return -ENODEV;
+	}
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "%s: pci_enable_device failed: %d\n",
+			__func__, err);
+		return err;
+	}
+
+	err = verify_pci_2_3(pdev);
+	if (err)
+		goto err_verify;
+
+	gdev = kzalloc(sizeof(struct uio_pci_generic_dev), GFP_KERNEL);
+	if (!gdev) {
+		err = -ENOMEM;
+		goto err_alloc;
+	}
+
+	gdev->info.name = "uio_pci_generic";
+	gdev->info.version = DRIVER_VERSION;
+	gdev->info.irq = pdev->irq;
+	gdev->info.irq_flags = IRQF_SHARED;
+	gdev->info.handler = irqhandler;
+	gdev->pdev = pdev;
+	spin_lock_init(&gdev->lock);
+
+	if (uio_register_device(&pdev->dev, &gdev->info))
+		goto err_register;
+	pci_set_drvdata(pdev, gdev);
+
+	return 0;
+err_register:
+	kfree(gdev);
+err_alloc:
+err_verify:
+	pci_disable_device(pdev);
+	return err;
+}
+
+static void remove(struct pci_dev *pdev)
+{
+	struct uio_pci_generic_dev *gdev = pci_get_drvdata(pdev);
+
+	uio_unregister_device(&gdev->info);
+	pci_disable_device(pdev);
+	kfree(gdev);
+}
+
+static struct pci_driver driver = {
+	.name = "uio_pci_generic",
+	.id_table = NULL, /* only dynamic id's */
+	.probe = probe,
+	.remove = remove,
+};
+
+static int __init init(void)
+{
+	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
+	return pci_register_driver(&driver);
+}
+
+static void __exit cleanup(void)
+{
+	pci_unregister_driver(&driver);
+}
+
+module_init(init);
+module_exit(cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index fcaee42c7ac2..dd0bed4f1cf0 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -42,6 +42,7 @@
 #define  PCI_COMMAND_INTX_DISABLE 0x400 /* INTx Emulation Disable */
 
 #define PCI_STATUS		0x06	/* 16 bits */
+#define  PCI_STATUS_INTERRUPT	0x08	/* Interrupt status */
 #define  PCI_STATUS_CAP_LIST	0x10	/* Support Capability List */
 #define  PCI_STATUS_66MHZ	0x20	/* Support 66 Mhz PCI 2.1 bus */
 #define  PCI_STATUS_UDF		0x40	/* Support User Definable Features [obsolete] */
-- 
cgit v1.2.3-71-gd317


From 4622709445705c1e423d2addcfd8ccae052fe0ba Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Tue, 4 Aug 2009 12:55:34 +0200
Subject: Driver core: Add support for compatibility classes

When turning class devices into bus devices, we may need to
temporarily add links in sysfs so that user-space applications
are not confused. This is done by adding the following API:

* Functions to register and unregister compatibility classes.
  These appear in sysfs at the same location as regular classes, but
  instead of class devices, they contain links to bus devices.
* Functions to create and delete such links. Additionally, the caller
  can optionally pass a target device to which a "device" link should
  point (typically that would be the device's parent), to fully emulate
  the original class device.

The i2c subsystem will be the first user of this API, as i2c adapters
are being converted from class devices to bus devices.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
---
 drivers/base/class.c   | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/device.h |  8 +++++
 2 files changed, 95 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/class.c b/drivers/base/class.c
index eb85e4312301..161746deab4b 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -488,6 +488,93 @@ void class_interface_unregister(struct class_interface *class_intf)
 	class_put(parent);
 }
 
+struct class_compat {
+	struct kobject *kobj;
+};
+
+/**
+ * class_compat_register - register a compatibility class
+ * @name: the name of the class
+ *
+ * Compatibility class are meant as a temporary user-space compatibility
+ * workaround when converting a family of class devices to a bus devices.
+ */
+struct class_compat *class_compat_register(const char *name)
+{
+	struct class_compat *cls;
+
+	cls = kmalloc(sizeof(struct class_compat), GFP_KERNEL);
+	if (!cls)
+		return NULL;
+	cls->kobj = kobject_create_and_add(name, &class_kset->kobj);
+	if (!cls->kobj) {
+		kfree(cls);
+		return NULL;
+	}
+	return cls;
+}
+EXPORT_SYMBOL_GPL(class_compat_register);
+
+/**
+ * class_compat_unregister - unregister a compatibility class
+ * @cls: the class to unregister
+ */
+void class_compat_unregister(struct class_compat *cls)
+{
+	kobject_put(cls->kobj);
+	kfree(cls);
+}
+EXPORT_SYMBOL_GPL(class_compat_unregister);
+
+/**
+ * class_compat_create_link - create a compatibility class device link to
+ *			      a bus device
+ * @cls: the compatibility class
+ * @dev: the target bus device
+ * @device_link: an optional device to which a "device" link should be created
+ */
+int class_compat_create_link(struct class_compat *cls, struct device *dev,
+			     struct device *device_link)
+{
+	int error;
+
+	error = sysfs_create_link(cls->kobj, &dev->kobj, dev_name(dev));
+	if (error)
+		return error;
+
+	/*
+	 * Optionally add a "device" link (typically to the parent), as a
+	 * class device would have one and we want to provide as much
+	 * backwards compatibility as possible.
+	 */
+	if (device_link) {
+		error = sysfs_create_link(&dev->kobj, &device_link->kobj,
+					  "device");
+		if (error)
+			sysfs_remove_link(cls->kobj, dev_name(dev));
+	}
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(class_compat_create_link);
+
+/**
+ * class_compat_remove_link - remove a compatibility class device link to
+ *			      a bus device
+ * @cls: the compatibility class
+ * @dev: the target bus device
+ * @device_link: an optional device to which a "device" link was previously
+ * 		 created
+ */
+void class_compat_remove_link(struct class_compat *cls, struct device *dev,
+			      struct device *device_link)
+{
+	if (device_link)
+		sysfs_remove_link(&dev->kobj, "device");
+	sysfs_remove_link(cls->kobj, dev_name(dev));
+}
+EXPORT_SYMBOL_GPL(class_compat_remove_link);
+
 int __init classes_init(void)
 {
 	class_kset = kset_create_and_add("class", NULL, NULL);
diff --git a/include/linux/device.h b/include/linux/device.h
index e19e40a3dcbe..62ff53a67931 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -225,6 +225,14 @@ extern void class_unregister(struct class *class);
 	__class_register(class, &__key);	\
 })
 
+struct class_compat;
+struct class_compat *class_compat_register(const char *name);
+void class_compat_unregister(struct class_compat *cls);
+int class_compat_create_link(struct class_compat *cls, struct device *dev,
+			     struct device *device_link);
+void class_compat_remove_link(struct class_compat *cls, struct device *dev,
+			      struct device *device_link);
+
 extern void class_dev_iter_init(struct class_dev_iter *iter,
 				struct class *class,
 				struct device *start,
-- 
cgit v1.2.3-71-gd317


From 2b2af54a5bb6f7e80ccf78f20084b93c398c3a8b Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Thu, 30 Apr 2009 15:23:42 +0200
Subject: Driver Core: devtmpfs - kernel-maintained tmpfs-based /dev

Devtmpfs lets the kernel create a tmpfs instance called devtmpfs
very early at kernel initialization, before any driver-core device
is registered. Every device with a major/minor will provide a
device node in devtmpfs.

Devtmpfs can be changed and altered by userspace at any time,
and in any way needed - just like today's udev-mounted tmpfs.
Unmodified udev versions will run just fine on top of it, and will
recognize an already existing kernel-created device node and use it.
The default node permissions are root:root 0600. Proper permissions
and user/group ownership, meaningful symlinks, all other policy still
needs to be applied by userspace.

If a node is created by devtmps, devtmpfs will remove the device node
when the device goes away. If the device node was created by
userspace, or the devtmpfs created node was replaced by userspace, it
will no longer be removed by devtmpfs.

If it is requested to auto-mount it, it makes init=/bin/sh work
without any further userspace support. /dev will be fully populated
and dynamic, and always reflect the current device state of the kernel.
With the commonly used dynamic device numbers, it solves the problem
where static devices nodes may point to the wrong devices.

It is intended to make the initial bootup logic simpler and more robust,
by de-coupling the creation of the inital environment, to reliably run
userspace processes, from a complex userspace bootstrap logic to provide
a working /dev.

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Jan Blunck <jblunck@suse.de>
Tested-By: Harald Hoyer <harald@redhat.com>
Tested-By: Scott James Remnant <scott@ubuntu.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/Kconfig     |  25 ++++
 drivers/base/Makefile    |   1 +
 drivers/base/base.h      |   6 +
 drivers/base/core.c      |   3 +
 drivers/base/devtmpfs.c  | 367 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/base/init.c      |   1 +
 include/linux/device.h   |  10 ++
 include/linux/shmem_fs.h |   3 +
 init/do_mounts.c         |   2 +-
 init/main.c              |   2 +
 mm/shmem.c               |   9 +-
 11 files changed, 422 insertions(+), 7 deletions(-)
 create mode 100644 drivers/base/devtmpfs.c

(limited to 'include/linux')

diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 8f006f96ff53..ee377270beb9 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -8,6 +8,31 @@ config UEVENT_HELPER_PATH
 	  Path to uevent helper program forked by the kernel for
 	  every uevent.
 
+config DEVTMPFS
+	bool "Create a kernel maintained /dev tmpfs (EXPERIMENTAL)"
+	depends on HOTPLUG && SHMEM && TMPFS
+	help
+	  This creates a tmpfs filesystem, and mounts it at bootup
+	  and mounts it at /dev. The kernel driver core creates device
+	  nodes for all registered devices in that filesystem. All device
+	  nodes are owned by root and have the default mode of 0600.
+	  Userspace can add and delete the nodes as needed. This is
+	  intended to simplify bootup, and make it possible to delay
+	  the initial coldplug at bootup done by udev in userspace.
+	  It should also provide a simpler way for rescue systems
+	  to bring up a kernel with dynamic major/minor numbers.
+	  Meaningful symlinks, permissions and device ownership must
+	  still be handled by userspace.
+	  If unsure, say N here.
+
+config DEVTMPFS_MOUNT
+	bool "Automount devtmpfs at /dev"
+	depends on DEVTMPFS
+	help
+	  This will mount devtmpfs at /dev if the kernel mounts the root
+	  filesystem. It will not affect initramfs based mounting.
+	  If unsure, say N here.
+
 config STANDALONE
 	bool "Select only drivers that don't need compile-time external firmware" if EXPERIMENTAL
 	default y
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index 1b2640ce74f0..c12c7f2f2a6f 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -4,6 +4,7 @@ obj-y			:= core.o sys.o bus.o dd.o \
 			   driver.o class.o platform.o \
 			   cpu.o firmware.o init.o map.o devres.o \
 			   attribute_container.o transport_class.o
+obj-$(CONFIG_DEVTMPFS)	+= devtmpfs.o
 obj-y			+= power/
 obj-$(CONFIG_HAS_DMA)	+= dma-mapping.o
 obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
diff --git a/drivers/base/base.h b/drivers/base/base.h
index 503d59c57501..2ca7f5b7b824 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -139,3 +139,9 @@ static inline void module_add_driver(struct module *mod,
 				     struct device_driver *drv) { }
 static inline void module_remove_driver(struct device_driver *drv) { }
 #endif
+
+#ifdef CONFIG_DEVTMPFS
+extern int devtmpfs_init(void);
+#else
+static inline int devtmpfs_init(void) { return 0; }
+#endif
diff --git a/drivers/base/core.c b/drivers/base/core.c
index a992985d1fab..390e664ec1c7 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -929,6 +929,8 @@ int device_add(struct device *dev)
 		error = device_create_sys_dev_entry(dev);
 		if (error)
 			goto devtattrError;
+
+		devtmpfs_create_node(dev);
 	}
 
 	error = device_add_class_symlinks(dev);
@@ -1075,6 +1077,7 @@ void device_del(struct device *dev)
 	if (parent)
 		klist_del(&dev->p->knode_parent);
 	if (MAJOR(dev->devt)) {
+		devtmpfs_delete_node(dev);
 		device_remove_sys_dev_entry(dev);
 		device_remove_file(dev, &devt_attr);
 	}
diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
new file mode 100644
index 000000000000..fd488ad4263a
--- /dev/null
+++ b/drivers/base/devtmpfs.c
@@ -0,0 +1,367 @@
+/*
+ * devtmpfs - kernel-maintained tmpfs-based /dev
+ *
+ * Copyright (C) 2009, Kay Sievers <kay.sievers@vrfy.org>
+ *
+ * During bootup, before any driver core device is registered,
+ * devtmpfs, a tmpfs-based filesystem is created. Every driver-core
+ * device which requests a device node, will add a node in this
+ * filesystem. The node is named after the the name of the device,
+ * or the susbsytem can provide a custom name. All devices are
+ * owned by root and have a mode of 0600.
+ */
+
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+#include <linux/mount.h>
+#include <linux/device.h>
+#include <linux/genhd.h>
+#include <linux/namei.h>
+#include <linux/fs.h>
+#include <linux/shmem_fs.h>
+#include <linux/cred.h>
+#include <linux/init_task.h>
+
+static struct vfsmount *dev_mnt;
+
+#if defined CONFIG_DEVTMPFS_MOUNT
+static int dev_mount = 1;
+#else
+static int dev_mount;
+#endif
+
+static int __init mount_param(char *str)
+{
+	dev_mount = simple_strtoul(str, NULL, 0);
+	return 1;
+}
+__setup("devtmpfs.mount=", mount_param);
+
+static int dev_get_sb(struct file_system_type *fs_type, int flags,
+		      const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	return get_sb_single(fs_type, flags, data, shmem_fill_super, mnt);
+}
+
+static struct file_system_type dev_fs_type = {
+	.name = "devtmpfs",
+	.get_sb = dev_get_sb,
+	.kill_sb = kill_litter_super,
+};
+
+#ifdef CONFIG_BLOCK
+static inline int is_blockdev(struct device *dev)
+{
+	return dev->class == &block_class;
+}
+#else
+static inline int is_blockdev(struct device *dev) { return 0; }
+#endif
+
+static int dev_mkdir(const char *name, mode_t mode)
+{
+	struct nameidata nd;
+	struct dentry *dentry;
+	int err;
+
+	err = vfs_path_lookup(dev_mnt->mnt_root, dev_mnt,
+			      name, LOOKUP_PARENT, &nd);
+	if (err)
+		return err;
+
+	dentry = lookup_create(&nd, 1);
+	if (!IS_ERR(dentry)) {
+		err = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode);
+		dput(dentry);
+	} else {
+		err = PTR_ERR(dentry);
+	}
+	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+
+	path_put(&nd.path);
+	return err;
+}
+
+static int create_path(const char *nodepath)
+{
+	char *path;
+	struct nameidata nd;
+	int err = 0;
+
+	path = kstrdup(nodepath, GFP_KERNEL);
+	if (!path)
+		return -ENOMEM;
+
+	err = vfs_path_lookup(dev_mnt->mnt_root, dev_mnt,
+			      path, LOOKUP_PARENT, &nd);
+	if (err == 0) {
+		struct dentry *dentry;
+
+		/* create directory right away */
+		dentry = lookup_create(&nd, 1);
+		if (!IS_ERR(dentry)) {
+			err = vfs_mkdir(nd.path.dentry->d_inode,
+					dentry, 0755);
+			dput(dentry);
+		}
+		mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+
+		path_put(&nd.path);
+	} else if (err == -ENOENT) {
+		char *s;
+
+		/* parent directories do not exist, create them */
+		s = path;
+		while (1) {
+			s = strchr(s, '/');
+			if (!s)
+				break;
+			s[0] = '\0';
+			err = dev_mkdir(path, 0755);
+			if (err && err != -EEXIST)
+				break;
+			s[0] = '/';
+			s++;
+		}
+	}
+
+	kfree(path);
+	return err;
+}
+
+int devtmpfs_create_node(struct device *dev)
+{
+	const char *tmp = NULL;
+	const char *nodename;
+	const struct cred *curr_cred;
+	mode_t mode;
+	struct nameidata nd;
+	struct dentry *dentry;
+	int err;
+
+	if (!dev_mnt)
+		return 0;
+
+	nodename = device_get_nodename(dev, &tmp);
+	if (!nodename)
+		return -ENOMEM;
+
+	if (is_blockdev(dev))
+		mode = S_IFBLK|0600;
+	else
+		mode = S_IFCHR|0600;
+
+	curr_cred = override_creds(&init_cred);
+	err = vfs_path_lookup(dev_mnt->mnt_root, dev_mnt,
+			      nodename, LOOKUP_PARENT, &nd);
+	if (err == -ENOENT) {
+		/* create missing parent directories */
+		create_path(nodename);
+		err = vfs_path_lookup(dev_mnt->mnt_root, dev_mnt,
+				      nodename, LOOKUP_PARENT, &nd);
+		if (err)
+			goto out;
+	}
+
+	dentry = lookup_create(&nd, 0);
+	if (!IS_ERR(dentry)) {
+		err = vfs_mknod(nd.path.dentry->d_inode,
+				dentry, mode, dev->devt);
+		/* mark as kernel created inode */
+		if (!err)
+			dentry->d_inode->i_private = &dev_mnt;
+		dput(dentry);
+	} else {
+		err = PTR_ERR(dentry);
+	}
+	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+
+	path_put(&nd.path);
+out:
+	kfree(tmp);
+	revert_creds(curr_cred);
+	return err;
+}
+
+static int dev_rmdir(const char *name)
+{
+	struct nameidata nd;
+	struct dentry *dentry;
+	int err;
+
+	err = vfs_path_lookup(dev_mnt->mnt_root, dev_mnt,
+			      name, LOOKUP_PARENT, &nd);
+	if (err)
+		return err;
+
+	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	dentry = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
+	if (!IS_ERR(dentry)) {
+		if (dentry->d_inode)
+			err = vfs_rmdir(nd.path.dentry->d_inode, dentry);
+		else
+			err = -ENOENT;
+		dput(dentry);
+	} else {
+		err = PTR_ERR(dentry);
+	}
+	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+
+	path_put(&nd.path);
+	return err;
+}
+
+static int delete_path(const char *nodepath)
+{
+	const char *path;
+	int err = 0;
+
+	path = kstrdup(nodepath, GFP_KERNEL);
+	if (!path)
+		return -ENOMEM;
+
+	while (1) {
+		char *base;
+
+		base = strrchr(path, '/');
+		if (!base)
+			break;
+		base[0] = '\0';
+		err = dev_rmdir(path);
+		if (err)
+			break;
+	}
+
+	kfree(path);
+	return err;
+}
+
+static int dev_mynode(struct device *dev, struct inode *inode, struct kstat *stat)
+{
+	/* did we create it */
+	if (inode->i_private != &dev_mnt)
+		return 0;
+
+	/* does the dev_t match */
+	if (is_blockdev(dev)) {
+		if (!S_ISBLK(stat->mode))
+			return 0;
+	} else {
+		if (!S_ISCHR(stat->mode))
+			return 0;
+	}
+	if (stat->rdev != dev->devt)
+		return 0;
+
+	/* ours */
+	return 1;
+}
+
+int devtmpfs_delete_node(struct device *dev)
+{
+	const char *tmp = NULL;
+	const char *nodename;
+	const struct cred *curr_cred;
+	struct nameidata nd;
+	struct dentry *dentry;
+	struct kstat stat;
+	int deleted = 1;
+	int err;
+
+	if (!dev_mnt)
+		return 0;
+
+	nodename = device_get_nodename(dev, &tmp);
+	if (!nodename)
+		return -ENOMEM;
+
+	curr_cred = override_creds(&init_cred);
+	err = vfs_path_lookup(dev_mnt->mnt_root, dev_mnt,
+			      nodename, LOOKUP_PARENT, &nd);
+	if (err)
+		goto out;
+
+	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	dentry = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
+	if (!IS_ERR(dentry)) {
+		if (dentry->d_inode) {
+			err = vfs_getattr(nd.path.mnt, dentry, &stat);
+			if (!err && dev_mynode(dev, dentry->d_inode, &stat)) {
+				err = vfs_unlink(nd.path.dentry->d_inode,
+						 dentry);
+				if (!err || err == -ENOENT)
+					deleted = 1;
+			}
+		} else {
+			err = -ENOENT;
+		}
+		dput(dentry);
+	} else {
+		err = PTR_ERR(dentry);
+	}
+	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+
+	path_put(&nd.path);
+	if (deleted && strchr(nodename, '/'))
+		delete_path(nodename);
+out:
+	kfree(tmp);
+	revert_creds(curr_cred);
+	return err;
+}
+
+/*
+ * If configured, or requested by the commandline, devtmpfs will be
+ * auto-mounted after the kernel mounted the root filesystem.
+ */
+int devtmpfs_mount(const char *mountpoint)
+{
+	struct path path;
+	int err;
+
+	if (!dev_mount)
+		return 0;
+
+	if (!dev_mnt)
+		return 0;
+
+	err = kern_path(mountpoint, LOOKUP_FOLLOW, &path);
+	if (err)
+		return err;
+	err = do_add_mount(dev_mnt, &path, 0, NULL);
+	if (err)
+		printk(KERN_INFO "devtmpfs: error mounting %i\n", err);
+	else
+		printk(KERN_INFO "devtmpfs: mounted\n");
+	path_put(&path);
+	return err;
+}
+
+/*
+ * Create devtmpfs instance, driver-core devices will add their device
+ * nodes here.
+ */
+int __init devtmpfs_init(void)
+{
+	int err;
+	struct vfsmount *mnt;
+
+	err = register_filesystem(&dev_fs_type);
+	if (err) {
+		printk(KERN_ERR "devtmpfs: unable to register devtmpfs "
+		       "type %i\n", err);
+		return err;
+	}
+
+	mnt = kern_mount(&dev_fs_type);
+	if (IS_ERR(mnt)) {
+		err = PTR_ERR(mnt);
+		printk(KERN_ERR "devtmpfs: unable to create devtmpfs %i\n", err);
+		unregister_filesystem(&dev_fs_type);
+		return err;
+	}
+	dev_mnt = mnt;
+
+	printk(KERN_INFO "devtmpfs: initialized\n");
+	return 0;
+}
diff --git a/drivers/base/init.c b/drivers/base/init.c
index 7bd9b6a5b01f..c8a934e79421 100644
--- a/drivers/base/init.c
+++ b/drivers/base/init.c
@@ -20,6 +20,7 @@
 void __init driver_init(void)
 {
 	/* These are the core pieces */
+	devtmpfs_init();
 	devices_init();
 	buses_init();
 	classes_init();
diff --git a/include/linux/device.h b/include/linux/device.h
index 62ff53a67931..847b763e40e9 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -552,6 +552,16 @@ extern void put_device(struct device *dev);
 
 extern void wait_for_device_probe(void);
 
+#ifdef CONFIG_DEVTMPFS
+extern int devtmpfs_create_node(struct device *dev);
+extern int devtmpfs_delete_node(struct device *dev);
+extern int devtmpfs_mount(const char *mountpoint);
+#else
+static inline int devtmpfs_create_node(struct device *dev) { return 0; }
+static inline int devtmpfs_delete_node(struct device *dev) { return 0; }
+static inline int devtmpfs_mount(const char *mountpoint) { return 0; }
+#endif
+
 /* drivers/base/power/shutdown.c */
 extern void device_shutdown(void);
 
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 6d3f2f449ead..deee7afd8d66 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -38,6 +38,9 @@ static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
 	return container_of(inode, struct shmem_inode_info, vfs_inode);
 }
 
+extern int init_tmpfs(void);
+extern int shmem_fill_super(struct super_block *sb, void *data, int silent);
+
 #ifdef CONFIG_TMPFS_POSIX_ACL
 int shmem_check_acl(struct inode *, int);
 int shmem_acl_init(struct inode *, struct inode *);
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 093f65915501..bb008d064c1a 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -415,7 +415,7 @@ void __init prepare_namespace(void)
 
 	mount_root();
 out:
+	devtmpfs_mount("dev");
 	sys_mount(".", "/", NULL, MS_MOVE, NULL);
 	sys_chroot(".");
 }
-
diff --git a/init/main.c b/init/main.c
index b34fd8e5edef..8e6a7846bd07 100644
--- a/init/main.c
+++ b/init/main.c
@@ -68,6 +68,7 @@
 #include <linux/async.h>
 #include <linux/kmemcheck.h>
 #include <linux/kmemtrace.h>
+#include <linux/shmem_fs.h>
 #include <trace/boot.h>
 
 #include <asm/io.h>
@@ -809,6 +810,7 @@ static void __init do_basic_setup(void)
 	init_workqueues();
 	cpuset_init_smp();
 	usermodehelper_init();
+	init_tmpfs();
 	driver_init();
 	init_irq_proc();
 	do_ctors();
diff --git a/mm/shmem.c b/mm/shmem.c
index 5a0b3d4055f3..bd20f8bb02aa 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2298,8 +2298,7 @@ static void shmem_put_super(struct super_block *sb)
 	sb->s_fs_info = NULL;
 }
 
-static int shmem_fill_super(struct super_block *sb,
-			    void *data, int silent)
+int shmem_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct inode *inode;
 	struct dentry *root;
@@ -2519,7 +2518,7 @@ static struct file_system_type tmpfs_fs_type = {
 	.kill_sb	= kill_litter_super,
 };
 
-static int __init init_tmpfs(void)
+int __init init_tmpfs(void)
 {
 	int error;
 
@@ -2576,7 +2575,7 @@ static struct file_system_type tmpfs_fs_type = {
 	.kill_sb	= kill_litter_super,
 };
 
-static int __init init_tmpfs(void)
+int __init init_tmpfs(void)
 {
 	BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
 
@@ -2687,5 +2686,3 @@ int shmem_zero_setup(struct vm_area_struct *vma)
 	vma->vm_ops = &shmem_vm_ops;
 	return 0;
 }
-
-module_init(init_tmpfs)
-- 
cgit v1.2.3-71-gd317


From 5d351754fcf58d1a604aa7cf95c2805e8a098ad9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 15 Sep 2009 13:32:13 -0400
Subject: SUNRPC: Defer the auth_gss upcall when the RPC call is asynchronous

Otherwise, the upcall is going to be synchronous, which may not be what the
caller wants...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 include/linux/sunrpc/auth.h |  4 ++--
 net/sunrpc/auth.c           | 20 ++++++++++++--------
 net/sunrpc/auth_generic.c   |  4 ++--
 3 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 3f632182d8eb..996df4dac7d4 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -111,7 +111,7 @@ struct rpc_credops {
 	void			(*crdestroy)(struct rpc_cred *);
 
 	int			(*crmatch)(struct auth_cred *, struct rpc_cred *, int);
-	void			(*crbind)(struct rpc_task *, struct rpc_cred *);
+	void			(*crbind)(struct rpc_task *, struct rpc_cred *, int);
 	__be32 *		(*crmarshal)(struct rpc_task *, __be32 *);
 	int			(*crrefresh)(struct rpc_task *);
 	__be32 *		(*crvalidate)(struct rpc_task *, __be32 *);
@@ -140,7 +140,7 @@ struct rpc_cred *	rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *
 void			rpcauth_init_cred(struct rpc_cred *, const struct auth_cred *, struct rpc_auth *, const struct rpc_credops *);
 struct rpc_cred *	rpcauth_lookupcred(struct rpc_auth *, int);
 void			rpcauth_bindcred(struct rpc_task *, struct rpc_cred *, int);
-void			rpcauth_generic_bind_cred(struct rpc_task *, struct rpc_cred *);
+void			rpcauth_generic_bind_cred(struct rpc_task *, struct rpc_cred *, int);
 void			put_rpccred(struct rpc_cred *);
 void			rpcauth_unbindcred(struct rpc_task *);
 __be32 *		rpcauth_marshcred(struct rpc_task *, __be32 *);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 0c431c277af5..54a4e042f104 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -385,7 +385,7 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred,
 EXPORT_SYMBOL_GPL(rpcauth_init_cred);
 
 void
-rpcauth_generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred)
+rpcauth_generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred, int lookupflags)
 {
 	task->tk_msg.rpc_cred = get_rpccred(cred);
 	dprintk("RPC: %5u holding %s cred %p\n", task->tk_pid,
@@ -394,7 +394,7 @@ rpcauth_generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred)
 EXPORT_SYMBOL_GPL(rpcauth_generic_bind_cred);
 
 static void
-rpcauth_bind_root_cred(struct rpc_task *task)
+rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags)
 {
 	struct rpc_auth *auth = task->tk_client->cl_auth;
 	struct auth_cred acred = {
@@ -405,7 +405,7 @@ rpcauth_bind_root_cred(struct rpc_task *task)
 
 	dprintk("RPC: %5u looking up %s cred\n",
 		task->tk_pid, task->tk_client->cl_auth->au_ops->au_name);
-	ret = auth->au_ops->lookup_cred(auth, &acred, 0);
+	ret = auth->au_ops->lookup_cred(auth, &acred, lookupflags);
 	if (!IS_ERR(ret))
 		task->tk_msg.rpc_cred = ret;
 	else
@@ -413,14 +413,14 @@ rpcauth_bind_root_cred(struct rpc_task *task)
 }
 
 static void
-rpcauth_bind_new_cred(struct rpc_task *task)
+rpcauth_bind_new_cred(struct rpc_task *task, int lookupflags)
 {
 	struct rpc_auth *auth = task->tk_client->cl_auth;
 	struct rpc_cred *ret;
 
 	dprintk("RPC: %5u looking up %s cred\n",
 		task->tk_pid, auth->au_ops->au_name);
-	ret = rpcauth_lookupcred(auth, 0);
+	ret = rpcauth_lookupcred(auth, lookupflags);
 	if (!IS_ERR(ret))
 		task->tk_msg.rpc_cred = ret;
 	else
@@ -430,12 +430,16 @@ rpcauth_bind_new_cred(struct rpc_task *task)
 void
 rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
 {
+	int lookupflags = 0;
+
+	if (flags & RPC_TASK_ASYNC)
+		lookupflags |= RPCAUTH_LOOKUP_NEW;
 	if (cred != NULL)
-		cred->cr_ops->crbind(task, cred);
+		cred->cr_ops->crbind(task, cred, lookupflags);
 	else if (flags & RPC_TASK_ROOTCREDS)
-		rpcauth_bind_root_cred(task);
+		rpcauth_bind_root_cred(task, lookupflags);
 	else
-		rpcauth_bind_new_cred(task);
+		rpcauth_bind_new_cred(task, lookupflags);
 }
 
 void
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 4028502f0528..bf88bf8e9365 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -55,13 +55,13 @@ struct rpc_cred *rpc_lookup_machine_cred(void)
 EXPORT_SYMBOL_GPL(rpc_lookup_machine_cred);
 
 static void
-generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred)
+generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred, int lookupflags)
 {
 	struct rpc_auth *auth = task->tk_client->cl_auth;
 	struct auth_cred *acred = &container_of(cred, struct generic_cred, gc_base)->acred;
 	struct rpc_cred *ret;
 
-	ret = auth->au_ops->lookup_cred(auth, acred, 0);
+	ret = auth->au_ops->lookup_cred(auth, acred, lookupflags);
 	if (!IS_ERR(ret))
 		task->tk_msg.rpc_cred = ret;
 	else
-- 
cgit v1.2.3-71-gd317


From 29ab23cc5d351658d01a4327d55e9106a73fd04f Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Tue, 15 Sep 2009 15:56:50 -0400
Subject: nfsd4: allow nfs4 state startup to fail

The failure here is pretty unlikely, but we should handle it anyway.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c       | 17 ++++++++++++-----
 fs/nfsd/nfssvc.c          |  4 +++-
 include/linux/nfsd/nfsd.h |  4 ++--
 3 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 46e9ac526872..11db40cb2f2b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4004,7 +4004,7 @@ set_max_delegations(void)
 
 /* initialization to perform when the nfsd service is started: */
 
-static void
+static int
 __nfs4_state_start(void)
 {
 	unsigned long grace_time;
@@ -4016,19 +4016,26 @@ __nfs4_state_start(void)
 	printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
 	       grace_time/HZ);
 	laundry_wq = create_singlethread_workqueue("nfsd4");
+	if (laundry_wq == NULL)
+		return -ENOMEM;
 	queue_delayed_work(laundry_wq, &laundromat_work, grace_time);
 	set_max_delegations();
+	return 0;
 }
 
-void
+int
 nfs4_state_start(void)
 {
+	int ret;
+
 	if (nfs4_init)
-		return;
+		return 0;
 	nfsd4_load_reboot_recovery_data();
-	__nfs4_state_start();
+	ret = __nfs4_state_start();
+	if (ret)
+		return ret;
 	nfs4_init = 1;
-	return;
+	return 0;
 }
 
 time_t
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 4472449c0937..fcc001088261 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -411,7 +411,9 @@ nfsd_svc(unsigned short port, int nrservs)
 	error =	nfsd_racache_init(2*nrservs);
 	if (error<0)
 		goto out;
-	nfs4_state_start();
+	error = nfs4_state_start();
+	if (error)
+		goto out;
 
 	nfsd_reset_versions();
 
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 2812ed52669d..24fdf89cea83 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -166,7 +166,7 @@ extern int nfsd_max_blksize;
 extern unsigned int max_delegations;
 int nfs4_state_init(void);
 void nfsd4_free_slabs(void);
-void nfs4_state_start(void);
+int nfs4_state_start(void);
 void nfs4_state_shutdown(void);
 time_t nfs4_lease_time(void);
 void nfs4_reset_lease(time_t leasetime);
@@ -174,7 +174,7 @@ int nfs4_reset_recoverydir(char *recdir);
 #else
 static inline int nfs4_state_init(void) { return 0; }
 static inline void nfsd4_free_slabs(void) { }
-static inline void nfs4_state_start(void) { }
+static inline int nfs4_state_start(void) { }
 static inline void nfs4_state_shutdown(void) { }
 static inline time_t nfs4_lease_time(void) { return 0; }
 static inline void nfs4_reset_lease(time_t leasetime) { }
-- 
cgit v1.2.3-71-gd317


From 80fc015bdfe1f5b870c1e1ee02d78e709523fee7 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Tue, 15 Sep 2009 18:07:35 -0400
Subject: nfsd4: use common rpc_cred for all callbacks

Callbacks are always made using the machine's identity, so we can use a
single auth_generic credential shared among callbacks to all clients and
let the rpc code take care of the rest.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c     | 33 ++++++++++-----------------------
 fs/nfsd/nfs4state.c        |  6 +-----
 include/linux/nfsd/state.h |  2 +-
 3 files changed, 12 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4abb88264c72..128519769ea8 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -432,42 +432,29 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = {
 	.rpc_call_done = nfsd4_cb_probe_done,
 };
 
-static struct rpc_cred *lookup_cb_cred(struct nfs4_cb_conn *cb)
+static struct rpc_cred *callback_cred;
+
+int set_callback_cred(void)
 {
-	struct auth_cred acred = {
-		.machine_cred = 1
-	};
-	struct rpc_auth *auth = cb->cb_client->cl_auth;
-
-	/*
-	 * Note in the gss case this doesn't actually have to wait for a
-	 * gss upcall (or any calls to the client); this just creates a
-	 * non-uptodate cred which the rpc state machine will fill in with
-	 * a refresh_upcall later.
-	 */
-	return auth->au_ops->lookup_cred(auth, &acred, RPCAUTH_LOOKUP_NEW);
+	callback_cred = rpc_lookup_machine_cred();
+	if (!callback_cred)
+		return -ENOMEM;
+	return 0;
 }
 
+
 void do_probe_callback(struct nfs4_client *clp)
 {
 	struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
 	struct rpc_message msg = {
 		.rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
 		.rpc_argp       = clp,
+		.rpc_cred	= callback_cred
 	};
-	struct rpc_cred *cred;
 	int status;
 
-	cred = lookup_cb_cred(cb);
-	if (IS_ERR(cred)) {
-		status = PTR_ERR(cred);
-		goto out;
-	}
-	cb->cb_cred = cred;
-	msg.rpc_cred = cb->cb_cred;
 	status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_SOFT,
 				&nfsd4_cb_probe_ops, (void *)clp);
-out:
 	if (status) {
 		warn_no_callback_path(clp, status);
 		put_nfs4_client(clp);
@@ -550,7 +537,7 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
 		.rpc_argp = dp,
-		.rpc_cred = clp->cl_cb_conn.cb_cred
+		.rpc_cred = callback_cred
 	};
 	int status;
 
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 11db40cb2f2b..0445192d660d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -696,10 +696,6 @@ shutdown_callback_client(struct nfs4_client *clp)
 		clp->cl_cb_conn.cb_client = NULL;
 		rpc_shutdown_client(clnt);
 	}
-	if (clp->cl_cb_conn.cb_cred) {
-		put_rpccred(clp->cl_cb_conn.cb_cred);
-		clp->cl_cb_conn.cb_cred = NULL;
-	}
 }
 
 static inline void
@@ -4020,7 +4016,7 @@ __nfs4_state_start(void)
 		return -ENOMEM;
 	queue_delayed_work(laundry_wq, &laundromat_work, grace_time);
 	set_max_delegations();
-	return 0;
+	return set_callback_cred();
 }
 
 int
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 70ef5f4abbbc..9bf3aa8c5aea 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -89,7 +89,6 @@ struct nfs4_cb_conn {
 	/* RPC client info */
 	atomic_t		cb_set;     /* successful CB_NULL call */
 	struct rpc_clnt *       cb_client;
-	struct rpc_cred	*	cb_cred;
 };
 
 /* Maximum number of slots per session. 160 is useful for long haul TCP */
@@ -362,6 +361,7 @@ extern int nfs4_in_grace(void);
 extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
 extern void put_nfs4_client(struct nfs4_client *clp);
 extern void nfs4_free_stateowner(struct kref *kref);
+extern int set_callback_cred(void);
 extern void nfsd4_probe_callback(struct nfs4_client *clp);
 extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
 extern void nfs4_put_delegation(struct nfs4_delegation *dp);
-- 
cgit v1.2.3-71-gd317


From 38524ab38f2752beee262a97403d871665838172 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Thu, 10 Sep 2009 12:25:59 +0300
Subject: nfsd41: Backchannel: callback infrastructure

Keep the xprt used for create_session in cl_cb_xprt.
Mark cl_callback.cb_minorversion = 1 and remember
the client provided cl_callback.cb_prog rpc program number.
Use it to probe the callback path.

Use the client's network address to initialize as the
callback's address as expected by the xprt creation
routines.

Define xdr sizes and code nfs4_cb_compound header to be able
to send a null callback rpc.

Signed-off-by: Andy Adamson<andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
[get callback minorversion from fore channel's]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: change bc_sock to bc_xprt]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[pulled definition for cl_cb_xprt]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: set up backchannel's cb_addr]
[moved rpc_create_args init to "nfsd: modify nfsd4.1 backchannel to use new xprt class"]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c     | 17 +++++++++++++++--
 fs/nfsd/nfs4state.c        | 14 ++++++++++++++
 include/linux/nfsd/state.h |  3 +++
 3 files changed, 32 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 128519769ea8..db4188ce9b00 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -43,6 +43,7 @@
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svcsock.h>
 #include <linux/nfsd/nfsd.h>
 #include <linux/nfsd/state.h>
 #include <linux/sunrpc/sched.h>
@@ -52,16 +53,19 @@
 
 #define NFSPROC4_CB_NULL 0
 #define NFSPROC4_CB_COMPOUND 1
+#define NFS4_STATEID_SIZE 16
 
 /* Index of predefined Linux callback client operations */
 
 enum {
 	NFSPROC4_CLNT_CB_NULL = 0,
 	NFSPROC4_CLNT_CB_RECALL,
+	NFSPROC4_CLNT_CB_SEQUENCE,
 };
 
 enum nfs_cb_opnum4 {
 	OP_CB_RECALL            = 4,
+	OP_CB_SEQUENCE          = 11,
 };
 
 #define NFS4_MAXTAGLEN		20
@@ -70,15 +74,22 @@ enum nfs_cb_opnum4 {
 #define NFS4_dec_cb_null_sz		0
 #define cb_compound_enc_hdr_sz		4
 #define cb_compound_dec_hdr_sz		(3 + (NFS4_MAXTAGLEN >> 2))
+#define sessionid_sz			(NFS4_MAX_SESSIONID_LEN >> 2)
+#define cb_sequence_enc_sz		(sessionid_sz + 4 +             \
+					1 /* no referring calls list yet */)
+#define cb_sequence_dec_sz		(op_dec_sz + sessionid_sz + 4)
+
 #define op_enc_sz			1
 #define op_dec_sz			2
 #define enc_nfs4_fh_sz			(1 + (NFS4_FHSIZE >> 2))
 #define enc_stateid_sz			(NFS4_STATEID_SIZE >> 2)
 #define NFS4_enc_cb_recall_sz		(cb_compound_enc_hdr_sz +       \
+					cb_sequence_enc_sz +            \
 					1 + enc_stateid_sz +            \
 					enc_nfs4_fh_sz)
 
 #define NFS4_dec_cb_recall_sz		(cb_compound_dec_hdr_sz  +      \
+					cb_sequence_dec_sz +            \
 					op_dec_sz)
 
 /*
@@ -137,11 +148,13 @@ xdr_error:                                      \
 } while (0)
 
 struct nfs4_cb_compound_hdr {
-	int		status;
-	u32		ident;
+	/* args */
+	u32		ident;	/* minorversion 0 only */
 	u32		nops;
 	__be32		*nops_p;
 	u32		minorversion;
+	/* res */
+	int		status;
 	u32		taglen;
 	char		*tag;
 };
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0445192d660d..d8196b453f61 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -702,6 +702,8 @@ static inline void
 free_client(struct nfs4_client *clp)
 {
 	shutdown_callback_client(clp);
+	if (clp->cl_cb_xprt)
+		svc_xprt_put(clp->cl_cb_xprt);
 	if (clp->cl_cred.cr_group_info)
 		put_group_info(clp->cl_cred.cr_group_info);
 	kfree(clp->cl_principal);
@@ -1317,6 +1319,18 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 		cr_ses->flags &= ~SESSION4_PERSIST;
 		cr_ses->flags &= ~SESSION4_RDMA;
 
+		if (cr_ses->flags & SESSION4_BACK_CHAN) {
+			unconf->cl_cb_xprt = rqstp->rq_xprt;
+			svc_xprt_get(unconf->cl_cb_xprt);
+			rpc_copy_addr(
+				(struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
+				sa);
+			unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
+			unconf->cl_cb_conn.cb_minorversion =
+				cstate->minorversion;
+			unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
+			nfsd4_probe_callback(unconf);
+		}
 		conf = unconf;
 	} else {
 		status = nfserr_stale_clientid;
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 9bf3aa8c5aea..c916032570c4 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -211,6 +211,9 @@ struct nfs4_client {
 	struct nfsd4_clid_slot	cl_cs_slot;	/* create_session slot */
 	u32			cl_exchange_flags;
 	struct nfs4_sessionid	cl_sessionid;
+
+	/* for nfs41 callbacks */
+	struct svc_xprt		*cl_cb_xprt;	/* 4.1 callback transport */
 };
 
 /* struct nfs4_client_reset
-- 
cgit v1.2.3-71-gd317


From 132f97715c098393fb8de3c26b07b9fdbd2334f1 Mon Sep 17 00:00:00 2001
From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Date: Thu, 10 Sep 2009 12:26:12 +0300
Subject: nfsd41: Backchannel: Add sequence arguments to callback RPC arguments

Follow the model we use in the client. Make the sequence arguments
part of the regular RPC arguments.  None of the callbacks that are
soon to be implemented expect results that need to be passed back
to the caller, so we don't define a separate RPC results structure.
For session validation, the cb_sequence decoding will use a pointer
to the sequence arguments that are part of the RPC argument.

Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
[define struct nfsd4_cb_sequence here]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c     | 5 +++++
 include/linux/nfsd/state.h | 6 ++++++
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index db4188ce9b00..f31175717c1a 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -92,6 +92,11 @@ enum nfs_cb_opnum4 {
 					cb_sequence_dec_sz +            \
 					op_dec_sz)
 
+struct nfs4_rpc_args {
+	void				*args_op;
+	struct nfsd4_cb_sequence	args_seq;
+};
+
 /*
 * Generic encode routines from fs/nfs/nfs4xdr.c
 */
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index c916032570c4..0e5b5aecde03 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -60,6 +60,12 @@ typedef struct {
 #define si_stateownerid   si_opaque.so_stateownerid
 #define si_fileid         si_opaque.so_fileid
 
+struct nfsd4_cb_sequence {
+	/* args/res */
+	u32			cbs_minorversion;
+	struct nfs4_client	*cbs_clp;
+};
+
 struct nfs4_delegation {
 	struct list_head	dl_perfile;
 	struct list_head	dl_perclnt;
-- 
cgit v1.2.3-71-gd317


From 199ff35e1c8724871e157c2e48556c2794946e82 Mon Sep 17 00:00:00 2001
From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Date: Thu, 10 Sep 2009 12:26:25 +0300
Subject: nfsd41: Backchannel: Server backchannel RPC wait queue

RPC callback requests will wait on this wait queue if the backchannel
is out of slots.

Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c        | 2 ++
 include/linux/nfsd/state.h | 4 ++++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d8196b453f61..f4cebd9016bc 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -775,6 +775,8 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
 	INIT_LIST_HEAD(&clp->cl_delegations);
 	INIT_LIST_HEAD(&clp->cl_sessions);
 	INIT_LIST_HEAD(&clp->cl_lru);
+	clear_bit(0, &clp->cl_cb_slot_busy);
+	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
 	return clp;
 }
 
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 0e5b5aecde03..9cc40a137c34 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -219,7 +219,11 @@ struct nfs4_client {
 	struct nfs4_sessionid	cl_sessionid;
 
 	/* for nfs41 callbacks */
+	/* We currently support a single back channel with a single slot */
+	unsigned long		cl_cb_slot_busy;
 	struct svc_xprt		*cl_cb_xprt;	/* 4.1 callback transport */
+	struct rpc_wait_queue	cl_cb_waitq;	/* backchannel callers may */
+						/* wait here for slots */
 };
 
 /* struct nfs4_client_reset
-- 
cgit v1.2.3-71-gd317


From 2a1d1b593803d7c18a369bf148f3b48c5a3260fc Mon Sep 17 00:00:00 2001
From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Date: Thu, 10 Sep 2009 12:26:38 +0300
Subject: nfsd41: Backchannel: Setup sequence information

Follows the model used by the NFS client.  Setup the RPC prepare and done
function pointers so that we can populate the sequence information if
minorversion == 1.  rpc_run_task() is then invoked directly just like
existing NFS client operations do.

nfsd4_cb_prepare() determines if the sequence information needs to be setup.
If the slot is in use, it adds itself to the wait queue.

nfsd4_cb_done() wakes anyone sleeping on the callback channel wait queue
after our RPC reply has been received.  It also sets the task message
result pointer to NULL to clearly indicate we're done using it.

Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
[define and initialize cl_cb_seq_nr here]
[pulled out unused defintion of nfsd4_cb_done]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c     | 62 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs4state.c        |  1 +
 include/linux/nfsd/state.h |  1 +
 3 files changed, 64 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index f31175717c1a..25a09069e458 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -501,6 +501,67 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 	do_probe_callback(clp);
 }
 
+/*
+ * There's currently a single callback channel slot.
+ * If the slot is available, then mark it busy.  Otherwise, set the
+ * thread for sleeping on the callback RPC wait queue.
+ */
+static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
+		struct rpc_task *task)
+{
+	struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
+	u32 *ptr = (u32 *)clp->cl_sessionid.data;
+	int status = 0;
+
+	dprintk("%s: %u:%u:%u:%u\n", __func__,
+		ptr[0], ptr[1], ptr[2], ptr[3]);
+
+	if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
+		rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
+		dprintk("%s slot is busy\n", __func__);
+		status = -EAGAIN;
+		goto out;
+	}
+
+	/*
+	 * We'll need the clp during XDR encoding and decoding,
+	 * and the sequence during decoding to verify the reply
+	 */
+	args->args_seq.cbs_clp = clp;
+	task->tk_msg.rpc_resp = &args->args_seq;
+
+out:
+	dprintk("%s status=%d\n", __func__, status);
+	return status;
+}
+
+/*
+ * TODO: cb_sequence should support referring call lists, cachethis, multiple
+ * slots, and mark callback channel down on communication errors.
+ */
+static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_delegation *dp = calldata;
+	struct nfs4_client *clp = dp->dl_client;
+	struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
+	u32 minorversion = clp->cl_cb_conn.cb_minorversion;
+	int status = 0;
+
+	args->args_seq.cbs_minorversion = minorversion;
+	if (minorversion) {
+		status = nfsd41_cb_setup_sequence(clp, task);
+		if (status) {
+			if (status != -EAGAIN) {
+				/* terminate rpc task */
+				task->tk_status = status;
+				task->tk_action = NULL;
+			}
+			return;
+		}
+	}
+	rpc_call_start(task);
+}
+
 static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs4_delegation *dp = calldata;
@@ -540,6 +601,7 @@ static void nfsd4_cb_recall_release(void *calldata)
 }
 
 static const struct rpc_call_ops nfsd4_cb_recall_ops = {
+	.rpc_call_prepare = nfsd4_cb_prepare,
 	.rpc_call_done = nfsd4_cb_recall_done,
 	.rpc_release = nfsd4_cb_recall_release,
 };
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f4cebd9016bc..76b7bcbb3f20 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1331,6 +1331,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 			unconf->cl_cb_conn.cb_minorversion =
 				cstate->minorversion;
 			unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
+			unconf->cl_cb_seq_nr = 1;
 			nfsd4_probe_callback(unconf);
 		}
 		conf = unconf;
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 9cc40a137c34..b38d11324189 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -221,6 +221,7 @@ struct nfs4_client {
 	/* for nfs41 callbacks */
 	/* We currently support a single back channel with a single slot */
 	unsigned long		cl_cb_slot_busy;
+	u32			cl_cb_seq_nr;
 	struct svc_xprt		*cl_cb_xprt;	/* 4.1 callback transport */
 	struct rpc_wait_queue	cl_cb_waitq;	/* backchannel callers may */
 						/* wait here for slots */
-- 
cgit v1.2.3-71-gd317


From fe832a3a48737b24f95fab202b1c67fb588b071d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 15 Sep 2009 23:51:31 -0400
Subject: tracing: remove notrace from __kprobes annotation

When ftrace had issues with NMIs, it was needed to annotate all
the areas that kprobes had issues with notrace. Now that ftrace is
NMI safe, the functions that limit ftrace from tracing are just a
small few.

Kprobes is too big of a set for ftrace not to trace. Remove the
coupling.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/kprobes.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index bcd9c07848be..3a46b7b7abb2 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -48,13 +48,13 @@
 #define KPROBE_HIT_SSDONE	0x00000008
 
 /* Attach to insert probes on any functions which should be ignored*/
-#define __kprobes	__attribute__((__section__(".kprobes.text"))) notrace
+#define __kprobes	__attribute__((__section__(".kprobes.text")))
 #else /* CONFIG_KPROBES */
 typedef int kprobe_opcode_t;
 struct arch_specific_insn {
 	int dummy;
 };
-#define __kprobes	notrace
+#define __kprobes
 #endif /* CONFIG_KPROBES */
 
 struct kprobe;
-- 
cgit v1.2.3-71-gd317


From 59abf02644c45f1591e1374ee7bb45dc757fcb88 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 16 Sep 2009 08:28:30 +0200
Subject: sched: Add SD_PREFER_LOCAL

And turn it on for NUMA and MC domains. This improves
locality in balancing decisions by keeping up to
capacity amount of tasks local before looking for idle
CPUs. (and twice the capacity if SD_POWERSAVINGS_BALANCE
is set.)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h    | 2 +-
 include/linux/topology.h | 2 ++
 kernel/sched_fair.c      | 7 +++++--
 3 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ee1f88993097..b4a39bb2b4a4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -805,7 +805,7 @@ enum cpu_idle_type {
 #define SD_BALANCE_FORK		0x0008	/* Balance on fork, clone */
 #define SD_BALANCE_WAKE		0x0010  /* Balance on wakeup */
 #define SD_WAKE_AFFINE		0x0020	/* Wake task to waking CPU */
-
+#define SD_PREFER_LOCAL		0x0040  /* Prefer to keep tasks local to this domain */
 #define SD_SHARE_CPUPOWER	0x0080	/* Domain members share cpu power */
 #define SD_POWERSAVINGS_BALANCE	0x0100	/* Balance for power savings */
 #define SD_SHARE_PKG_RESOURCES	0x0200	/* Domain members share cpu pkg resources */
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 936ab2b37683..a6614b0242a9 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -129,6 +129,7 @@ int arch_update_cpu_topology(void);
 				| 1*SD_BALANCE_FORK			\
 				| 1*SD_BALANCE_WAKE			\
 				| 1*SD_WAKE_AFFINE			\
+				| 1*SD_PREFER_LOCAL			\
 				| 0*SD_SHARE_CPUPOWER			\
 				| 1*SD_SHARE_PKG_RESOURCES		\
 				| 0*SD_SERIALIZE			\
@@ -161,6 +162,7 @@ int arch_update_cpu_topology(void);
 				| 1*SD_BALANCE_FORK			\
 				| 1*SD_BALANCE_WAKE			\
 				| 1*SD_WAKE_AFFINE			\
+				| 1*SD_PREFER_LOCAL			\
 				| 0*SD_SHARE_CPUPOWER			\
 				| 0*SD_SHARE_PKG_RESOURCES		\
 				| 0*SD_SERIALIZE			\
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 280892e9d85e..a37f311f436e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1360,7 +1360,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int flags)
 		 * If power savings logic is enabled for a domain, see if we
 		 * are not overloaded, if so, don't balance wider.
 		 */
-		if (tmp->flags & SD_POWERSAVINGS_BALANCE) {
+		if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
 			unsigned long power = 0;
 			unsigned long nr_running = 0;
 			unsigned long capacity;
@@ -1373,7 +1373,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int flags)
 
 			capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
 
-			if (nr_running/2 < capacity)
+			if (tmp->flags & SD_POWERSAVINGS_BALANCE)
+				nr_running /= 2;
+
+			if (nr_running < capacity)
 				break;
 		}
 
-- 
cgit v1.2.3-71-gd317


From d466f2fcb32cd97fd586bfa33f5dba3ac78aadb0 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 16 Sep 2009 11:50:03 +0200
Subject: HWPOISON: Add page flag for poisoned pages

Hardware poisoned pages need special handling in the VM and shouldn't be
touched again. This requires a new page flag. Define it here.

The page flags wars seem to be over, so it shouldn't be a problem
to get a new one.

v2: Add TestSetHWPoison (suggested by Johannes Weiner)

Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/page-flags.h | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 2b87acfc5f87..9bc5fd9fdbf6 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -51,6 +51,9 @@
  * PG_buddy is set to indicate that the page is free and in the buddy system
  * (see mm/page_alloc.c).
  *
+ * PG_hwpoison indicates that a page got corrupted in hardware and contains
+ * data with incorrect ECC bits that triggered a machine check. Accessing is
+ * not safe since it may cause another machine check. Don't touch!
  */
 
 /*
@@ -101,6 +104,9 @@ enum pageflags {
 #endif
 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
 	PG_uncached,		/* Page has been mapped as uncached */
+#endif
+#ifdef CONFIG_MEMORY_FAILURE
+	PG_hwpoison,		/* hardware poisoned page. Don't touch */
 #endif
 	__NR_PAGEFLAGS,
 
@@ -263,6 +269,15 @@ PAGEFLAG(Uncached, uncached)
 PAGEFLAG_FALSE(Uncached)
 #endif
 
+#ifdef CONFIG_MEMORY_FAILURE
+PAGEFLAG(HWPoison, hwpoison)
+TESTSETFLAG(HWPoison, hwpoison)
+#define __PG_HWPOISON (1UL << PG_hwpoison)
+#else
+PAGEFLAG_FALSE(HWPoison)
+#define __PG_HWPOISON 0
+#endif
+
 static inline int PageUptodate(struct page *page)
 {
 	int ret = test_bit(PG_uptodate, &(page)->flags);
@@ -387,7 +402,7 @@ static inline void __ClearPageTail(struct page *page)
 	 1 << PG_private | 1 << PG_private_2 | \
 	 1 << PG_buddy	 | 1 << PG_writeback | 1 << PG_reserved | \
 	 1 << PG_slab	 | 1 << PG_swapcache | 1 << PG_active | \
-	 1 << PG_unevictable | __PG_MLOCKED)
+	 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON)
 
 /*
  * Flags checked when a page is prepped for return by the page allocator.
-- 
cgit v1.2.3-71-gd317


From 10be22dfe1e6ad978269dc275147e0ed049187bb Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 16 Sep 2009 11:50:04 +0200
Subject: HWPOISON: Export some rmap vma locking to outside world

Needed for later patch that walks rmap entries on its own.

This used to be very frowned upon, but memory-failure.c does
some rather specialized rmap walking and rmap has been stable
for quite some time, so I think it's ok now to export it.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/rmap.h | 6 ++++++
 mm/rmap.c            | 4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bf116d0dbf23..8dff2ffab82c 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -112,6 +112,12 @@ int page_mkclean(struct page *);
  */
 int try_to_munlock(struct page *);
 
+/*
+ * Called by memory-failure.c to kill processes.
+ */
+struct anon_vma *page_lock_anon_vma(struct page *page);
+void page_unlock_anon_vma(struct anon_vma *anon_vma);
+
 #else	/* !CONFIG_MMU */
 
 #define anon_vma_init()		do {} while (0)
diff --git a/mm/rmap.c b/mm/rmap.c
index 0895b5c7cbff..5a35c030e779 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -191,7 +191,7 @@ void __init anon_vma_init(void)
  * Getting a lock on a stable anon_vma from a page off the LRU is
  * tricky: page_lock_anon_vma rely on RCU to guard against the races.
  */
-static struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *page_lock_anon_vma(struct page *page)
 {
 	struct anon_vma *anon_vma;
 	unsigned long anon_mapping;
@@ -211,7 +211,7 @@ out:
 	return NULL;
 }
 
-static void page_unlock_anon_vma(struct anon_vma *anon_vma)
+void page_unlock_anon_vma(struct anon_vma *anon_vma)
 {
 	spin_unlock(&anon_vma->lock);
 	rcu_read_unlock();
-- 
cgit v1.2.3-71-gd317


From a7420aa54dbf699a5a05feba3c859b6baaa3938c Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 16 Sep 2009 11:50:05 +0200
Subject: HWPOISON: Add support for poison swap entries v2

Memory migration uses special swap entry types to trigger special actions on
page faults. Extend this mechanism to also support poisoned swap entries, to
trigger poison handling on page faults. This allows follow-on patches to
prevent processes from faulting in poisoned pages again.

v2: Fix overflow in MAX_SWAPFILES (Fengguang Wu)
v3: Better overflow fix (Hidehiro Kawai)

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/swap.h    | 34 ++++++++++++++++++++++++++++------
 include/linux/swapops.h | 38 ++++++++++++++++++++++++++++++++++++++
 mm/swapfile.c           |  4 ++--
 3 files changed, 68 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7c15334f3ff2..f077e454c659 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -34,15 +34,37 @@ static inline int current_is_kswapd(void)
  * the type/offset into the pte as 5/27 as well.
  */
 #define MAX_SWAPFILES_SHIFT	5
-#ifndef CONFIG_MIGRATION
-#define MAX_SWAPFILES		(1 << MAX_SWAPFILES_SHIFT)
+
+/*
+ * Use some of the swap files numbers for other purposes. This
+ * is a convenient way to hook into the VM to trigger special
+ * actions on faults.
+ */
+
+/*
+ * NUMA node memory migration support
+ */
+#ifdef CONFIG_MIGRATION
+#define SWP_MIGRATION_NUM 2
+#define SWP_MIGRATION_READ	(MAX_SWAPFILES + SWP_HWPOISON_NUM)
+#define SWP_MIGRATION_WRITE	(MAX_SWAPFILES + SWP_HWPOISON_NUM + 1)
 #else
-/* Use last two entries for page migration swap entries */
-#define MAX_SWAPFILES		((1 << MAX_SWAPFILES_SHIFT)-2)
-#define SWP_MIGRATION_READ	MAX_SWAPFILES
-#define SWP_MIGRATION_WRITE	(MAX_SWAPFILES + 1)
+#define SWP_MIGRATION_NUM 0
 #endif
 
+/*
+ * Handling of hardware poisoned pages with memory corruption.
+ */
+#ifdef CONFIG_MEMORY_FAILURE
+#define SWP_HWPOISON_NUM 1
+#define SWP_HWPOISON		MAX_SWAPFILES
+#else
+#define SWP_HWPOISON_NUM 0
+#endif
+
+#define MAX_SWAPFILES \
+	((1 << MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
+
 /*
  * Magic header for a swap area. The first part of the union is
  * what the swap magic looks like for the old (limited to 128MB)
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 6ec39ab27b4b..cd42e30b7c6e 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -131,3 +131,41 @@ static inline int is_write_migration_entry(swp_entry_t entry)
 
 #endif
 
+#ifdef CONFIG_MEMORY_FAILURE
+/*
+ * Support for hardware poisoned pages
+ */
+static inline swp_entry_t make_hwpoison_entry(struct page *page)
+{
+	BUG_ON(!PageLocked(page));
+	return swp_entry(SWP_HWPOISON, page_to_pfn(page));
+}
+
+static inline int is_hwpoison_entry(swp_entry_t entry)
+{
+	return swp_type(entry) == SWP_HWPOISON;
+}
+#else
+
+static inline swp_entry_t make_hwpoison_entry(struct page *page)
+{
+	return swp_entry(0, 0);
+}
+
+static inline int is_hwpoison_entry(swp_entry_t swp)
+{
+	return 0;
+}
+#endif
+
+#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION)
+static inline int non_swap_entry(swp_entry_t entry)
+{
+	return swp_type(entry) >= MAX_SWAPFILES;
+}
+#else
+static inline int non_swap_entry(swp_entry_t entry)
+{
+	return 0;
+}
+#endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 74f1102e8749..ce5dda6d604b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -699,7 +699,7 @@ int free_swap_and_cache(swp_entry_t entry)
 	struct swap_info_struct *p;
 	struct page *page = NULL;
 
-	if (is_migration_entry(entry))
+	if (non_swap_entry(entry))
 		return 1;
 
 	p = swap_info_get(entry);
@@ -2085,7 +2085,7 @@ static int __swap_duplicate(swp_entry_t entry, bool cache)
 	int count;
 	bool has_cache;
 
-	if (is_migration_entry(entry))
+	if (non_swap_entry(entry))
 		return -EINVAL;
 
 	type = swp_type(entry);
-- 
cgit v1.2.3-71-gd317


From d1737fdbec7f90edc52dd0c5c3767457f28e78d8 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 16 Sep 2009 11:50:06 +0200
Subject: HWPOISON: Add basic support for poisoned pages in fault handler v3

- Add a new VM_FAULT_HWPOISON error code to handle_mm_fault. Right now
architectures have to explicitely enable poison page support, so
this is forward compatible to all architectures. They only need
to add it when they enable poison page support.
- Add poison page handling in swap in fault code

v2: Add missing delayacct_clear_flag (Hidehiro Kawai)
v3: Really use delayacct_clear_flag (Hidehiro Kawai)

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/mm.h |  3 ++-
 mm/memory.c        | 18 +++++++++++++++---
 2 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9a72cc78e6b8..082b68cb5ffe 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -685,11 +685,12 @@ static inline int page_mapped(struct page *page)
 #define VM_FAULT_SIGBUS	0x0002
 #define VM_FAULT_MAJOR	0x0004
 #define VM_FAULT_WRITE	0x0008	/* Special case for get_user_pages */
+#define VM_FAULT_HWPOISON 0x0010	/* Hit poisoned page */
 
 #define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page */
 #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
 
-#define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS)
+#define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON)
 
 /*
  * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
diff --git a/mm/memory.c b/mm/memory.c
index aede2ce3aba4..02bae2d540d4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1319,7 +1319,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				if (ret & VM_FAULT_ERROR) {
 					if (ret & VM_FAULT_OOM)
 						return i ? i : -ENOMEM;
-					else if (ret & VM_FAULT_SIGBUS)
+					if (ret &
+					    (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
 						return i ? i : -EFAULT;
 					BUG();
 				}
@@ -2511,8 +2512,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto out;
 
 	entry = pte_to_swp_entry(orig_pte);
-	if (is_migration_entry(entry)) {
-		migration_entry_wait(mm, pmd, address);
+	if (unlikely(non_swap_entry(entry))) {
+		if (is_migration_entry(entry)) {
+			migration_entry_wait(mm, pmd, address);
+		} else if (is_hwpoison_entry(entry)) {
+			ret = VM_FAULT_HWPOISON;
+		} else {
+			print_bad_pte(vma, address, orig_pte, NULL);
+			ret = VM_FAULT_OOM;
+		}
 		goto out;
 	}
 	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
@@ -2536,6 +2544,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		/* Had to read the page from swap area: Major fault */
 		ret = VM_FAULT_MAJOR;
 		count_vm_event(PGMAJFAULT);
+	} else if (PageHWPoison(page)) {
+		ret = VM_FAULT_HWPOISON;
+		delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+		goto out;
 	}
 
 	lock_page(page);
-- 
cgit v1.2.3-71-gd317


From 14fa31b89c5ae79e4131da41761378a6df674352 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 16 Sep 2009 11:50:10 +0200
Subject: HWPOISON: Use bitmask/action code for try_to_unmap behaviour

try_to_unmap currently has multiple modi (migration, munlock, normal unmap)
which are selected by magic flag variables. The logic is not very straight
forward, because each of these flag change multiple behaviours (e.g.
migration turns off aging, not only sets up migration ptes etc.)
Also the different flags interact in magic ways.

A later patch in this series adds another mode to try_to_unmap, so
this becomes quickly unmanageable.

Replace the different flags with a action code (migration, munlock, munmap)
and some additional flags as modifiers (ignore mlock, ignore aging).
This makes the logic more straight forward and allows easier extension
to new behaviours. Change all the caller to declare what they want to
do.

This patch is supposed to be a nop in behaviour. If anyone can prove
it is not that would be a bug.

Cc: Lee.Schermerhorn@hp.com
Cc: npiggin@suse.de

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/rmap.h | 13 ++++++++++++-
 mm/migrate.c         |  2 +-
 mm/rmap.c            | 40 ++++++++++++++++++++++------------------
 mm/vmscan.c          |  2 +-
 4 files changed, 36 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 8dff2ffab82c..4c4a2d4d289e 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -85,7 +85,18 @@ static inline void page_dup_rmap(struct page *page, struct vm_area_struct *vma,
  */
 int page_referenced(struct page *, int is_locked,
 			struct mem_cgroup *cnt, unsigned long *vm_flags);
-int try_to_unmap(struct page *, int ignore_refs);
+enum ttu_flags {
+	TTU_UNMAP = 0,			/* unmap mode */
+	TTU_MIGRATION = 1,		/* migration mode */
+	TTU_MUNLOCK = 2,		/* munlock mode */
+	TTU_ACTION_MASK = 0xff,
+
+	TTU_IGNORE_MLOCK = (1 << 8),	/* ignore mlock */
+	TTU_IGNORE_ACCESS = (1 << 9),	/* don't age */
+};
+#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
+
+int try_to_unmap(struct page *, enum ttu_flags flags);
 
 /*
  * Called from mm/filemap_xip.c to unmap empty zero page
diff --git a/mm/migrate.c b/mm/migrate.c
index 939888f9ddab..e3a0cd3859a9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -669,7 +669,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 	}
 
 	/* Establish migration ptes or remove ptes */
-	try_to_unmap(page, 1);
+	try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
 
 	if (!page_mapped(page))
 		rc = move_to_new_page(newpage, page);
diff --git a/mm/rmap.c b/mm/rmap.c
index 5a35c030e779..08c112a776a7 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -774,7 +774,7 @@ void page_remove_rmap(struct page *page)
  * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
  */
 static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
-				int migration)
+				enum ttu_flags flags)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long address;
@@ -796,11 +796,13 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	 * If it's recently referenced (perhaps page_referenced
 	 * skipped over this mm) then we should reactivate it.
 	 */
-	if (!migration) {
+	if (!(flags & TTU_IGNORE_MLOCK)) {
 		if (vma->vm_flags & VM_LOCKED) {
 			ret = SWAP_MLOCK;
 			goto out_unmap;
 		}
+	}
+	if (!(flags & TTU_IGNORE_ACCESS)) {
 		if (ptep_clear_flush_young_notify(vma, address, pte)) {
 			ret = SWAP_FAIL;
 			goto out_unmap;
@@ -840,12 +842,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			 * pte. do_swap_page() will wait until the migration
 			 * pte is removed and then restart fault handling.
 			 */
-			BUG_ON(!migration);
+			BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION);
 			entry = make_migration_entry(page, pte_write(pteval));
 		}
 		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
 		BUG_ON(pte_file(*pte));
-	} else if (PAGE_MIGRATION && migration) {
+	} else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) {
 		/* Establish migration entry for a file page */
 		swp_entry_t entry;
 		entry = make_migration_entry(page, pte_write(pteval));
@@ -1014,12 +1016,13 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
  * 'LOCKED.
  */
-static int try_to_unmap_anon(struct page *page, int unlock, int migration)
+static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
 {
 	struct anon_vma *anon_vma;
 	struct vm_area_struct *vma;
 	unsigned int mlocked = 0;
 	int ret = SWAP_AGAIN;
+	int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
 
 	if (MLOCK_PAGES && unlikely(unlock))
 		ret = SWAP_SUCCESS;	/* default for try_to_munlock() */
@@ -1035,7 +1038,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
 				continue;  /* must visit all unlocked vmas */
 			ret = SWAP_MLOCK;  /* saw at least one mlocked vma */
 		} else {
-			ret = try_to_unmap_one(page, vma, migration);
+			ret = try_to_unmap_one(page, vma, flags);
 			if (ret == SWAP_FAIL || !page_mapped(page))
 				break;
 		}
@@ -1059,8 +1062,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
 /**
  * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
  * @page: the page to unmap/unlock
- * @unlock:  request for unlock rather than unmap [unlikely]
- * @migration:  unmapping for migration - ignored if @unlock
+ * @flags: action and flags
  *
  * Find all the mappings of a page using the mapping pointer and the vma chains
  * contained in the address_space struct it points to.
@@ -1072,7 +1074,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
  * 'LOCKED.
  */
-static int try_to_unmap_file(struct page *page, int unlock, int migration)
+static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
 {
 	struct address_space *mapping = page->mapping;
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -1084,6 +1086,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
 	unsigned long max_nl_size = 0;
 	unsigned int mapcount;
 	unsigned int mlocked = 0;
+	int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
 
 	if (MLOCK_PAGES && unlikely(unlock))
 		ret = SWAP_SUCCESS;	/* default for try_to_munlock() */
@@ -1096,7 +1099,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
 				continue;	/* must visit all vmas */
 			ret = SWAP_MLOCK;
 		} else {
-			ret = try_to_unmap_one(page, vma, migration);
+			ret = try_to_unmap_one(page, vma, flags);
 			if (ret == SWAP_FAIL || !page_mapped(page))
 				goto out;
 		}
@@ -1121,7 +1124,8 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
 			ret = SWAP_MLOCK;	/* leave mlocked == 0 */
 			goto out;		/* no need to look further */
 		}
-		if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED))
+		if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
+			(vma->vm_flags & VM_LOCKED))
 			continue;
 		cursor = (unsigned long) vma->vm_private_data;
 		if (cursor > max_nl_cursor)
@@ -1155,7 +1159,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
 	do {
 		list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 						shared.vm_set.list) {
-			if (!MLOCK_PAGES && !migration &&
+			if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
 			    (vma->vm_flags & VM_LOCKED))
 				continue;
 			cursor = (unsigned long) vma->vm_private_data;
@@ -1195,7 +1199,7 @@ out:
 /**
  * try_to_unmap - try to remove all page table mappings to a page
  * @page: the page to get unmapped
- * @migration: migration flag
+ * @flags: action and flags
  *
  * Tries to remove all the page table entries which are mapping this
  * page, used in the pageout path.  Caller must hold the page lock.
@@ -1206,16 +1210,16 @@ out:
  * SWAP_FAIL	- the page is unswappable
  * SWAP_MLOCK	- page is mlocked.
  */
-int try_to_unmap(struct page *page, int migration)
+int try_to_unmap(struct page *page, enum ttu_flags flags)
 {
 	int ret;
 
 	BUG_ON(!PageLocked(page));
 
 	if (PageAnon(page))
-		ret = try_to_unmap_anon(page, 0, migration);
+		ret = try_to_unmap_anon(page, flags);
 	else
-		ret = try_to_unmap_file(page, 0, migration);
+		ret = try_to_unmap_file(page, flags);
 	if (ret != SWAP_MLOCK && !page_mapped(page))
 		ret = SWAP_SUCCESS;
 	return ret;
@@ -1240,8 +1244,8 @@ int try_to_munlock(struct page *page)
 	VM_BUG_ON(!PageLocked(page) || PageLRU(page));
 
 	if (PageAnon(page))
-		return try_to_unmap_anon(page, 1, 0);
+		return try_to_unmap_anon(page, TTU_MUNLOCK);
 	else
-		return try_to_unmap_file(page, 1, 0);
+		return try_to_unmap_file(page, TTU_MUNLOCK);
 }
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ba8228e0a806..ab3b0ad3ce52 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -659,7 +659,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * processes. Try to unmap it here.
 		 */
 		if (page_mapped(page) && mapping) {
-			switch (try_to_unmap(page, 0)) {
+			switch (try_to_unmap(page, TTU_UNMAP)) {
 			case SWAP_FAIL:
 				goto activate_locked;
 			case SWAP_AGAIN:
-- 
cgit v1.2.3-71-gd317


From 888b9f7c58ebe8303bad817cd554df887a683957 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Wed, 16 Sep 2009 11:50:11 +0200
Subject: HWPOISON: Handle hardware poisoned pages in try_to_unmap

When a page has the poison bit set replace the PTE with a poison entry.
This causes the right error handling to be done later when a process runs
into it.

v2: add a new flag to not do that (needed for the memory-failure handler
later) (Fengguang)
v3: remove unnecessary is_migration_entry() test (Fengguang, Minchan)

Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/rmap.h | 1 +
 mm/rmap.c            | 9 ++++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 4c4a2d4d289e..ce989f1fc2ed 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -93,6 +93,7 @@ enum ttu_flags {
 
 	TTU_IGNORE_MLOCK = (1 << 8),	/* ignore mlock */
 	TTU_IGNORE_ACCESS = (1 << 9),	/* don't age */
+	TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
 };
 #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 08c112a776a7..7e72ca19d68b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -820,7 +820,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	/* Update high watermark before we lower rss */
 	update_hiwater_rss(mm);
 
-	if (PageAnon(page)) {
+	if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
+		if (PageAnon(page))
+			dec_mm_counter(mm, anon_rss);
+		else
+			dec_mm_counter(mm, file_rss);
+		set_pte_at(mm, address, pte,
+				swp_entry_to_pte(make_hwpoison_entry(page)));
+	} else if (PageAnon(page)) {
 		swp_entry_t entry = { .val = page_private(page) };
 
 		if (PageSwapCache(page)) {
-- 
cgit v1.2.3-71-gd317


From 750b4987b0cd4d408e54cb83a80a067cbe690feb Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 16 Sep 2009 11:50:12 +0200
Subject: HWPOISON: Refactor truncate to allow direct truncating of page v2

Extract out truncate_inode_page() out of the truncate path so that
it can be used by memory-failure.c

[AK: description, headers, fix typos]
v2: Some white space changes from Fengguang Wu

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/mm.h |  2 ++
 mm/truncate.c      | 29 +++++++++++++++--------------
 2 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 082b68cb5ffe..8cbc0aafd5bd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -794,6 +794,8 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
 extern int vmtruncate(struct inode * inode, loff_t offset);
 extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
 
+int truncate_inode_page(struct address_space *mapping, struct page *page);
+
 #ifdef CONFIG_MMU
 extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, unsigned int flags);
diff --git a/mm/truncate.c b/mm/truncate.c
index ccc3ecf7cb98..2519a7c92873 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -93,11 +93,11 @@ EXPORT_SYMBOL(cancel_dirty_page);
  * its lock, b) when a concurrent invalidate_mapping_pages got there first and
  * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
  */
-static void
+static int
 truncate_complete_page(struct address_space *mapping, struct page *page)
 {
 	if (page->mapping != mapping)
-		return;
+		return -EIO;
 
 	if (page_has_private(page))
 		do_invalidatepage(page, 0);
@@ -108,6 +108,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
 	remove_from_page_cache(page);
 	ClearPageMappedToDisk(page);
 	page_cache_release(page);	/* pagecache ref */
+	return 0;
 }
 
 /*
@@ -135,6 +136,16 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
 	return ret;
 }
 
+int truncate_inode_page(struct address_space *mapping, struct page *page)
+{
+	if (page_mapped(page)) {
+		unmap_mapping_range(mapping,
+				   (loff_t)page->index << PAGE_CACHE_SHIFT,
+				   PAGE_CACHE_SIZE, 0);
+	}
+	return truncate_complete_page(mapping, page);
+}
+
 /**
  * truncate_inode_pages - truncate range of pages specified by start & end byte offsets
  * @mapping: mapping to truncate
@@ -196,12 +207,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 				unlock_page(page);
 				continue;
 			}
-			if (page_mapped(page)) {
-				unmap_mapping_range(mapping,
-				  (loff_t)page_index<<PAGE_CACHE_SHIFT,
-				  PAGE_CACHE_SIZE, 0);
-			}
-			truncate_complete_page(mapping, page);
+			truncate_inode_page(mapping, page);
 			unlock_page(page);
 		}
 		pagevec_release(&pvec);
@@ -238,15 +244,10 @@ void truncate_inode_pages_range(struct address_space *mapping,
 				break;
 			lock_page(page);
 			wait_on_page_writeback(page);
-			if (page_mapped(page)) {
-				unmap_mapping_range(mapping,
-				  (loff_t)page->index<<PAGE_CACHE_SHIFT,
-				  PAGE_CACHE_SIZE, 0);
-			}
+			truncate_inode_page(mapping, page);
 			if (page->index > next)
 				next = page->index;
 			next++;
-			truncate_complete_page(mapping, page);
 			unlock_page(page);
 		}
 		pagevec_release(&pvec);
-- 
cgit v1.2.3-71-gd317


From 83f786680aec8d030184f7ced1a0a3dd8ac81764 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 16 Sep 2009 11:50:13 +0200
Subject: HWPOISON: Add invalidate_inode_page

Add a simple way to invalidate a single page
This is just a refactoring of the truncate.c code.
Originally from Fengguang, modified by Andi Kleen.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/mm.h |  2 ++
 mm/truncate.c      | 26 ++++++++++++++++++++------
 2 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8cbc0aafd5bd..b05bbde0296d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -796,6 +796,8 @@ extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
 
 int truncate_inode_page(struct address_space *mapping, struct page *page);
 
+int invalidate_inode_page(struct page *page);
+
 #ifdef CONFIG_MMU
 extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, unsigned int flags);
diff --git a/mm/truncate.c b/mm/truncate.c
index 2519a7c92873..ea132f7ea2d2 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -146,6 +146,24 @@ int truncate_inode_page(struct address_space *mapping, struct page *page)
 	return truncate_complete_page(mapping, page);
 }
 
+/*
+ * Safely invalidate one page from its pagecache mapping.
+ * It only drops clean, unused pages. The page must be locked.
+ *
+ * Returns 1 if the page is successfully invalidated, otherwise 0.
+ */
+int invalidate_inode_page(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+	if (!mapping)
+		return 0;
+	if (PageDirty(page) || PageWriteback(page))
+		return 0;
+	if (page_mapped(page))
+		return 0;
+	return invalidate_complete_page(mapping, page);
+}
+
 /**
  * truncate_inode_pages - truncate range of pages specified by start & end byte offsets
  * @mapping: mapping to truncate
@@ -312,12 +330,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
 			if (lock_failed)
 				continue;
 
-			if (PageDirty(page) || PageWriteback(page))
-				goto unlock;
-			if (page_mapped(page))
-				goto unlock;
-			ret += invalidate_complete_page(mapping, page);
-unlock:
+			ret += invalidate_inode_page(page);
+
 			unlock_page(page);
 			if (next > end)
 				break;
-- 
cgit v1.2.3-71-gd317


From 257187362123f15d9d1e09918cf87cebbea4e786 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 16 Sep 2009 11:50:13 +0200
Subject: HWPOISON: Define a new error_remove_page address space op for async
 truncation

Truncating metadata pages is not safe right now before
we haven't audited all file systems.

To enable truncation only for data address space define
a new address_space callback error_remove_page.

This is used for memory_failure.c memory error handling.

This can be then set to truncate_inode_page()

This patch just defines the new operation and adds documentation.

Callers and users come in followon patches.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 Documentation/filesystems/vfs.txt |  7 +++++++
 include/linux/fs.h                |  1 +
 include/linux/mm.h                |  1 +
 mm/truncate.c                     | 17 +++++++++++++++++
 4 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index f49eecf2e573..623f094c9d8d 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -536,6 +536,7 @@ struct address_space_operations {
 	/* migrate the contents of a page to the specified target */
 	int (*migratepage) (struct page *, struct page *);
 	int (*launder_page) (struct page *);
+	int (*error_remove_page) (struct mapping *mapping, struct page *page);
 };
 
   writepage: called by the VM to write a dirty page to backing store.
@@ -694,6 +695,12 @@ struct address_space_operations {
   	prevent redirtying the page, it is kept locked during the whole
 	operation.
 
+  error_remove_page: normally set to generic_error_remove_page if truncation
+	is ok for this address space. Used for memory failure handling.
+	Setting this implies you deal with pages going away under you,
+	unless you have them locked or reference counts increased.
+
+
 The File Object
 ===============
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b21cf6b9c80b..4f47afd37647 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -595,6 +595,7 @@ struct address_space_operations {
 	int (*launder_page) (struct page *);
 	int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
 					unsigned long);
+	int (*error_remove_page)(struct address_space *, struct page *);
 };
 
 /*
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b05bbde0296d..a16018f7d61c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -795,6 +795,7 @@ extern int vmtruncate(struct inode * inode, loff_t offset);
 extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
 
 int truncate_inode_page(struct address_space *mapping, struct page *page);
+int generic_error_remove_page(struct address_space *mapping, struct page *page);
 
 int invalidate_inode_page(struct page *page);
 
diff --git a/mm/truncate.c b/mm/truncate.c
index ea132f7ea2d2..a17b3977cfdf 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -146,6 +146,23 @@ int truncate_inode_page(struct address_space *mapping, struct page *page)
 	return truncate_complete_page(mapping, page);
 }
 
+/*
+ * Used to get rid of pages on hardware memory corruption.
+ */
+int generic_error_remove_page(struct address_space *mapping, struct page *page)
+{
+	if (!mapping)
+		return -EINVAL;
+	/*
+	 * Only punch for normal data pages for now.
+	 * Handling other types like directories would need more auditing.
+	 */
+	if (!S_ISREG(mapping->host->i_mode))
+		return -EIO;
+	return truncate_inode_page(mapping, page);
+}
+EXPORT_SYMBOL(generic_error_remove_page);
+
 /*
  * Safely invalidate one page from its pagecache mapping.
  * It only drops clean, unused pages. The page must be locked.
-- 
cgit v1.2.3-71-gd317


From 4db96cf077aa938b11fe7ac79ecc9b29ec00fbab Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 16 Sep 2009 11:50:14 +0200
Subject: HWPOISON: Add PR_MCE_KILL prctl to control early kill behaviour per
 process

This allows processes to override their early/late kill
behaviour on hardware memory errors.

Typically applications which are memory error aware is
better of with early kill (see the error as soon
as possible), all others with late kill (only
see the error when the error is really impacting execution)

There's a global sysctl, but this way an application
can set its specific policy.

We're using two bits, one to signify that the process
stated its intention and that

I also made the prctl future proof by enforcing
the unused arguments are 0.

The state is inherited to children.

Note this makes us officially run out of process flags
on 32bit, but the next patch can easily add another field.

Manpage patch will be supplied separately.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/prctl.h |  2 ++
 include/linux/sched.h |  2 ++
 kernel/sys.c          | 22 ++++++++++++++++++++++
 3 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index b00df4c79c63..3dc303197e67 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -88,4 +88,6 @@
 #define PR_TASK_PERF_COUNTERS_DISABLE		31
 #define PR_TASK_PERF_COUNTERS_ENABLE		32
 
+#define PR_MCE_KILL	33
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f3d74bd04d18..29eae73c951d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1687,6 +1687,7 @@ extern cputime_t task_gtime(struct task_struct *p);
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
 #define PF_VCPU		0x00000010	/* I'm a virtual CPU */
 #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
+#define PF_MCE_PROCESS  0x00000080      /* process policy on mce errors */
 #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
 #define PF_DUMPCORE	0x00000200	/* dumped core */
 #define PF_SIGNALED	0x00000400	/* killed by a signal */
@@ -1706,6 +1707,7 @@ extern cputime_t task_gtime(struct task_struct *p);
 #define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
 #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
 #define PF_THREAD_BOUND	0x04000000	/* Thread bound to specific cpu */
+#define PF_MCE_EARLY    0x08000000      /* Early kill for mce process policy */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
 #define PF_FREEZER_SKIP	0x40000000	/* Freezer should not count it as freezeable */
diff --git a/kernel/sys.c b/kernel/sys.c
index b3f1097c76fa..41e02eff3398 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1528,6 +1528,28 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 				current->timer_slack_ns = arg2;
 			error = 0;
 			break;
+		case PR_MCE_KILL:
+			if (arg4 | arg5)
+				return -EINVAL;
+			switch (arg2) {
+			case 0:
+				if (arg3 != 0)
+					return -EINVAL;
+				current->flags &= ~PF_MCE_PROCESS;
+				break;
+			case 1:
+				current->flags |= PF_MCE_PROCESS;
+				if (arg3 != 0)
+					current->flags |= PF_MCE_EARLY;
+				else
+					current->flags &= ~PF_MCE_EARLY;
+				break;
+			default:
+				return -EINVAL;
+			}
+			error = 0;
+			break;
+
 		default:
 			error = -EINVAL;
 			break;
-- 
cgit v1.2.3-71-gd317


From 6a46079cf57a7f7758e8b926980a4f852f89b34d Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 16 Sep 2009 11:50:15 +0200
Subject: HWPOISON: The high level memory error handler in the VM v7

Add the high level memory handler that poisons pages
that got corrupted by hardware (typically by a two bit flip in a DIMM
or a cache) on the Linux level. The goal is to prevent everyone
from accessing these pages in the future.

This done at the VM level by marking a page hwpoisoned
and doing the appropriate action based on the type of page
it is.

The code that does this is portable and lives in mm/memory-failure.c

To quote the overview comment:

High level machine check handler. Handles pages reported by the
hardware as being corrupted usually due to a 2bit ECC memory or cache
failure.

This focuses on pages detected as corrupted in the background.
When the current CPU tries to consume corruption the currently
running process can just be killed directly instead. This implies
that if the error cannot be handled for some reason it's safe to
just ignore it because no corruption has been consumed yet. Instead
when that happens another machine check will happen.

Handles page cache pages in various states. The tricky part
here is that we can access any page asynchronous to other VM
users, because memory failures could happen anytime and anywhere,
possibly violating some of their assumptions. This is why this code
has to be extremely careful. Generally it tries to use normal locking
rules, as in get the standard locks, even if that means the
error handling takes potentially a long time.

Some of the operations here are somewhat inefficient and have non
linear algorithmic complexity, because the data structures have not
been optimized for this case. This is in particular the case
for the mapping from a vma to a process. Since this case is expected
to be rare we hope we can get away with this.

There are in principle two strategies to kill processes on poison:
- just unmap the data and wait for an actual reference before
killing
- kill as soon as corruption is detected.
Both have advantages and disadvantages and should be used
in different situations. Right now both are implemented and can
be switched with a new sysctl vm.memory_failure_early_kill
The default is early kill.

The patch does some rmap data structure walking on its own to collect
processes to kill. This is unusual because normally all rmap data structure
knowledge is in rmap.c only. I put it here for now to keep
everything together and rmap knowledge has been seeping out anyways

Includes contributions from Johannes Weiner, Chris Mason, Fengguang Wu,
Nick Piggin (who did a lot of great work) and others.

Cc: npiggin@suse.de
Cc: riel@redhat.com
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
---
 Documentation/sysctl/vm.txt |  41 ++-
 fs/proc/meminfo.c           |   9 +-
 include/linux/mm.h          |   7 +
 include/linux/rmap.h        |   1 +
 kernel/sysctl.c             |  25 ++
 mm/Kconfig                  |  10 +
 mm/Makefile                 |   1 +
 mm/filemap.c                |   4 +
 mm/memory-failure.c         | 832 ++++++++++++++++++++++++++++++++++++++++++++
 mm/rmap.c                   |   7 +-
 10 files changed, 934 insertions(+), 3 deletions(-)
 create mode 100644 mm/memory-failure.c

(limited to 'include/linux')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index c4de6359d440..faf62740aa2c 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -32,6 +32,8 @@ Currently, these files are in /proc/sys/vm:
 - legacy_va_layout
 - lowmem_reserve_ratio
 - max_map_count
+- memory_failure_early_kill
+- memory_failure_recovery
 - min_free_kbytes
 - min_slab_ratio
 - min_unmapped_ratio
@@ -53,7 +55,6 @@ Currently, these files are in /proc/sys/vm:
 - vfs_cache_pressure
 - zone_reclaim_mode
 
-
 ==============================================================
 
 block_dump
@@ -275,6 +276,44 @@ e.g., up to one or two maps per allocation.
 
 The default value is 65536.
 
+=============================================================
+
+memory_failure_early_kill:
+
+Control how to kill processes when uncorrected memory error (typically
+a 2bit error in a memory module) is detected in the background by hardware
+that cannot be handled by the kernel. In some cases (like the page
+still having a valid copy on disk) the kernel will handle the failure
+transparently without affecting any applications. But if there is
+no other uptodate copy of the data it will kill to prevent any data
+corruptions from propagating.
+
+1: Kill all processes that have the corrupted and not reloadable page mapped
+as soon as the corruption is detected.  Note this is not supported
+for a few types of pages, like kernel internally allocated data or
+the swap cache, but works for the majority of user pages.
+
+0: Only unmap the corrupted page from all processes and only kill a process
+who tries to access it.
+
+The kill is done using a catchable SIGBUS with BUS_MCEERR_AO, so processes can
+handle this if they want to.
+
+This is only active on architectures/platforms with advanced machine
+check handling and depends on the hardware capabilities.
+
+Applications can override this setting individually with the PR_MCE_KILL prctl
+
+==============================================================
+
+memory_failure_recovery
+
+Enable memory failure recovery (when supported by the platform)
+
+1: Attempt recovery.
+
+0: Always panic on a memory failure.
+
 ==============================================================
 
 min_free_kbytes:
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index d5c410d47fae..78faedcb0a8d 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -95,7 +95,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		"Committed_AS:   %8lu kB\n"
 		"VmallocTotal:   %8lu kB\n"
 		"VmallocUsed:    %8lu kB\n"
-		"VmallocChunk:   %8lu kB\n",
+		"VmallocChunk:   %8lu kB\n"
+#ifdef CONFIG_MEMORY_FAILURE
+		"HardwareCorrupted: %8lu kB\n"
+#endif
+		,
 		K(i.totalram),
 		K(i.freeram),
 		K(i.bufferram),
@@ -140,6 +144,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		(unsigned long)VMALLOC_TOTAL >> 10,
 		vmi.used >> 10,
 		vmi.largest_chunk >> 10
+#ifdef CONFIG_MEMORY_FAILURE
+		,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
+#endif
 		);
 
 	hugetlb_report_meminfo(m);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a16018f7d61c..1ffca03f34b7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1309,5 +1309,12 @@ void vmemmap_populate_print_last(void);
 extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
 				 size_t size);
 extern void refund_locked_memory(struct mm_struct *mm, size_t size);
+
+extern void memory_failure(unsigned long pfn, int trapno);
+extern int __memory_failure(unsigned long pfn, int trapno, int ref);
+extern int sysctl_memory_failure_early_kill;
+extern int sysctl_memory_failure_recovery;
+extern atomic_long_t mce_bad_pages;
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index ce989f1fc2ed..3c1004e50747 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -129,6 +129,7 @@ int try_to_munlock(struct page *);
  */
 struct anon_vma *page_lock_anon_vma(struct page *page);
 void page_unlock_anon_vma(struct anon_vma *anon_vma);
+int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
 
 #else	/* !CONFIG_MMU */
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6bb59f707402..eacae77ac9fc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1372,6 +1372,31 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &scan_unevictable_handler,
 	},
+#ifdef CONFIG_MEMORY_FAILURE
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "memory_failure_early_kill",
+		.data		= &sysctl_memory_failure_early_kill,
+		.maxlen		= sizeof(sysctl_memory_failure_early_kill),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "memory_failure_recovery",
+		.data		= &sysctl_memory_failure_recovery,
+		.maxlen		= sizeof(sysctl_memory_failure_recovery),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
+
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/mm/Kconfig b/mm/Kconfig
index 3aa519f52e18..ea2d8b61c631 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -233,6 +233,16 @@ config DEFAULT_MMAP_MIN_ADDR
 	  /proc/sys/vm/mmap_min_addr tunable.
 
 
+config MEMORY_FAILURE
+	depends on MMU
+	depends on X86_MCE
+	bool "Enable recovery from hardware memory errors"
+	help
+	  Enables code to recover from some memory failures on systems
+	  with MCA recovery. This allows a system to continue running
+	  even when some of its memory has uncorrected errors. This requires
+	  special hardware support and typically ECC memory.
+
 config NOMMU_INITIAL_TRIM_EXCESS
 	int "Turn on mmap() excess space trimming before booting"
 	depends on !MMU
diff --git a/mm/Makefile b/mm/Makefile
index ea4b18bd3960..dc2551e7d006 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -40,5 +40,6 @@ obj-$(CONFIG_SMP) += allocpercpu.o
 endif
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
diff --git a/mm/filemap.c b/mm/filemap.c
index dd51c68e2b86..75575c392167 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -104,6 +104,10 @@
  *
  *  ->task->proc_lock
  *    ->dcache_lock		(proc_pid_lookup)
+ *
+ *  (code doesn't rely on that order, so you could switch it around)
+ *  ->tasklist_lock             (memory_failure, collect_procs_ao)
+ *    ->i_mmap_lock
  */
 
 /*
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
new file mode 100644
index 000000000000..729d4b15b645
--- /dev/null
+++ b/mm/memory-failure.c
@@ -0,0 +1,832 @@
+/*
+ * Copyright (C) 2008, 2009 Intel Corporation
+ * Authors: Andi Kleen, Fengguang Wu
+ *
+ * This software may be redistributed and/or modified under the terms of
+ * the GNU General Public License ("GPL") version 2 only as published by the
+ * Free Software Foundation.
+ *
+ * High level machine check handler. Handles pages reported by the
+ * hardware as being corrupted usually due to a 2bit ECC memory or cache
+ * failure.
+ *
+ * Handles page cache pages in various states.	The tricky part
+ * here is that we can access any page asynchronous to other VM
+ * users, because memory failures could happen anytime and anywhere,
+ * possibly violating some of their assumptions. This is why this code
+ * has to be extremely careful. Generally it tries to use normal locking
+ * rules, as in get the standard locks, even if that means the
+ * error handling takes potentially a long time.
+ *
+ * The operation to map back from RMAP chains to processes has to walk
+ * the complete process list and has non linear complexity with the number
+ * mappings. In short it can be quite slow. But since memory corruptions
+ * are rare we hope to get away with this.
+ */
+
+/*
+ * Notebook:
+ * - hugetlb needs more code
+ * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
+ * - pass bad pages to kdump next kernel
+ */
+#define DEBUG 1		/* remove me in 2.6.34 */
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/page-flags.h>
+#include <linux/sched.h>
+#include <linux/rmap.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/backing-dev.h>
+#include "internal.h"
+
+int sysctl_memory_failure_early_kill __read_mostly = 0;
+
+int sysctl_memory_failure_recovery __read_mostly = 1;
+
+atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
+
+/*
+ * Send all the processes who have the page mapped an ``action optional''
+ * signal.
+ */
+static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
+			unsigned long pfn)
+{
+	struct siginfo si;
+	int ret;
+
+	printk(KERN_ERR
+		"MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
+		pfn, t->comm, t->pid);
+	si.si_signo = SIGBUS;
+	si.si_errno = 0;
+	si.si_code = BUS_MCEERR_AO;
+	si.si_addr = (void *)addr;
+#ifdef __ARCH_SI_TRAPNO
+	si.si_trapno = trapno;
+#endif
+	si.si_addr_lsb = PAGE_SHIFT;
+	/*
+	 * Don't use force here, it's convenient if the signal
+	 * can be temporarily blocked.
+	 * This could cause a loop when the user sets SIGBUS
+	 * to SIG_IGN, but hopefully noone will do that?
+	 */
+	ret = send_sig_info(SIGBUS, &si, t);  /* synchronous? */
+	if (ret < 0)
+		printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
+		       t->comm, t->pid, ret);
+	return ret;
+}
+
+/*
+ * Kill all processes that have a poisoned page mapped and then isolate
+ * the page.
+ *
+ * General strategy:
+ * Find all processes having the page mapped and kill them.
+ * But we keep a page reference around so that the page is not
+ * actually freed yet.
+ * Then stash the page away
+ *
+ * There's no convenient way to get back to mapped processes
+ * from the VMAs. So do a brute-force search over all
+ * running processes.
+ *
+ * Remember that machine checks are not common (or rather
+ * if they are common you have other problems), so this shouldn't
+ * be a performance issue.
+ *
+ * Also there are some races possible while we get from the
+ * error detection to actually handle it.
+ */
+
+struct to_kill {
+	struct list_head nd;
+	struct task_struct *tsk;
+	unsigned long addr;
+	unsigned addr_valid:1;
+};
+
+/*
+ * Failure handling: if we can't find or can't kill a process there's
+ * not much we can do.	We just print a message and ignore otherwise.
+ */
+
+/*
+ * Schedule a process for later kill.
+ * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
+ * TBD would GFP_NOIO be enough?
+ */
+static void add_to_kill(struct task_struct *tsk, struct page *p,
+		       struct vm_area_struct *vma,
+		       struct list_head *to_kill,
+		       struct to_kill **tkc)
+{
+	struct to_kill *tk;
+
+	if (*tkc) {
+		tk = *tkc;
+		*tkc = NULL;
+	} else {
+		tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
+		if (!tk) {
+			printk(KERN_ERR
+		"MCE: Out of memory while machine check handling\n");
+			return;
+		}
+	}
+	tk->addr = page_address_in_vma(p, vma);
+	tk->addr_valid = 1;
+
+	/*
+	 * In theory we don't have to kill when the page was
+	 * munmaped. But it could be also a mremap. Since that's
+	 * likely very rare kill anyways just out of paranoia, but use
+	 * a SIGKILL because the error is not contained anymore.
+	 */
+	if (tk->addr == -EFAULT) {
+		pr_debug("MCE: Unable to find user space address %lx in %s\n",
+			page_to_pfn(p), tsk->comm);
+		tk->addr_valid = 0;
+	}
+	get_task_struct(tsk);
+	tk->tsk = tsk;
+	list_add_tail(&tk->nd, to_kill);
+}
+
+/*
+ * Kill the processes that have been collected earlier.
+ *
+ * Only do anything when DOIT is set, otherwise just free the list
+ * (this is used for clean pages which do not need killing)
+ * Also when FAIL is set do a force kill because something went
+ * wrong earlier.
+ */
+static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
+			  int fail, unsigned long pfn)
+{
+	struct to_kill *tk, *next;
+
+	list_for_each_entry_safe (tk, next, to_kill, nd) {
+		if (doit) {
+			/*
+			 * In case something went wrong with munmaping
+			 * make sure the process doesn't catch the
+			 * signal and then access the memory. Just kill it.
+			 * the signal handlers
+			 */
+			if (fail || tk->addr_valid == 0) {
+				printk(KERN_ERR
+		"MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
+					pfn, tk->tsk->comm, tk->tsk->pid);
+				force_sig(SIGKILL, tk->tsk);
+			}
+
+			/*
+			 * In theory the process could have mapped
+			 * something else on the address in-between. We could
+			 * check for that, but we need to tell the
+			 * process anyways.
+			 */
+			else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
+					      pfn) < 0)
+				printk(KERN_ERR
+		"MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
+					pfn, tk->tsk->comm, tk->tsk->pid);
+		}
+		put_task_struct(tk->tsk);
+		kfree(tk);
+	}
+}
+
+static int task_early_kill(struct task_struct *tsk)
+{
+	if (!tsk->mm)
+		return 0;
+	if (tsk->flags & PF_MCE_PROCESS)
+		return !!(tsk->flags & PF_MCE_EARLY);
+	return sysctl_memory_failure_early_kill;
+}
+
+/*
+ * Collect processes when the error hit an anonymous page.
+ */
+static void collect_procs_anon(struct page *page, struct list_head *to_kill,
+			      struct to_kill **tkc)
+{
+	struct vm_area_struct *vma;
+	struct task_struct *tsk;
+	struct anon_vma *av;
+
+	read_lock(&tasklist_lock);
+	av = page_lock_anon_vma(page);
+	if (av == NULL)	/* Not actually mapped anymore */
+		goto out;
+	for_each_process (tsk) {
+		if (!task_early_kill(tsk))
+			continue;
+		list_for_each_entry (vma, &av->head, anon_vma_node) {
+			if (!page_mapped_in_vma(page, vma))
+				continue;
+			if (vma->vm_mm == tsk->mm)
+				add_to_kill(tsk, page, vma, to_kill, tkc);
+		}
+	}
+	page_unlock_anon_vma(av);
+out:
+	read_unlock(&tasklist_lock);
+}
+
+/*
+ * Collect processes when the error hit a file mapped page.
+ */
+static void collect_procs_file(struct page *page, struct list_head *to_kill,
+			      struct to_kill **tkc)
+{
+	struct vm_area_struct *vma;
+	struct task_struct *tsk;
+	struct prio_tree_iter iter;
+	struct address_space *mapping = page->mapping;
+
+	/*
+	 * A note on the locking order between the two locks.
+	 * We don't rely on this particular order.
+	 * If you have some other code that needs a different order
+	 * feel free to switch them around. Or add a reverse link
+	 * from mm_struct to task_struct, then this could be all
+	 * done without taking tasklist_lock and looping over all tasks.
+	 */
+
+	read_lock(&tasklist_lock);
+	spin_lock(&mapping->i_mmap_lock);
+	for_each_process(tsk) {
+		pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+
+		if (!task_early_kill(tsk))
+			continue;
+
+		vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
+				      pgoff) {
+			/*
+			 * Send early kill signal to tasks where a vma covers
+			 * the page but the corrupted page is not necessarily
+			 * mapped it in its pte.
+			 * Assume applications who requested early kill want
+			 * to be informed of all such data corruptions.
+			 */
+			if (vma->vm_mm == tsk->mm)
+				add_to_kill(tsk, page, vma, to_kill, tkc);
+		}
+	}
+	spin_unlock(&mapping->i_mmap_lock);
+	read_unlock(&tasklist_lock);
+}
+
+/*
+ * Collect the processes who have the corrupted page mapped to kill.
+ * This is done in two steps for locking reasons.
+ * First preallocate one tokill structure outside the spin locks,
+ * so that we can kill at least one process reasonably reliable.
+ */
+static void collect_procs(struct page *page, struct list_head *tokill)
+{
+	struct to_kill *tk;
+
+	if (!page->mapping)
+		return;
+
+	tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
+	if (!tk)
+		return;
+	if (PageAnon(page))
+		collect_procs_anon(page, tokill, &tk);
+	else
+		collect_procs_file(page, tokill, &tk);
+	kfree(tk);
+}
+
+/*
+ * Error handlers for various types of pages.
+ */
+
+enum outcome {
+	FAILED,		/* Error handling failed */
+	DELAYED,	/* Will be handled later */
+	IGNORED,	/* Error safely ignored */
+	RECOVERED,	/* Successfully recovered */
+};
+
+static const char *action_name[] = {
+	[FAILED] = "Failed",
+	[DELAYED] = "Delayed",
+	[IGNORED] = "Ignored",
+	[RECOVERED] = "Recovered",
+};
+
+/*
+ * Error hit kernel page.
+ * Do nothing, try to be lucky and not touch this instead. For a few cases we
+ * could be more sophisticated.
+ */
+static int me_kernel(struct page *p, unsigned long pfn)
+{
+	return DELAYED;
+}
+
+/*
+ * Already poisoned page.
+ */
+static int me_ignore(struct page *p, unsigned long pfn)
+{
+	return IGNORED;
+}
+
+/*
+ * Page in unknown state. Do nothing.
+ */
+static int me_unknown(struct page *p, unsigned long pfn)
+{
+	printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
+	return FAILED;
+}
+
+/*
+ * Free memory
+ */
+static int me_free(struct page *p, unsigned long pfn)
+{
+	return DELAYED;
+}
+
+/*
+ * Clean (or cleaned) page cache page.
+ */
+static int me_pagecache_clean(struct page *p, unsigned long pfn)
+{
+	int err;
+	int ret = FAILED;
+	struct address_space *mapping;
+
+	if (!isolate_lru_page(p))
+		page_cache_release(p);
+
+	/*
+	 * For anonymous pages we're done the only reference left
+	 * should be the one m_f() holds.
+	 */
+	if (PageAnon(p))
+		return RECOVERED;
+
+	/*
+	 * Now truncate the page in the page cache. This is really
+	 * more like a "temporary hole punch"
+	 * Don't do this for block devices when someone else
+	 * has a reference, because it could be file system metadata
+	 * and that's not safe to truncate.
+	 */
+	mapping = page_mapping(p);
+	if (!mapping) {
+		/*
+		 * Page has been teared down in the meanwhile
+		 */
+		return FAILED;
+	}
+
+	/*
+	 * Truncation is a bit tricky. Enable it per file system for now.
+	 *
+	 * Open: to take i_mutex or not for this? Right now we don't.
+	 */
+	if (mapping->a_ops->error_remove_page) {
+		err = mapping->a_ops->error_remove_page(mapping, p);
+		if (err != 0) {
+			printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
+					pfn, err);
+		} else if (page_has_private(p) &&
+				!try_to_release_page(p, GFP_NOIO)) {
+			pr_debug("MCE %#lx: failed to release buffers\n", pfn);
+		} else {
+			ret = RECOVERED;
+		}
+	} else {
+		/*
+		 * If the file system doesn't support it just invalidate
+		 * This fails on dirty or anything with private pages
+		 */
+		if (invalidate_inode_page(p))
+			ret = RECOVERED;
+		else
+			printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
+				pfn);
+	}
+	return ret;
+}
+
+/*
+ * Dirty cache page page
+ * Issues: when the error hit a hole page the error is not properly
+ * propagated.
+ */
+static int me_pagecache_dirty(struct page *p, unsigned long pfn)
+{
+	struct address_space *mapping = page_mapping(p);
+
+	SetPageError(p);
+	/* TBD: print more information about the file. */
+	if (mapping) {
+		/*
+		 * IO error will be reported by write(), fsync(), etc.
+		 * who check the mapping.
+		 * This way the application knows that something went
+		 * wrong with its dirty file data.
+		 *
+		 * There's one open issue:
+		 *
+		 * The EIO will be only reported on the next IO
+		 * operation and then cleared through the IO map.
+		 * Normally Linux has two mechanisms to pass IO error
+		 * first through the AS_EIO flag in the address space
+		 * and then through the PageError flag in the page.
+		 * Since we drop pages on memory failure handling the
+		 * only mechanism open to use is through AS_AIO.
+		 *
+		 * This has the disadvantage that it gets cleared on
+		 * the first operation that returns an error, while
+		 * the PageError bit is more sticky and only cleared
+		 * when the page is reread or dropped.  If an
+		 * application assumes it will always get error on
+		 * fsync, but does other operations on the fd before
+		 * and the page is dropped inbetween then the error
+		 * will not be properly reported.
+		 *
+		 * This can already happen even without hwpoisoned
+		 * pages: first on metadata IO errors (which only
+		 * report through AS_EIO) or when the page is dropped
+		 * at the wrong time.
+		 *
+		 * So right now we assume that the application DTRT on
+		 * the first EIO, but we're not worse than other parts
+		 * of the kernel.
+		 */
+		mapping_set_error(mapping, EIO);
+	}
+
+	return me_pagecache_clean(p, pfn);
+}
+
+/*
+ * Clean and dirty swap cache.
+ *
+ * Dirty swap cache page is tricky to handle. The page could live both in page
+ * cache and swap cache(ie. page is freshly swapped in). So it could be
+ * referenced concurrently by 2 types of PTEs:
+ * normal PTEs and swap PTEs. We try to handle them consistently by calling
+ * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
+ * and then
+ *      - clear dirty bit to prevent IO
+ *      - remove from LRU
+ *      - but keep in the swap cache, so that when we return to it on
+ *        a later page fault, we know the application is accessing
+ *        corrupted data and shall be killed (we installed simple
+ *        interception code in do_swap_page to catch it).
+ *
+ * Clean swap cache pages can be directly isolated. A later page fault will
+ * bring in the known good data from disk.
+ */
+static int me_swapcache_dirty(struct page *p, unsigned long pfn)
+{
+	int ret = FAILED;
+
+	ClearPageDirty(p);
+	/* Trigger EIO in shmem: */
+	ClearPageUptodate(p);
+
+	if (!isolate_lru_page(p)) {
+		page_cache_release(p);
+		ret = DELAYED;
+	}
+
+	return ret;
+}
+
+static int me_swapcache_clean(struct page *p, unsigned long pfn)
+{
+	int ret = FAILED;
+
+	if (!isolate_lru_page(p)) {
+		page_cache_release(p);
+		ret = RECOVERED;
+	}
+	delete_from_swap_cache(p);
+	return ret;
+}
+
+/*
+ * Huge pages. Needs work.
+ * Issues:
+ * No rmap support so we cannot find the original mapper. In theory could walk
+ * all MMs and look for the mappings, but that would be non atomic and racy.
+ * Need rmap for hugepages for this. Alternatively we could employ a heuristic,
+ * like just walking the current process and hoping it has it mapped (that
+ * should be usually true for the common "shared database cache" case)
+ * Should handle free huge pages and dequeue them too, but this needs to
+ * handle huge page accounting correctly.
+ */
+static int me_huge_page(struct page *p, unsigned long pfn)
+{
+	return FAILED;
+}
+
+/*
+ * Various page states we can handle.
+ *
+ * A page state is defined by its current page->flags bits.
+ * The table matches them in order and calls the right handler.
+ *
+ * This is quite tricky because we can access page at any time
+ * in its live cycle, so all accesses have to be extremly careful.
+ *
+ * This is not complete. More states could be added.
+ * For any missing state don't attempt recovery.
+ */
+
+#define dirty		(1UL << PG_dirty)
+#define sc		(1UL << PG_swapcache)
+#define unevict		(1UL << PG_unevictable)
+#define mlock		(1UL << PG_mlocked)
+#define writeback	(1UL << PG_writeback)
+#define lru		(1UL << PG_lru)
+#define swapbacked	(1UL << PG_swapbacked)
+#define head		(1UL << PG_head)
+#define tail		(1UL << PG_tail)
+#define compound	(1UL << PG_compound)
+#define slab		(1UL << PG_slab)
+#define buddy		(1UL << PG_buddy)
+#define reserved	(1UL << PG_reserved)
+
+static struct page_state {
+	unsigned long mask;
+	unsigned long res;
+	char *msg;
+	int (*action)(struct page *p, unsigned long pfn);
+} error_states[] = {
+	{ reserved,	reserved,	"reserved kernel",	me_ignore },
+	{ buddy,	buddy,		"free kernel",	me_free },
+
+	/*
+	 * Could in theory check if slab page is free or if we can drop
+	 * currently unused objects without touching them. But just
+	 * treat it as standard kernel for now.
+	 */
+	{ slab,		slab,		"kernel slab",	me_kernel },
+
+#ifdef CONFIG_PAGEFLAGS_EXTENDED
+	{ head,		head,		"huge",		me_huge_page },
+	{ tail,		tail,		"huge",		me_huge_page },
+#else
+	{ compound,	compound,	"huge",		me_huge_page },
+#endif
+
+	{ sc|dirty,	sc|dirty,	"swapcache",	me_swapcache_dirty },
+	{ sc|dirty,	sc,		"swapcache",	me_swapcache_clean },
+
+	{ unevict|dirty, unevict|dirty,	"unevictable LRU", me_pagecache_dirty},
+	{ unevict,	unevict,	"unevictable LRU", me_pagecache_clean},
+
+#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
+	{ mlock|dirty,	mlock|dirty,	"mlocked LRU",	me_pagecache_dirty },
+	{ mlock,	mlock,		"mlocked LRU",	me_pagecache_clean },
+#endif
+
+	{ lru|dirty,	lru|dirty,	"LRU",		me_pagecache_dirty },
+	{ lru|dirty,	lru,		"clean LRU",	me_pagecache_clean },
+	{ swapbacked,	swapbacked,	"anonymous",	me_pagecache_clean },
+
+	/*
+	 * Catchall entry: must be at end.
+	 */
+	{ 0,		0,		"unknown page state",	me_unknown },
+};
+
+#undef lru
+
+static void action_result(unsigned long pfn, char *msg, int result)
+{
+	struct page *page = NULL;
+	if (pfn_valid(pfn))
+		page = pfn_to_page(pfn);
+
+	printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
+		pfn,
+		page && PageDirty(page) ? "dirty " : "",
+		msg, action_name[result]);
+}
+
+static int page_action(struct page_state *ps, struct page *p,
+			unsigned long pfn, int ref)
+{
+	int result;
+
+	result = ps->action(p, pfn);
+	action_result(pfn, ps->msg, result);
+	if (page_count(p) != 1 + ref)
+		printk(KERN_ERR
+		       "MCE %#lx: %s page still referenced by %d users\n",
+		       pfn, ps->msg, page_count(p) - 1);
+
+	/* Could do more checks here if page looks ok */
+	/*
+	 * Could adjust zone counters here to correct for the missing page.
+	 */
+
+	return result == RECOVERED ? 0 : -EBUSY;
+}
+
+#define N_UNMAP_TRIES 5
+
+/*
+ * Do all that is necessary to remove user space mappings. Unmap
+ * the pages and send SIGBUS to the processes if the data was dirty.
+ */
+static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
+				  int trapno)
+{
+	enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+	struct address_space *mapping;
+	LIST_HEAD(tokill);
+	int ret;
+	int i;
+	int kill = 1;
+
+	if (PageReserved(p) || PageCompound(p) || PageSlab(p))
+		return;
+
+	if (!PageLRU(p))
+		lru_add_drain_all();
+
+	/*
+	 * This check implies we don't kill processes if their pages
+	 * are in the swap cache early. Those are always late kills.
+	 */
+	if (!page_mapped(p))
+		return;
+
+	if (PageSwapCache(p)) {
+		printk(KERN_ERR
+		       "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
+		ttu |= TTU_IGNORE_HWPOISON;
+	}
+
+	/*
+	 * Propagate the dirty bit from PTEs to struct page first, because we
+	 * need this to decide if we should kill or just drop the page.
+	 */
+	mapping = page_mapping(p);
+	if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
+		if (page_mkclean(p)) {
+			SetPageDirty(p);
+		} else {
+			kill = 0;
+			ttu |= TTU_IGNORE_HWPOISON;
+			printk(KERN_INFO
+	"MCE %#lx: corrupted page was clean: dropped without side effects\n",
+				pfn);
+		}
+	}
+
+	/*
+	 * First collect all the processes that have the page
+	 * mapped in dirty form.  This has to be done before try_to_unmap,
+	 * because ttu takes the rmap data structures down.
+	 *
+	 * Error handling: We ignore errors here because
+	 * there's nothing that can be done.
+	 */
+	if (kill)
+		collect_procs(p, &tokill);
+
+	/*
+	 * try_to_unmap can fail temporarily due to races.
+	 * Try a few times (RED-PEN better strategy?)
+	 */
+	for (i = 0; i < N_UNMAP_TRIES; i++) {
+		ret = try_to_unmap(p, ttu);
+		if (ret == SWAP_SUCCESS)
+			break;
+		pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn,  ret);
+	}
+
+	if (ret != SWAP_SUCCESS)
+		printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
+				pfn, page_mapcount(p));
+
+	/*
+	 * Now that the dirty bit has been propagated to the
+	 * struct page and all unmaps done we can decide if
+	 * killing is needed or not.  Only kill when the page
+	 * was dirty, otherwise the tokill list is merely
+	 * freed.  When there was a problem unmapping earlier
+	 * use a more force-full uncatchable kill to prevent
+	 * any accesses to the poisoned memory.
+	 */
+	kill_procs_ao(&tokill, !!PageDirty(p), trapno,
+		      ret != SWAP_SUCCESS, pfn);
+}
+
+int __memory_failure(unsigned long pfn, int trapno, int ref)
+{
+	struct page_state *ps;
+	struct page *p;
+	int res;
+
+	if (!sysctl_memory_failure_recovery)
+		panic("Memory failure from trap %d on page %lx", trapno, pfn);
+
+	if (!pfn_valid(pfn)) {
+		action_result(pfn, "memory outside kernel control", IGNORED);
+		return -EIO;
+	}
+
+	p = pfn_to_page(pfn);
+	if (TestSetPageHWPoison(p)) {
+		action_result(pfn, "already hardware poisoned", IGNORED);
+		return 0;
+	}
+
+	atomic_long_add(1, &mce_bad_pages);
+
+	/*
+	 * We need/can do nothing about count=0 pages.
+	 * 1) it's a free page, and therefore in safe hand:
+	 *    prep_new_page() will be the gate keeper.
+	 * 2) it's part of a non-compound high order page.
+	 *    Implies some kernel user: cannot stop them from
+	 *    R/W the page; let's pray that the page has been
+	 *    used and will be freed some time later.
+	 * In fact it's dangerous to directly bump up page count from 0,
+	 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
+	 */
+	if (!get_page_unless_zero(compound_head(p))) {
+		action_result(pfn, "free or high order kernel", IGNORED);
+		return PageBuddy(compound_head(p)) ? 0 : -EBUSY;
+	}
+
+	/*
+	 * Lock the page and wait for writeback to finish.
+	 * It's very difficult to mess with pages currently under IO
+	 * and in many cases impossible, so we just avoid it here.
+	 */
+	lock_page_nosync(p);
+	wait_on_page_writeback(p);
+
+	/*
+	 * Now take care of user space mappings.
+	 */
+	hwpoison_user_mappings(p, pfn, trapno);
+
+	/*
+	 * Torn down by someone else?
+	 */
+	if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
+		action_result(pfn, "already truncated LRU", IGNORED);
+		res = 0;
+		goto out;
+	}
+
+	res = -EBUSY;
+	for (ps = error_states;; ps++) {
+		if ((p->flags & ps->mask) == ps->res) {
+			res = page_action(ps, p, pfn, ref);
+			break;
+		}
+	}
+out:
+	unlock_page(p);
+	return res;
+}
+EXPORT_SYMBOL_GPL(__memory_failure);
+
+/**
+ * memory_failure - Handle memory failure of a page.
+ * @pfn: Page Number of the corrupted page
+ * @trapno: Trap number reported in the signal to user space.
+ *
+ * This function is called by the low level machine check code
+ * of an architecture when it detects hardware memory corruption
+ * of a page. It tries its best to recover, which includes
+ * dropping pages, killing processes etc.
+ *
+ * The function is primarily of use for corruptions that
+ * happen outside the current execution context (e.g. when
+ * detected by a background scrubber)
+ *
+ * Must run in process context (e.g. a work queue) with interrupts
+ * enabled and no spinlocks hold.
+ */
+void memory_failure(unsigned long pfn, int trapno)
+{
+	__memory_failure(pfn, trapno, 0);
+}
diff --git a/mm/rmap.c b/mm/rmap.c
index 7e72ca19d68b..09c3d0b96116 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -36,6 +36,11 @@
  *                 mapping->tree_lock (widely used, in set_page_dirty,
  *                           in arch-dependent flush_dcache_mmap_lock,
  *                           within inode_lock in __sync_single_inode)
+ *
+ * (code doesn't rely on that order so it could be switched around)
+ * ->tasklist_lock
+ *   anon_vma->lock      (memory_failure, collect_procs_anon)
+ *     pte map lock
  */
 
 #include <linux/mm.h>
@@ -311,7 +316,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
  * if the page is not mapped into the page tables of this VMA.  Only
  * valid for normal file or anonymous VMAs.
  */
-static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
+int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
 {
 	unsigned long address;
 	pte_t *pte;
-- 
cgit v1.2.3-71-gd317


From 2c96ce9f2084c1e04d02883e622f74a537a63aea Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 15 Sep 2009 09:43:56 +0200
Subject: fs: remove bdev->bd_inode_backing_dev_info

It has been unused since it was introduced in:

commit 520808bf20e90fdbdb320264ba7dd5cf9d47dcac
Author: Andrew Morton <akpm@osdl.org>
Date:   Fri May 21 00:46:17 2004 -0700

    [PATCH] block device layer: separate backing_dev_info infrastructure

So lets just kill it.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c        | 1 -
 fs/inode.c            | 4 +---
 fs/nilfs2/the_nilfs.c | 4 +---
 include/linux/fs.h    | 1 -
 4 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 3581a4e53942..71e7e03ac343 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -420,7 +420,6 @@ static void bdev_destroy_inode(struct inode *inode)
 {
 	struct bdev_inode *bdi = BDEV_I(inode);
 
-	bdi->bdev.bd_inode_backing_dev_info = NULL;
 	kmem_cache_free(bdev_cachep, bdi);
 }
 
diff --git a/fs/inode.c b/fs/inode.c
index ae7b67e48661..b2ba83d2c4e1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -182,9 +182,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	if (sb->s_bdev) {
 		struct backing_dev_info *bdi;
 
-		bdi = sb->s_bdev->bd_inode_backing_dev_info;
-		if (!bdi)
-			bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+		bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
 		mapping->backing_dev_info = bdi;
 	}
 	inode->i_private = NULL;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index d4168e269c5d..ad391a8c3e7e 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -591,9 +591,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 
 	nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
 
-	bdi = nilfs->ns_bdev->bd_inode_backing_dev_info;
-	if (!bdi)
-		bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
+	bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
 	nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
 
 	/* Finding last segment */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b21cf6b9c80b..db29588874ac 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -655,7 +655,6 @@ struct block_device {
 	int			bd_invalidated;
 	struct gendisk *	bd_disk;
 	struct list_head	bd_list;
-	struct backing_dev_info *bd_inode_backing_dev_info;
 	/*
 	 * Private data.  You must have bd_claim'ed the block_device
 	 * to use this.  NOTE:  bd_claim allows an owner to claim
-- 
cgit v1.2.3-71-gd317


From 1fe06ad89255c211fe100d7f690d10b161398df8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 15 Sep 2009 15:10:20 +0200
Subject: writeback: get rid of wbc->for_writepages

It's only set, it's never checked. Kill it.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/afs/write.c              | 1 -
 fs/btrfs/ordered-data.c     | 1 -
 fs/jbd2/commit.c            | 1 -
 fs/nfs/write.c              | 1 -
 include/linux/writeback.h   | 1 -
 include/trace/events/ext4.h | 6 ++----
 mm/page-writeback.c         | 2 --
 7 files changed, 2 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/afs/write.c b/fs/afs/write.c
index c2e7a7ff0080..c63a3c8beb73 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -712,7 +712,6 @@ int afs_writeback_all(struct afs_vnode *vnode)
 		.bdi		= mapping->backing_dev_info,
 		.sync_mode	= WB_SYNC_ALL,
 		.nr_to_write	= LONG_MAX,
-		.for_writepages = 1,
 		.range_cyclic	= 1,
 	};
 	int ret;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index d6f0806c682f..7b2f401e604e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -740,7 +740,6 @@ int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
 		.nr_to_write = mapping->nrpages * 2,
 		.range_start = start,
 		.range_end = end,
-		.for_writepages = 1,
 	};
 	return btrfs_writepages(mapping, &wbc);
 }
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7b4088b2364d..0df600e9162d 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -220,7 +220,6 @@ static int journal_submit_inode_data_buffers(struct address_space *mapping)
 		.nr_to_write = mapping->nrpages * 2,
 		.range_start = 0,
 		.range_end = i_size_read(mapping->host),
-		.for_writepages = 1,
 	};
 
 	ret = generic_writepages(mapping, &wbc);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 120acadc6a84..53eb26c16b50 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1490,7 +1490,6 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
 		.nr_to_write = LONG_MAX,
 		.range_start = 0,
 		.range_end = LLONG_MAX,
-		.for_writepages = 1,
 	};
 
 	return __nfs_write_mapping(mapping, &wbc, how);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index d347632f1861..48a054e2b716 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -50,7 +50,6 @@ struct writeback_control {
 	unsigned encountered_congestion:1; /* An output: a queue is full */
 	unsigned for_kupdate:1;		/* A kupdate writeback */
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
-	unsigned for_writepages:1;	/* This is a writepages() call */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned more_io:1;		/* more io to be dispatched */
 	/*
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 7d8b5bc74185..8d433c4e3709 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -227,7 +227,6 @@ TRACE_EVENT(ext4_da_writepages,
 		__field(	char,	nonblocking		)
 		__field(	char,	for_kupdate		)
 		__field(	char,	for_reclaim		)
-		__field(	char,	for_writepages		)
 		__field(	char,	range_cyclic		)
 	),
 
@@ -241,16 +240,15 @@ TRACE_EVENT(ext4_da_writepages,
 		__entry->nonblocking	= wbc->nonblocking;
 		__entry->for_kupdate	= wbc->for_kupdate;
 		__entry->for_reclaim	= wbc->for_reclaim;
-		__entry->for_writepages	= wbc->for_writepages;
 		__entry->range_cyclic	= wbc->range_cyclic;
 	),
 
-	TP_printk("dev %s ino %lu nr_t_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d for_writepages %d range_cyclic %d",
+	TP_printk("dev %s ino %lu nr_t_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d",
 		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->nr_to_write,
 		  __entry->pages_skipped, __entry->range_start,
 		  __entry->range_end, __entry->nonblocking,
 		  __entry->for_kupdate, __entry->for_reclaim,
-		  __entry->for_writepages, __entry->range_cyclic)
+		  __entry->range_cyclic)
 );
 
 TRACE_EVENT(ext4_da_writepages_result,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index dd73d29c15a8..abc648f5de00 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1020,12 +1020,10 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 
 	if (wbc->nr_to_write <= 0)
 		return 0;
-	wbc->for_writepages = 1;
 	if (mapping->a_ops->writepages)
 		ret = mapping->a_ops->writepages(mapping, wbc);
 	else
 		ret = generic_writepages(mapping, wbc);
-	wbc->for_writepages = 0;
 	return ret;
 }
 
-- 
cgit v1.2.3-71-gd317


From 32a88aa1b6dfb901cec64e1898cac78d0f25028a Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 16 Sep 2009 15:02:33 +0200
Subject: fs: Assign bdi in super_block

We do this automatically in get_sb_bdev() from the set_bdev_super()
callback. Filesystems that have their own private backing_dev_info
must assign that in ->fill_super().

Note that ->s_bdi assignment is required for proper writeback!

Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/btrfs/disk-io.c | 1 +
 fs/fuse/inode.c    | 2 ++
 fs/nfs/super.c     | 2 ++
 fs/super.c         | 6 ++++++
 fs/sync.c          | 9 ++++++++-
 fs/ubifs/super.c   | 1 +
 include/linux/fs.h | 1 +
 7 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 15831d5c7367..8b8192790011 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1600,6 +1600,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	sb->s_blocksize = 4096;
 	sb->s_blocksize_bits = blksize_bits(4096);
+	sb->s_bdi = &fs_info->bdi;
 
 	/*
 	 * we set the i_size on the btree inode to the max possible int.
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 4567db6f9430..e5dbecd87b0f 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -894,6 +894,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (err)
 		goto err_put_conn;
 
+	sb->s_bdi = &fc->bdi;
+
 	/* Handle umasking inside the fuse code */
 	if (sb->s_flags & MS_POSIXACL)
 		fc->dont_mask = 1;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 867f70504531..de935692d40d 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1918,6 +1918,8 @@ static inline void nfs_initialise_sb(struct super_block *sb)
 	if (server->flags & NFS_MOUNT_NOAC)
 		sb->s_flags |= MS_SYNCHRONOUS;
 
+	sb->s_bdi = &server->backing_dev_info;
+
 	nfs_super_set_maxbytes(sb, server->maxfilesize);
 }
 
diff --git a/fs/super.c b/fs/super.c
index 9cda337ddae2..b03fea8fbfb6 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -707,6 +707,12 @@ static int set_bdev_super(struct super_block *s, void *data)
 {
 	s->s_bdev = data;
 	s->s_dev = s->s_bdev->bd_dev;
+
+	/*
+	 * We set the bdi here to the queue backing, file systems can
+	 * overwrite this in ->fill_super()
+	 */
+	s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
 	return 0;
 }
 
diff --git a/fs/sync.c b/fs/sync.c
index 192340930bb4..c08467a5d7cb 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -27,6 +27,13 @@
  */
 static int __sync_filesystem(struct super_block *sb, int wait)
 {
+	/*
+	 * This should be safe, as we require bdi backing to actually
+	 * write out data in the first place
+	 */
+	if (!sb->s_bdi)
+		return 0;
+
 	/* Avoid doing twice syncing and cache pruning for quota sync */
 	if (!wait) {
 		writeout_quota_sb(sb, -1);
@@ -101,7 +108,7 @@ restart:
 		spin_unlock(&sb_lock);
 
 		down_read(&sb->s_umount);
-		if (!(sb->s_flags & MS_RDONLY) && sb->s_root)
+		if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
 			__sync_filesystem(sb, wait);
 		up_read(&sb->s_umount);
 
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 51763aa8f4de..c4af069df1ad 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1980,6 +1980,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 	if (err)
 		goto out_bdi;
 
+	sb->s_bdi = &c->bdi;
 	sb->s_fs_info = c;
 	sb->s_magic = UBIFS_SUPER_MAGIC;
 	sb->s_blocksize = UBIFS_BLOCK_SIZE;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index db29588874ac..90162fb3bf04 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1342,6 +1342,7 @@ struct super_block {
 	int			s_nr_dentry_unused;	/* # of dentry on lru */
 
 	struct block_device	*s_bdev;
+	struct backing_dev_info *s_bdi;
 	struct mtd_info		*s_mtd;
 	struct list_head	s_instances;
 	struct quota_info	s_dquot;	/* Diskquota specific options */
-- 
cgit v1.2.3-71-gd317


From cfc4ba5365449cb6b5c9f68d755a142f17da1e47 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 14 Sep 2009 13:12:40 +0200
Subject: writeback: use RCU to protect bdi_list

Now that bdi_writeback_all() no longer handles integrity writeback,
it doesn't have to block anymore. This means that we can switch
bdi_list reader side protection to RCU.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c           |  6 ++--
 include/linux/backing-dev.h |  1 +
 mm/backing-dev.c            | 76 ++++++++++++++++++++++++++++++++-------------
 mm/page-writeback.c         |  8 ++---
 4 files changed, 63 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 14f06b459197..f8cd7a97f5b7 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -868,16 +868,16 @@ static void bdi_writeback_all(struct writeback_control *wbc)
 
 	WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
 
-	spin_lock(&bdi_lock);
+	rcu_read_lock();
 
-	list_for_each_entry(bdi, &bdi_list, bdi_list) {
+	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		if (!bdi_has_dirty_io(bdi))
 			continue;
 
 		bdi_alloc_queue_work(bdi, wbc);
 	}
 
-	spin_unlock(&bdi_lock);
+	rcu_read_unlock();
 }
 
 /*
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index f169bcb90b58..859e797f4576 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -59,6 +59,7 @@ struct bdi_writeback {
 
 struct backing_dev_info {
 	struct list_head bdi_list;
+	struct rcu_head rcu_head;
 	unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */
 	unsigned long state;	/* Always use atomic bitops on this */
 	unsigned int capabilities; /* Device capabilities */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index d3ca0dac1111..fd93566345b6 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -26,6 +26,12 @@ struct backing_dev_info default_backing_dev_info = {
 EXPORT_SYMBOL_GPL(default_backing_dev_info);
 
 static struct class *bdi_class;
+
+/*
+ * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as
+ * reader side protection for bdi_pending_list. bdi_list has RCU reader side
+ * locking.
+ */
 DEFINE_SPINLOCK(bdi_lock);
 LIST_HEAD(bdi_list);
 LIST_HEAD(bdi_pending_list);
@@ -284,9 +290,9 @@ static int bdi_start_fn(void *ptr)
 	/*
 	 * Add us to the active bdi_list
 	 */
-	spin_lock(&bdi_lock);
-	list_add(&bdi->bdi_list, &bdi_list);
-	spin_unlock(&bdi_lock);
+	spin_lock_bh(&bdi_lock);
+	list_add_rcu(&bdi->bdi_list, &bdi_list);
+	spin_unlock_bh(&bdi_lock);
 
 	bdi_task_init(bdi, wb);
 
@@ -389,7 +395,7 @@ static int bdi_forker_task(void *ptr)
 		if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
 			wb_do_writeback(me, 0);
 
-		spin_lock(&bdi_lock);
+		spin_lock_bh(&bdi_lock);
 
 		/*
 		 * Check if any existing bdi's have dirty data without
@@ -410,7 +416,7 @@ static int bdi_forker_task(void *ptr)
 		if (list_empty(&bdi_pending_list)) {
 			unsigned long wait;
 
-			spin_unlock(&bdi_lock);
+			spin_unlock_bh(&bdi_lock);
 			wait = msecs_to_jiffies(dirty_writeback_interval * 10);
 			schedule_timeout(wait);
 			try_to_freeze();
@@ -426,7 +432,7 @@ static int bdi_forker_task(void *ptr)
 		bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
 				 bdi_list);
 		list_del_init(&bdi->bdi_list);
-		spin_unlock(&bdi_lock);
+		spin_unlock_bh(&bdi_lock);
 
 		wb = &bdi->wb;
 		wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
@@ -445,9 +451,9 @@ static int bdi_forker_task(void *ptr)
 			 * a chance to flush other bdi's to free
 			 * memory.
 			 */
-			spin_lock(&bdi_lock);
+			spin_lock_bh(&bdi_lock);
 			list_add_tail(&bdi->bdi_list, &bdi_pending_list);
-			spin_unlock(&bdi_lock);
+			spin_unlock_bh(&bdi_lock);
 
 			bdi_flush_io(bdi);
 		}
@@ -456,6 +462,24 @@ static int bdi_forker_task(void *ptr)
 	return 0;
 }
 
+static void bdi_add_to_pending(struct rcu_head *head)
+{
+	struct backing_dev_info *bdi;
+
+	bdi = container_of(head, struct backing_dev_info, rcu_head);
+	INIT_LIST_HEAD(&bdi->bdi_list);
+
+	spin_lock(&bdi_lock);
+	list_add_tail(&bdi->bdi_list, &bdi_pending_list);
+	spin_unlock(&bdi_lock);
+
+	/*
+	 * We are now on the pending list, wake up bdi_forker_task()
+	 * to finish the job and add us back to the active bdi_list
+	 */
+	wake_up_process(default_backing_dev_info.wb.task);
+}
+
 /*
  * Add the default flusher task that gets created for any bdi
  * that has dirty data pending writeout
@@ -478,16 +502,29 @@ void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
 	 * waiting for previous additions to finish.
 	 */
 	if (!test_and_set_bit(BDI_pending, &bdi->state)) {
-		list_move_tail(&bdi->bdi_list, &bdi_pending_list);
+		list_del_rcu(&bdi->bdi_list);
 
 		/*
-		 * We are now on the pending list, wake up bdi_forker_task()
-		 * to finish the job and add us back to the active bdi_list
+		 * We must wait for the current RCU period to end before
+		 * moving to the pending list. So schedule that operation
+		 * from an RCU callback.
 		 */
-		wake_up_process(default_backing_dev_info.wb.task);
+		call_rcu(&bdi->rcu_head, bdi_add_to_pending);
 	}
 }
 
+/*
+ * Remove bdi from bdi_list, and ensure that it is no longer visible
+ */
+static void bdi_remove_from_list(struct backing_dev_info *bdi)
+{
+	spin_lock_bh(&bdi_lock);
+	list_del_rcu(&bdi->bdi_list);
+	spin_unlock_bh(&bdi_lock);
+
+	synchronize_rcu();
+}
+
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...)
 {
@@ -506,9 +543,9 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		goto exit;
 	}
 
-	spin_lock(&bdi_lock);
-	list_add_tail(&bdi->bdi_list, &bdi_list);
-	spin_unlock(&bdi_lock);
+	spin_lock_bh(&bdi_lock);
+	list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+	spin_unlock_bh(&bdi_lock);
 
 	bdi->dev = dev;
 
@@ -526,9 +563,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 			wb->task = NULL;
 			ret = -ENOMEM;
 
-			spin_lock(&bdi_lock);
-			list_del(&bdi->bdi_list);
-			spin_unlock(&bdi_lock);
+			bdi_remove_from_list(bdi);
 			goto exit;
 		}
 	}
@@ -565,9 +600,7 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
 	/*
 	 * Make sure nobody finds us on the bdi_list anymore
 	 */
-	spin_lock(&bdi_lock);
-	list_del(&bdi->bdi_list);
-	spin_unlock(&bdi_lock);
+	bdi_remove_from_list(bdi);
 
 	/*
 	 * Finally, kill the kernel threads. We don't need to be RCU
@@ -599,6 +632,7 @@ int bdi_init(struct backing_dev_info *bdi)
 	bdi->max_ratio = 100;
 	bdi->max_prop_frac = PROP_FRAC_BASE;
 	spin_lock_init(&bdi->wb_lock);
+	INIT_RCU_HEAD(&bdi->rcu_head);
 	INIT_LIST_HEAD(&bdi->bdi_list);
 	INIT_LIST_HEAD(&bdi->wb_list);
 	INIT_LIST_HEAD(&bdi->work_list);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index abc648f5de00..12c3d843ce93 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -315,7 +315,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
 {
 	int ret = 0;
 
-	spin_lock(&bdi_lock);
+	spin_lock_bh(&bdi_lock);
 	if (min_ratio > bdi->max_ratio) {
 		ret = -EINVAL;
 	} else {
@@ -327,7 +327,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
 			ret = -EINVAL;
 		}
 	}
-	spin_unlock(&bdi_lock);
+	spin_unlock_bh(&bdi_lock);
 
 	return ret;
 }
@@ -339,14 +339,14 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
 	if (max_ratio > 100)
 		return -EINVAL;
 
-	spin_lock(&bdi_lock);
+	spin_lock_bh(&bdi_lock);
 	if (bdi->min_ratio > max_ratio) {
 		ret = -EINVAL;
 	} else {
 		bdi->max_ratio = max_ratio;
 		bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
 	}
-	spin_unlock(&bdi_lock);
+	spin_unlock_bh(&bdi_lock);
 
 	return ret;
 }
-- 
cgit v1.2.3-71-gd317


From b6e51316daede0633e9274e1e30391cfa4747877 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 16 Sep 2009 15:13:54 +0200
Subject: writeback: separate starting of sync vs opportunistic writeback

bdi_start_writeback() is currently split into two paths, one for
WB_SYNC_NONE and one for WB_SYNC_ALL. Add bdi_sync_writeback()
for WB_SYNC_ALL writeback and let bdi_start_writeback() handle
only WB_SYNC_NONE.

Push down the writeback_control allocation and only accept the
parameters that make sense for each function. This cleans up
the API considerably.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c           | 132 ++++++++++++++++++++++----------------------
 fs/ubifs/budget.c           |  20 +------
 include/linux/backing-dev.h |   2 +-
 include/linux/writeback.h   |   4 +-
 mm/page-writeback.c         |  12 +---
 5 files changed, 75 insertions(+), 95 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 59b3ee63b624..5887328b5a06 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -74,14 +74,10 @@ static inline bool bdi_work_on_stack(struct bdi_work *work)
 }
 
 static inline void bdi_work_init(struct bdi_work *work,
-				 struct writeback_control *wbc)
+				 struct wb_writeback_args *args)
 {
 	INIT_RCU_HEAD(&work->rcu_head);
-	work->args.sb = wbc->sb;
-	work->args.nr_pages = wbc->nr_to_write;
-	work->args.sync_mode = wbc->sync_mode;
-	work->args.range_cyclic = wbc->range_cyclic;
-	work->args.for_kupdate = 0;
+	work->args = *args;
 	work->state = WS_USED;
 }
 
@@ -194,7 +190,7 @@ static void bdi_wait_on_work_clear(struct bdi_work *work)
 }
 
 static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
-				 struct writeback_control *wbc)
+				 struct wb_writeback_args *args)
 {
 	struct bdi_work *work;
 
@@ -204,7 +200,7 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
 	 */
 	work = kmalloc(sizeof(*work), GFP_ATOMIC);
 	if (work) {
-		bdi_work_init(work, wbc);
+		bdi_work_init(work, args);
 		bdi_queue_work(bdi, work);
 	} else {
 		struct bdi_writeback *wb = &bdi->wb;
@@ -214,24 +210,54 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
 	}
 }
 
-void bdi_start_writeback(struct writeback_control *wbc)
+/**
+ * bdi_sync_writeback - start and wait for writeback
+ * @bdi: the backing device to write from
+ * @sb: write inodes from this super_block
+ *
+ * Description:
+ *   This does WB_SYNC_ALL data integrity writeback and waits for the
+ *   IO to complete. Callers must hold the sb s_umount semaphore for
+ *   reading, to avoid having the super disappear before we are done.
+ */
+static void bdi_sync_writeback(struct backing_dev_info *bdi,
+			       struct super_block *sb)
 {
-	/*
-	 * WB_SYNC_NONE is opportunistic writeback. If this allocation fails,
-	 * bdi_queue_work() will wake up the thread and flush old data. This
-	 * should ensure some amount of progress in freeing memory.
-	 */
-	if (wbc->sync_mode != WB_SYNC_ALL)
-		bdi_alloc_queue_work(wbc->bdi, wbc);
-	else {
-		struct bdi_work work;
+	struct wb_writeback_args args = {
+		.sb		= sb,
+		.sync_mode	= WB_SYNC_ALL,
+		.nr_pages	= LONG_MAX,
+		.range_cyclic	= 0,
+	};
+	struct bdi_work work;
 
-		bdi_work_init(&work, wbc);
-		work.state |= WS_ONSTACK;
+	bdi_work_init(&work, &args);
+	work.state |= WS_ONSTACK;
 
-		bdi_queue_work(wbc->bdi, &work);
-		bdi_wait_on_work_clear(&work);
-	}
+	bdi_queue_work(bdi, &work);
+	bdi_wait_on_work_clear(&work);
+}
+
+/**
+ * bdi_start_writeback - start writeback
+ * @bdi: the backing device to write from
+ * @nr_pages: the number of pages to write
+ *
+ * Description:
+ *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
+ *   started when this function returns, we make no guarentees on
+ *   completion. Caller need not hold sb s_umount semaphore.
+ *
+ */
+void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
+{
+	struct wb_writeback_args args = {
+		.sync_mode	= WB_SYNC_NONE,
+		.nr_pages	= nr_pages,
+		.range_cyclic	= 1,
+	};
+
+	bdi_alloc_queue_work(bdi, &args);
 }
 
 /*
@@ -863,23 +889,25 @@ int bdi_writeback_task(struct bdi_writeback *wb)
 }
 
 /*
- * Schedule writeback for all backing devices. Can only be used for
- * WB_SYNC_NONE writeback, WB_SYNC_ALL should use bdi_start_writeback()
- * and pass in the superblock.
+ * Schedule writeback for all backing devices. This does WB_SYNC_NONE
+ * writeback, for integrity writeback see bdi_sync_writeback().
  */
-static void bdi_writeback_all(struct writeback_control *wbc)
+static void bdi_writeback_all(struct super_block *sb, long nr_pages)
 {
+	struct wb_writeback_args args = {
+		.sb		= sb,
+		.nr_pages	= nr_pages,
+		.sync_mode	= WB_SYNC_NONE,
+	};
 	struct backing_dev_info *bdi;
 
-	WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
-
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		if (!bdi_has_dirty_io(bdi))
 			continue;
 
-		bdi_alloc_queue_work(bdi, wbc);
+		bdi_alloc_queue_work(bdi, &args);
 	}
 
 	rcu_read_unlock();
@@ -891,17 +919,10 @@ static void bdi_writeback_all(struct writeback_control *wbc)
  */
 void wakeup_flusher_threads(long nr_pages)
 {
-	struct writeback_control wbc = {
-		.sync_mode	= WB_SYNC_NONE,
-		.older_than_this = NULL,
-		.range_cyclic	= 1,
-	};
-
 	if (nr_pages == 0)
 		nr_pages = global_page_state(NR_FILE_DIRTY) +
 				global_page_state(NR_UNSTABLE_NFS);
-	wbc.nr_to_write = nr_pages;
-	bdi_writeback_all(&wbc);
+	bdi_writeback_all(NULL, nr_pages);
 }
 
 static noinline void block_dump___mark_inode_dirty(struct inode *inode)
@@ -1048,7 +1069,7 @@ EXPORT_SYMBOL(__mark_inode_dirty);
  * on the writer throttling path, and we get decent balancing between many
  * throttled threads: we don't want them all piling up on inode_sync_wait.
  */
-static void wait_sb_inodes(struct writeback_control *wbc)
+static void wait_sb_inodes(struct super_block *sb)
 {
 	struct inode *inode, *old_inode = NULL;
 
@@ -1056,7 +1077,7 @@ static void wait_sb_inodes(struct writeback_control *wbc)
 	 * We need to be protected against the filesystem going from
 	 * r/o to r/w or vice versa.
 	 */
-	WARN_ON(!rwsem_is_locked(&wbc->sb->s_umount));
+	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
 	spin_lock(&inode_lock);
 
@@ -1067,7 +1088,7 @@ static void wait_sb_inodes(struct writeback_control *wbc)
 	 * In which case, the inode may not be on the dirty list, but
 	 * we still have to wait for that writeout.
 	 */
-	list_for_each_entry(inode, &wbc->sb->s_inodes, i_sb_list) {
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 		struct address_space *mapping;
 
 		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
@@ -1107,14 +1128,8 @@ static void wait_sb_inodes(struct writeback_control *wbc)
  * for IO completion of submitted IO. The number of pages submitted is
  * returned.
  */
-long writeback_inodes_sb(struct super_block *sb)
+void writeback_inodes_sb(struct super_block *sb)
 {
-	struct writeback_control wbc = {
-		.sb		= sb,
-		.sync_mode	= WB_SYNC_NONE,
-		.range_start	= 0,
-		.range_end	= LLONG_MAX,
-	};
 	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
 	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
 	long nr_to_write;
@@ -1122,9 +1137,7 @@ long writeback_inodes_sb(struct super_block *sb)
 	nr_to_write = nr_dirty + nr_unstable +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
 
-	wbc.nr_to_write = nr_to_write;
-	bdi_writeback_all(&wbc);
-	return nr_to_write - wbc.nr_to_write;
+	bdi_writeback_all(sb, nr_to_write);
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
 
@@ -1135,21 +1148,10 @@ EXPORT_SYMBOL(writeback_inodes_sb);
  * This function writes and waits on any dirty inode belonging to this
  * super_block. The number of pages synced is returned.
  */
-long sync_inodes_sb(struct super_block *sb)
+void sync_inodes_sb(struct super_block *sb)
 {
-	struct writeback_control wbc = {
-		.sb		= sb,
-		.bdi		= sb->s_bdi,
-		.sync_mode	= WB_SYNC_ALL,
-		.range_start	= 0,
-		.range_end	= LLONG_MAX,
-	};
-	long nr_to_write = LONG_MAX; /* doesn't actually matter */
-
-	wbc.nr_to_write = nr_to_write;
-	bdi_start_writeback(&wbc);
-	wait_sb_inodes(&wbc);
-	return nr_to_write - wbc.nr_to_write;
+	bdi_sync_writeback(sb->s_bdi, sb);
+	wait_sb_inodes(sb);
 }
 EXPORT_SYMBOL(sync_inodes_sb);
 
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 1c8991b0db13..ee1ce68fd98b 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -54,29 +54,15 @@
  * @nr_to_write: how many dirty pages to write-back
  *
  * This function shrinks UBIFS liability by means of writing back some amount
- * of dirty inodes and their pages. Returns the amount of pages which were
- * written back. The returned value does not include dirty inodes which were
- * synchronized.
+ * of dirty inodes and their pages.
  *
  * Note, this function synchronizes even VFS inodes which are locked
  * (@i_mutex) by the caller of the budgeting function, because write-back does
  * not touch @i_mutex.
  */
-static int shrink_liability(struct ubifs_info *c, int nr_to_write)
+static void shrink_liability(struct ubifs_info *c, int nr_to_write)
 {
-	int nr_written;
-
-	nr_written = writeback_inodes_sb(c->vfs_sb);
-	if (!nr_written) {
-		/*
-		 * Re-try again but wait on pages/inodes which are being
-		 * written-back concurrently (e.g., by pdflush).
-		 */
-		nr_written = sync_inodes_sb(c->vfs_sb);
-	}
-
-	dbg_budg("%d pages were written back", nr_written);
-	return nr_written;
+	writeback_inodes_sb(c->vfs_sb);
 }
 
 /**
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 859e797f4576..0ee33c2e6129 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -101,7 +101,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...);
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 void bdi_unregister(struct backing_dev_info *bdi);
-void bdi_start_writeback(struct writeback_control *wbc);
+void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages);
 int bdi_writeback_task(struct bdi_writeback *wb);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 48a054e2b716..75cf58666ff9 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -68,8 +68,8 @@ struct writeback_control {
  */	
 struct bdi_writeback;
 int inode_wait(void *);
-long writeback_inodes_sb(struct super_block *);
-long sync_inodes_sb(struct super_block *);
+void writeback_inodes_sb(struct super_block *);
+void sync_inodes_sb(struct super_block *);
 void writeback_inodes_wbc(struct writeback_control *wbc);
 long wb_do_writeback(struct bdi_writeback *wb, int force_wait);
 void wakeup_flusher_threads(long nr_pages);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 12c3d843ce93..1eea4fa0d410 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -582,16 +582,8 @@ static void balance_dirty_pages(struct address_space *mapping)
 	if ((laptop_mode && pages_written) ||
 	    (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY)
 					  + global_page_state(NR_UNSTABLE_NFS))
-					  > background_thresh))) {
-		struct writeback_control wbc = {
-			.bdi		= bdi,
-			.sync_mode	= WB_SYNC_NONE,
-			.nr_to_write	= nr_writeback,
-		};
-
-
-		bdi_start_writeback(&wbc);
-	}
+					  > background_thresh)))
+		bdi_start_writeback(bdi, nr_writeback);
 }
 
 void set_page_dirty_balance(struct page *page, int page_mkwrite)
-- 
cgit v1.2.3-71-gd317


From 182a85f8a119c789610a9d464f4129ded9f3c107 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 16 Sep 2009 13:24:49 +0200
Subject: sched: Disable wakeup balancing

Sysbench thinks SD_BALANCE_WAKE is too agressive and kbuild doesn't
really mind too much, SD_BALANCE_NEWIDLE picks up most of the
slack.

On a dual socket, quad core, dual thread nehalem system:

sysbench (--num_threads=16):

 SD_BALANCE_WAKE-: 13982 tx/s
 SD_BALANCE_WAKE+: 15688 tx/s

kbuild (-j16):

 SD_BALANCE_WAKE-: 47.648295846  seconds time elapsed   ( +-   0.312% )
 SD_BALANCE_WAKE+: 47.608607360  seconds time elapsed   ( +-   0.026% )

(same within noise)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/ia64/include/asm/topology.h           | 2 --
 arch/mips/include/asm/mach-ip27/topology.h | 1 -
 arch/powerpc/include/asm/topology.h        | 1 -
 arch/sh/include/asm/topology.h             | 1 -
 arch/sparc/include/asm/topology_64.h       | 1 -
 arch/x86/include/asm/topology.h            | 2 +-
 include/linux/topology.h                   | 6 +++---
 7 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 569b9dafc78c..d0141fbf51d0 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -68,7 +68,6 @@ void build_cpu_to_node_map(void);
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
 				| SD_BALANCE_FORK	\
-				| SD_BALANCE_WAKE	\
 				| SD_WAKE_AFFINE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
@@ -94,7 +93,6 @@ void build_cpu_to_node_map(void);
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
 				| SD_BALANCE_FORK	\
-				| SD_BALANCE_WAKE	\
 				| SD_SERIALIZE,		\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 64,			\
diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h
index d8332398f5be..230591707005 100644
--- a/arch/mips/include/asm/mach-ip27/topology.h
+++ b/arch/mips/include/asm/mach-ip27/topology.h
@@ -48,7 +48,6 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
 	.cache_nice_tries	= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
-				| SD_BALANCE_WAKE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
 	.nr_balance_failed	= 0,			\
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 1a2c9eb42a03..394edcbcce71 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -63,7 +63,6 @@ static inline int pcibus_to_node(struct pci_bus *bus)
 				| SD_BALANCE_EXEC	\
 				| SD_BALANCE_FORK	\
 				| SD_BALANCE_NEWIDLE	\
-				| SD_BALANCE_WAKE	\
 				| SD_SERIALIZE,		\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h
index a8cc564b703d..f8c40cc65054 100644
--- a/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@ -21,7 +21,6 @@
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
-				| SD_BALANCE_WAKE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_SERIALIZE,		\
 	.last_balance		= jiffies,		\
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index 10b979d1de20..26cd25c08399 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -57,7 +57,6 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
-				| SD_BALANCE_WAKE	\
 				| SD_SERIALIZE,		\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 589f12383d78..6f0695d744bf 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -141,7 +141,7 @@ extern unsigned long node_remap_size[];
 				| 1*SD_BALANCE_NEWIDLE			\
 				| 1*SD_BALANCE_EXEC			\
 				| 1*SD_BALANCE_FORK			\
-				| 1*SD_BALANCE_WAKE			\
+				| 0*SD_BALANCE_WAKE			\
 				| 1*SD_WAKE_AFFINE			\
 				| 0*SD_SHARE_CPUPOWER			\
 				| 0*SD_POWERSAVINGS_BALANCE		\
diff --git a/include/linux/topology.h b/include/linux/topology.h
index a6614b0242a9..809b26c07090 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -95,7 +95,7 @@ int arch_update_cpu_topology(void);
 				| 1*SD_BALANCE_NEWIDLE			\
 				| 1*SD_BALANCE_EXEC			\
 				| 1*SD_BALANCE_FORK			\
-				| 1*SD_BALANCE_WAKE			\
+				| 0*SD_BALANCE_WAKE			\
 				| 1*SD_WAKE_AFFINE			\
 				| 1*SD_SHARE_CPUPOWER			\
 				| 0*SD_POWERSAVINGS_BALANCE		\
@@ -127,7 +127,7 @@ int arch_update_cpu_topology(void);
 				| 1*SD_BALANCE_NEWIDLE			\
 				| 1*SD_BALANCE_EXEC			\
 				| 1*SD_BALANCE_FORK			\
-				| 1*SD_BALANCE_WAKE			\
+				| 0*SD_BALANCE_WAKE			\
 				| 1*SD_WAKE_AFFINE			\
 				| 1*SD_PREFER_LOCAL			\
 				| 0*SD_SHARE_CPUPOWER			\
@@ -160,7 +160,7 @@ int arch_update_cpu_topology(void);
 				| 1*SD_BALANCE_NEWIDLE			\
 				| 1*SD_BALANCE_EXEC			\
 				| 1*SD_BALANCE_FORK			\
-				| 1*SD_BALANCE_WAKE			\
+				| 0*SD_BALANCE_WAKE			\
 				| 1*SD_WAKE_AFFINE			\
 				| 1*SD_PREFER_LOCAL			\
 				| 0*SD_SHARE_CPUPOWER			\
-- 
cgit v1.2.3-71-gd317


From 9c28cbccec66a5ca292c6659bf5a0fe0c8459fa7 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 3 Aug 2009 19:21:00 +0200
Subject: jbd: Journal block numbers can ever be only 32-bit use unsigned int
 for them

It does not make sense to store block number for journal as unsigned long
since they can be only 32-bit (because of on-disk format limitation). So
change in-memory structures and variables to use unsigned int instead.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/checkpoint.c |  6 +++---
 fs/jbd/commit.c     |  2 +-
 fs/jbd/journal.c    | 30 +++++++++++++++---------------
 fs/jbd/recovery.c   | 18 +++++++++---------
 fs/jbd/revoke.c     | 16 ++++++++--------
 include/linux/jbd.h | 26 +++++++++++++-------------
 6 files changed, 49 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 61f32f3868cd..b0435dd0654d 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -456,7 +456,7 @@ int cleanup_journal_tail(journal_t *journal)
 {
 	transaction_t * transaction;
 	tid_t		first_tid;
-	unsigned long	blocknr, freed;
+	unsigned int	blocknr, freed;
 
 	if (is_journal_aborted(journal))
 		return 1;
@@ -502,8 +502,8 @@ int cleanup_journal_tail(journal_t *journal)
 		freed = freed + journal->j_last - journal->j_first;
 
 	jbd_debug(1,
-		  "Cleaning journal tail from %d to %d (offset %lu), "
-		  "freeing %lu\n",
+		  "Cleaning journal tail from %d to %d (offset %u), "
+		  "freeing %u\n",
 		  journal->j_tail_sequence, first_tid, blocknr, freed);
 
 	journal->j_free += freed;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 618e21c0b7a3..4bd882548c45 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -308,7 +308,7 @@ void journal_commit_transaction(journal_t *journal)
 	int bufs;
 	int flags;
 	int err;
-	unsigned long blocknr;
+	unsigned int blocknr;
 	ktime_t start_time;
 	u64 commit_time;
 	char *tagp = NULL;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index f96f85092d1c..bd3c073b485d 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -276,7 +276,7 @@ static void journal_kill_thread(journal_t *journal)
 int journal_write_metadata_buffer(transaction_t *transaction,
 				  struct journal_head  *jh_in,
 				  struct journal_head **jh_out,
-				  unsigned long blocknr)
+				  unsigned int blocknr)
 {
 	int need_copy_out = 0;
 	int done_copy_out = 0;
@@ -567,9 +567,9 @@ int log_wait_commit(journal_t *journal, tid_t tid)
  * Log buffer allocation routines:
  */
 
-int journal_next_log_block(journal_t *journal, unsigned long *retp)
+int journal_next_log_block(journal_t *journal, unsigned int *retp)
 {
-	unsigned long blocknr;
+	unsigned int blocknr;
 
 	spin_lock(&journal->j_state_lock);
 	J_ASSERT(journal->j_free > 1);
@@ -590,11 +590,11 @@ int journal_next_log_block(journal_t *journal, unsigned long *retp)
  * this is a no-op.  If needed, we can use j_blk_offset - everything is
  * ready.
  */
-int journal_bmap(journal_t *journal, unsigned long blocknr,
-		 unsigned long *retp)
+int journal_bmap(journal_t *journal, unsigned int blocknr,
+		 unsigned int *retp)
 {
 	int err = 0;
-	unsigned long ret;
+	unsigned int ret;
 
 	if (journal->j_inode) {
 		ret = bmap(journal->j_inode, blocknr);
@@ -604,7 +604,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr,
 			char b[BDEVNAME_SIZE];
 
 			printk(KERN_ALERT "%s: journal block not found "
-					"at offset %lu on %s\n",
+					"at offset %u on %s\n",
 				__func__,
 				blocknr,
 				bdevname(journal->j_dev, b));
@@ -630,7 +630,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr,
 struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
 {
 	struct buffer_head *bh;
-	unsigned long blocknr;
+	unsigned int blocknr;
 	int err;
 
 	err = journal_next_log_block(journal, &blocknr);
@@ -774,7 +774,7 @@ journal_t * journal_init_inode (struct inode *inode)
 	journal_t *journal = journal_init_common();
 	int err;
 	int n;
-	unsigned long blocknr;
+	unsigned int blocknr;
 
 	if (!journal)
 		return NULL;
@@ -846,12 +846,12 @@ static void journal_fail_superblock (journal_t *journal)
 static int journal_reset(journal_t *journal)
 {
 	journal_superblock_t *sb = journal->j_superblock;
-	unsigned long first, last;
+	unsigned int first, last;
 
 	first = be32_to_cpu(sb->s_first);
 	last = be32_to_cpu(sb->s_maxlen);
 	if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
-		printk(KERN_ERR "JBD: Journal too short (blocks %lu-%lu).\n",
+		printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n",
 		       first, last);
 		journal_fail_superblock(journal);
 		return -EINVAL;
@@ -885,7 +885,7 @@ static int journal_reset(journal_t *journal)
  **/
 int journal_create(journal_t *journal)
 {
-	unsigned long blocknr;
+	unsigned int blocknr;
 	struct buffer_head *bh;
 	journal_superblock_t *sb;
 	int i, err;
@@ -969,14 +969,14 @@ void journal_update_superblock(journal_t *journal, int wait)
 	if (sb->s_start == 0 && journal->j_tail_sequence ==
 				journal->j_transaction_sequence) {
 		jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
-			"(start %ld, seq %d, errno %d)\n",
+			"(start %u, seq %d, errno %d)\n",
 			journal->j_tail, journal->j_tail_sequence,
 			journal->j_errno);
 		goto out;
 	}
 
 	spin_lock(&journal->j_state_lock);
-	jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
+	jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n",
 		  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
 
 	sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
@@ -1371,7 +1371,7 @@ int journal_flush(journal_t *journal)
 {
 	int err = 0;
 	transaction_t *transaction = NULL;
-	unsigned long old_tail;
+	unsigned int old_tail;
 
 	spin_lock(&journal->j_state_lock);
 
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index db5e982c5ddf..cb1a49ae605e 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -70,7 +70,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
 {
 	int err;
 	unsigned int max, nbufs, next;
-	unsigned long blocknr;
+	unsigned int blocknr;
 	struct buffer_head *bh;
 
 	struct buffer_head * bufs[MAXBUF];
@@ -132,7 +132,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
 		 unsigned int offset)
 {
 	int err;
-	unsigned long blocknr;
+	unsigned int blocknr;
 	struct buffer_head *bh;
 
 	*bhp = NULL;
@@ -314,7 +314,7 @@ static int do_one_pass(journal_t *journal,
 			struct recovery_info *info, enum passtype pass)
 {
 	unsigned int		first_commit_ID, next_commit_ID;
-	unsigned long		next_log_block;
+	unsigned int		next_log_block;
 	int			err, success = 0;
 	journal_superblock_t *	sb;
 	journal_header_t *	tmp;
@@ -367,14 +367,14 @@ static int do_one_pass(journal_t *journal,
 			if (tid_geq(next_commit_ID, info->end_transaction))
 				break;
 
-		jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
+		jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n",
 			  next_commit_ID, next_log_block, journal->j_last);
 
 		/* Skip over each chunk of the transaction looking
 		 * either the next descriptor block or the final commit
 		 * record. */
 
-		jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
+		jbd_debug(3, "JBD: checking block %u\n", next_log_block);
 		err = jread(&bh, journal, next_log_block);
 		if (err)
 			goto failed;
@@ -429,7 +429,7 @@ static int do_one_pass(journal_t *journal,
 			tagp = &bh->b_data[sizeof(journal_header_t)];
 			while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
 			       <= journal->j_blocksize) {
-				unsigned long io_block;
+				unsigned int io_block;
 
 				tag = (journal_block_tag_t *) tagp;
 				flags = be32_to_cpu(tag->t_flags);
@@ -443,10 +443,10 @@ static int do_one_pass(journal_t *journal,
 					success = err;
 					printk (KERN_ERR
 						"JBD: IO error %d recovering "
-						"block %ld in log\n",
+						"block %u in log\n",
 						err, io_block);
 				} else {
-					unsigned long blocknr;
+					unsigned int blocknr;
 
 					J_ASSERT(obh != NULL);
 					blocknr = be32_to_cpu(tag->t_blocknr);
@@ -581,7 +581,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
 	max = be32_to_cpu(header->r_count);
 
 	while (offset < max) {
-		unsigned long blocknr;
+		unsigned int blocknr;
 		int err;
 
 		blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index da6cd9bdaabc..ad717328343a 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -101,7 +101,7 @@ struct jbd_revoke_record_s
 {
 	struct list_head  hash;
 	tid_t		  sequence;	/* Used for recovery only */
-	unsigned long	  blocknr;
+	unsigned int	  blocknr;
 };
 
 
@@ -126,7 +126,7 @@ static void flush_descriptor(journal_t *, struct journal_head *, int, int);
 /* Utility functions to maintain the revoke table */
 
 /* Borrowed from buffer.c: this is a tried and tested block hash function */
-static inline int hash(journal_t *journal, unsigned long block)
+static inline int hash(journal_t *journal, unsigned int block)
 {
 	struct jbd_revoke_table_s *table = journal->j_revoke;
 	int hash_shift = table->hash_shift;
@@ -136,7 +136,7 @@ static inline int hash(journal_t *journal, unsigned long block)
 		(block << (hash_shift - 12))) & (table->hash_size - 1);
 }
 
-static int insert_revoke_hash(journal_t *journal, unsigned long blocknr,
+static int insert_revoke_hash(journal_t *journal, unsigned int blocknr,
 			      tid_t seq)
 {
 	struct list_head *hash_list;
@@ -166,7 +166,7 @@ oom:
 /* Find a revoke record in the journal's hash table. */
 
 static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
-						      unsigned long blocknr)
+						      unsigned int blocknr)
 {
 	struct list_head *hash_list;
 	struct jbd_revoke_record_s *record;
@@ -332,7 +332,7 @@ void journal_destroy_revoke(journal_t *journal)
  * by one.
  */
 
-int journal_revoke(handle_t *handle, unsigned long blocknr,
+int journal_revoke(handle_t *handle, unsigned int blocknr,
 		   struct buffer_head *bh_in)
 {
 	struct buffer_head *bh = NULL;
@@ -401,7 +401,7 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
 		}
 	}
 
-	jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in);
+	jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in);
 	err = insert_revoke_hash(journal, blocknr,
 				handle->h_transaction->t_tid);
 	BUFFER_TRACE(bh_in, "exit");
@@ -644,7 +644,7 @@ static void flush_descriptor(journal_t *journal,
  */
 
 int journal_set_revoke(journal_t *journal,
-		       unsigned long blocknr,
+		       unsigned int blocknr,
 		       tid_t sequence)
 {
 	struct jbd_revoke_record_s *record;
@@ -668,7 +668,7 @@ int journal_set_revoke(journal_t *journal,
  */
 
 int journal_test_revoke(journal_t *journal,
-			unsigned long blocknr,
+			unsigned int blocknr,
 			tid_t sequence)
 {
 	struct jbd_revoke_record_s *record;
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index c2049a04fa0b..a1187a0c99b4 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -446,7 +446,7 @@ struct transaction_s
 	/*
 	 * Where in the log does this transaction's commit start? [no locking]
 	 */
-	unsigned long		t_log_start;
+	unsigned int		t_log_start;
 
 	/* Number of buffers on the t_buffers list [j_list_lock] */
 	int			t_nr_buffers;
@@ -701,26 +701,26 @@ struct journal_s
 	 * Journal head: identifies the first unused block in the journal.
 	 * [j_state_lock]
 	 */
-	unsigned long		j_head;
+	unsigned int		j_head;
 
 	/*
 	 * Journal tail: identifies the oldest still-used block in the journal.
 	 * [j_state_lock]
 	 */
-	unsigned long		j_tail;
+	unsigned int		j_tail;
 
 	/*
 	 * Journal free: how many free blocks are there in the journal?
 	 * [j_state_lock]
 	 */
-	unsigned long		j_free;
+	unsigned int		j_free;
 
 	/*
 	 * Journal start and end: the block numbers of the first usable block
 	 * and one beyond the last usable block in the journal. [j_state_lock]
 	 */
-	unsigned long		j_first;
-	unsigned long		j_last;
+	unsigned int		j_first;
+	unsigned int		j_last;
 
 	/*
 	 * Device, blocksize and starting block offset for the location where we
@@ -728,7 +728,7 @@ struct journal_s
 	 */
 	struct block_device	*j_dev;
 	int			j_blocksize;
-	unsigned long		j_blk_offset;
+	unsigned int		j_blk_offset;
 
 	/*
 	 * Device which holds the client fs.  For internal journal this will be
@@ -859,7 +859,7 @@ extern void __journal_clean_data_list(transaction_t *transaction);
 
 /* Log buffer allocation */
 extern struct journal_head * journal_get_descriptor_buffer(journal_t *);
-int journal_next_log_block(journal_t *, unsigned long *);
+int journal_next_log_block(journal_t *, unsigned int *);
 
 /* Commit management */
 extern void journal_commit_transaction(journal_t *);
@@ -874,7 +874,7 @@ extern int
 journal_write_metadata_buffer(transaction_t	  *transaction,
 			      struct journal_head  *jh_in,
 			      struct journal_head **jh_out,
-			      unsigned long	   blocknr);
+			      unsigned int blocknr);
 
 /* Transaction locking */
 extern void		__wait_on_journal (journal_t *);
@@ -942,7 +942,7 @@ extern void	   journal_abort      (journal_t *, int);
 extern int	   journal_errno      (journal_t *);
 extern void	   journal_ack_err    (journal_t *);
 extern int	   journal_clear_err  (journal_t *);
-extern int	   journal_bmap(journal_t *, unsigned long, unsigned long *);
+extern int	   journal_bmap(journal_t *, unsigned int, unsigned int *);
 extern int	   journal_force_commit(journal_t *);
 
 /*
@@ -976,14 +976,14 @@ extern int	   journal_init_revoke_caches(void);
 
 extern void	   journal_destroy_revoke(journal_t *);
 extern int	   journal_revoke (handle_t *,
-				unsigned long, struct buffer_head *);
+				unsigned int, struct buffer_head *);
 extern int	   journal_cancel_revoke(handle_t *, struct journal_head *);
 extern void	   journal_write_revoke_records(journal_t *,
 						transaction_t *, int);
 
 /* Recovery revoke support */
-extern int	journal_set_revoke(journal_t *, unsigned long, tid_t);
-extern int	journal_test_revoke(journal_t *, unsigned long, tid_t);
+extern int	journal_set_revoke(journal_t *, unsigned int, tid_t);
+extern int	journal_test_revoke(journal_t *, unsigned int, tid_t);
 extern void	journal_clear_revoke(journal_t *);
 extern void	journal_switch_revoke_table(journal_t *journal);
 
-- 
cgit v1.2.3-71-gd317


From 9f7b07d6cc3ed14783c9427a5b2a69794eb2de64 Mon Sep 17 00:00:00 2001
From: Daniel Ribeiro <drwyrm@gmail.com>
Date: Tue, 23 Jun 2009 12:32:11 -0300
Subject: mfd: Introduce irq_to_pcap()

Export an irq_to_pcap function to get pcap irq number, for the keypad driver.

Signed-off-by: Daniel Ribeiro <drwyrm@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/ezx-pcap.c       | 9 +++++----
 include/linux/mfd/ezx-pcap.h | 1 +
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/ezx-pcap.c b/drivers/mfd/ezx-pcap.c
index c1de4afa89a6..de7e63706abb 100644
--- a/drivers/mfd/ezx-pcap.c
+++ b/drivers/mfd/ezx-pcap.c
@@ -107,10 +107,11 @@ int ezx_pcap_read(struct pcap_chip *pcap, u8 reg_num, u32 *value)
 EXPORT_SYMBOL_GPL(ezx_pcap_read);
 
 /* IRQ */
-static inline unsigned int irq2pcap(struct pcap_chip *pcap, int irq)
+int irq_to_pcap(struct pcap_chip *pcap, int irq)
 {
-	return 1 << (irq - pcap->irq_base);
+	return irq - pcap->irq_base;
 }
+EXPORT_SYMBOL_GPL(irq_to_pcap);
 
 int pcap_to_irq(struct pcap_chip *pcap, int irq)
 {
@@ -122,7 +123,7 @@ static void pcap_mask_irq(unsigned int irq)
 {
 	struct pcap_chip *pcap = get_irq_chip_data(irq);
 
-	pcap->msr |= irq2pcap(pcap, irq);
+	pcap->msr |= 1 << irq_to_pcap(pcap, irq);
 	queue_work(pcap->workqueue, &pcap->msr_work);
 }
 
@@ -130,7 +131,7 @@ static void pcap_unmask_irq(unsigned int irq)
 {
 	struct pcap_chip *pcap = get_irq_chip_data(irq);
 
-	pcap->msr &= ~irq2pcap(pcap, irq);
+	pcap->msr &= ~(1 << irq_to_pcap(pcap, irq));
 	queue_work(pcap->workqueue, &pcap->msr_work);
 }
 
diff --git a/include/linux/mfd/ezx-pcap.h b/include/linux/mfd/ezx-pcap.h
index c12c3c0932bf..6296b4935a1e 100644
--- a/include/linux/mfd/ezx-pcap.h
+++ b/include/linux/mfd/ezx-pcap.h
@@ -26,6 +26,7 @@ struct pcap_chip;
 int ezx_pcap_write(struct pcap_chip *, u8, u32);
 int ezx_pcap_read(struct pcap_chip *, u8, u32 *);
 int pcap_to_irq(struct pcap_chip *, int);
+int irq_to_pcap(struct pcap_chip *, int);
 int pcap_adc_async(struct pcap_chip *, u8, u32, u8[], void *, void *);
 int pcap_adc_sync(struct pcap_chip *, u8, u32, u8[], u16[]);
 
-- 
cgit v1.2.3-71-gd317


From ecd78cbdb989fd593bf4fd69cdb572200e70a553 Mon Sep 17 00:00:00 2001
From: Daniel Ribeiro <drwyrm@gmail.com>
Date: Tue, 23 Jun 2009 12:33:10 -0300
Subject: mfd: add set_ts_bits for pcap

Some TS controller bits are on the same register as the ADC control, save
TS specific bits and export a set_ts_bits function so the TS driver can set
it with the adc_mutex lock held.

Signed-off-by: Daniel Ribeiro <drwyrm@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/ezx-pcap.c       | 22 ++++++++++++++++++----
 include/linux/mfd/ezx-pcap.h |  1 +
 2 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/ezx-pcap.c b/drivers/mfd/ezx-pcap.c
index de7e63706abb..c5122024f05a 100644
--- a/drivers/mfd/ezx-pcap.c
+++ b/drivers/mfd/ezx-pcap.c
@@ -195,6 +195,19 @@ static void pcap_irq_handler(unsigned int irq, struct irq_desc *desc)
 }
 
 /* ADC */
+void pcap_set_ts_bits(struct pcap_chip *pcap, u32 bits)
+{
+	u32 tmp;
+
+	mutex_lock(&pcap->adc_mutex);
+	ezx_pcap_read(pcap, PCAP_REG_ADC, &tmp);
+	tmp &= ~(PCAP_ADC_TS_M_MASK | PCAP_ADC_TS_REF_LOWPWR);
+	tmp |= bits & (PCAP_ADC_TS_M_MASK | PCAP_ADC_TS_REF_LOWPWR);
+	ezx_pcap_write(pcap, PCAP_REG_ADC, tmp);
+	mutex_unlock(&pcap->adc_mutex);
+}
+EXPORT_SYMBOL_GPL(pcap_set_ts_bits);
+
 static void pcap_disable_adc(struct pcap_chip *pcap)
 {
 	u32 tmp;
@@ -217,15 +230,16 @@ static void pcap_adc_trigger(struct pcap_chip *pcap)
 		mutex_unlock(&pcap->adc_mutex);
 		return;
 	}
-	mutex_unlock(&pcap->adc_mutex);
-
-	/* start conversion on requested bank */
-	tmp = pcap->adc_queue[head]->flags | PCAP_ADC_ADEN;
+	/* start conversion on requested bank, save TS_M bits */
+	ezx_pcap_read(pcap, PCAP_REG_ADC, &tmp);
+	tmp &= (PCAP_ADC_TS_M_MASK | PCAP_ADC_TS_REF_LOWPWR);
+	tmp |= pcap->adc_queue[head]->flags | PCAP_ADC_ADEN;
 
 	if (pcap->adc_queue[head]->bank == PCAP_ADC_BANK_1)
 		tmp |= PCAP_ADC_AD_SEL1;
 
 	ezx_pcap_write(pcap, PCAP_REG_ADC, tmp);
+	mutex_unlock(&pcap->adc_mutex);
 	ezx_pcap_write(pcap, PCAP_REG_ADR, PCAP_ADR_ASC);
 }
 
diff --git a/include/linux/mfd/ezx-pcap.h b/include/linux/mfd/ezx-pcap.h
index 6296b4935a1e..b15caacf0720 100644
--- a/include/linux/mfd/ezx-pcap.h
+++ b/include/linux/mfd/ezx-pcap.h
@@ -29,6 +29,7 @@ int pcap_to_irq(struct pcap_chip *, int);
 int irq_to_pcap(struct pcap_chip *, int);
 int pcap_adc_async(struct pcap_chip *, u8, u32, u8[], void *, void *);
 int pcap_adc_sync(struct pcap_chip *, u8, u32, u8[], u16[]);
+void pcap_set_ts_bits(struct pcap_chip *, u32);
 
 #define PCAP_SECOND_PORT	1
 #define PCAP_CS_AH		2
-- 
cgit v1.2.3-71-gd317


From e9a22635b0d794d0cb242ffb0249f7b2a410bca2 Mon Sep 17 00:00:00 2001
From: Daniel Ribeiro <drwyrm@gmail.com>
Date: Sat, 27 Jun 2009 00:17:20 -0300
Subject: mfd: add ezx_pcap_setbits

Provides an atomic set_bits functions, as needed by the pcap-regulator
driver.

Signed-off-by: Daniel Ribeiro <drwyrm@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/ezx-pcap.c       | 23 +++++++++++++++++++++++
 include/linux/mfd/ezx-pcap.h |  1 +
 2 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/ezx-pcap.c b/drivers/mfd/ezx-pcap.c
index 732664f238fe..86d394894d81 100644
--- a/drivers/mfd/ezx-pcap.c
+++ b/drivers/mfd/ezx-pcap.c
@@ -107,6 +107,29 @@ int ezx_pcap_read(struct pcap_chip *pcap, u8 reg_num, u32 *value)
 }
 EXPORT_SYMBOL_GPL(ezx_pcap_read);
 
+int ezx_pcap_set_bits(struct pcap_chip *pcap, u8 reg_num, u32 mask, u32 val)
+{
+	int ret;
+	u32 tmp = PCAP_REGISTER_READ_OP_BIT |
+		(reg_num << PCAP_REGISTER_ADDRESS_SHIFT);
+
+	mutex_lock(&pcap->io_mutex);
+	ret = ezx_pcap_putget(pcap, &tmp);
+	if (ret)
+		goto out_unlock;
+
+	tmp &= (PCAP_REGISTER_VALUE_MASK & ~mask);
+	tmp |= (val & mask) | PCAP_REGISTER_WRITE_OP_BIT |
+		(reg_num << PCAP_REGISTER_ADDRESS_SHIFT);
+
+	ret = ezx_pcap_putget(pcap, &tmp);
+out_unlock:
+	mutex_unlock(&pcap->io_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ezx_pcap_set_bits);
+
 /* IRQ */
 int irq_to_pcap(struct pcap_chip *pcap, int irq)
 {
diff --git a/include/linux/mfd/ezx-pcap.h b/include/linux/mfd/ezx-pcap.h
index b15caacf0720..dec82b0b05f9 100644
--- a/include/linux/mfd/ezx-pcap.h
+++ b/include/linux/mfd/ezx-pcap.h
@@ -25,6 +25,7 @@ struct pcap_chip;
 
 int ezx_pcap_write(struct pcap_chip *, u8, u32);
 int ezx_pcap_read(struct pcap_chip *, u8, u32 *);
+int ezx_pcap_set_bits(struct pcap_chip *, u8, u32, u32);
 int pcap_to_irq(struct pcap_chip *, int);
 int irq_to_pcap(struct pcap_chip *, int);
 int pcap_adc_async(struct pcap_chip *, u8, u32, u8[], void *, void *);
-- 
cgit v1.2.3-71-gd317


From fb6c023a2b845df1ec383b74644ac35a4bbb76b6 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 20 Jul 2009 12:43:45 +0100
Subject: hwmon: Add WM835x PMIC hardware monitoring driver

This driver provides reporting of the status supply voltage rails
of the WM835x series of PMICs via the hwmon API.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Acked-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 Documentation/hwmon/wm8350      |  26 +++++++
 drivers/hwmon/Kconfig           |  10 +++
 drivers/hwmon/Makefile          |   1 +
 drivers/hwmon/wm8350-hwmon.c    | 151 ++++++++++++++++++++++++++++++++++++++++
 drivers/mfd/wm8350-core.c       |   3 +
 include/linux/mfd/wm8350/core.h |   6 ++
 6 files changed, 197 insertions(+)
 create mode 100644 Documentation/hwmon/wm8350
 create mode 100644 drivers/hwmon/wm8350-hwmon.c

(limited to 'include/linux')

diff --git a/Documentation/hwmon/wm8350 b/Documentation/hwmon/wm8350
new file mode 100644
index 000000000000..98f923bd2e92
--- /dev/null
+++ b/Documentation/hwmon/wm8350
@@ -0,0 +1,26 @@
+Kernel driver wm8350-hwmon
+==========================
+
+Supported chips:
+  * Wolfson Microelectronics WM835x PMICs
+    Prefix: 'wm8350'
+    Datasheet:
+	http://www.wolfsonmicro.com/products/WM8350
+	http://www.wolfsonmicro.com/products/WM8351
+	http://www.wolfsonmicro.com/products/WM8352
+
+Authors: Mark Brown <broonie@opensource.wolfsonmicro.com>
+
+Description
+-----------
+
+The WM835x series of PMICs include an AUXADC which can be used to
+monitor a range of system operating parameters, including the voltages
+of the major supplies within the system.  Currently the driver provides
+simple access to these major supplies.
+
+Voltage Monitoring
+------------------
+
+Voltages are sampled by a 12 bit ADC.  For the internal supplies the ADC
+is referenced to the system VRTC.
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index 2e25b7a827d3..120085ce651a 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -937,6 +937,16 @@ config SENSORS_W83627EHF
 	  This driver can also be built as a module.  If so, the module
 	  will be called w83627ehf.
 
+config SENSORS_WM8350
+	tristate "Wolfson Microelectronics WM835x"
+	depends on MFD_WM8350
+	help
+	  If you say yes here you get support for the hardware
+	  monitoring features of the WM835x series of PMICs.
+
+	  This driver can also be built as a module.  If so, the module
+	  will be called wm8350-hwmon.
+
 config SENSORS_ULTRA45
 	tristate "Sun Ultra45 PIC16F747"
 	depends on SPARC64
diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
index 7f239a247c33..ed5f1781b8c4 100644
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -90,6 +90,7 @@ obj-$(CONFIG_SENSORS_VT8231)	+= vt8231.o
 obj-$(CONFIG_SENSORS_W83627EHF)	+= w83627ehf.o
 obj-$(CONFIG_SENSORS_W83L785TS)	+= w83l785ts.o
 obj-$(CONFIG_SENSORS_W83L786NG)	+= w83l786ng.o
+obj-$(CONFIG_SENSORS_WM8350)	+= wm8350-hwmon.o
 
 ifeq ($(CONFIG_HWMON_DEBUG_CHIP),y)
 EXTRA_CFLAGS += -DDEBUG
diff --git a/drivers/hwmon/wm8350-hwmon.c b/drivers/hwmon/wm8350-hwmon.c
new file mode 100644
index 000000000000..13290595ca86
--- /dev/null
+++ b/drivers/hwmon/wm8350-hwmon.c
@@ -0,0 +1,151 @@
+/*
+ * drivers/hwmon/wm8350-hwmon.c - Wolfson Microelectronics WM8350 PMIC
+ *                                  hardware monitoring features.
+ *
+ * Copyright (C) 2009 Wolfson Microelectronics plc
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License v2 as published by the
+ * Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/platform_device.h>
+#include <linux/hwmon.h>
+#include <linux/hwmon-sysfs.h>
+
+#include <linux/mfd/wm8350/core.h>
+#include <linux/mfd/wm8350/comparator.h>
+
+static ssize_t show_name(struct device *dev,
+			 struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "wm8350\n");
+}
+
+static const char *input_names[] = {
+	[WM8350_AUXADC_USB]  = "USB",
+	[WM8350_AUXADC_LINE] = "Line",
+	[WM8350_AUXADC_BATT] = "Battery",
+};
+
+
+static ssize_t show_voltage(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct wm8350 *wm8350 = dev_get_drvdata(dev);
+	int channel = to_sensor_dev_attr(attr)->index;
+	int val;
+
+	val = wm8350_read_auxadc(wm8350, channel, 0, 0) * WM8350_AUX_COEFF;
+	val = DIV_ROUND_CLOSEST(val, 1000);
+
+	return sprintf(buf, "%d\n", val);
+}
+
+static ssize_t show_label(struct device *dev,
+			  struct device_attribute *attr, char *buf)
+{
+	int channel = to_sensor_dev_attr(attr)->index;
+
+	return sprintf(buf, "%s\n", input_names[channel]);
+}
+
+#define WM8350_NAMED_VOLTAGE(id, name) \
+	static SENSOR_DEVICE_ATTR(in##id##_input, S_IRUGO, show_voltage,\
+				  NULL, name);		\
+	static SENSOR_DEVICE_ATTR(in##id##_label, S_IRUGO, show_label,	\
+				  NULL, name)
+
+static DEVICE_ATTR(name, S_IRUGO, show_name, NULL);
+
+WM8350_NAMED_VOLTAGE(0, WM8350_AUXADC_USB);
+WM8350_NAMED_VOLTAGE(1, WM8350_AUXADC_BATT);
+WM8350_NAMED_VOLTAGE(2, WM8350_AUXADC_LINE);
+
+static struct attribute *wm8350_attributes[] = {
+	&dev_attr_name.attr,
+
+	&sensor_dev_attr_in0_input.dev_attr.attr,
+	&sensor_dev_attr_in0_label.dev_attr.attr,
+	&sensor_dev_attr_in1_input.dev_attr.attr,
+	&sensor_dev_attr_in1_label.dev_attr.attr,
+	&sensor_dev_attr_in2_input.dev_attr.attr,
+	&sensor_dev_attr_in2_label.dev_attr.attr,
+
+	NULL,
+};
+
+static const struct attribute_group wm8350_attr_group = {
+	.attrs	= wm8350_attributes,
+};
+
+static int __devinit wm8350_hwmon_probe(struct platform_device *pdev)
+{
+	struct wm8350 *wm8350 = platform_get_drvdata(pdev);
+	int ret;
+
+	ret = sysfs_create_group(&pdev->dev.kobj, &wm8350_attr_group);
+	if (ret)
+		goto err;
+
+	wm8350->hwmon.classdev = hwmon_device_register(&pdev->dev);
+	if (IS_ERR(wm8350->hwmon.classdev)) {
+		ret = PTR_ERR(wm8350->hwmon.classdev);
+		goto err_group;
+	}
+
+	return 0;
+
+err_group:
+	sysfs_remove_group(&pdev->dev.kobj, &wm8350_attr_group);
+err:
+	return ret;
+}
+
+static int __devexit wm8350_hwmon_remove(struct platform_device *pdev)
+{
+	struct wm8350 *wm8350 = platform_get_drvdata(pdev);
+
+	hwmon_device_unregister(wm8350->hwmon.classdev);
+	sysfs_remove_group(&pdev->dev.kobj, &wm8350_attr_group);
+
+	return 0;
+}
+
+static struct platform_driver wm8350_hwmon_driver = {
+	.probe = wm8350_hwmon_probe,
+	.remove = __devexit_p(wm8350_hwmon_remove),
+	.driver = {
+		.name = "wm8350-hwmon",
+		.owner = THIS_MODULE,
+	},
+};
+
+static int __init wm8350_hwmon_init(void)
+{
+	return platform_driver_register(&wm8350_hwmon_driver);
+}
+module_init(wm8350_hwmon_init);
+
+static void __exit wm8350_hwmon_exit(void)
+{
+	platform_driver_unregister(&wm8350_hwmon_driver);
+}
+module_exit(wm8350_hwmon_exit);
+
+MODULE_AUTHOR("Mark Brown <broonie@opensource.wolfsonmicro.com>");
+MODULE_DESCRIPTION("WM8350 Hardware Monitoring");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:wm8350-hwmon");
diff --git a/drivers/mfd/wm8350-core.c b/drivers/mfd/wm8350-core.c
index fe24079387c5..9d662a576a41 100644
--- a/drivers/mfd/wm8350-core.c
+++ b/drivers/mfd/wm8350-core.c
@@ -1472,6 +1472,8 @@ int wm8350_device_init(struct wm8350 *wm8350, int irq,
 				   &(wm8350->codec.pdev));
 	wm8350_client_dev_register(wm8350, "wm8350-gpio",
 				   &(wm8350->gpio.pdev));
+	wm8350_client_dev_register(wm8350, "wm8350-hwmon",
+				   &(wm8350->hwmon.pdev));
 	wm8350_client_dev_register(wm8350, "wm8350-power",
 				   &(wm8350->power.pdev));
 	wm8350_client_dev_register(wm8350, "wm8350-rtc", &(wm8350->rtc.pdev));
@@ -1498,6 +1500,7 @@ void wm8350_device_exit(struct wm8350 *wm8350)
 	platform_device_unregister(wm8350->wdt.pdev);
 	platform_device_unregister(wm8350->rtc.pdev);
 	platform_device_unregister(wm8350->power.pdev);
+	platform_device_unregister(wm8350->hwmon.pdev);
 	platform_device_unregister(wm8350->gpio.pdev);
 	platform_device_unregister(wm8350->codec.pdev);
 
diff --git a/include/linux/mfd/wm8350/core.h b/include/linux/mfd/wm8350/core.h
index 42cca672f340..969b0b55615c 100644
--- a/include/linux/mfd/wm8350/core.h
+++ b/include/linux/mfd/wm8350/core.h
@@ -605,6 +605,11 @@ struct wm8350_irq {
 	void *data;
 };
 
+struct wm8350_hwmon {
+	struct platform_device *pdev;
+	struct device *classdev;
+};
+
 struct wm8350 {
 	struct device *dev;
 
@@ -629,6 +634,7 @@ struct wm8350 {
 	/* Client devices */
 	struct wm8350_codec codec;
 	struct wm8350_gpio gpio;
+	struct wm8350_hwmon hwmon;
 	struct wm8350_pmic pmic;
 	struct wm8350_power power;
 	struct wm8350_rtc rtc;
-- 
cgit v1.2.3-71-gd317


From ed52e62ebec9e703eb0b69704feaf1b6e847d882 Mon Sep 17 00:00:00 2001
From: Paul Fertser <fercerpav@gmail.com>
Date: Tue, 28 Jul 2009 00:41:15 +0400
Subject: mfd: use a dedicated workqueue for pcf50633 irq processing

Using the default kernel "events" workqueue causes problems with
synchronous adc readings if initiated from some task on the same
workqueue.

I had a deadlock trying to use pcf50633_adc_sync_read from a
power_supply class driver because the reading was initiated from the
workqueue and it waited for the irq processing to complete (to get the
result) and that was put on the same workqueue.

Signed-off-by: Paul Fertser <fercerpav@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/pcf50633-core.c       | 5 ++++-
 include/linux/mfd/pcf50633/core.h | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/pcf50633-core.c b/drivers/mfd/pcf50633-core.c
index 8d3c38bf9714..d26d7747175e 100644
--- a/drivers/mfd/pcf50633-core.c
+++ b/drivers/mfd/pcf50633-core.c
@@ -444,7 +444,7 @@ static irqreturn_t pcf50633_irq(int irq, void *data)
 
 	get_device(pcf->dev);
 	disable_irq_nosync(pcf->irq);
-	schedule_work(&pcf->irq_work);
+	queue_work(pcf->work_queue, &pcf->irq_work);
 
 	return IRQ_HANDLED;
 }
@@ -575,6 +575,7 @@ static int __devinit pcf50633_probe(struct i2c_client *client,
 	pcf->dev = &client->dev;
 	pcf->i2c_client = client;
 	pcf->irq = client->irq;
+	pcf->work_queue = create_singlethread_workqueue("pcf50633");
 
 	INIT_WORK(&pcf->irq_work, pcf50633_irq_worker);
 
@@ -651,6 +652,7 @@ static int __devinit pcf50633_probe(struct i2c_client *client,
 	return 0;
 
 err:
+	destroy_workqueue(pcf->work_queue);
 	kfree(pcf);
 	return ret;
 }
@@ -661,6 +663,7 @@ static int __devexit pcf50633_remove(struct i2c_client *client)
 	int i;
 
 	free_irq(pcf->irq, pcf);
+	destroy_workqueue(pcf->work_queue);
 
 	platform_device_unregister(pcf->input_pdev);
 	platform_device_unregister(pcf->rtc_pdev);
diff --git a/include/linux/mfd/pcf50633/core.h b/include/linux/mfd/pcf50633/core.h
index c8f51c3c0a72..9aba7b779fbc 100644
--- a/include/linux/mfd/pcf50633/core.h
+++ b/include/linux/mfd/pcf50633/core.h
@@ -136,6 +136,7 @@ struct pcf50633 {
 	int irq;
 	struct pcf50633_irq irq_handler[PCF50633_NUM_IRQ];
 	struct work_struct irq_work;
+	struct workqueue_struct *work_queue;
 	struct mutex lock;
 
 	u8 mask_regs[5];
-- 
cgit v1.2.3-71-gd317


From 8d360d8c03e1e8514bbaf606b1cd3b818dfc445d Mon Sep 17 00:00:00 2001
From: Paul Fertser <fercerpav@gmail.com>
Date: Tue, 28 Jul 2009 01:09:04 +0400
Subject: mfd: fix wrong define for 10bit pcf50633 ADC mode

The 10 bits definition was the 8 bits one.

Signed-off-by: Paul Fertser <fercerpav@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/linux/mfd/pcf50633/adc.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/pcf50633/adc.h b/include/linux/mfd/pcf50633/adc.h
index 56669b4183ad..b35e62801ffa 100644
--- a/include/linux/mfd/pcf50633/adc.h
+++ b/include/linux/mfd/pcf50633/adc.h
@@ -25,7 +25,8 @@
 #define PCF50633_REG_ADCS3		0x57
 
 #define PCF50633_ADCC1_ADCSTART		0x01
-#define PCF50633_ADCC1_RES_10BIT	0x02
+#define PCF50633_ADCC1_RES_8BIT		0x02
+#define PCF50633_ADCC1_RES_10BIT	0x00
 #define PCF50633_ADCC1_AVERAGE_NO	0x00
 #define PCF50633_ADCC1_AVERAGE_4	0x04
 #define PCF50633_ADCC1_AVERAGE_8	0x08
-- 
cgit v1.2.3-71-gd317


From 327bc3a3efa408fb285948bfef112a6c58dfb375 Mon Sep 17 00:00:00 2001
From: Daniel Ribeiro <drwyrm@gmail.com>
Date: Fri, 31 Jul 2009 12:38:02 +0200
Subject: mfd: Remove VIB defines from pcap header file

Vibrator will be accessed via the pcap-regulator driver, no need to expose its
bits in the header file.

Signed-off-by: Daniel Ribeiro <drwyrm@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/linux/mfd/ezx-pcap.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/ezx-pcap.h b/include/linux/mfd/ezx-pcap.h
index dec82b0b05f9..e5124ceea769 100644
--- a/include/linux/mfd/ezx-pcap.h
+++ b/include/linux/mfd/ezx-pcap.h
@@ -227,7 +227,6 @@ void pcap_set_ts_bits(struct pcap_chip *, u32);
 #define PCAP_LED1		1
 #define PCAP_BL0		2
 #define PCAP_BL1		3
-#define PCAP_VIB		4
 #define PCAP_LED_3MA		0
 #define PCAP_LED_4MA		1
 #define PCAP_LED_5MA		2
@@ -246,9 +245,6 @@ void pcap_set_ts_bits(struct pcap_chip *, u32);
 #define PCAP_LED0_C_SHIFT	15
 #define PCAP_LED1_C_SHIFT	17
 #define PCAP_BL1_SHIFT		20
-#define PCAP_VIB_MASK		0x3
-#define PCAP_VIB_SHIFT		20
-#define PCAP_VIB_EN		(1 << 19)
 
 /* RTC */
 #define PCAP_RTC_DAY_MASK	0x3fff
-- 
cgit v1.2.3-71-gd317


From 3bed6e415fc2cbf8d706848a62a48aebe84435e5 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 27 Jul 2009 14:45:51 +0100
Subject: mfd: Allow multiple MFD cells with the same name

Provide basic support for MFDs having multiple cells of a given
type with different IDs by adding an id to the mfd_cell structure
and then adding that to the id passed in to mfd_add_devices().

As it stands this approach requires that MFDs using this feature
deal with ensuring that there aren't any ID collisions resulting
from multiple MFDs of the same type being instantiated. This needs
to happen with the existing code too, but with this approach there
is a knock on effect on the IDs for non-duplicated devices.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/mfd-core.c   | 2 +-
 include/linux/mfd/core.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/mfd-core.c b/drivers/mfd/mfd-core.c
index 54ddf3772e0c..ae15e495e20e 100644
--- a/drivers/mfd/mfd-core.c
+++ b/drivers/mfd/mfd-core.c
@@ -25,7 +25,7 @@ static int mfd_add_device(struct device *parent, int id,
 	int ret = -ENOMEM;
 	int r;
 
-	pdev = platform_device_alloc(cell->name, id);
+	pdev = platform_device_alloc(cell->name, id + cell->id);
 	if (!pdev)
 		goto fail_alloc;
 
diff --git a/include/linux/mfd/core.h b/include/linux/mfd/core.h
index 49ef857cdb2d..11d740b8831d 100644
--- a/include/linux/mfd/core.h
+++ b/include/linux/mfd/core.h
@@ -23,6 +23,7 @@
  */
 struct mfd_cell {
 	const char		*name;
+	int			id;
 
 	int			(*enable)(struct platform_device *dev);
 	int			(*disable)(struct platform_device *dev);
-- 
cgit v1.2.3-71-gd317


From d2bedfe7a8b2f34beee2cad9cae74a088ee8ed07 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 27 Jul 2009 14:45:52 +0100
Subject: mfd: Initial core support for WM831x series devices

The WM831x series of devices are register compatible processor power
management subsystems, providing regulator and power path management
facilities along with other services like watchdog, RTC and touch
panel controllers.

This patch adds very basic support, providing basic single register
I2C access, handling of the security key and registration of the
devices.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/wm831x-core.c        | 1357 ++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/wm831x/core.h  |  247 +++++++
 include/linux/mfd/wm831x/pdata.h |  107 +++
 3 files changed, 1711 insertions(+)
 create mode 100644 drivers/mfd/wm831x-core.c
 create mode 100644 include/linux/mfd/wm831x/core.h
 create mode 100644 include/linux/mfd/wm831x/pdata.h

(limited to 'include/linux')

diff --git a/drivers/mfd/wm831x-core.c b/drivers/mfd/wm831x-core.c
new file mode 100644
index 000000000000..cc1040c9d46c
--- /dev/null
+++ b/drivers/mfd/wm831x-core.c
@@ -0,0 +1,1357 @@
+/*
+ * wm831x-core.c  --  Device access for Wolfson WM831x PMICs
+ *
+ * Copyright 2009 Wolfson Microelectronics PLC.
+ *
+ * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/i2c.h>
+#include <linux/mfd/core.h>
+
+#include <linux/mfd/wm831x/core.h>
+#include <linux/mfd/wm831x/pdata.h>
+
+enum wm831x_parent {
+	WM8310 = 0,
+	WM8311 = 1,
+	WM8312 = 2,
+};
+
+static int wm831x_reg_locked(struct wm831x *wm831x, unsigned short reg)
+{
+	if (!wm831x->locked)
+		return 0;
+
+	switch (reg) {
+	case WM831X_WATCHDOG:
+	case WM831X_DC4_CONTROL:
+	case WM831X_ON_PIN_CONTROL:
+	case WM831X_BACKUP_CHARGER_CONTROL:
+	case WM831X_CHARGER_CONTROL_1:
+	case WM831X_CHARGER_CONTROL_2:
+		return 1;
+
+	default:
+		return 0;
+	}
+}
+
+/**
+ * wm831x_reg_unlock: Unlock user keyed registers
+ *
+ * The WM831x has a user key preventing writes to particularly
+ * critical registers.  This function locks those registers,
+ * allowing writes to them.
+ */
+void wm831x_reg_lock(struct wm831x *wm831x)
+{
+	int ret;
+
+	ret = wm831x_reg_write(wm831x, WM831X_SECURITY_KEY, 0);
+	if (ret == 0) {
+		dev_vdbg(wm831x->dev, "Registers locked\n");
+
+		mutex_lock(&wm831x->io_lock);
+		WARN_ON(wm831x->locked);
+		wm831x->locked = 1;
+		mutex_unlock(&wm831x->io_lock);
+	} else {
+		dev_err(wm831x->dev, "Failed to lock registers: %d\n", ret);
+	}
+
+}
+EXPORT_SYMBOL_GPL(wm831x_reg_lock);
+
+/**
+ * wm831x_reg_unlock: Unlock user keyed registers
+ *
+ * The WM831x has a user key preventing writes to particularly
+ * critical registers.  This function locks those registers,
+ * preventing spurious writes.
+ */
+int wm831x_reg_unlock(struct wm831x *wm831x)
+{
+	int ret;
+
+	/* 0x9716 is the value required to unlock the registers */
+	ret = wm831x_reg_write(wm831x, WM831X_SECURITY_KEY, 0x9716);
+	if (ret == 0) {
+		dev_vdbg(wm831x->dev, "Registers unlocked\n");
+
+		mutex_lock(&wm831x->io_lock);
+		WARN_ON(!wm831x->locked);
+		wm831x->locked = 0;
+		mutex_unlock(&wm831x->io_lock);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(wm831x_reg_unlock);
+
+static int wm831x_read(struct wm831x *wm831x, unsigned short reg,
+		       int bytes, void *dest)
+{
+	int ret, i;
+	u16 *buf = dest;
+
+	BUG_ON(bytes % 2);
+	BUG_ON(bytes <= 0);
+
+	ret = wm831x->read_dev(wm831x, reg, bytes, dest);
+	if (ret < 0)
+		return ret;
+
+	for (i = 0; i < bytes / 2; i++) {
+		buf[i] = be16_to_cpu(buf[i]);
+
+		dev_vdbg(wm831x->dev, "Read %04x from R%d(0x%x)\n",
+			 buf[i], reg + i, reg + i);
+	}
+
+	return 0;
+}
+
+/**
+ * wm831x_reg_read: Read a single WM831x register.
+ *
+ * @wm831x: Device to read from.
+ * @reg: Register to read.
+ */
+int wm831x_reg_read(struct wm831x *wm831x, unsigned short reg)
+{
+	unsigned short val;
+	int ret;
+
+	mutex_lock(&wm831x->io_lock);
+
+	ret = wm831x_read(wm831x, reg, 2, &val);
+
+	mutex_unlock(&wm831x->io_lock);
+
+	if (ret < 0)
+		return ret;
+	else
+		return val;
+}
+EXPORT_SYMBOL_GPL(wm831x_reg_read);
+
+/**
+ * wm831x_bulk_read: Read multiple WM831x registers
+ *
+ * @wm831x: Device to read from
+ * @reg: First register
+ * @count: Number of registers
+ * @buf: Buffer to fill.
+ */
+int wm831x_bulk_read(struct wm831x *wm831x, unsigned short reg,
+		     int count, u16 *buf)
+{
+	int ret;
+
+	mutex_lock(&wm831x->io_lock);
+
+	ret = wm831x_read(wm831x, reg, count * 2, buf);
+
+	mutex_unlock(&wm831x->io_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(wm831x_bulk_read);
+
+static int wm831x_write(struct wm831x *wm831x, unsigned short reg,
+			int bytes, void *src)
+{
+	u16 *buf = src;
+	int i;
+
+	BUG_ON(bytes % 2);
+	BUG_ON(bytes <= 0);
+
+	for (i = 0; i < bytes / 2; i++) {
+		if (wm831x_reg_locked(wm831x, reg))
+			return -EPERM;
+
+		dev_vdbg(wm831x->dev, "Write %04x to R%d(0x%x)\n",
+			 buf[i], reg + i, reg + i);
+
+		buf[i] = cpu_to_be16(buf[i]);
+	}
+
+	return wm831x->write_dev(wm831x, reg, bytes, src);
+}
+
+/**
+ * wm831x_reg_write: Write a single WM831x register.
+ *
+ * @wm831x: Device to write to.
+ * @reg: Register to write to.
+ * @val: Value to write.
+ */
+int wm831x_reg_write(struct wm831x *wm831x, unsigned short reg,
+		     unsigned short val)
+{
+	int ret;
+
+	mutex_lock(&wm831x->io_lock);
+
+	ret = wm831x_write(wm831x, reg, 2, &val);
+
+	mutex_unlock(&wm831x->io_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(wm831x_reg_write);
+
+/**
+ * wm831x_set_bits: Set the value of a bitfield in a WM831x register
+ *
+ * @wm831x: Device to write to.
+ * @reg: Register to write to.
+ * @mask: Mask of bits to set.
+ * @val: Value to set (unshifted)
+ */
+int wm831x_set_bits(struct wm831x *wm831x, unsigned short reg,
+		    unsigned short mask, unsigned short val)
+{
+	int ret;
+	u16 r;
+
+	mutex_lock(&wm831x->io_lock);
+
+	ret = wm831x_read(wm831x, reg, 2, &r);
+	if (ret < 0)
+		goto out;
+
+	r &= ~mask;
+	r |= val;
+
+	ret = wm831x_write(wm831x, reg, 2, &r);
+
+out:
+	mutex_unlock(&wm831x->io_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(wm831x_set_bits);
+
+static struct resource wm831x_dcdc1_resources[] = {
+	{
+		.start = WM831X_DC1_CONTROL_1,
+		.end   = WM831X_DC1_DVS_CONTROL,
+		.flags = IORESOURCE_IO,
+	},
+	{
+		.name  = "UV",
+		.start = WM831X_IRQ_UV_DC1,
+		.end   = WM831X_IRQ_UV_DC1,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name  = "HC",
+		.start = WM831X_IRQ_HC_DC1,
+		.end   = WM831X_IRQ_HC_DC1,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+
+static struct resource wm831x_dcdc2_resources[] = {
+	{
+		.start = WM831X_DC2_CONTROL_1,
+		.end   = WM831X_DC2_DVS_CONTROL,
+		.flags = IORESOURCE_IO,
+	},
+	{
+		.name  = "UV",
+		.start = WM831X_IRQ_UV_DC2,
+		.end   = WM831X_IRQ_UV_DC2,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name  = "HC",
+		.start = WM831X_IRQ_HC_DC2,
+		.end   = WM831X_IRQ_HC_DC2,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+static struct resource wm831x_dcdc3_resources[] = {
+	{
+		.start = WM831X_DC3_CONTROL_1,
+		.end   = WM831X_DC3_SLEEP_CONTROL,
+		.flags = IORESOURCE_IO,
+	},
+	{
+		.name  = "UV",
+		.start = WM831X_IRQ_UV_DC3,
+		.end   = WM831X_IRQ_UV_DC3,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+static struct resource wm831x_dcdc4_resources[] = {
+	{
+		.start = WM831X_DC4_CONTROL,
+		.end   = WM831X_DC4_SLEEP_CONTROL,
+		.flags = IORESOURCE_IO,
+	},
+	{
+		.name  = "UV",
+		.start = WM831X_IRQ_UV_DC4,
+		.end   = WM831X_IRQ_UV_DC4,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+static struct resource wm831x_gpio_resources[] = {
+	{
+		.start = WM831X_IRQ_GPIO_1,
+		.end   = WM831X_IRQ_GPIO_16,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+static struct resource wm831x_isink1_resources[] = {
+	{
+		.start = WM831X_CURRENT_SINK_1,
+		.end   = WM831X_CURRENT_SINK_1,
+		.flags = IORESOURCE_IO,
+	},
+	{
+		.start = WM831X_IRQ_CS1,
+		.end   = WM831X_IRQ_CS1,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+static struct resource wm831x_isink2_resources[] = {
+	{
+		.start = WM831X_CURRENT_SINK_2,
+		.end   = WM831X_CURRENT_SINK_2,
+		.flags = IORESOURCE_IO,
+	},
+	{
+		.start = WM831X_IRQ_CS2,
+		.end   = WM831X_IRQ_CS2,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+static struct resource wm831x_ldo1_resources[] = {
+	{
+		.start = WM831X_LDO1_CONTROL,
+		.end   = WM831X_LDO1_SLEEP_CONTROL,
+		.flags = IORESOURCE_IO,
+	},
+	{
+		.name  = "UV",
+		.start = WM831X_IRQ_UV_LDO1,
+		.end   = WM831X_IRQ_UV_LDO1,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+static struct resource wm831x_ldo2_resources[] = {
+	{
+		.start = WM831X_LDO2_CONTROL,
+		.end   = WM831X_LDO2_SLEEP_CONTROL,
+		.flags = IORESOURCE_IO,
+	},
+	{
+		.name  = "UV",
+		.start = WM831X_IRQ_UV_LDO2,
+		.end   = WM831X_IRQ_UV_LDO2,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+static struct resource wm831x_ldo3_resources[] = {
+	{
+		.start = WM831X_LDO3_CONTROL,
+		.end   = WM831X_LDO3_SLEEP_CONTROL,
+		.flags = IORESOURCE_IO,
+	},
+	{
+		.name  = "UV",
+		.start = WM831X_IRQ_UV_LDO3,
+		.end   = WM831X_IRQ_UV_LDO3,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+static struct resource wm831x_ldo4_resources[] = {
+	{
+		.start = WM831X_LDO4_CONTROL,
+		.end   = WM831X_LDO4_SLEEP_CONTROL,
+		.flags = IORESOURCE_IO,
+	},
+	{
+		.name  = "UV",
+		.start = WM831X_IRQ_UV_LDO4,
+		.end   = WM831X_IRQ_UV_LDO4,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+static struct resource wm831x_ldo5_resources[] = {
+	{
+		.start = WM831X_LDO5_CONTROL,
+		.end   = WM831X_LDO5_SLEEP_CONTROL,
+		.flags = IORESOURCE_IO,
+	},
+	{
+		.name  = "UV",
+		.start = WM831X_IRQ_UV_LDO5,
+		.end   = WM831X_IRQ_UV_LDO5,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+static struct resource wm831x_ldo6_resources[] = {
+	{
+		.start = WM831X_LDO6_CONTROL,
+		.end   = WM831X_LDO6_SLEEP_CONTROL,
+		.flags = IORESOURCE_IO,
+	},
+	{
+		.name  = "UV",
+		.start = WM831X_IRQ_UV_LDO6,
+		.end   = WM831X_IRQ_UV_LDO6,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+static struct resource wm831x_ldo7_resources[] = {
+	{
+		.start = WM831X_LDO7_CONTROL,
+		.end   = WM831X_LDO7_SLEEP_CONTROL,
+		.flags = IORESOURCE_IO,
+	},
+	{
+		.name  = "UV",
+		.start = WM831X_IRQ_UV_LDO7,
+		.end   = WM831X_IRQ_UV_LDO7,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+static struct resource wm831x_ldo8_resources[] = {
+	{
+		.start = WM831X_LDO8_CONTROL,
+		.end   = WM831X_LDO8_SLEEP_CONTROL,
+		.flags = IORESOURCE_IO,
+	},
+	{
+		.name  = "UV",
+		.start = WM831X_IRQ_UV_LDO8,
+		.end   = WM831X_IRQ_UV_LDO8,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+static struct resource wm831x_ldo9_resources[] = {
+	{
+		.start = WM831X_LDO9_CONTROL,
+		.end   = WM831X_LDO9_SLEEP_CONTROL,
+		.flags = IORESOURCE_IO,
+	},
+	{
+		.name  = "UV",
+		.start = WM831X_IRQ_UV_LDO9,
+		.end   = WM831X_IRQ_UV_LDO9,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+static struct resource wm831x_ldo10_resources[] = {
+	{
+		.start = WM831X_LDO10_CONTROL,
+		.end   = WM831X_LDO10_SLEEP_CONTROL,
+		.flags = IORESOURCE_IO,
+	},
+	{
+		.name  = "UV",
+		.start = WM831X_IRQ_UV_LDO10,
+		.end   = WM831X_IRQ_UV_LDO10,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+static struct resource wm831x_ldo11_resources[] = {
+	{
+		.start = WM831X_LDO11_ON_CONTROL,
+		.end   = WM831X_LDO11_SLEEP_CONTROL,
+		.flags = IORESOURCE_IO,
+	},
+};
+
+static struct resource wm831x_on_resources[] = {
+	{
+		.start = WM831X_IRQ_ON,
+		.end   = WM831X_IRQ_ON,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+
+static struct resource wm831x_power_resources[] = {
+	{
+		.name = "SYSLO",
+		.start = WM831X_IRQ_PPM_SYSLO,
+		.end   = WM831X_IRQ_PPM_SYSLO,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "PWR SRC",
+		.start = WM831X_IRQ_PPM_PWR_SRC,
+		.end   = WM831X_IRQ_PPM_PWR_SRC,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "USB CURR",
+		.start = WM831X_IRQ_PPM_USB_CURR,
+		.end   = WM831X_IRQ_PPM_USB_CURR,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "BATT HOT",
+		.start = WM831X_IRQ_CHG_BATT_HOT,
+		.end   = WM831X_IRQ_CHG_BATT_HOT,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "BATT COLD",
+		.start = WM831X_IRQ_CHG_BATT_COLD,
+		.end   = WM831X_IRQ_CHG_BATT_COLD,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "BATT FAIL",
+		.start = WM831X_IRQ_CHG_BATT_FAIL,
+		.end   = WM831X_IRQ_CHG_BATT_FAIL,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "OV",
+		.start = WM831X_IRQ_CHG_OV,
+		.end   = WM831X_IRQ_CHG_OV,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "END",
+		.start = WM831X_IRQ_CHG_END,
+		.end   = WM831X_IRQ_CHG_END,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "TO",
+		.start = WM831X_IRQ_CHG_TO,
+		.end   = WM831X_IRQ_CHG_TO,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "MODE",
+		.start = WM831X_IRQ_CHG_MODE,
+		.end   = WM831X_IRQ_CHG_MODE,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "START",
+		.start = WM831X_IRQ_CHG_START,
+		.end   = WM831X_IRQ_CHG_START,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+static struct resource wm831x_rtc_resources[] = {
+	{
+		.name = "PER",
+		.start = WM831X_IRQ_RTC_PER,
+		.end   = WM831X_IRQ_RTC_PER,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "ALM",
+		.start = WM831X_IRQ_RTC_ALM,
+		.end   = WM831X_IRQ_RTC_ALM,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+static struct resource wm831x_status1_resources[] = {
+	{
+		.start = WM831X_STATUS_LED_1,
+		.end   = WM831X_STATUS_LED_1,
+		.flags = IORESOURCE_IO,
+	},
+};
+
+static struct resource wm831x_status2_resources[] = {
+	{
+		.start = WM831X_STATUS_LED_2,
+		.end   = WM831X_STATUS_LED_2,
+		.flags = IORESOURCE_IO,
+	},
+};
+
+static struct resource wm831x_touch_resources[] = {
+	{
+		.name = "TCHPD",
+		.start = WM831X_IRQ_TCHPD,
+		.end   = WM831X_IRQ_TCHPD,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "TCHDATA",
+		.start = WM831X_IRQ_TCHDATA,
+		.end   = WM831X_IRQ_TCHDATA,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+static struct resource wm831x_wdt_resources[] = {
+	{
+		.start = WM831X_IRQ_WDOG_TO,
+		.end   = WM831X_IRQ_WDOG_TO,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+static struct mfd_cell wm8310_devs[] = {
+	{
+		.name = "wm831x-buckv",
+		.id = 1,
+		.num_resources = ARRAY_SIZE(wm831x_dcdc1_resources),
+		.resources = wm831x_dcdc1_resources,
+	},
+	{
+		.name = "wm831x-buckv",
+		.id = 2,
+		.num_resources = ARRAY_SIZE(wm831x_dcdc2_resources),
+		.resources = wm831x_dcdc2_resources,
+	},
+	{
+		.name = "wm831x-buckp",
+		.id = 3,
+		.num_resources = ARRAY_SIZE(wm831x_dcdc3_resources),
+		.resources = wm831x_dcdc3_resources,
+	},
+	{
+		.name = "wm831x-boostp",
+		.id = 4,
+		.num_resources = ARRAY_SIZE(wm831x_dcdc4_resources),
+		.resources = wm831x_dcdc4_resources,
+	},
+	{
+		.name = "wm831x-epe",
+		.id = 1,
+	},
+	{
+		.name = "wm831x-epe",
+		.id = 2,
+	},
+	{
+		.name = "wm831x-gpio",
+		.num_resources = ARRAY_SIZE(wm831x_gpio_resources),
+		.resources = wm831x_gpio_resources,
+	},
+	{
+		.name = "wm831x-hwmon",
+	},
+	{
+		.name = "wm831x-isink",
+		.id = 1,
+		.num_resources = ARRAY_SIZE(wm831x_isink1_resources),
+		.resources = wm831x_isink1_resources,
+	},
+	{
+		.name = "wm831x-isink",
+		.id = 2,
+		.num_resources = ARRAY_SIZE(wm831x_isink2_resources),
+		.resources = wm831x_isink2_resources,
+	},
+	{
+		.name = "wm831x-ldo",
+		.id = 1,
+		.num_resources = ARRAY_SIZE(wm831x_ldo1_resources),
+		.resources = wm831x_ldo1_resources,
+	},
+	{
+		.name = "wm831x-ldo",
+		.id = 2,
+		.num_resources = ARRAY_SIZE(wm831x_ldo2_resources),
+		.resources = wm831x_ldo2_resources,
+	},
+	{
+		.name = "wm831x-ldo",
+		.id = 3,
+		.num_resources = ARRAY_SIZE(wm831x_ldo3_resources),
+		.resources = wm831x_ldo3_resources,
+	},
+	{
+		.name = "wm831x-ldo",
+		.id = 4,
+		.num_resources = ARRAY_SIZE(wm831x_ldo4_resources),
+		.resources = wm831x_ldo4_resources,
+	},
+	{
+		.name = "wm831x-ldo",
+		.id = 5,
+		.num_resources = ARRAY_SIZE(wm831x_ldo5_resources),
+		.resources = wm831x_ldo5_resources,
+	},
+	{
+		.name = "wm831x-ldo",
+		.id = 6,
+		.num_resources = ARRAY_SIZE(wm831x_ldo6_resources),
+		.resources = wm831x_ldo6_resources,
+	},
+	{
+		.name = "wm831x-aldo",
+		.id = 7,
+		.num_resources = ARRAY_SIZE(wm831x_ldo7_resources),
+		.resources = wm831x_ldo7_resources,
+	},
+	{
+		.name = "wm831x-aldo",
+		.id = 8,
+		.num_resources = ARRAY_SIZE(wm831x_ldo8_resources),
+		.resources = wm831x_ldo8_resources,
+	},
+	{
+		.name = "wm831x-aldo",
+		.id = 9,
+		.num_resources = ARRAY_SIZE(wm831x_ldo9_resources),
+		.resources = wm831x_ldo9_resources,
+	},
+	{
+		.name = "wm831x-aldo",
+		.id = 10,
+		.num_resources = ARRAY_SIZE(wm831x_ldo10_resources),
+		.resources = wm831x_ldo10_resources,
+	},
+	{
+		.name = "wm831x-alive-ldo",
+		.id = 11,
+		.num_resources = ARRAY_SIZE(wm831x_ldo11_resources),
+		.resources = wm831x_ldo11_resources,
+	},
+	{
+		.name = "wm831x-on",
+		.num_resources = ARRAY_SIZE(wm831x_on_resources),
+		.resources = wm831x_on_resources,
+	},
+	{
+		.name = "wm831x-power",
+		.num_resources = ARRAY_SIZE(wm831x_power_resources),
+		.resources = wm831x_power_resources,
+	},
+	{
+		.name = "wm831x-rtc",
+		.num_resources = ARRAY_SIZE(wm831x_rtc_resources),
+		.resources = wm831x_rtc_resources,
+	},
+	{
+		.name = "wm831x-status",
+		.id = 1,
+		.num_resources = ARRAY_SIZE(wm831x_status1_resources),
+		.resources = wm831x_status1_resources,
+	},
+	{
+		.name = "wm831x-status",
+		.id = 2,
+		.num_resources = ARRAY_SIZE(wm831x_status2_resources),
+		.resources = wm831x_status2_resources,
+	},
+	{
+		.name = "wm831x-watchdog",
+		.num_resources = ARRAY_SIZE(wm831x_wdt_resources),
+		.resources = wm831x_wdt_resources,
+	},
+};
+
+static struct mfd_cell wm8311_devs[] = {
+	{
+		.name = "wm831x-buckv",
+		.id = 1,
+		.num_resources = ARRAY_SIZE(wm831x_dcdc1_resources),
+		.resources = wm831x_dcdc1_resources,
+	},
+	{
+		.name = "wm831x-buckv",
+		.id = 2,
+		.num_resources = ARRAY_SIZE(wm831x_dcdc2_resources),
+		.resources = wm831x_dcdc2_resources,
+	},
+	{
+		.name = "wm831x-buckp",
+		.id = 3,
+		.num_resources = ARRAY_SIZE(wm831x_dcdc3_resources),
+		.resources = wm831x_dcdc3_resources,
+	},
+	{
+		.name = "wm831x-boostp",
+		.id = 4,
+		.num_resources = ARRAY_SIZE(wm831x_dcdc4_resources),
+		.resources = wm831x_dcdc4_resources,
+	},
+	{
+		.name = "wm831x-epe",
+		.id = 1,
+	},
+	{
+		.name = "wm831x-epe",
+		.id = 2,
+	},
+	{
+		.name = "wm831x-gpio",
+		.num_resources = ARRAY_SIZE(wm831x_gpio_resources),
+		.resources = wm831x_gpio_resources,
+	},
+	{
+		.name = "wm831x-hwmon",
+	},
+	{
+		.name = "wm831x-isink",
+		.id = 1,
+		.num_resources = ARRAY_SIZE(wm831x_isink1_resources),
+		.resources = wm831x_isink1_resources,
+	},
+	{
+		.name = "wm831x-isink",
+		.id = 2,
+		.num_resources = ARRAY_SIZE(wm831x_isink2_resources),
+		.resources = wm831x_isink2_resources,
+	},
+	{
+		.name = "wm831x-ldo",
+		.id = 1,
+		.num_resources = ARRAY_SIZE(wm831x_ldo1_resources),
+		.resources = wm831x_ldo1_resources,
+	},
+	{
+		.name = "wm831x-ldo",
+		.id = 2,
+		.num_resources = ARRAY_SIZE(wm831x_ldo2_resources),
+		.resources = wm831x_ldo2_resources,
+	},
+	{
+		.name = "wm831x-ldo",
+		.id = 3,
+		.num_resources = ARRAY_SIZE(wm831x_ldo3_resources),
+		.resources = wm831x_ldo3_resources,
+	},
+	{
+		.name = "wm831x-ldo",
+		.id = 4,
+		.num_resources = ARRAY_SIZE(wm831x_ldo4_resources),
+		.resources = wm831x_ldo4_resources,
+	},
+	{
+		.name = "wm831x-ldo",
+		.id = 5,
+		.num_resources = ARRAY_SIZE(wm831x_ldo5_resources),
+		.resources = wm831x_ldo5_resources,
+	},
+	{
+		.name = "wm831x-aldo",
+		.id = 7,
+		.num_resources = ARRAY_SIZE(wm831x_ldo7_resources),
+		.resources = wm831x_ldo7_resources,
+	},
+	{
+		.name = "wm831x-alive-ldo",
+		.id = 11,
+		.num_resources = ARRAY_SIZE(wm831x_ldo11_resources),
+		.resources = wm831x_ldo11_resources,
+	},
+	{
+		.name = "wm831x-on",
+		.num_resources = ARRAY_SIZE(wm831x_on_resources),
+		.resources = wm831x_on_resources,
+	},
+	{
+		.name = "wm831x-power",
+		.num_resources = ARRAY_SIZE(wm831x_power_resources),
+		.resources = wm831x_power_resources,
+	},
+	{
+		.name = "wm831x-rtc",
+		.num_resources = ARRAY_SIZE(wm831x_rtc_resources),
+		.resources = wm831x_rtc_resources,
+	},
+	{
+		.name = "wm831x-status",
+		.id = 1,
+		.num_resources = ARRAY_SIZE(wm831x_status1_resources),
+		.resources = wm831x_status1_resources,
+	},
+	{
+		.name = "wm831x-status",
+		.id = 2,
+		.num_resources = ARRAY_SIZE(wm831x_status2_resources),
+		.resources = wm831x_status2_resources,
+	},
+	{
+		.name = "wm831x-touch",
+		.num_resources = ARRAY_SIZE(wm831x_touch_resources),
+		.resources = wm831x_touch_resources,
+	},
+	{
+		.name = "wm831x-watchdog",
+		.num_resources = ARRAY_SIZE(wm831x_wdt_resources),
+		.resources = wm831x_wdt_resources,
+	},
+};
+
+static struct mfd_cell wm8312_devs[] = {
+	{
+		.name = "wm831x-buckv",
+		.id = 1,
+		.num_resources = ARRAY_SIZE(wm831x_dcdc1_resources),
+		.resources = wm831x_dcdc1_resources,
+	},
+	{
+		.name = "wm831x-buckv",
+		.id = 2,
+		.num_resources = ARRAY_SIZE(wm831x_dcdc2_resources),
+		.resources = wm831x_dcdc2_resources,
+	},
+	{
+		.name = "wm831x-buckp",
+		.id = 3,
+		.num_resources = ARRAY_SIZE(wm831x_dcdc3_resources),
+		.resources = wm831x_dcdc3_resources,
+	},
+	{
+		.name = "wm831x-boostp",
+		.id = 4,
+		.num_resources = ARRAY_SIZE(wm831x_dcdc4_resources),
+		.resources = wm831x_dcdc4_resources,
+	},
+	{
+		.name = "wm831x-epe",
+		.id = 1,
+	},
+	{
+		.name = "wm831x-epe",
+		.id = 2,
+	},
+	{
+		.name = "wm831x-gpio",
+		.num_resources = ARRAY_SIZE(wm831x_gpio_resources),
+		.resources = wm831x_gpio_resources,
+	},
+	{
+		.name = "wm831x-hwmon",
+	},
+	{
+		.name = "wm831x-isink",
+		.id = 1,
+		.num_resources = ARRAY_SIZE(wm831x_isink1_resources),
+		.resources = wm831x_isink1_resources,
+	},
+	{
+		.name = "wm831x-isink",
+		.id = 2,
+		.num_resources = ARRAY_SIZE(wm831x_isink2_resources),
+		.resources = wm831x_isink2_resources,
+	},
+	{
+		.name = "wm831x-ldo",
+		.id = 1,
+		.num_resources = ARRAY_SIZE(wm831x_ldo1_resources),
+		.resources = wm831x_ldo1_resources,
+	},
+	{
+		.name = "wm831x-ldo",
+		.id = 2,
+		.num_resources = ARRAY_SIZE(wm831x_ldo2_resources),
+		.resources = wm831x_ldo2_resources,
+	},
+	{
+		.name = "wm831x-ldo",
+		.id = 3,
+		.num_resources = ARRAY_SIZE(wm831x_ldo3_resources),
+		.resources = wm831x_ldo3_resources,
+	},
+	{
+		.name = "wm831x-ldo",
+		.id = 4,
+		.num_resources = ARRAY_SIZE(wm831x_ldo4_resources),
+		.resources = wm831x_ldo4_resources,
+	},
+	{
+		.name = "wm831x-ldo",
+		.id = 5,
+		.num_resources = ARRAY_SIZE(wm831x_ldo5_resources),
+		.resources = wm831x_ldo5_resources,
+	},
+	{
+		.name = "wm831x-ldo",
+		.id = 6,
+		.num_resources = ARRAY_SIZE(wm831x_ldo6_resources),
+		.resources = wm831x_ldo6_resources,
+	},
+	{
+		.name = "wm831x-aldo",
+		.id = 7,
+		.num_resources = ARRAY_SIZE(wm831x_ldo7_resources),
+		.resources = wm831x_ldo7_resources,
+	},
+	{
+		.name = "wm831x-aldo",
+		.id = 8,
+		.num_resources = ARRAY_SIZE(wm831x_ldo8_resources),
+		.resources = wm831x_ldo8_resources,
+	},
+	{
+		.name = "wm831x-aldo",
+		.id = 9,
+		.num_resources = ARRAY_SIZE(wm831x_ldo9_resources),
+		.resources = wm831x_ldo9_resources,
+	},
+	{
+		.name = "wm831x-aldo",
+		.id = 10,
+		.num_resources = ARRAY_SIZE(wm831x_ldo10_resources),
+		.resources = wm831x_ldo10_resources,
+	},
+	{
+		.name = "wm831x-alive-ldo",
+		.id = 11,
+		.num_resources = ARRAY_SIZE(wm831x_ldo11_resources),
+		.resources = wm831x_ldo11_resources,
+	},
+	{
+		.name = "wm831x-on",
+		.num_resources = ARRAY_SIZE(wm831x_on_resources),
+		.resources = wm831x_on_resources,
+	},
+	{
+		.name = "wm831x-power",
+		.num_resources = ARRAY_SIZE(wm831x_power_resources),
+		.resources = wm831x_power_resources,
+	},
+	{
+		.name = "wm831x-rtc",
+		.num_resources = ARRAY_SIZE(wm831x_rtc_resources),
+		.resources = wm831x_rtc_resources,
+	},
+	{
+		.name = "wm831x-status",
+		.id = 1,
+		.num_resources = ARRAY_SIZE(wm831x_status1_resources),
+		.resources = wm831x_status1_resources,
+	},
+	{
+		.name = "wm831x-status",
+		.id = 2,
+		.num_resources = ARRAY_SIZE(wm831x_status2_resources),
+		.resources = wm831x_status2_resources,
+	},
+	{
+		.name = "wm831x-touch",
+		.num_resources = ARRAY_SIZE(wm831x_touch_resources),
+		.resources = wm831x_touch_resources,
+	},
+	{
+		.name = "wm831x-watchdog",
+		.num_resources = ARRAY_SIZE(wm831x_wdt_resources),
+		.resources = wm831x_wdt_resources,
+	},
+};
+
+/*
+ * Instantiate the generic non-control parts of the device.
+ */
+static int wm831x_device_init(struct wm831x *wm831x, unsigned long id, int irq)
+{
+	struct wm831x_pdata *pdata = wm831x->dev->platform_data;
+	int rev;
+	enum wm831x_parent parent;
+	int ret;
+
+	mutex_init(&wm831x->io_lock);
+	mutex_init(&wm831x->key_lock);
+	dev_set_drvdata(wm831x->dev, wm831x);
+
+	ret = wm831x_reg_read(wm831x, WM831X_PARENT_ID);
+	if (ret < 0) {
+		dev_err(wm831x->dev, "Failed to read parent ID: %d\n", ret);
+		goto err;
+	}
+	if (ret != 0x6204) {
+		dev_err(wm831x->dev, "Device is not a WM831x: ID %x\n", ret);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = wm831x_reg_read(wm831x, WM831X_REVISION);
+	if (ret < 0) {
+		dev_err(wm831x->dev, "Failed to read revision: %d\n", ret);
+		goto err;
+	}
+	rev = (ret & WM831X_PARENT_REV_MASK) >> WM831X_PARENT_REV_SHIFT;
+
+	ret = wm831x_reg_read(wm831x, WM831X_RESET_ID);
+	if (ret < 0) {
+		dev_err(wm831x->dev, "Failed to read device ID: %d\n", ret);
+		goto err;
+	}
+
+	switch (ret) {
+	case 0x8310:
+		parent = WM8310;
+		switch (rev) {
+		case 0:
+			dev_info(wm831x->dev, "WM8310 revision %c\n",
+				 'A' + rev);
+			break;
+		}
+		break;
+
+	case 0x8311:
+		parent = WM8311;
+		switch (rev) {
+		case 0:
+			dev_info(wm831x->dev, "WM8311 revision %c\n",
+				 'A' + rev);
+			break;
+		}
+		break;
+
+	case 0x8312:
+		parent = WM8312;
+		switch (rev) {
+		case 0:
+			dev_info(wm831x->dev, "WM8312 revision %c\n",
+				 'A' + rev);
+			break;
+		}
+		break;
+
+	case 0:
+		/* Some engineering samples do not have the ID set,
+		 * rely on the device being registered correctly.
+		 * This will need revisiting for future devices with
+		 * multiple dies.
+		 */
+		parent = id;
+		switch (rev) {
+		case 0:
+			dev_info(wm831x->dev, "WM831%d ES revision %c\n",
+				 parent, 'A' + rev);
+			break;
+		}
+		break;
+
+	default:
+		dev_err(wm831x->dev, "Unknown WM831x device %04x\n", ret);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	/* This will need revisiting in future but is OK for all
+	 * current parts.
+	 */
+	if (parent != id)
+		dev_warn(wm831x->dev, "Device was registered as a WM831%lu\n",
+			 id);
+
+	/* Bootstrap the user key */
+	ret = wm831x_reg_read(wm831x, WM831X_SECURITY_KEY);
+	if (ret < 0) {
+		dev_err(wm831x->dev, "Failed to read security key: %d\n", ret);
+		goto err;
+	}
+	if (ret != 0) {
+		dev_warn(wm831x->dev, "Security key had non-zero value %x\n",
+			 ret);
+		wm831x_reg_write(wm831x, WM831X_SECURITY_KEY, 0);
+	}
+	wm831x->locked = 1;
+
+	if (pdata && pdata->pre_init) {
+		ret = pdata->pre_init(wm831x);
+		if (ret != 0) {
+			dev_err(wm831x->dev, "pre_init() failed: %d\n", ret);
+			goto err;
+		}
+	}
+
+	/* The core device is up, instantiate the subdevices. */
+	switch (parent) {
+	case WM8310:
+		ret = mfd_add_devices(wm831x->dev, -1,
+				      wm8310_devs, ARRAY_SIZE(wm8310_devs),
+				      NULL, 0);
+		break;
+
+	case WM8311:
+		ret = mfd_add_devices(wm831x->dev, -1,
+				      wm8311_devs, ARRAY_SIZE(wm8311_devs),
+				      NULL, 0);
+		break;
+
+	case WM8312:
+		ret = mfd_add_devices(wm831x->dev, -1,
+				      wm8312_devs, ARRAY_SIZE(wm8312_devs),
+				      NULL, 0);
+		break;
+
+	default:
+		/* If this happens the bus probe function is buggy */
+		BUG();
+	}
+
+	if (ret != 0) {
+		dev_err(wm831x->dev, "Failed to add children\n");
+		goto err;
+	}
+
+	if (pdata && pdata->post_init) {
+		ret = pdata->post_init(wm831x);
+		if (ret != 0) {
+			dev_err(wm831x->dev, "post_init() failed: %d\n", ret);
+			goto err;
+		}
+	}
+
+	return 0;
+
+err:
+	mfd_remove_devices(wm831x->dev);
+	kfree(wm831x);
+	return ret;
+}
+
+static void wm831x_device_exit(struct wm831x *wm831x)
+{
+	mfd_remove_devices(wm831x->dev);
+	kfree(wm831x);
+}
+
+static int wm831x_i2c_read_device(struct wm831x *wm831x, unsigned short reg,
+				  int bytes, void *dest)
+{
+	struct i2c_client *i2c = wm831x->control_data;
+	int ret;
+	u16 r = cpu_to_be16(reg);
+
+	ret = i2c_master_send(i2c, (unsigned char *)&r, 2);
+	if (ret < 0)
+		return ret;
+	if (ret != 2)
+		return -EIO;
+
+	ret = i2c_master_recv(i2c, dest, bytes);
+	if (ret < 0)
+		return ret;
+	if (ret != bytes)
+		return -EIO;
+	return 0;
+}
+
+/* Currently we allocate the write buffer on the stack; this is OK for
+ * small writes - if we need to do large writes this will need to be
+ * revised.
+ */
+static int wm831x_i2c_write_device(struct wm831x *wm831x, unsigned short reg,
+				   int bytes, void *src)
+{
+	struct i2c_client *i2c = wm831x->control_data;
+	unsigned char msg[bytes + 2];
+	int ret;
+
+	reg = cpu_to_be16(reg);
+	memcpy(&msg[0], &reg, 2);
+	memcpy(&msg[2], src, bytes);
+
+	ret = i2c_master_send(i2c, msg, bytes + 2);
+	if (ret < 0)
+		return ret;
+	if (ret < bytes + 2)
+		return -EIO;
+
+	return 0;
+}
+
+static int wm831x_i2c_probe(struct i2c_client *i2c,
+			    const struct i2c_device_id *id)
+{
+	struct wm831x *wm831x;
+
+	wm831x = kzalloc(sizeof(struct wm831x), GFP_KERNEL);
+	if (wm831x == NULL) {
+		kfree(i2c);
+		return -ENOMEM;
+	}
+
+	i2c_set_clientdata(i2c, wm831x);
+	wm831x->dev = &i2c->dev;
+	wm831x->control_data = i2c;
+	wm831x->read_dev = wm831x_i2c_read_device;
+	wm831x->write_dev = wm831x_i2c_write_device;
+
+	return wm831x_device_init(wm831x, id->driver_data, i2c->irq);
+}
+
+static int wm831x_i2c_remove(struct i2c_client *i2c)
+{
+	struct wm831x *wm831x = i2c_get_clientdata(i2c);
+
+	wm831x_device_exit(wm831x);
+
+	return 0;
+}
+
+static const struct i2c_device_id wm831x_i2c_id[] = {
+	{ "wm8310", WM8310 },
+	{ "wm8311", WM8311 },
+	{ "wm8312", WM8312 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, wm831x_i2c_id);
+
+
+static struct i2c_driver wm831x_i2c_driver = {
+	.driver = {
+		   .name = "wm831x",
+		   .owner = THIS_MODULE,
+	},
+	.probe = wm831x_i2c_probe,
+	.remove = wm831x_i2c_remove,
+	.id_table = wm831x_i2c_id,
+};
+
+static int __init wm831x_i2c_init(void)
+{
+	int ret;
+
+	ret = i2c_add_driver(&wm831x_i2c_driver);
+	if (ret != 0)
+		pr_err("Failed to register wm831x I2C driver: %d\n", ret);
+
+	return ret;
+}
+subsys_initcall(wm831x_i2c_init);
+
+static void __exit wm831x_i2c_exit(void)
+{
+	i2c_del_driver(&wm831x_i2c_driver);
+}
+module_exit(wm831x_i2c_exit);
+
+MODULE_DESCRIPTION("I2C support for the WM831X AudioPlus PMIC");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Mark Brown");
diff --git a/include/linux/mfd/wm831x/core.h b/include/linux/mfd/wm831x/core.h
new file mode 100644
index 000000000000..d90e693053ba
--- /dev/null
+++ b/include/linux/mfd/wm831x/core.h
@@ -0,0 +1,247 @@
+/*
+ * include/linux/mfd/wm831x/core.h -- Core interface for WM831x
+ *
+ * Copyright 2009 Wolfson Microelectronics PLC.
+ *
+ * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#ifndef __MFD_WM831X_CORE_H__
+#define __MFD_WM831X_CORE_H__
+
+/*
+ * Register values.
+ */
+#define WM831X_RESET_ID                         0x00
+#define WM831X_REVISION                         0x01
+#define WM831X_PARENT_ID                        0x4000
+#define WM831X_SYSVDD_CONTROL                   0x4001
+#define WM831X_THERMAL_MONITORING               0x4002
+#define WM831X_POWER_STATE                      0x4003
+#define WM831X_WATCHDOG                         0x4004
+#define WM831X_ON_PIN_CONTROL                   0x4005
+#define WM831X_RESET_CONTROL                    0x4006
+#define WM831X_CONTROL_INTERFACE                0x4007
+#define WM831X_SECURITY_KEY                     0x4008
+#define WM831X_SOFTWARE_SCRATCH                 0x4009
+#define WM831X_OTP_CONTROL                      0x400A
+#define WM831X_GPIO_LEVEL                       0x400C
+#define WM831X_SYSTEM_STATUS                    0x400D
+#define WM831X_ON_SOURCE                        0x400E
+#define WM831X_OFF_SOURCE                       0x400F
+#define WM831X_SYSTEM_INTERRUPTS                0x4010
+#define WM831X_INTERRUPT_STATUS_1               0x4011
+#define WM831X_INTERRUPT_STATUS_2               0x4012
+#define WM831X_INTERRUPT_STATUS_3               0x4013
+#define WM831X_INTERRUPT_STATUS_4               0x4014
+#define WM831X_INTERRUPT_STATUS_5               0x4015
+#define WM831X_IRQ_CONFIG                       0x4017
+#define WM831X_SYSTEM_INTERRUPTS_MASK           0x4018
+#define WM831X_INTERRUPT_STATUS_1_MASK          0x4019
+#define WM831X_INTERRUPT_STATUS_2_MASK          0x401A
+#define WM831X_INTERRUPT_STATUS_3_MASK          0x401B
+#define WM831X_INTERRUPT_STATUS_4_MASK          0x401C
+#define WM831X_INTERRUPT_STATUS_5_MASK          0x401D
+#define WM831X_RTC_WRITE_COUNTER                0x4020
+#define WM831X_RTC_TIME_1                       0x4021
+#define WM831X_RTC_TIME_2                       0x4022
+#define WM831X_RTC_ALARM_1                      0x4023
+#define WM831X_RTC_ALARM_2                      0x4024
+#define WM831X_RTC_CONTROL                      0x4025
+#define WM831X_RTC_TRIM                         0x4026
+#define WM831X_TOUCH_CONTROL_1                  0x4028
+#define WM831X_TOUCH_CONTROL_2                  0x4029
+#define WM831X_TOUCH_DATA_X                     0x402A
+#define WM831X_TOUCH_DATA_Y                     0x402B
+#define WM831X_TOUCH_DATA_Z                     0x402C
+#define WM831X_AUXADC_DATA                      0x402D
+#define WM831X_AUXADC_CONTROL                   0x402E
+#define WM831X_AUXADC_SOURCE                    0x402F
+#define WM831X_COMPARATOR_CONTROL               0x4030
+#define WM831X_COMPARATOR_1                     0x4031
+#define WM831X_COMPARATOR_2                     0x4032
+#define WM831X_COMPARATOR_3                     0x4033
+#define WM831X_COMPARATOR_4                     0x4034
+#define WM831X_GPIO1_CONTROL                    0x4038
+#define WM831X_GPIO2_CONTROL                    0x4039
+#define WM831X_GPIO3_CONTROL                    0x403A
+#define WM831X_GPIO4_CONTROL                    0x403B
+#define WM831X_GPIO5_CONTROL                    0x403C
+#define WM831X_GPIO6_CONTROL                    0x403D
+#define WM831X_GPIO7_CONTROL                    0x403E
+#define WM831X_GPIO8_CONTROL                    0x403F
+#define WM831X_GPIO9_CONTROL                    0x4040
+#define WM831X_GPIO10_CONTROL                   0x4041
+#define WM831X_GPIO11_CONTROL                   0x4042
+#define WM831X_GPIO12_CONTROL                   0x4043
+#define WM831X_GPIO13_CONTROL                   0x4044
+#define WM831X_GPIO14_CONTROL                   0x4045
+#define WM831X_GPIO15_CONTROL                   0x4046
+#define WM831X_GPIO16_CONTROL                   0x4047
+#define WM831X_CHARGER_CONTROL_1                0x4048
+#define WM831X_CHARGER_CONTROL_2                0x4049
+#define WM831X_CHARGER_STATUS                   0x404A
+#define WM831X_BACKUP_CHARGER_CONTROL           0x404B
+#define WM831X_STATUS_LED_1                     0x404C
+#define WM831X_STATUS_LED_2                     0x404D
+#define WM831X_CURRENT_SINK_1                   0x404E
+#define WM831X_CURRENT_SINK_2                   0x404F
+#define WM831X_DCDC_ENABLE                      0x4050
+#define WM831X_LDO_ENABLE                       0x4051
+#define WM831X_DCDC_STATUS                      0x4052
+#define WM831X_LDO_STATUS                       0x4053
+#define WM831X_DCDC_UV_STATUS                   0x4054
+#define WM831X_LDO_UV_STATUS                    0x4055
+#define WM831X_DC1_CONTROL_1                    0x4056
+#define WM831X_DC1_CONTROL_2                    0x4057
+#define WM831X_DC1_ON_CONFIG                    0x4058
+#define WM831X_DC1_SLEEP_CONTROL                0x4059
+#define WM831X_DC1_DVS_CONTROL                  0x405A
+#define WM831X_DC2_CONTROL_1                    0x405B
+#define WM831X_DC2_CONTROL_2                    0x405C
+#define WM831X_DC2_ON_CONFIG                    0x405D
+#define WM831X_DC2_SLEEP_CONTROL                0x405E
+#define WM831X_DC2_DVS_CONTROL                  0x405F
+#define WM831X_DC3_CONTROL_1                    0x4060
+#define WM831X_DC3_CONTROL_2                    0x4061
+#define WM831X_DC3_ON_CONFIG                    0x4062
+#define WM831X_DC3_SLEEP_CONTROL                0x4063
+#define WM831X_DC4_CONTROL                      0x4064
+#define WM831X_DC4_SLEEP_CONTROL                0x4065
+#define WM831X_EPE1_CONTROL                     0x4066
+#define WM831X_EPE2_CONTROL                     0x4067
+#define WM831X_LDO1_CONTROL                     0x4068
+#define WM831X_LDO1_ON_CONTROL                  0x4069
+#define WM831X_LDO1_SLEEP_CONTROL               0x406A
+#define WM831X_LDO2_CONTROL                     0x406B
+#define WM831X_LDO2_ON_CONTROL                  0x406C
+#define WM831X_LDO2_SLEEP_CONTROL               0x406D
+#define WM831X_LDO3_CONTROL                     0x406E
+#define WM831X_LDO3_ON_CONTROL                  0x406F
+#define WM831X_LDO3_SLEEP_CONTROL               0x4070
+#define WM831X_LDO4_CONTROL                     0x4071
+#define WM831X_LDO4_ON_CONTROL                  0x4072
+#define WM831X_LDO4_SLEEP_CONTROL               0x4073
+#define WM831X_LDO5_CONTROL                     0x4074
+#define WM831X_LDO5_ON_CONTROL                  0x4075
+#define WM831X_LDO5_SLEEP_CONTROL               0x4076
+#define WM831X_LDO6_CONTROL                     0x4077
+#define WM831X_LDO6_ON_CONTROL                  0x4078
+#define WM831X_LDO6_SLEEP_CONTROL               0x4079
+#define WM831X_LDO7_CONTROL                     0x407A
+#define WM831X_LDO7_ON_CONTROL                  0x407B
+#define WM831X_LDO7_SLEEP_CONTROL               0x407C
+#define WM831X_LDO8_CONTROL                     0x407D
+#define WM831X_LDO8_ON_CONTROL                  0x407E
+#define WM831X_LDO8_SLEEP_CONTROL               0x407F
+#define WM831X_LDO9_CONTROL                     0x4080
+#define WM831X_LDO9_ON_CONTROL                  0x4081
+#define WM831X_LDO9_SLEEP_CONTROL               0x4082
+#define WM831X_LDO10_CONTROL                    0x4083
+#define WM831X_LDO10_ON_CONTROL                 0x4084
+#define WM831X_LDO10_SLEEP_CONTROL              0x4085
+#define WM831X_LDO11_ON_CONTROL                 0x4087
+#define WM831X_LDO11_SLEEP_CONTROL              0x4088
+#define WM831X_POWER_GOOD_SOURCE_1              0x408E
+#define WM831X_POWER_GOOD_SOURCE_2              0x408F
+#define WM831X_CLOCK_CONTROL_1                  0x4090
+#define WM831X_CLOCK_CONTROL_2                  0x4091
+#define WM831X_FLL_CONTROL_1                    0x4092
+#define WM831X_FLL_CONTROL_2                    0x4093
+#define WM831X_FLL_CONTROL_3                    0x4094
+#define WM831X_FLL_CONTROL_4                    0x4095
+#define WM831X_FLL_CONTROL_5                    0x4096
+#define WM831X_UNIQUE_ID_1                      0x7800
+#define WM831X_UNIQUE_ID_2                      0x7801
+#define WM831X_UNIQUE_ID_3                      0x7802
+#define WM831X_UNIQUE_ID_4                      0x7803
+#define WM831X_UNIQUE_ID_5                      0x7804
+#define WM831X_UNIQUE_ID_6                      0x7805
+#define WM831X_UNIQUE_ID_7                      0x7806
+#define WM831X_UNIQUE_ID_8                      0x7807
+#define WM831X_FACTORY_OTP_ID                   0x7808
+#define WM831X_FACTORY_OTP_1                    0x7809
+#define WM831X_FACTORY_OTP_2                    0x780A
+#define WM831X_FACTORY_OTP_3                    0x780B
+#define WM831X_FACTORY_OTP_4                    0x780C
+#define WM831X_FACTORY_OTP_5                    0x780D
+#define WM831X_CUSTOMER_OTP_ID                  0x7810
+#define WM831X_DC1_OTP_CONTROL                  0x7811
+#define WM831X_DC2_OTP_CONTROL                  0x7812
+#define WM831X_DC3_OTP_CONTROL                  0x7813
+#define WM831X_LDO1_2_OTP_CONTROL               0x7814
+#define WM831X_LDO3_4_OTP_CONTROL               0x7815
+#define WM831X_LDO5_6_OTP_CONTROL               0x7816
+#define WM831X_LDO7_8_OTP_CONTROL               0x7817
+#define WM831X_LDO9_10_OTP_CONTROL              0x7818
+#define WM831X_LDO11_EPE_CONTROL                0x7819
+#define WM831X_GPIO1_OTP_CONTROL                0x781A
+#define WM831X_GPIO2_OTP_CONTROL                0x781B
+#define WM831X_GPIO3_OTP_CONTROL                0x781C
+#define WM831X_GPIO4_OTP_CONTROL                0x781D
+#define WM831X_GPIO5_OTP_CONTROL                0x781E
+#define WM831X_GPIO6_OTP_CONTROL                0x781F
+#define WM831X_DBE_CHECK_DATA                   0x7827
+
+/*
+ * R0 (0x00) - Reset ID
+ */
+#define WM831X_CHIP_ID_MASK                     0xFFFF  /* CHIP_ID - [15:0] */
+#define WM831X_CHIP_ID_SHIFT                         0  /* CHIP_ID - [15:0] */
+#define WM831X_CHIP_ID_WIDTH                        16  /* CHIP_ID - [15:0] */
+
+/*
+ * R1 (0x01) - Revision
+ */
+#define WM831X_PARENT_REV_MASK                  0xFF00  /* PARENT_REV - [15:8] */
+#define WM831X_PARENT_REV_SHIFT                      8  /* PARENT_REV - [15:8] */
+#define WM831X_PARENT_REV_WIDTH                      8  /* PARENT_REV - [15:8] */
+#define WM831X_CHILD_REV_MASK                   0x00FF  /* CHILD_REV - [7:0] */
+#define WM831X_CHILD_REV_SHIFT                       0  /* CHILD_REV - [7:0] */
+#define WM831X_CHILD_REV_WIDTH                       8  /* CHILD_REV - [7:0] */
+
+/*
+ * R16384 (0x4000) - Parent ID
+ */
+#define WM831X_PARENT_ID_MASK                   0xFFFF  /* PARENT_ID - [15:0] */
+#define WM831X_PARENT_ID_SHIFT                       0  /* PARENT_ID - [15:0] */
+#define WM831X_PARENT_ID_WIDTH                      16  /* PARENT_ID - [15:0] */
+
+struct wm831x {
+	struct mutex io_lock;
+
+	struct device *dev;
+	int (*read_dev)(struct wm831x *wm831x, unsigned short reg,
+			int bytes, void *dest);
+	int (*write_dev)(struct wm831x *wm831x, unsigned short reg,
+			 int bytes, void *src);
+
+	void *control_data;
+
+	/* The WM831x has a security key blocking access to certain
+	 * registers.  The mutex is taken by the accessors for locking
+	 * and unlocking the security key, locked is used to fail
+	 * writes if the lock is held.
+	 */
+	struct mutex key_lock;
+	unsigned int locked:1;
+};
+
+/* Device I/O API */
+int wm831x_reg_read(struct wm831x *wm831x, unsigned short reg);
+int wm831x_reg_write(struct wm831x *wm831x, unsigned short reg,
+		 unsigned short val);
+void wm831x_reg_lock(struct wm831x *wm831x);
+int wm831x_reg_unlock(struct wm831x *wm831x);
+int wm831x_set_bits(struct wm831x *wm831x, unsigned short reg,
+		    unsigned short mask, unsigned short val);
+int wm831x_bulk_read(struct wm831x *wm831x, unsigned short reg,
+		     int count, u16 *buf);
+
+#endif
diff --git a/include/linux/mfd/wm831x/pdata.h b/include/linux/mfd/wm831x/pdata.h
new file mode 100644
index 000000000000..571e60136264
--- /dev/null
+++ b/include/linux/mfd/wm831x/pdata.h
@@ -0,0 +1,107 @@
+/*
+ * include/linux/mfd/wm831x/pdata.h -- Platform data for WM831x
+ *
+ * Copyright 2009 Wolfson Microelectronics PLC.
+ *
+ * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#ifndef __MFD_WM831X_PDATA_H__
+#define __MFD_WM831X_PDATA_H__
+
+struct wm831x;
+struct regulator_init_data;
+
+struct wm831x_backup_pdata {
+	int charger_enable;
+	int no_constant_voltage;  /** Disable constant voltage charging */
+	int vlim;   /** Voltage limit in milivolts */
+	int ilim;   /** Current limit in microamps */
+};
+
+struct wm831x_battery_pdata {
+	int enable;         /** Enable charging */
+	int fast_enable;    /** Enable fast charging */
+	int off_mask;       /** Mask OFF while charging */
+	int trickle_ilim;   /** Trickle charge current limit, in mA */
+	int vsel;           /** Target voltage, in mV */
+	int eoc_iterm;      /** End of trickle charge current, in mA */
+	int fast_ilim;      /** Fast charge current limit, in mA */
+	int timeout;        /** Charge cycle timeout, in minutes */
+};
+
+/* Sources for status LED configuration.  Values are register values
+ * plus 1 to allow for a zero default for preserve.
+ */
+enum wm831x_status_src {
+	WM831X_STATUS_PRESERVE = 0,  /* Keep the current hardware setting */
+	WM831X_STATUS_OTP = 1,
+	WM831X_STATUS_POWER = 2,
+	WM831X_STATUS_CHARGER = 3,
+	WM831X_STATUS_MANUAL = 4,
+};
+
+struct wm831x_status_pdata {
+	enum wm831x_status_src default_src;
+	const char *name;
+	const char *default_trigger;
+};
+
+struct wm831x_touch_pdata {
+	int fivewire;          /** 1 for five wire mode, 0 for 4 wire */
+	int isel;              /** Current for pen down (uA) */
+	int rpu;               /** Pen down sensitivity resistor divider */
+	int pressure;          /** Report pressure (boolean) */
+	int data_irq;          /** Touch data ready IRQ */
+};
+
+enum wm831x_watchdog_action {
+	WM831X_WDOG_NONE = 0,
+	WM831X_WDOG_INTERRUPT = 1,
+	WM831X_WDOG_RESET = 2,
+	WM831X_WDOG_WAKE = 3,
+};
+
+struct wm831x_watchdog_pdata {
+	enum wm831x_watchdog_action primary, secondary;
+	int update_gpio;
+	unsigned int software:1;
+};
+
+#define WM831X_MAX_STATUS 2
+#define WM831X_MAX_DCDC   4
+#define WM831X_MAX_EPE    2
+#define WM831X_MAX_LDO    11
+#define WM831X_MAX_ISINK  2
+
+struct wm831x_pdata {
+	/** Called before subdevices are set up */
+	int (*pre_init)(struct wm831x *wm831x);
+	/** Called after subdevices are set up */
+	int (*post_init)(struct wm831x *wm831x);
+
+	int gpio_base;
+	struct wm831x_backup_pdata *backup;
+	struct wm831x_battery_pdata *battery;
+	struct wm831x_touch_pdata *touch;
+	struct wm831x_watchdog_pdata *watchdog;
+
+	/** LED1 = 0 and so on */
+	struct wm831x_status_pdata *status[WM831X_MAX_STATUS];
+	/** DCDC1 = 0 and so on */
+	struct regulator_init_data *dcdc[WM831X_MAX_DCDC];
+	/** EPE1 = 0 and so on */
+	struct regulator_init_data *epe[WM831X_MAX_EPE];
+	/** LDO1 = 0 and so on */
+	struct regulator_init_data *ldo[WM831X_MAX_LDO];
+	/** ISINK1 = 0 and so on*/
+	struct regulator_init_data *isink[WM831X_MAX_ISINK];
+};
+
+#endif
-- 
cgit v1.2.3-71-gd317


From 7d4d0a3e7343e3190afaa17253073db58e3d9bff Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 27 Jul 2009 14:45:53 +0100
Subject: mfd: Add WM831x interrupt support

The WM831x includes an interrupt controller managing interrupts for
the various functions on the chip. This patch adds support for the
core interrupt block on the device.

Ideally this would be supported by genirq, particularly for the
GPIOs, but currently genirq is unable to cope with controllers on
interrupt driven buses so we cut'n'paste the generic interface.
Once genirq is able to cope chips like this it should be a case
of filing the prefixes off the code and redoing wm831x-irq.c to
move over.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/wm831x-core.c       |  12 +-
 drivers/mfd/wm831x-irq.c        | 559 +++++++++++++++++++++++++++++
 include/linux/mfd/wm831x/core.h |  21 ++
 include/linux/mfd/wm831x/irq.h  | 764 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 1354 insertions(+), 2 deletions(-)
 create mode 100644 drivers/mfd/wm831x-irq.c
 create mode 100644 include/linux/mfd/wm831x/irq.h

(limited to 'include/linux')

diff --git a/drivers/mfd/wm831x-core.c b/drivers/mfd/wm831x-core.c
index cc1040c9d46c..eb63d22160d1 100644
--- a/drivers/mfd/wm831x-core.c
+++ b/drivers/mfd/wm831x-core.c
@@ -19,6 +19,7 @@
 
 #include <linux/mfd/wm831x/core.h>
 #include <linux/mfd/wm831x/pdata.h>
+#include <linux/mfd/wm831x/irq.h>
 
 enum wm831x_parent {
 	WM8310 = 0,
@@ -1189,6 +1190,10 @@ static int wm831x_device_init(struct wm831x *wm831x, unsigned long id, int irq)
 		}
 	}
 
+	ret = wm831x_irq_init(wm831x, irq);
+	if (ret != 0)
+		goto err;
+
 	/* The core device is up, instantiate the subdevices. */
 	switch (parent) {
 	case WM8310:
@@ -1216,19 +1221,21 @@ static int wm831x_device_init(struct wm831x *wm831x, unsigned long id, int irq)
 
 	if (ret != 0) {
 		dev_err(wm831x->dev, "Failed to add children\n");
-		goto err;
+		goto err_irq;
 	}
 
 	if (pdata && pdata->post_init) {
 		ret = pdata->post_init(wm831x);
 		if (ret != 0) {
 			dev_err(wm831x->dev, "post_init() failed: %d\n", ret);
-			goto err;
+			goto err_irq;
 		}
 	}
 
 	return 0;
 
+err_irq:
+	wm831x_irq_exit(wm831x);
 err:
 	mfd_remove_devices(wm831x->dev);
 	kfree(wm831x);
@@ -1238,6 +1245,7 @@ err:
 static void wm831x_device_exit(struct wm831x *wm831x)
 {
 	mfd_remove_devices(wm831x->dev);
+	wm831x_irq_exit(wm831x);
 	kfree(wm831x);
 }
 
diff --git a/drivers/mfd/wm831x-irq.c b/drivers/mfd/wm831x-irq.c
new file mode 100644
index 000000000000..d3015dfb9134
--- /dev/null
+++ b/drivers/mfd/wm831x-irq.c
@@ -0,0 +1,559 @@
+/*
+ * wm831x-irq.c  --  Interrupt controller support for Wolfson WM831x PMICs
+ *
+ * Copyright 2009 Wolfson Microelectronics PLC.
+ *
+ * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/i2c.h>
+#include <linux/mfd/core.h>
+#include <linux/interrupt.h>
+
+#include <linux/mfd/wm831x/core.h>
+#include <linux/mfd/wm831x/pdata.h>
+#include <linux/mfd/wm831x/irq.h>
+
+#include <linux/delay.h>
+
+/*
+ * Since generic IRQs don't currently support interrupt controllers on
+ * interrupt driven buses we don't use genirq but instead provide an
+ * interface that looks very much like the standard ones.  This leads
+ * to some bodges, including storing interrupt handler information in
+ * the static irq_data table we use to look up the data for individual
+ * interrupts, but hopefully won't last too long.
+ */
+
+struct wm831x_irq_data {
+	int primary;
+	int reg;
+	int mask;
+	irq_handler_t handler;
+	void *handler_data;
+};
+
+static struct wm831x_irq_data wm831x_irqs[] = {
+	[WM831X_IRQ_TEMP_THW] = {
+		.primary = WM831X_TEMP_INT,
+		.reg = 1,
+		.mask = WM831X_TEMP_THW_EINT,
+	},
+	[WM831X_IRQ_GPIO_1] = {
+		.primary = WM831X_GP_INT,
+		.reg = 5,
+		.mask = WM831X_GP1_EINT,
+	},
+	[WM831X_IRQ_GPIO_2] = {
+		.primary = WM831X_GP_INT,
+		.reg = 5,
+		.mask = WM831X_GP2_EINT,
+	},
+	[WM831X_IRQ_GPIO_3] = {
+		.primary = WM831X_GP_INT,
+		.reg = 5,
+		.mask = WM831X_GP3_EINT,
+	},
+	[WM831X_IRQ_GPIO_4] = {
+		.primary = WM831X_GP_INT,
+		.reg = 5,
+		.mask = WM831X_GP4_EINT,
+	},
+	[WM831X_IRQ_GPIO_5] = {
+		.primary = WM831X_GP_INT,
+		.reg = 5,
+		.mask = WM831X_GP5_EINT,
+	},
+	[WM831X_IRQ_GPIO_6] = {
+		.primary = WM831X_GP_INT,
+		.reg = 5,
+		.mask = WM831X_GP6_EINT,
+	},
+	[WM831X_IRQ_GPIO_7] = {
+		.primary = WM831X_GP_INT,
+		.reg = 5,
+		.mask = WM831X_GP7_EINT,
+	},
+	[WM831X_IRQ_GPIO_8] = {
+		.primary = WM831X_GP_INT,
+		.reg = 5,
+		.mask = WM831X_GP8_EINT,
+	},
+	[WM831X_IRQ_GPIO_9] = {
+		.primary = WM831X_GP_INT,
+		.reg = 5,
+		.mask = WM831X_GP9_EINT,
+	},
+	[WM831X_IRQ_GPIO_10] = {
+		.primary = WM831X_GP_INT,
+		.reg = 5,
+		.mask = WM831X_GP10_EINT,
+	},
+	[WM831X_IRQ_GPIO_11] = {
+		.primary = WM831X_GP_INT,
+		.reg = 5,
+		.mask = WM831X_GP11_EINT,
+	},
+	[WM831X_IRQ_GPIO_12] = {
+		.primary = WM831X_GP_INT,
+		.reg = 5,
+		.mask = WM831X_GP12_EINT,
+	},
+	[WM831X_IRQ_GPIO_13] = {
+		.primary = WM831X_GP_INT,
+		.reg = 5,
+		.mask = WM831X_GP13_EINT,
+	},
+	[WM831X_IRQ_GPIO_14] = {
+		.primary = WM831X_GP_INT,
+		.reg = 5,
+		.mask = WM831X_GP14_EINT,
+	},
+	[WM831X_IRQ_GPIO_15] = {
+		.primary = WM831X_GP_INT,
+		.reg = 5,
+		.mask = WM831X_GP15_EINT,
+	},
+	[WM831X_IRQ_GPIO_16] = {
+		.primary = WM831X_GP_INT,
+		.reg = 5,
+		.mask = WM831X_GP16_EINT,
+	},
+	[WM831X_IRQ_ON] = {
+		.primary = WM831X_ON_PIN_INT,
+		.reg = 1,
+		.mask = WM831X_ON_PIN_EINT,
+	},
+	[WM831X_IRQ_PPM_SYSLO] = {
+		.primary = WM831X_PPM_INT,
+		.reg = 1,
+		.mask = WM831X_PPM_SYSLO_EINT,
+	},
+	[WM831X_IRQ_PPM_PWR_SRC] = {
+		.primary = WM831X_PPM_INT,
+		.reg = 1,
+		.mask = WM831X_PPM_PWR_SRC_EINT,
+	},
+	[WM831X_IRQ_PPM_USB_CURR] = {
+		.primary = WM831X_PPM_INT,
+		.reg = 1,
+		.mask = WM831X_PPM_USB_CURR_EINT,
+	},
+	[WM831X_IRQ_WDOG_TO] = {
+		.primary = WM831X_WDOG_INT,
+		.reg = 1,
+		.mask = WM831X_WDOG_TO_EINT,
+	},
+	[WM831X_IRQ_RTC_PER] = {
+		.primary = WM831X_RTC_INT,
+		.reg = 1,
+		.mask = WM831X_RTC_PER_EINT,
+	},
+	[WM831X_IRQ_RTC_ALM] = {
+		.primary = WM831X_RTC_INT,
+		.reg = 1,
+		.mask = WM831X_RTC_ALM_EINT,
+	},
+	[WM831X_IRQ_CHG_BATT_HOT] = {
+		.primary = WM831X_CHG_INT,
+		.reg = 2,
+		.mask = WM831X_CHG_BATT_HOT_EINT,
+	},
+	[WM831X_IRQ_CHG_BATT_COLD] = {
+		.primary = WM831X_CHG_INT,
+		.reg = 2,
+		.mask = WM831X_CHG_BATT_COLD_EINT,
+	},
+	[WM831X_IRQ_CHG_BATT_FAIL] = {
+		.primary = WM831X_CHG_INT,
+		.reg = 2,
+		.mask = WM831X_CHG_BATT_FAIL_EINT,
+	},
+	[WM831X_IRQ_CHG_OV] = {
+		.primary = WM831X_CHG_INT,
+		.reg = 2,
+		.mask = WM831X_CHG_OV_EINT,
+	},
+	[WM831X_IRQ_CHG_END] = {
+		.primary = WM831X_CHG_INT,
+		.reg = 2,
+		.mask = WM831X_CHG_END_EINT,
+	},
+	[WM831X_IRQ_CHG_TO] = {
+		.primary = WM831X_CHG_INT,
+		.reg = 2,
+		.mask = WM831X_CHG_TO_EINT,
+	},
+	[WM831X_IRQ_CHG_MODE] = {
+		.primary = WM831X_CHG_INT,
+		.reg = 2,
+		.mask = WM831X_CHG_MODE_EINT,
+	},
+	[WM831X_IRQ_CHG_START] = {
+		.primary = WM831X_CHG_INT,
+		.reg = 2,
+		.mask = WM831X_CHG_START_EINT,
+	},
+	[WM831X_IRQ_TCHDATA] = {
+		.primary = WM831X_TCHDATA_INT,
+		.reg = 1,
+		.mask = WM831X_TCHDATA_EINT,
+	},
+	[WM831X_IRQ_TCHPD] = {
+		.primary = WM831X_TCHPD_INT,
+		.reg = 1,
+		.mask = WM831X_TCHPD_EINT,
+	},
+	[WM831X_IRQ_AUXADC_DATA] = {
+		.primary = WM831X_AUXADC_INT,
+		.reg = 1,
+		.mask = WM831X_AUXADC_DATA_EINT,
+	},
+	[WM831X_IRQ_AUXADC_DCOMP1] = {
+		.primary = WM831X_AUXADC_INT,
+		.reg = 1,
+		.mask = WM831X_AUXADC_DCOMP1_EINT,
+	},
+	[WM831X_IRQ_AUXADC_DCOMP2] = {
+		.primary = WM831X_AUXADC_INT,
+		.reg = 1,
+		.mask = WM831X_AUXADC_DCOMP2_EINT,
+	},
+	[WM831X_IRQ_AUXADC_DCOMP3] = {
+		.primary = WM831X_AUXADC_INT,
+		.reg = 1,
+		.mask = WM831X_AUXADC_DCOMP3_EINT,
+	},
+	[WM831X_IRQ_AUXADC_DCOMP4] = {
+		.primary = WM831X_AUXADC_INT,
+		.reg = 1,
+		.mask = WM831X_AUXADC_DCOMP4_EINT,
+	},
+	[WM831X_IRQ_CS1] = {
+		.primary = WM831X_CS_INT,
+		.reg = 2,
+		.mask = WM831X_CS1_EINT,
+	},
+	[WM831X_IRQ_CS2] = {
+		.primary = WM831X_CS_INT,
+		.reg = 2,
+		.mask = WM831X_CS2_EINT,
+	},
+	[WM831X_IRQ_HC_DC1] = {
+		.primary = WM831X_HC_INT,
+		.reg = 4,
+		.mask = WM831X_HC_DC1_EINT,
+	},
+	[WM831X_IRQ_HC_DC2] = {
+		.primary = WM831X_HC_INT,
+		.reg = 4,
+		.mask = WM831X_HC_DC2_EINT,
+	},
+	[WM831X_IRQ_UV_LDO1] = {
+		.primary = WM831X_UV_INT,
+		.reg = 3,
+		.mask = WM831X_UV_LDO1_EINT,
+	},
+	[WM831X_IRQ_UV_LDO2] = {
+		.primary = WM831X_UV_INT,
+		.reg = 3,
+		.mask = WM831X_UV_LDO2_EINT,
+	},
+	[WM831X_IRQ_UV_LDO3] = {
+		.primary = WM831X_UV_INT,
+		.reg = 3,
+		.mask = WM831X_UV_LDO3_EINT,
+	},
+	[WM831X_IRQ_UV_LDO4] = {
+		.primary = WM831X_UV_INT,
+		.reg = 3,
+		.mask = WM831X_UV_LDO4_EINT,
+	},
+	[WM831X_IRQ_UV_LDO5] = {
+		.primary = WM831X_UV_INT,
+		.reg = 3,
+		.mask = WM831X_UV_LDO5_EINT,
+	},
+	[WM831X_IRQ_UV_LDO6] = {
+		.primary = WM831X_UV_INT,
+		.reg = 3,
+		.mask = WM831X_UV_LDO6_EINT,
+	},
+	[WM831X_IRQ_UV_LDO7] = {
+		.primary = WM831X_UV_INT,
+		.reg = 3,
+		.mask = WM831X_UV_LDO7_EINT,
+	},
+	[WM831X_IRQ_UV_LDO8] = {
+		.primary = WM831X_UV_INT,
+		.reg = 3,
+		.mask = WM831X_UV_LDO8_EINT,
+	},
+	[WM831X_IRQ_UV_LDO9] = {
+		.primary = WM831X_UV_INT,
+		.reg = 3,
+		.mask = WM831X_UV_LDO9_EINT,
+	},
+	[WM831X_IRQ_UV_LDO10] = {
+		.primary = WM831X_UV_INT,
+		.reg = 3,
+		.mask = WM831X_UV_LDO10_EINT,
+	},
+	[WM831X_IRQ_UV_DC1] = {
+		.primary = WM831X_UV_INT,
+		.reg = 4,
+		.mask = WM831X_UV_DC1_EINT,
+	},
+	[WM831X_IRQ_UV_DC2] = {
+		.primary = WM831X_UV_INT,
+		.reg = 4,
+		.mask = WM831X_UV_DC2_EINT,
+	},
+	[WM831X_IRQ_UV_DC3] = {
+		.primary = WM831X_UV_INT,
+		.reg = 4,
+		.mask = WM831X_UV_DC3_EINT,
+	},
+	[WM831X_IRQ_UV_DC4] = {
+		.primary = WM831X_UV_INT,
+		.reg = 4,
+		.mask = WM831X_UV_DC4_EINT,
+	},
+};
+
+static inline int irq_data_to_status_reg(struct wm831x_irq_data *irq_data)
+{
+	return WM831X_INTERRUPT_STATUS_1 - 1 + irq_data->reg;
+}
+
+static inline int irq_data_to_mask_reg(struct wm831x_irq_data *irq_data)
+{
+	return WM831X_INTERRUPT_STATUS_1_MASK - 1 + irq_data->reg;
+}
+
+static void __wm831x_enable_irq(struct wm831x *wm831x, int irq)
+{
+	struct wm831x_irq_data *irq_data = &wm831x_irqs[irq];
+
+	wm831x->irq_masks[irq_data->reg - 1] &= ~irq_data->mask;
+	wm831x_reg_write(wm831x, irq_data_to_mask_reg(irq_data),
+			 wm831x->irq_masks[irq_data->reg - 1]);
+}
+
+void wm831x_enable_irq(struct wm831x *wm831x, int irq)
+{
+	mutex_lock(&wm831x->irq_lock);
+	__wm831x_enable_irq(wm831x, irq);
+	mutex_unlock(&wm831x->irq_lock);
+}
+EXPORT_SYMBOL_GPL(wm831x_enable_irq);
+
+static void __wm831x_disable_irq(struct wm831x *wm831x, int irq)
+{
+	struct wm831x_irq_data *irq_data = &wm831x_irqs[irq];
+
+	wm831x->irq_masks[irq_data->reg - 1] |= irq_data->mask;
+	wm831x_reg_write(wm831x, irq_data_to_mask_reg(irq_data),
+			 wm831x->irq_masks[irq_data->reg - 1]);
+}
+
+void wm831x_disable_irq(struct wm831x *wm831x, int irq)
+{
+	mutex_lock(&wm831x->irq_lock);
+	__wm831x_disable_irq(wm831x, irq);
+	mutex_unlock(&wm831x->irq_lock);
+}
+EXPORT_SYMBOL_GPL(wm831x_disable_irq);
+
+int wm831x_request_irq(struct wm831x *wm831x,
+		       unsigned int irq, irq_handler_t handler,
+		       unsigned long flags, const char *name,
+		       void *dev)
+{
+	int ret = 0;
+
+	if (irq < 0 || irq >= WM831X_NUM_IRQS)
+		return -EINVAL;
+
+	mutex_lock(&wm831x->irq_lock);
+
+	if (wm831x_irqs[irq].handler) {
+		dev_err(wm831x->dev, "Already have handler for IRQ %d\n", irq);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	wm831x_irqs[irq].handler = handler;
+	wm831x_irqs[irq].handler_data = dev;
+
+	__wm831x_enable_irq(wm831x, irq);
+
+out:
+	mutex_unlock(&wm831x->irq_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(wm831x_request_irq);
+
+void wm831x_free_irq(struct wm831x *wm831x, unsigned int irq, void *data)
+{
+	if (irq < 0 || irq >= WM831X_NUM_IRQS)
+		return;
+
+	mutex_lock(&wm831x->irq_lock);
+
+	wm831x_irqs[irq].handler = NULL;
+	wm831x_irqs[irq].handler_data = NULL;
+
+	__wm831x_disable_irq(wm831x, irq);
+
+	mutex_unlock(&wm831x->irq_lock);
+}
+EXPORT_SYMBOL_GPL(wm831x_free_irq);
+
+
+static void wm831x_handle_irq(struct wm831x *wm831x, int irq, int status)
+{
+	struct wm831x_irq_data *irq_data = &wm831x_irqs[irq];
+
+	if (irq_data->handler) {
+		irq_data->handler(irq, irq_data->handler_data);
+		wm831x_reg_write(wm831x, irq_data_to_status_reg(irq_data),
+				 irq_data->mask);
+	} else {
+		dev_err(wm831x->dev, "Unhandled IRQ %d, masking\n", irq);
+		__wm831x_disable_irq(wm831x, irq);
+	}
+}
+
+/* Main interrupt handling occurs in a workqueue since we need
+ * interrupts enabled to interact with the chip. */
+static void wm831x_irq_worker(struct work_struct *work)
+{
+	struct wm831x *wm831x = container_of(work, struct wm831x, irq_work);
+	unsigned int i;
+	int primary;
+	int status_regs[5];
+	int read[5] = { 0 };
+	int *status;
+
+	primary = wm831x_reg_read(wm831x, WM831X_SYSTEM_INTERRUPTS);
+	if (primary < 0) {
+		dev_err(wm831x->dev, "Failed to read system interrupt: %d\n",
+			primary);
+		goto out;
+	}
+
+	mutex_lock(&wm831x->irq_lock);
+
+	for (i = 0; i < ARRAY_SIZE(wm831x_irqs); i++) {
+		int offset = wm831x_irqs[i].reg - 1;
+
+		if (!(primary & wm831x_irqs[i].primary))
+			continue;
+
+		status = &status_regs[offset];
+
+		/* Hopefully there should only be one register to read
+		 * each time otherwise we ought to do a block read. */
+		if (!read[offset]) {
+			*status = wm831x_reg_read(wm831x,
+				     irq_data_to_status_reg(&wm831x_irqs[i]));
+			if (*status < 0) {
+				dev_err(wm831x->dev,
+					"Failed to read IRQ status: %d\n",
+					*status);
+				goto out_lock;
+			}
+
+			/* Mask out the disabled IRQs */
+			*status &= ~wm831x->irq_masks[offset];
+			read[offset] = 1;
+		}
+
+		if (*status & wm831x_irqs[i].mask)
+			wm831x_handle_irq(wm831x, i, *status);
+	}
+
+out_lock:
+	mutex_unlock(&wm831x->irq_lock);
+out:
+	enable_irq(wm831x->irq);
+}
+
+
+static irqreturn_t wm831x_cpu_irq(int irq, void *data)
+{
+	struct wm831x *wm831x = data;
+
+	/* Shut the interrupt to the CPU up and schedule the actual
+	 * handler; we can't check that the IRQ is asserted. */
+	disable_irq_nosync(irq);
+
+	queue_work(wm831x->irq_wq, &wm831x->irq_work);
+
+	return IRQ_HANDLED;
+}
+
+int wm831x_irq_init(struct wm831x *wm831x, int irq)
+{
+	int i, ret;
+
+	if (!irq) {
+		dev_warn(wm831x->dev,
+			 "No interrupt specified - functionality limited\n");
+		return 0;
+	}
+
+
+	wm831x->irq_wq = create_singlethread_workqueue("wm831x-irq");
+	if (!wm831x->irq_wq) {
+		dev_err(wm831x->dev, "Failed to allocate IRQ worker\n");
+		return -ESRCH;
+	}
+
+	wm831x->irq = irq;
+	mutex_init(&wm831x->irq_lock);
+	INIT_WORK(&wm831x->irq_work, wm831x_irq_worker);
+
+	/* Mask the individual interrupt sources */
+	for (i = 0; i < ARRAY_SIZE(wm831x->irq_masks); i++) {
+		wm831x->irq_masks[i] = 0xffff;
+		wm831x_reg_write(wm831x, WM831X_INTERRUPT_STATUS_1_MASK + i,
+				 0xffff);
+	}
+
+	/* Enable top level interrupts, we mask at secondary level */
+	wm831x_reg_write(wm831x, WM831X_SYSTEM_INTERRUPTS_MASK, 0);
+
+	/* We're good to go.  We set IRQF_SHARED since there's a
+	 * chance the driver will interoperate with another driver but
+	 * the need to disable the IRQ while handing via I2C/SPI means
+	 * that this may break and performance will be impacted.  If
+	 * this does happen it's a hardware design issue and the only
+	 * other alternative would be polling.
+	 */
+	ret = request_irq(irq, wm831x_cpu_irq, IRQF_TRIGGER_LOW | IRQF_SHARED,
+			  "wm831x", wm831x);
+	if (ret != 0) {
+		dev_err(wm831x->dev, "Failed to request IRQ %d: %d\n",
+			irq, ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+void wm831x_irq_exit(struct wm831x *wm831x)
+{
+	if (wm831x->irq)
+		free_irq(wm831x->irq, wm831x);
+}
diff --git a/include/linux/mfd/wm831x/core.h b/include/linux/mfd/wm831x/core.h
index d90e693053ba..b96c9355b16e 100644
--- a/include/linux/mfd/wm831x/core.h
+++ b/include/linux/mfd/wm831x/core.h
@@ -15,6 +15,9 @@
 #ifndef __MFD_WM831X_CORE_H__
 #define __MFD_WM831X_CORE_H__
 
+#include <linux/interrupt.h>
+#include <linux/workqueue.h>
+
 /*
  * Register values.
  */
@@ -224,6 +227,13 @@ struct wm831x {
 
 	void *control_data;
 
+	int irq;  /* Our chip IRQ */
+	struct mutex irq_lock;
+	struct workqueue_struct *irq_wq;
+	struct work_struct irq_work;
+	unsigned int irq_base;
+	int irq_masks[5];
+
 	/* The WM831x has a security key blocking access to certain
 	 * registers.  The mutex is taken by the accessors for locking
 	 * and unlocking the security key, locked is used to fail
@@ -244,4 +254,15 @@ int wm831x_set_bits(struct wm831x *wm831x, unsigned short reg,
 int wm831x_bulk_read(struct wm831x *wm831x, unsigned short reg,
 		     int count, u16 *buf);
 
+int wm831x_irq_init(struct wm831x *wm831x, int irq);
+void wm831x_irq_exit(struct wm831x *wm831x);
+
+int __must_check wm831x_request_irq(struct wm831x *wm831x,
+				    unsigned int irq, irq_handler_t handler,
+				    unsigned long flags, const char *name,
+				    void *dev);
+void wm831x_free_irq(struct wm831x *wm831x, unsigned int, void *);
+void wm831x_disable_irq(struct wm831x *wm831x, int irq);
+void wm831x_enable_irq(struct wm831x *wm831x, int irq);
+
 #endif
diff --git a/include/linux/mfd/wm831x/irq.h b/include/linux/mfd/wm831x/irq.h
new file mode 100644
index 000000000000..3a8c97656fda
--- /dev/null
+++ b/include/linux/mfd/wm831x/irq.h
@@ -0,0 +1,764 @@
+/*
+ * include/linux/mfd/wm831x/irq.h -- Interrupt controller for WM831x
+ *
+ * Copyright 2009 Wolfson Microelectronics PLC.
+ *
+ * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#ifndef __MFD_WM831X_IRQ_H__
+#define __MFD_WM831X_IRQ_H__
+
+/* Interrupt number assignments within Linux */
+#define WM831X_IRQ_TEMP_THW 0
+#define WM831X_IRQ_GPIO_1   1
+#define WM831X_IRQ_GPIO_2   2
+#define WM831X_IRQ_GPIO_3   3
+#define WM831X_IRQ_GPIO_4   4
+#define WM831X_IRQ_GPIO_5   5
+#define WM831X_IRQ_GPIO_6   6
+#define WM831X_IRQ_GPIO_7   7
+#define WM831X_IRQ_GPIO_8   8
+#define WM831X_IRQ_GPIO_9   9
+#define WM831X_IRQ_GPIO_10  10
+#define WM831X_IRQ_GPIO_11  11
+#define WM831X_IRQ_GPIO_12  12
+#define WM831X_IRQ_GPIO_13  13
+#define WM831X_IRQ_GPIO_14  14
+#define WM831X_IRQ_GPIO_15  15
+#define WM831X_IRQ_GPIO_16  16
+#define WM831X_IRQ_ON           17
+#define WM831X_IRQ_PPM_SYSLO    18
+#define WM831X_IRQ_PPM_PWR_SRC  19
+#define WM831X_IRQ_PPM_USB_CURR 20
+#define WM831X_IRQ_WDOG_TO      21
+#define WM831X_IRQ_RTC_PER      22
+#define WM831X_IRQ_RTC_ALM      23
+#define WM831X_IRQ_CHG_BATT_HOT  24
+#define WM831X_IRQ_CHG_BATT_COLD 25
+#define WM831X_IRQ_CHG_BATT_FAIL 26
+#define WM831X_IRQ_CHG_OV        27
+#define WM831X_IRQ_CHG_END       29
+#define WM831X_IRQ_CHG_TO        30
+#define WM831X_IRQ_CHG_MODE      31
+#define WM831X_IRQ_CHG_START     32
+#define WM831X_IRQ_TCHDATA       33
+#define WM831X_IRQ_TCHPD         34
+#define WM831X_IRQ_AUXADC_DATA   35
+#define WM831X_IRQ_AUXADC_DCOMP1 36
+#define WM831X_IRQ_AUXADC_DCOMP2 37
+#define WM831X_IRQ_AUXADC_DCOMP3 38
+#define WM831X_IRQ_AUXADC_DCOMP4 39
+#define WM831X_IRQ_CS1           40
+#define WM831X_IRQ_CS2           41
+#define WM831X_IRQ_HC_DC1        42
+#define WM831X_IRQ_HC_DC2        43
+#define WM831X_IRQ_UV_LDO1       44
+#define WM831X_IRQ_UV_LDO2       45
+#define WM831X_IRQ_UV_LDO3       46
+#define WM831X_IRQ_UV_LDO4       47
+#define WM831X_IRQ_UV_LDO5       48
+#define WM831X_IRQ_UV_LDO6       49
+#define WM831X_IRQ_UV_LDO7       50
+#define WM831X_IRQ_UV_LDO8       51
+#define WM831X_IRQ_UV_LDO9       52
+#define WM831X_IRQ_UV_LDO10      53
+#define WM831X_IRQ_UV_DC1        54
+#define WM831X_IRQ_UV_DC2        55
+#define WM831X_IRQ_UV_DC3        56
+#define WM831X_IRQ_UV_DC4        57
+
+#define WM831X_NUM_IRQS     58
+
+/*
+ * R16400 (0x4010) - System Interrupts
+ */
+#define WM831X_PS_INT                           0x8000  /* PS_INT */
+#define WM831X_PS_INT_MASK                      0x8000  /* PS_INT */
+#define WM831X_PS_INT_SHIFT                         15  /* PS_INT */
+#define WM831X_PS_INT_WIDTH                          1  /* PS_INT */
+#define WM831X_TEMP_INT                         0x4000  /* TEMP_INT */
+#define WM831X_TEMP_INT_MASK                    0x4000  /* TEMP_INT */
+#define WM831X_TEMP_INT_SHIFT                       14  /* TEMP_INT */
+#define WM831X_TEMP_INT_WIDTH                        1  /* TEMP_INT */
+#define WM831X_GP_INT                           0x2000  /* GP_INT */
+#define WM831X_GP_INT_MASK                      0x2000  /* GP_INT */
+#define WM831X_GP_INT_SHIFT                         13  /* GP_INT */
+#define WM831X_GP_INT_WIDTH                          1  /* GP_INT */
+#define WM831X_ON_PIN_INT                       0x1000  /* ON_PIN_INT */
+#define WM831X_ON_PIN_INT_MASK                  0x1000  /* ON_PIN_INT */
+#define WM831X_ON_PIN_INT_SHIFT                     12  /* ON_PIN_INT */
+#define WM831X_ON_PIN_INT_WIDTH                      1  /* ON_PIN_INT */
+#define WM831X_WDOG_INT                         0x0800  /* WDOG_INT */
+#define WM831X_WDOG_INT_MASK                    0x0800  /* WDOG_INT */
+#define WM831X_WDOG_INT_SHIFT                       11  /* WDOG_INT */
+#define WM831X_WDOG_INT_WIDTH                        1  /* WDOG_INT */
+#define WM831X_TCHDATA_INT                      0x0400  /* TCHDATA_INT */
+#define WM831X_TCHDATA_INT_MASK                 0x0400  /* TCHDATA_INT */
+#define WM831X_TCHDATA_INT_SHIFT                    10  /* TCHDATA_INT */
+#define WM831X_TCHDATA_INT_WIDTH                     1  /* TCHDATA_INT */
+#define WM831X_TCHPD_INT                        0x0200  /* TCHPD_INT */
+#define WM831X_TCHPD_INT_MASK                   0x0200  /* TCHPD_INT */
+#define WM831X_TCHPD_INT_SHIFT                       9  /* TCHPD_INT */
+#define WM831X_TCHPD_INT_WIDTH                       1  /* TCHPD_INT */
+#define WM831X_AUXADC_INT                       0x0100  /* AUXADC_INT */
+#define WM831X_AUXADC_INT_MASK                  0x0100  /* AUXADC_INT */
+#define WM831X_AUXADC_INT_SHIFT                      8  /* AUXADC_INT */
+#define WM831X_AUXADC_INT_WIDTH                      1  /* AUXADC_INT */
+#define WM831X_PPM_INT                          0x0080  /* PPM_INT */
+#define WM831X_PPM_INT_MASK                     0x0080  /* PPM_INT */
+#define WM831X_PPM_INT_SHIFT                         7  /* PPM_INT */
+#define WM831X_PPM_INT_WIDTH                         1  /* PPM_INT */
+#define WM831X_CS_INT                           0x0040  /* CS_INT */
+#define WM831X_CS_INT_MASK                      0x0040  /* CS_INT */
+#define WM831X_CS_INT_SHIFT                          6  /* CS_INT */
+#define WM831X_CS_INT_WIDTH                          1  /* CS_INT */
+#define WM831X_RTC_INT                          0x0020  /* RTC_INT */
+#define WM831X_RTC_INT_MASK                     0x0020  /* RTC_INT */
+#define WM831X_RTC_INT_SHIFT                         5  /* RTC_INT */
+#define WM831X_RTC_INT_WIDTH                         1  /* RTC_INT */
+#define WM831X_OTP_INT                          0x0010  /* OTP_INT */
+#define WM831X_OTP_INT_MASK                     0x0010  /* OTP_INT */
+#define WM831X_OTP_INT_SHIFT                         4  /* OTP_INT */
+#define WM831X_OTP_INT_WIDTH                         1  /* OTP_INT */
+#define WM831X_CHILD_INT                        0x0008  /* CHILD_INT */
+#define WM831X_CHILD_INT_MASK                   0x0008  /* CHILD_INT */
+#define WM831X_CHILD_INT_SHIFT                       3  /* CHILD_INT */
+#define WM831X_CHILD_INT_WIDTH                       1  /* CHILD_INT */
+#define WM831X_CHG_INT                          0x0004  /* CHG_INT */
+#define WM831X_CHG_INT_MASK                     0x0004  /* CHG_INT */
+#define WM831X_CHG_INT_SHIFT                         2  /* CHG_INT */
+#define WM831X_CHG_INT_WIDTH                         1  /* CHG_INT */
+#define WM831X_HC_INT                           0x0002  /* HC_INT */
+#define WM831X_HC_INT_MASK                      0x0002  /* HC_INT */
+#define WM831X_HC_INT_SHIFT                          1  /* HC_INT */
+#define WM831X_HC_INT_WIDTH                          1  /* HC_INT */
+#define WM831X_UV_INT                           0x0001  /* UV_INT */
+#define WM831X_UV_INT_MASK                      0x0001  /* UV_INT */
+#define WM831X_UV_INT_SHIFT                          0  /* UV_INT */
+#define WM831X_UV_INT_WIDTH                          1  /* UV_INT */
+
+/*
+ * R16401 (0x4011) - Interrupt Status 1
+ */
+#define WM831X_PPM_SYSLO_EINT                   0x8000  /* PPM_SYSLO_EINT */
+#define WM831X_PPM_SYSLO_EINT_MASK              0x8000  /* PPM_SYSLO_EINT */
+#define WM831X_PPM_SYSLO_EINT_SHIFT                 15  /* PPM_SYSLO_EINT */
+#define WM831X_PPM_SYSLO_EINT_WIDTH                  1  /* PPM_SYSLO_EINT */
+#define WM831X_PPM_PWR_SRC_EINT                 0x4000  /* PPM_PWR_SRC_EINT */
+#define WM831X_PPM_PWR_SRC_EINT_MASK            0x4000  /* PPM_PWR_SRC_EINT */
+#define WM831X_PPM_PWR_SRC_EINT_SHIFT               14  /* PPM_PWR_SRC_EINT */
+#define WM831X_PPM_PWR_SRC_EINT_WIDTH                1  /* PPM_PWR_SRC_EINT */
+#define WM831X_PPM_USB_CURR_EINT                0x2000  /* PPM_USB_CURR_EINT */
+#define WM831X_PPM_USB_CURR_EINT_MASK           0x2000  /* PPM_USB_CURR_EINT */
+#define WM831X_PPM_USB_CURR_EINT_SHIFT              13  /* PPM_USB_CURR_EINT */
+#define WM831X_PPM_USB_CURR_EINT_WIDTH               1  /* PPM_USB_CURR_EINT */
+#define WM831X_ON_PIN_EINT                      0x1000  /* ON_PIN_EINT */
+#define WM831X_ON_PIN_EINT_MASK                 0x1000  /* ON_PIN_EINT */
+#define WM831X_ON_PIN_EINT_SHIFT                    12  /* ON_PIN_EINT */
+#define WM831X_ON_PIN_EINT_WIDTH                     1  /* ON_PIN_EINT */
+#define WM831X_WDOG_TO_EINT                     0x0800  /* WDOG_TO_EINT */
+#define WM831X_WDOG_TO_EINT_MASK                0x0800  /* WDOG_TO_EINT */
+#define WM831X_WDOG_TO_EINT_SHIFT                   11  /* WDOG_TO_EINT */
+#define WM831X_WDOG_TO_EINT_WIDTH                    1  /* WDOG_TO_EINT */
+#define WM831X_TCHDATA_EINT                     0x0400  /* TCHDATA_EINT */
+#define WM831X_TCHDATA_EINT_MASK                0x0400  /* TCHDATA_EINT */
+#define WM831X_TCHDATA_EINT_SHIFT                   10  /* TCHDATA_EINT */
+#define WM831X_TCHDATA_EINT_WIDTH                    1  /* TCHDATA_EINT */
+#define WM831X_TCHPD_EINT                       0x0200  /* TCHPD_EINT */
+#define WM831X_TCHPD_EINT_MASK                  0x0200  /* TCHPD_EINT */
+#define WM831X_TCHPD_EINT_SHIFT                      9  /* TCHPD_EINT */
+#define WM831X_TCHPD_EINT_WIDTH                      1  /* TCHPD_EINT */
+#define WM831X_AUXADC_DATA_EINT                 0x0100  /* AUXADC_DATA_EINT */
+#define WM831X_AUXADC_DATA_EINT_MASK            0x0100  /* AUXADC_DATA_EINT */
+#define WM831X_AUXADC_DATA_EINT_SHIFT                8  /* AUXADC_DATA_EINT */
+#define WM831X_AUXADC_DATA_EINT_WIDTH                1  /* AUXADC_DATA_EINT */
+#define WM831X_AUXADC_DCOMP4_EINT               0x0080  /* AUXADC_DCOMP4_EINT */
+#define WM831X_AUXADC_DCOMP4_EINT_MASK          0x0080  /* AUXADC_DCOMP4_EINT */
+#define WM831X_AUXADC_DCOMP4_EINT_SHIFT              7  /* AUXADC_DCOMP4_EINT */
+#define WM831X_AUXADC_DCOMP4_EINT_WIDTH              1  /* AUXADC_DCOMP4_EINT */
+#define WM831X_AUXADC_DCOMP3_EINT               0x0040  /* AUXADC_DCOMP3_EINT */
+#define WM831X_AUXADC_DCOMP3_EINT_MASK          0x0040  /* AUXADC_DCOMP3_EINT */
+#define WM831X_AUXADC_DCOMP3_EINT_SHIFT              6  /* AUXADC_DCOMP3_EINT */
+#define WM831X_AUXADC_DCOMP3_EINT_WIDTH              1  /* AUXADC_DCOMP3_EINT */
+#define WM831X_AUXADC_DCOMP2_EINT               0x0020  /* AUXADC_DCOMP2_EINT */
+#define WM831X_AUXADC_DCOMP2_EINT_MASK          0x0020  /* AUXADC_DCOMP2_EINT */
+#define WM831X_AUXADC_DCOMP2_EINT_SHIFT              5  /* AUXADC_DCOMP2_EINT */
+#define WM831X_AUXADC_DCOMP2_EINT_WIDTH              1  /* AUXADC_DCOMP2_EINT */
+#define WM831X_AUXADC_DCOMP1_EINT               0x0010  /* AUXADC_DCOMP1_EINT */
+#define WM831X_AUXADC_DCOMP1_EINT_MASK          0x0010  /* AUXADC_DCOMP1_EINT */
+#define WM831X_AUXADC_DCOMP1_EINT_SHIFT              4  /* AUXADC_DCOMP1_EINT */
+#define WM831X_AUXADC_DCOMP1_EINT_WIDTH              1  /* AUXADC_DCOMP1_EINT */
+#define WM831X_RTC_PER_EINT                     0x0008  /* RTC_PER_EINT */
+#define WM831X_RTC_PER_EINT_MASK                0x0008  /* RTC_PER_EINT */
+#define WM831X_RTC_PER_EINT_SHIFT                    3  /* RTC_PER_EINT */
+#define WM831X_RTC_PER_EINT_WIDTH                    1  /* RTC_PER_EINT */
+#define WM831X_RTC_ALM_EINT                     0x0004  /* RTC_ALM_EINT */
+#define WM831X_RTC_ALM_EINT_MASK                0x0004  /* RTC_ALM_EINT */
+#define WM831X_RTC_ALM_EINT_SHIFT                    2  /* RTC_ALM_EINT */
+#define WM831X_RTC_ALM_EINT_WIDTH                    1  /* RTC_ALM_EINT */
+#define WM831X_TEMP_THW_EINT                    0x0002  /* TEMP_THW_EINT */
+#define WM831X_TEMP_THW_EINT_MASK               0x0002  /* TEMP_THW_EINT */
+#define WM831X_TEMP_THW_EINT_SHIFT                   1  /* TEMP_THW_EINT */
+#define WM831X_TEMP_THW_EINT_WIDTH                   1  /* TEMP_THW_EINT */
+
+/*
+ * R16402 (0x4012) - Interrupt Status 2
+ */
+#define WM831X_CHG_BATT_HOT_EINT                0x8000  /* CHG_BATT_HOT_EINT */
+#define WM831X_CHG_BATT_HOT_EINT_MASK           0x8000  /* CHG_BATT_HOT_EINT */
+#define WM831X_CHG_BATT_HOT_EINT_SHIFT              15  /* CHG_BATT_HOT_EINT */
+#define WM831X_CHG_BATT_HOT_EINT_WIDTH               1  /* CHG_BATT_HOT_EINT */
+#define WM831X_CHG_BATT_COLD_EINT               0x4000  /* CHG_BATT_COLD_EINT */
+#define WM831X_CHG_BATT_COLD_EINT_MASK          0x4000  /* CHG_BATT_COLD_EINT */
+#define WM831X_CHG_BATT_COLD_EINT_SHIFT             14  /* CHG_BATT_COLD_EINT */
+#define WM831X_CHG_BATT_COLD_EINT_WIDTH              1  /* CHG_BATT_COLD_EINT */
+#define WM831X_CHG_BATT_FAIL_EINT               0x2000  /* CHG_BATT_FAIL_EINT */
+#define WM831X_CHG_BATT_FAIL_EINT_MASK          0x2000  /* CHG_BATT_FAIL_EINT */
+#define WM831X_CHG_BATT_FAIL_EINT_SHIFT             13  /* CHG_BATT_FAIL_EINT */
+#define WM831X_CHG_BATT_FAIL_EINT_WIDTH              1  /* CHG_BATT_FAIL_EINT */
+#define WM831X_CHG_OV_EINT                      0x1000  /* CHG_OV_EINT */
+#define WM831X_CHG_OV_EINT_MASK                 0x1000  /* CHG_OV_EINT */
+#define WM831X_CHG_OV_EINT_SHIFT                    12  /* CHG_OV_EINT */
+#define WM831X_CHG_OV_EINT_WIDTH                     1  /* CHG_OV_EINT */
+#define WM831X_CHG_END_EINT                     0x0800  /* CHG_END_EINT */
+#define WM831X_CHG_END_EINT_MASK                0x0800  /* CHG_END_EINT */
+#define WM831X_CHG_END_EINT_SHIFT                   11  /* CHG_END_EINT */
+#define WM831X_CHG_END_EINT_WIDTH                    1  /* CHG_END_EINT */
+#define WM831X_CHG_TO_EINT                      0x0400  /* CHG_TO_EINT */
+#define WM831X_CHG_TO_EINT_MASK                 0x0400  /* CHG_TO_EINT */
+#define WM831X_CHG_TO_EINT_SHIFT                    10  /* CHG_TO_EINT */
+#define WM831X_CHG_TO_EINT_WIDTH                     1  /* CHG_TO_EINT */
+#define WM831X_CHG_MODE_EINT                    0x0200  /* CHG_MODE_EINT */
+#define WM831X_CHG_MODE_EINT_MASK               0x0200  /* CHG_MODE_EINT */
+#define WM831X_CHG_MODE_EINT_SHIFT                   9  /* CHG_MODE_EINT */
+#define WM831X_CHG_MODE_EINT_WIDTH                   1  /* CHG_MODE_EINT */
+#define WM831X_CHG_START_EINT                   0x0100  /* CHG_START_EINT */
+#define WM831X_CHG_START_EINT_MASK              0x0100  /* CHG_START_EINT */
+#define WM831X_CHG_START_EINT_SHIFT                  8  /* CHG_START_EINT */
+#define WM831X_CHG_START_EINT_WIDTH                  1  /* CHG_START_EINT */
+#define WM831X_CS2_EINT                         0x0080  /* CS2_EINT */
+#define WM831X_CS2_EINT_MASK                    0x0080  /* CS2_EINT */
+#define WM831X_CS2_EINT_SHIFT                        7  /* CS2_EINT */
+#define WM831X_CS2_EINT_WIDTH                        1  /* CS2_EINT */
+#define WM831X_CS1_EINT                         0x0040  /* CS1_EINT */
+#define WM831X_CS1_EINT_MASK                    0x0040  /* CS1_EINT */
+#define WM831X_CS1_EINT_SHIFT                        6  /* CS1_EINT */
+#define WM831X_CS1_EINT_WIDTH                        1  /* CS1_EINT */
+#define WM831X_OTP_CMD_END_EINT                 0x0020  /* OTP_CMD_END_EINT */
+#define WM831X_OTP_CMD_END_EINT_MASK            0x0020  /* OTP_CMD_END_EINT */
+#define WM831X_OTP_CMD_END_EINT_SHIFT                5  /* OTP_CMD_END_EINT */
+#define WM831X_OTP_CMD_END_EINT_WIDTH                1  /* OTP_CMD_END_EINT */
+#define WM831X_OTP_ERR_EINT                     0x0010  /* OTP_ERR_EINT */
+#define WM831X_OTP_ERR_EINT_MASK                0x0010  /* OTP_ERR_EINT */
+#define WM831X_OTP_ERR_EINT_SHIFT                    4  /* OTP_ERR_EINT */
+#define WM831X_OTP_ERR_EINT_WIDTH                    1  /* OTP_ERR_EINT */
+#define WM831X_PS_POR_EINT                      0x0004  /* PS_POR_EINT */
+#define WM831X_PS_POR_EINT_MASK                 0x0004  /* PS_POR_EINT */
+#define WM831X_PS_POR_EINT_SHIFT                     2  /* PS_POR_EINT */
+#define WM831X_PS_POR_EINT_WIDTH                     1  /* PS_POR_EINT */
+#define WM831X_PS_SLEEP_OFF_EINT                0x0002  /* PS_SLEEP_OFF_EINT */
+#define WM831X_PS_SLEEP_OFF_EINT_MASK           0x0002  /* PS_SLEEP_OFF_EINT */
+#define WM831X_PS_SLEEP_OFF_EINT_SHIFT               1  /* PS_SLEEP_OFF_EINT */
+#define WM831X_PS_SLEEP_OFF_EINT_WIDTH               1  /* PS_SLEEP_OFF_EINT */
+#define WM831X_PS_ON_WAKE_EINT                  0x0001  /* PS_ON_WAKE_EINT */
+#define WM831X_PS_ON_WAKE_EINT_MASK             0x0001  /* PS_ON_WAKE_EINT */
+#define WM831X_PS_ON_WAKE_EINT_SHIFT                 0  /* PS_ON_WAKE_EINT */
+#define WM831X_PS_ON_WAKE_EINT_WIDTH                 1  /* PS_ON_WAKE_EINT */
+
+/*
+ * R16403 (0x4013) - Interrupt Status 3
+ */
+#define WM831X_UV_LDO10_EINT                    0x0200  /* UV_LDO10_EINT */
+#define WM831X_UV_LDO10_EINT_MASK               0x0200  /* UV_LDO10_EINT */
+#define WM831X_UV_LDO10_EINT_SHIFT                   9  /* UV_LDO10_EINT */
+#define WM831X_UV_LDO10_EINT_WIDTH                   1  /* UV_LDO10_EINT */
+#define WM831X_UV_LDO9_EINT                     0x0100  /* UV_LDO9_EINT */
+#define WM831X_UV_LDO9_EINT_MASK                0x0100  /* UV_LDO9_EINT */
+#define WM831X_UV_LDO9_EINT_SHIFT                    8  /* UV_LDO9_EINT */
+#define WM831X_UV_LDO9_EINT_WIDTH                    1  /* UV_LDO9_EINT */
+#define WM831X_UV_LDO8_EINT                     0x0080  /* UV_LDO8_EINT */
+#define WM831X_UV_LDO8_EINT_MASK                0x0080  /* UV_LDO8_EINT */
+#define WM831X_UV_LDO8_EINT_SHIFT                    7  /* UV_LDO8_EINT */
+#define WM831X_UV_LDO8_EINT_WIDTH                    1  /* UV_LDO8_EINT */
+#define WM831X_UV_LDO7_EINT                     0x0040  /* UV_LDO7_EINT */
+#define WM831X_UV_LDO7_EINT_MASK                0x0040  /* UV_LDO7_EINT */
+#define WM831X_UV_LDO7_EINT_SHIFT                    6  /* UV_LDO7_EINT */
+#define WM831X_UV_LDO7_EINT_WIDTH                    1  /* UV_LDO7_EINT */
+#define WM831X_UV_LDO6_EINT                     0x0020  /* UV_LDO6_EINT */
+#define WM831X_UV_LDO6_EINT_MASK                0x0020  /* UV_LDO6_EINT */
+#define WM831X_UV_LDO6_EINT_SHIFT                    5  /* UV_LDO6_EINT */
+#define WM831X_UV_LDO6_EINT_WIDTH                    1  /* UV_LDO6_EINT */
+#define WM831X_UV_LDO5_EINT                     0x0010  /* UV_LDO5_EINT */
+#define WM831X_UV_LDO5_EINT_MASK                0x0010  /* UV_LDO5_EINT */
+#define WM831X_UV_LDO5_EINT_SHIFT                    4  /* UV_LDO5_EINT */
+#define WM831X_UV_LDO5_EINT_WIDTH                    1  /* UV_LDO5_EINT */
+#define WM831X_UV_LDO4_EINT                     0x0008  /* UV_LDO4_EINT */
+#define WM831X_UV_LDO4_EINT_MASK                0x0008  /* UV_LDO4_EINT */
+#define WM831X_UV_LDO4_EINT_SHIFT                    3  /* UV_LDO4_EINT */
+#define WM831X_UV_LDO4_EINT_WIDTH                    1  /* UV_LDO4_EINT */
+#define WM831X_UV_LDO3_EINT                     0x0004  /* UV_LDO3_EINT */
+#define WM831X_UV_LDO3_EINT_MASK                0x0004  /* UV_LDO3_EINT */
+#define WM831X_UV_LDO3_EINT_SHIFT                    2  /* UV_LDO3_EINT */
+#define WM831X_UV_LDO3_EINT_WIDTH                    1  /* UV_LDO3_EINT */
+#define WM831X_UV_LDO2_EINT                     0x0002  /* UV_LDO2_EINT */
+#define WM831X_UV_LDO2_EINT_MASK                0x0002  /* UV_LDO2_EINT */
+#define WM831X_UV_LDO2_EINT_SHIFT                    1  /* UV_LDO2_EINT */
+#define WM831X_UV_LDO2_EINT_WIDTH                    1  /* UV_LDO2_EINT */
+#define WM831X_UV_LDO1_EINT                     0x0001  /* UV_LDO1_EINT */
+#define WM831X_UV_LDO1_EINT_MASK                0x0001  /* UV_LDO1_EINT */
+#define WM831X_UV_LDO1_EINT_SHIFT                    0  /* UV_LDO1_EINT */
+#define WM831X_UV_LDO1_EINT_WIDTH                    1  /* UV_LDO1_EINT */
+
+/*
+ * R16404 (0x4014) - Interrupt Status 4
+ */
+#define WM831X_HC_DC2_EINT                      0x0200  /* HC_DC2_EINT */
+#define WM831X_HC_DC2_EINT_MASK                 0x0200  /* HC_DC2_EINT */
+#define WM831X_HC_DC2_EINT_SHIFT                     9  /* HC_DC2_EINT */
+#define WM831X_HC_DC2_EINT_WIDTH                     1  /* HC_DC2_EINT */
+#define WM831X_HC_DC1_EINT                      0x0100  /* HC_DC1_EINT */
+#define WM831X_HC_DC1_EINT_MASK                 0x0100  /* HC_DC1_EINT */
+#define WM831X_HC_DC1_EINT_SHIFT                     8  /* HC_DC1_EINT */
+#define WM831X_HC_DC1_EINT_WIDTH                     1  /* HC_DC1_EINT */
+#define WM831X_UV_DC4_EINT                      0x0008  /* UV_DC4_EINT */
+#define WM831X_UV_DC4_EINT_MASK                 0x0008  /* UV_DC4_EINT */
+#define WM831X_UV_DC4_EINT_SHIFT                     3  /* UV_DC4_EINT */
+#define WM831X_UV_DC4_EINT_WIDTH                     1  /* UV_DC4_EINT */
+#define WM831X_UV_DC3_EINT                      0x0004  /* UV_DC3_EINT */
+#define WM831X_UV_DC3_EINT_MASK                 0x0004  /* UV_DC3_EINT */
+#define WM831X_UV_DC3_EINT_SHIFT                     2  /* UV_DC3_EINT */
+#define WM831X_UV_DC3_EINT_WIDTH                     1  /* UV_DC3_EINT */
+#define WM831X_UV_DC2_EINT                      0x0002  /* UV_DC2_EINT */
+#define WM831X_UV_DC2_EINT_MASK                 0x0002  /* UV_DC2_EINT */
+#define WM831X_UV_DC2_EINT_SHIFT                     1  /* UV_DC2_EINT */
+#define WM831X_UV_DC2_EINT_WIDTH                     1  /* UV_DC2_EINT */
+#define WM831X_UV_DC1_EINT                      0x0001  /* UV_DC1_EINT */
+#define WM831X_UV_DC1_EINT_MASK                 0x0001  /* UV_DC1_EINT */
+#define WM831X_UV_DC1_EINT_SHIFT                     0  /* UV_DC1_EINT */
+#define WM831X_UV_DC1_EINT_WIDTH                     1  /* UV_DC1_EINT */
+
+/*
+ * R16405 (0x4015) - Interrupt Status 5
+ */
+#define WM831X_GP16_EINT                        0x8000  /* GP16_EINT */
+#define WM831X_GP16_EINT_MASK                   0x8000  /* GP16_EINT */
+#define WM831X_GP16_EINT_SHIFT                      15  /* GP16_EINT */
+#define WM831X_GP16_EINT_WIDTH                       1  /* GP16_EINT */
+#define WM831X_GP15_EINT                        0x4000  /* GP15_EINT */
+#define WM831X_GP15_EINT_MASK                   0x4000  /* GP15_EINT */
+#define WM831X_GP15_EINT_SHIFT                      14  /* GP15_EINT */
+#define WM831X_GP15_EINT_WIDTH                       1  /* GP15_EINT */
+#define WM831X_GP14_EINT                        0x2000  /* GP14_EINT */
+#define WM831X_GP14_EINT_MASK                   0x2000  /* GP14_EINT */
+#define WM831X_GP14_EINT_SHIFT                      13  /* GP14_EINT */
+#define WM831X_GP14_EINT_WIDTH                       1  /* GP14_EINT */
+#define WM831X_GP13_EINT                        0x1000  /* GP13_EINT */
+#define WM831X_GP13_EINT_MASK                   0x1000  /* GP13_EINT */
+#define WM831X_GP13_EINT_SHIFT                      12  /* GP13_EINT */
+#define WM831X_GP13_EINT_WIDTH                       1  /* GP13_EINT */
+#define WM831X_GP12_EINT                        0x0800  /* GP12_EINT */
+#define WM831X_GP12_EINT_MASK                   0x0800  /* GP12_EINT */
+#define WM831X_GP12_EINT_SHIFT                      11  /* GP12_EINT */
+#define WM831X_GP12_EINT_WIDTH                       1  /* GP12_EINT */
+#define WM831X_GP11_EINT                        0x0400  /* GP11_EINT */
+#define WM831X_GP11_EINT_MASK                   0x0400  /* GP11_EINT */
+#define WM831X_GP11_EINT_SHIFT                      10  /* GP11_EINT */
+#define WM831X_GP11_EINT_WIDTH                       1  /* GP11_EINT */
+#define WM831X_GP10_EINT                        0x0200  /* GP10_EINT */
+#define WM831X_GP10_EINT_MASK                   0x0200  /* GP10_EINT */
+#define WM831X_GP10_EINT_SHIFT                       9  /* GP10_EINT */
+#define WM831X_GP10_EINT_WIDTH                       1  /* GP10_EINT */
+#define WM831X_GP9_EINT                         0x0100  /* GP9_EINT */
+#define WM831X_GP9_EINT_MASK                    0x0100  /* GP9_EINT */
+#define WM831X_GP9_EINT_SHIFT                        8  /* GP9_EINT */
+#define WM831X_GP9_EINT_WIDTH                        1  /* GP9_EINT */
+#define WM831X_GP8_EINT                         0x0080  /* GP8_EINT */
+#define WM831X_GP8_EINT_MASK                    0x0080  /* GP8_EINT */
+#define WM831X_GP8_EINT_SHIFT                        7  /* GP8_EINT */
+#define WM831X_GP8_EINT_WIDTH                        1  /* GP8_EINT */
+#define WM831X_GP7_EINT                         0x0040  /* GP7_EINT */
+#define WM831X_GP7_EINT_MASK                    0x0040  /* GP7_EINT */
+#define WM831X_GP7_EINT_SHIFT                        6  /* GP7_EINT */
+#define WM831X_GP7_EINT_WIDTH                        1  /* GP7_EINT */
+#define WM831X_GP6_EINT                         0x0020  /* GP6_EINT */
+#define WM831X_GP6_EINT_MASK                    0x0020  /* GP6_EINT */
+#define WM831X_GP6_EINT_SHIFT                        5  /* GP6_EINT */
+#define WM831X_GP6_EINT_WIDTH                        1  /* GP6_EINT */
+#define WM831X_GP5_EINT                         0x0010  /* GP5_EINT */
+#define WM831X_GP5_EINT_MASK                    0x0010  /* GP5_EINT */
+#define WM831X_GP5_EINT_SHIFT                        4  /* GP5_EINT */
+#define WM831X_GP5_EINT_WIDTH                        1  /* GP5_EINT */
+#define WM831X_GP4_EINT                         0x0008  /* GP4_EINT */
+#define WM831X_GP4_EINT_MASK                    0x0008  /* GP4_EINT */
+#define WM831X_GP4_EINT_SHIFT                        3  /* GP4_EINT */
+#define WM831X_GP4_EINT_WIDTH                        1  /* GP4_EINT */
+#define WM831X_GP3_EINT                         0x0004  /* GP3_EINT */
+#define WM831X_GP3_EINT_MASK                    0x0004  /* GP3_EINT */
+#define WM831X_GP3_EINT_SHIFT                        2  /* GP3_EINT */
+#define WM831X_GP3_EINT_WIDTH                        1  /* GP3_EINT */
+#define WM831X_GP2_EINT                         0x0002  /* GP2_EINT */
+#define WM831X_GP2_EINT_MASK                    0x0002  /* GP2_EINT */
+#define WM831X_GP2_EINT_SHIFT                        1  /* GP2_EINT */
+#define WM831X_GP2_EINT_WIDTH                        1  /* GP2_EINT */
+#define WM831X_GP1_EINT                         0x0001  /* GP1_EINT */
+#define WM831X_GP1_EINT_MASK                    0x0001  /* GP1_EINT */
+#define WM831X_GP1_EINT_SHIFT                        0  /* GP1_EINT */
+#define WM831X_GP1_EINT_WIDTH                        1  /* GP1_EINT */
+
+/*
+ * R16407 (0x4017) - IRQ Config
+ */
+#define WM831X_IRQ_OD                           0x0002  /* IRQ_OD */
+#define WM831X_IRQ_OD_MASK                      0x0002  /* IRQ_OD */
+#define WM831X_IRQ_OD_SHIFT                          1  /* IRQ_OD */
+#define WM831X_IRQ_OD_WIDTH                          1  /* IRQ_OD */
+#define WM831X_IM_IRQ                           0x0001  /* IM_IRQ */
+#define WM831X_IM_IRQ_MASK                      0x0001  /* IM_IRQ */
+#define WM831X_IM_IRQ_SHIFT                          0  /* IM_IRQ */
+#define WM831X_IM_IRQ_WIDTH                          1  /* IM_IRQ */
+
+/*
+ * R16408 (0x4018) - System Interrupts Mask
+ */
+#define WM831X_IM_PS_INT                        0x8000  /* IM_PS_INT */
+#define WM831X_IM_PS_INT_MASK                   0x8000  /* IM_PS_INT */
+#define WM831X_IM_PS_INT_SHIFT                      15  /* IM_PS_INT */
+#define WM831X_IM_PS_INT_WIDTH                       1  /* IM_PS_INT */
+#define WM831X_IM_TEMP_INT                      0x4000  /* IM_TEMP_INT */
+#define WM831X_IM_TEMP_INT_MASK                 0x4000  /* IM_TEMP_INT */
+#define WM831X_IM_TEMP_INT_SHIFT                    14  /* IM_TEMP_INT */
+#define WM831X_IM_TEMP_INT_WIDTH                     1  /* IM_TEMP_INT */
+#define WM831X_IM_GP_INT                        0x2000  /* IM_GP_INT */
+#define WM831X_IM_GP_INT_MASK                   0x2000  /* IM_GP_INT */
+#define WM831X_IM_GP_INT_SHIFT                      13  /* IM_GP_INT */
+#define WM831X_IM_GP_INT_WIDTH                       1  /* IM_GP_INT */
+#define WM831X_IM_ON_PIN_INT                    0x1000  /* IM_ON_PIN_INT */
+#define WM831X_IM_ON_PIN_INT_MASK               0x1000  /* IM_ON_PIN_INT */
+#define WM831X_IM_ON_PIN_INT_SHIFT                  12  /* IM_ON_PIN_INT */
+#define WM831X_IM_ON_PIN_INT_WIDTH                   1  /* IM_ON_PIN_INT */
+#define WM831X_IM_WDOG_INT                      0x0800  /* IM_WDOG_INT */
+#define WM831X_IM_WDOG_INT_MASK                 0x0800  /* IM_WDOG_INT */
+#define WM831X_IM_WDOG_INT_SHIFT                    11  /* IM_WDOG_INT */
+#define WM831X_IM_WDOG_INT_WIDTH                     1  /* IM_WDOG_INT */
+#define WM831X_IM_TCHDATA_INT                   0x0400  /* IM_TCHDATA_INT */
+#define WM831X_IM_TCHDATA_INT_MASK              0x0400  /* IM_TCHDATA_INT */
+#define WM831X_IM_TCHDATA_INT_SHIFT                 10  /* IM_TCHDATA_INT */
+#define WM831X_IM_TCHDATA_INT_WIDTH                  1  /* IM_TCHDATA_INT */
+#define WM831X_IM_TCHPD_INT                     0x0200  /* IM_TCHPD_INT */
+#define WM831X_IM_TCHPD_INT_MASK                0x0200  /* IM_TCHPD_INT */
+#define WM831X_IM_TCHPD_INT_SHIFT                    9  /* IM_TCHPD_INT */
+#define WM831X_IM_TCHPD_INT_WIDTH                    1  /* IM_TCHPD_INT */
+#define WM831X_IM_AUXADC_INT                    0x0100  /* IM_AUXADC_INT */
+#define WM831X_IM_AUXADC_INT_MASK               0x0100  /* IM_AUXADC_INT */
+#define WM831X_IM_AUXADC_INT_SHIFT                   8  /* IM_AUXADC_INT */
+#define WM831X_IM_AUXADC_INT_WIDTH                   1  /* IM_AUXADC_INT */
+#define WM831X_IM_PPM_INT                       0x0080  /* IM_PPM_INT */
+#define WM831X_IM_PPM_INT_MASK                  0x0080  /* IM_PPM_INT */
+#define WM831X_IM_PPM_INT_SHIFT                      7  /* IM_PPM_INT */
+#define WM831X_IM_PPM_INT_WIDTH                      1  /* IM_PPM_INT */
+#define WM831X_IM_CS_INT                        0x0040  /* IM_CS_INT */
+#define WM831X_IM_CS_INT_MASK                   0x0040  /* IM_CS_INT */
+#define WM831X_IM_CS_INT_SHIFT                       6  /* IM_CS_INT */
+#define WM831X_IM_CS_INT_WIDTH                       1  /* IM_CS_INT */
+#define WM831X_IM_RTC_INT                       0x0020  /* IM_RTC_INT */
+#define WM831X_IM_RTC_INT_MASK                  0x0020  /* IM_RTC_INT */
+#define WM831X_IM_RTC_INT_SHIFT                      5  /* IM_RTC_INT */
+#define WM831X_IM_RTC_INT_WIDTH                      1  /* IM_RTC_INT */
+#define WM831X_IM_OTP_INT                       0x0010  /* IM_OTP_INT */
+#define WM831X_IM_OTP_INT_MASK                  0x0010  /* IM_OTP_INT */
+#define WM831X_IM_OTP_INT_SHIFT                      4  /* IM_OTP_INT */
+#define WM831X_IM_OTP_INT_WIDTH                      1  /* IM_OTP_INT */
+#define WM831X_IM_CHILD_INT                     0x0008  /* IM_CHILD_INT */
+#define WM831X_IM_CHILD_INT_MASK                0x0008  /* IM_CHILD_INT */
+#define WM831X_IM_CHILD_INT_SHIFT                    3  /* IM_CHILD_INT */
+#define WM831X_IM_CHILD_INT_WIDTH                    1  /* IM_CHILD_INT */
+#define WM831X_IM_CHG_INT                       0x0004  /* IM_CHG_INT */
+#define WM831X_IM_CHG_INT_MASK                  0x0004  /* IM_CHG_INT */
+#define WM831X_IM_CHG_INT_SHIFT                      2  /* IM_CHG_INT */
+#define WM831X_IM_CHG_INT_WIDTH                      1  /* IM_CHG_INT */
+#define WM831X_IM_HC_INT                        0x0002  /* IM_HC_INT */
+#define WM831X_IM_HC_INT_MASK                   0x0002  /* IM_HC_INT */
+#define WM831X_IM_HC_INT_SHIFT                       1  /* IM_HC_INT */
+#define WM831X_IM_HC_INT_WIDTH                       1  /* IM_HC_INT */
+#define WM831X_IM_UV_INT                        0x0001  /* IM_UV_INT */
+#define WM831X_IM_UV_INT_MASK                   0x0001  /* IM_UV_INT */
+#define WM831X_IM_UV_INT_SHIFT                       0  /* IM_UV_INT */
+#define WM831X_IM_UV_INT_WIDTH                       1  /* IM_UV_INT */
+
+/*
+ * R16409 (0x4019) - Interrupt Status 1 Mask
+ */
+#define WM831X_IM_PPM_SYSLO_EINT                0x8000  /* IM_PPM_SYSLO_EINT */
+#define WM831X_IM_PPM_SYSLO_EINT_MASK           0x8000  /* IM_PPM_SYSLO_EINT */
+#define WM831X_IM_PPM_SYSLO_EINT_SHIFT              15  /* IM_PPM_SYSLO_EINT */
+#define WM831X_IM_PPM_SYSLO_EINT_WIDTH               1  /* IM_PPM_SYSLO_EINT */
+#define WM831X_IM_PPM_PWR_SRC_EINT              0x4000  /* IM_PPM_PWR_SRC_EINT */
+#define WM831X_IM_PPM_PWR_SRC_EINT_MASK         0x4000  /* IM_PPM_PWR_SRC_EINT */
+#define WM831X_IM_PPM_PWR_SRC_EINT_SHIFT            14  /* IM_PPM_PWR_SRC_EINT */
+#define WM831X_IM_PPM_PWR_SRC_EINT_WIDTH             1  /* IM_PPM_PWR_SRC_EINT */
+#define WM831X_IM_PPM_USB_CURR_EINT             0x2000  /* IM_PPM_USB_CURR_EINT */
+#define WM831X_IM_PPM_USB_CURR_EINT_MASK        0x2000  /* IM_PPM_USB_CURR_EINT */
+#define WM831X_IM_PPM_USB_CURR_EINT_SHIFT           13  /* IM_PPM_USB_CURR_EINT */
+#define WM831X_IM_PPM_USB_CURR_EINT_WIDTH            1  /* IM_PPM_USB_CURR_EINT */
+#define WM831X_IM_ON_PIN_EINT                   0x1000  /* IM_ON_PIN_EINT */
+#define WM831X_IM_ON_PIN_EINT_MASK              0x1000  /* IM_ON_PIN_EINT */
+#define WM831X_IM_ON_PIN_EINT_SHIFT                 12  /* IM_ON_PIN_EINT */
+#define WM831X_IM_ON_PIN_EINT_WIDTH                  1  /* IM_ON_PIN_EINT */
+#define WM831X_IM_WDOG_TO_EINT                  0x0800  /* IM_WDOG_TO_EINT */
+#define WM831X_IM_WDOG_TO_EINT_MASK             0x0800  /* IM_WDOG_TO_EINT */
+#define WM831X_IM_WDOG_TO_EINT_SHIFT                11  /* IM_WDOG_TO_EINT */
+#define WM831X_IM_WDOG_TO_EINT_WIDTH                 1  /* IM_WDOG_TO_EINT */
+#define WM831X_IM_TCHDATA_EINT                  0x0400  /* IM_TCHDATA_EINT */
+#define WM831X_IM_TCHDATA_EINT_MASK             0x0400  /* IM_TCHDATA_EINT */
+#define WM831X_IM_TCHDATA_EINT_SHIFT                10  /* IM_TCHDATA_EINT */
+#define WM831X_IM_TCHDATA_EINT_WIDTH                 1  /* IM_TCHDATA_EINT */
+#define WM831X_IM_TCHPD_EINT                    0x0200  /* IM_TCHPD_EINT */
+#define WM831X_IM_TCHPD_EINT_MASK               0x0200  /* IM_TCHPD_EINT */
+#define WM831X_IM_TCHPD_EINT_SHIFT                   9  /* IM_TCHPD_EINT */
+#define WM831X_IM_TCHPD_EINT_WIDTH                   1  /* IM_TCHPD_EINT */
+#define WM831X_IM_AUXADC_DATA_EINT              0x0100  /* IM_AUXADC_DATA_EINT */
+#define WM831X_IM_AUXADC_DATA_EINT_MASK         0x0100  /* IM_AUXADC_DATA_EINT */
+#define WM831X_IM_AUXADC_DATA_EINT_SHIFT             8  /* IM_AUXADC_DATA_EINT */
+#define WM831X_IM_AUXADC_DATA_EINT_WIDTH             1  /* IM_AUXADC_DATA_EINT */
+#define WM831X_IM_AUXADC_DCOMP4_EINT            0x0080  /* IM_AUXADC_DCOMP4_EINT */
+#define WM831X_IM_AUXADC_DCOMP4_EINT_MASK       0x0080  /* IM_AUXADC_DCOMP4_EINT */
+#define WM831X_IM_AUXADC_DCOMP4_EINT_SHIFT           7  /* IM_AUXADC_DCOMP4_EINT */
+#define WM831X_IM_AUXADC_DCOMP4_EINT_WIDTH           1  /* IM_AUXADC_DCOMP4_EINT */
+#define WM831X_IM_AUXADC_DCOMP3_EINT            0x0040  /* IM_AUXADC_DCOMP3_EINT */
+#define WM831X_IM_AUXADC_DCOMP3_EINT_MASK       0x0040  /* IM_AUXADC_DCOMP3_EINT */
+#define WM831X_IM_AUXADC_DCOMP3_EINT_SHIFT           6  /* IM_AUXADC_DCOMP3_EINT */
+#define WM831X_IM_AUXADC_DCOMP3_EINT_WIDTH           1  /* IM_AUXADC_DCOMP3_EINT */
+#define WM831X_IM_AUXADC_DCOMP2_EINT            0x0020  /* IM_AUXADC_DCOMP2_EINT */
+#define WM831X_IM_AUXADC_DCOMP2_EINT_MASK       0x0020  /* IM_AUXADC_DCOMP2_EINT */
+#define WM831X_IM_AUXADC_DCOMP2_EINT_SHIFT           5  /* IM_AUXADC_DCOMP2_EINT */
+#define WM831X_IM_AUXADC_DCOMP2_EINT_WIDTH           1  /* IM_AUXADC_DCOMP2_EINT */
+#define WM831X_IM_AUXADC_DCOMP1_EINT            0x0010  /* IM_AUXADC_DCOMP1_EINT */
+#define WM831X_IM_AUXADC_DCOMP1_EINT_MASK       0x0010  /* IM_AUXADC_DCOMP1_EINT */
+#define WM831X_IM_AUXADC_DCOMP1_EINT_SHIFT           4  /* IM_AUXADC_DCOMP1_EINT */
+#define WM831X_IM_AUXADC_DCOMP1_EINT_WIDTH           1  /* IM_AUXADC_DCOMP1_EINT */
+#define WM831X_IM_RTC_PER_EINT                  0x0008  /* IM_RTC_PER_EINT */
+#define WM831X_IM_RTC_PER_EINT_MASK             0x0008  /* IM_RTC_PER_EINT */
+#define WM831X_IM_RTC_PER_EINT_SHIFT                 3  /* IM_RTC_PER_EINT */
+#define WM831X_IM_RTC_PER_EINT_WIDTH                 1  /* IM_RTC_PER_EINT */
+#define WM831X_IM_RTC_ALM_EINT                  0x0004  /* IM_RTC_ALM_EINT */
+#define WM831X_IM_RTC_ALM_EINT_MASK             0x0004  /* IM_RTC_ALM_EINT */
+#define WM831X_IM_RTC_ALM_EINT_SHIFT                 2  /* IM_RTC_ALM_EINT */
+#define WM831X_IM_RTC_ALM_EINT_WIDTH                 1  /* IM_RTC_ALM_EINT */
+#define WM831X_IM_TEMP_THW_EINT                 0x0002  /* IM_TEMP_THW_EINT */
+#define WM831X_IM_TEMP_THW_EINT_MASK            0x0002  /* IM_TEMP_THW_EINT */
+#define WM831X_IM_TEMP_THW_EINT_SHIFT                1  /* IM_TEMP_THW_EINT */
+#define WM831X_IM_TEMP_THW_EINT_WIDTH                1  /* IM_TEMP_THW_EINT */
+
+/*
+ * R16410 (0x401A) - Interrupt Status 2 Mask
+ */
+#define WM831X_IM_CHG_BATT_HOT_EINT             0x8000  /* IM_CHG_BATT_HOT_EINT */
+#define WM831X_IM_CHG_BATT_HOT_EINT_MASK        0x8000  /* IM_CHG_BATT_HOT_EINT */
+#define WM831X_IM_CHG_BATT_HOT_EINT_SHIFT           15  /* IM_CHG_BATT_HOT_EINT */
+#define WM831X_IM_CHG_BATT_HOT_EINT_WIDTH            1  /* IM_CHG_BATT_HOT_EINT */
+#define WM831X_IM_CHG_BATT_COLD_EINT            0x4000  /* IM_CHG_BATT_COLD_EINT */
+#define WM831X_IM_CHG_BATT_COLD_EINT_MASK       0x4000  /* IM_CHG_BATT_COLD_EINT */
+#define WM831X_IM_CHG_BATT_COLD_EINT_SHIFT          14  /* IM_CHG_BATT_COLD_EINT */
+#define WM831X_IM_CHG_BATT_COLD_EINT_WIDTH           1  /* IM_CHG_BATT_COLD_EINT */
+#define WM831X_IM_CHG_BATT_FAIL_EINT            0x2000  /* IM_CHG_BATT_FAIL_EINT */
+#define WM831X_IM_CHG_BATT_FAIL_EINT_MASK       0x2000  /* IM_CHG_BATT_FAIL_EINT */
+#define WM831X_IM_CHG_BATT_FAIL_EINT_SHIFT          13  /* IM_CHG_BATT_FAIL_EINT */
+#define WM831X_IM_CHG_BATT_FAIL_EINT_WIDTH           1  /* IM_CHG_BATT_FAIL_EINT */
+#define WM831X_IM_CHG_OV_EINT                   0x1000  /* IM_CHG_OV_EINT */
+#define WM831X_IM_CHG_OV_EINT_MASK              0x1000  /* IM_CHG_OV_EINT */
+#define WM831X_IM_CHG_OV_EINT_SHIFT                 12  /* IM_CHG_OV_EINT */
+#define WM831X_IM_CHG_OV_EINT_WIDTH                  1  /* IM_CHG_OV_EINT */
+#define WM831X_IM_CHG_END_EINT                  0x0800  /* IM_CHG_END_EINT */
+#define WM831X_IM_CHG_END_EINT_MASK             0x0800  /* IM_CHG_END_EINT */
+#define WM831X_IM_CHG_END_EINT_SHIFT                11  /* IM_CHG_END_EINT */
+#define WM831X_IM_CHG_END_EINT_WIDTH                 1  /* IM_CHG_END_EINT */
+#define WM831X_IM_CHG_TO_EINT                   0x0400  /* IM_CHG_TO_EINT */
+#define WM831X_IM_CHG_TO_EINT_MASK              0x0400  /* IM_CHG_TO_EINT */
+#define WM831X_IM_CHG_TO_EINT_SHIFT                 10  /* IM_CHG_TO_EINT */
+#define WM831X_IM_CHG_TO_EINT_WIDTH                  1  /* IM_CHG_TO_EINT */
+#define WM831X_IM_CHG_MODE_EINT                 0x0200  /* IM_CHG_MODE_EINT */
+#define WM831X_IM_CHG_MODE_EINT_MASK            0x0200  /* IM_CHG_MODE_EINT */
+#define WM831X_IM_CHG_MODE_EINT_SHIFT                9  /* IM_CHG_MODE_EINT */
+#define WM831X_IM_CHG_MODE_EINT_WIDTH                1  /* IM_CHG_MODE_EINT */
+#define WM831X_IM_CHG_START_EINT                0x0100  /* IM_CHG_START_EINT */
+#define WM831X_IM_CHG_START_EINT_MASK           0x0100  /* IM_CHG_START_EINT */
+#define WM831X_IM_CHG_START_EINT_SHIFT               8  /* IM_CHG_START_EINT */
+#define WM831X_IM_CHG_START_EINT_WIDTH               1  /* IM_CHG_START_EINT */
+#define WM831X_IM_CS2_EINT                      0x0080  /* IM_CS2_EINT */
+#define WM831X_IM_CS2_EINT_MASK                 0x0080  /* IM_CS2_EINT */
+#define WM831X_IM_CS2_EINT_SHIFT                     7  /* IM_CS2_EINT */
+#define WM831X_IM_CS2_EINT_WIDTH                     1  /* IM_CS2_EINT */
+#define WM831X_IM_CS1_EINT                      0x0040  /* IM_CS1_EINT */
+#define WM831X_IM_CS1_EINT_MASK                 0x0040  /* IM_CS1_EINT */
+#define WM831X_IM_CS1_EINT_SHIFT                     6  /* IM_CS1_EINT */
+#define WM831X_IM_CS1_EINT_WIDTH                     1  /* IM_CS1_EINT */
+#define WM831X_IM_OTP_CMD_END_EINT              0x0020  /* IM_OTP_CMD_END_EINT */
+#define WM831X_IM_OTP_CMD_END_EINT_MASK         0x0020  /* IM_OTP_CMD_END_EINT */
+#define WM831X_IM_OTP_CMD_END_EINT_SHIFT             5  /* IM_OTP_CMD_END_EINT */
+#define WM831X_IM_OTP_CMD_END_EINT_WIDTH             1  /* IM_OTP_CMD_END_EINT */
+#define WM831X_IM_OTP_ERR_EINT                  0x0010  /* IM_OTP_ERR_EINT */
+#define WM831X_IM_OTP_ERR_EINT_MASK             0x0010  /* IM_OTP_ERR_EINT */
+#define WM831X_IM_OTP_ERR_EINT_SHIFT                 4  /* IM_OTP_ERR_EINT */
+#define WM831X_IM_OTP_ERR_EINT_WIDTH                 1  /* IM_OTP_ERR_EINT */
+#define WM831X_IM_PS_POR_EINT                   0x0004  /* IM_PS_POR_EINT */
+#define WM831X_IM_PS_POR_EINT_MASK              0x0004  /* IM_PS_POR_EINT */
+#define WM831X_IM_PS_POR_EINT_SHIFT                  2  /* IM_PS_POR_EINT */
+#define WM831X_IM_PS_POR_EINT_WIDTH                  1  /* IM_PS_POR_EINT */
+#define WM831X_IM_PS_SLEEP_OFF_EINT             0x0002  /* IM_PS_SLEEP_OFF_EINT */
+#define WM831X_IM_PS_SLEEP_OFF_EINT_MASK        0x0002  /* IM_PS_SLEEP_OFF_EINT */
+#define WM831X_IM_PS_SLEEP_OFF_EINT_SHIFT            1  /* IM_PS_SLEEP_OFF_EINT */
+#define WM831X_IM_PS_SLEEP_OFF_EINT_WIDTH            1  /* IM_PS_SLEEP_OFF_EINT */
+#define WM831X_IM_PS_ON_WAKE_EINT               0x0001  /* IM_PS_ON_WAKE_EINT */
+#define WM831X_IM_PS_ON_WAKE_EINT_MASK          0x0001  /* IM_PS_ON_WAKE_EINT */
+#define WM831X_IM_PS_ON_WAKE_EINT_SHIFT              0  /* IM_PS_ON_WAKE_EINT */
+#define WM831X_IM_PS_ON_WAKE_EINT_WIDTH              1  /* IM_PS_ON_WAKE_EINT */
+
+/*
+ * R16411 (0x401B) - Interrupt Status 3 Mask
+ */
+#define WM831X_IM_UV_LDO10_EINT                 0x0200  /* IM_UV_LDO10_EINT */
+#define WM831X_IM_UV_LDO10_EINT_MASK            0x0200  /* IM_UV_LDO10_EINT */
+#define WM831X_IM_UV_LDO10_EINT_SHIFT                9  /* IM_UV_LDO10_EINT */
+#define WM831X_IM_UV_LDO10_EINT_WIDTH                1  /* IM_UV_LDO10_EINT */
+#define WM831X_IM_UV_LDO9_EINT                  0x0100  /* IM_UV_LDO9_EINT */
+#define WM831X_IM_UV_LDO9_EINT_MASK             0x0100  /* IM_UV_LDO9_EINT */
+#define WM831X_IM_UV_LDO9_EINT_SHIFT                 8  /* IM_UV_LDO9_EINT */
+#define WM831X_IM_UV_LDO9_EINT_WIDTH                 1  /* IM_UV_LDO9_EINT */
+#define WM831X_IM_UV_LDO8_EINT                  0x0080  /* IM_UV_LDO8_EINT */
+#define WM831X_IM_UV_LDO8_EINT_MASK             0x0080  /* IM_UV_LDO8_EINT */
+#define WM831X_IM_UV_LDO8_EINT_SHIFT                 7  /* IM_UV_LDO8_EINT */
+#define WM831X_IM_UV_LDO8_EINT_WIDTH                 1  /* IM_UV_LDO8_EINT */
+#define WM831X_IM_UV_LDO7_EINT                  0x0040  /* IM_UV_LDO7_EINT */
+#define WM831X_IM_UV_LDO7_EINT_MASK             0x0040  /* IM_UV_LDO7_EINT */
+#define WM831X_IM_UV_LDO7_EINT_SHIFT                 6  /* IM_UV_LDO7_EINT */
+#define WM831X_IM_UV_LDO7_EINT_WIDTH                 1  /* IM_UV_LDO7_EINT */
+#define WM831X_IM_UV_LDO6_EINT                  0x0020  /* IM_UV_LDO6_EINT */
+#define WM831X_IM_UV_LDO6_EINT_MASK             0x0020  /* IM_UV_LDO6_EINT */
+#define WM831X_IM_UV_LDO6_EINT_SHIFT                 5  /* IM_UV_LDO6_EINT */
+#define WM831X_IM_UV_LDO6_EINT_WIDTH                 1  /* IM_UV_LDO6_EINT */
+#define WM831X_IM_UV_LDO5_EINT                  0x0010  /* IM_UV_LDO5_EINT */
+#define WM831X_IM_UV_LDO5_EINT_MASK             0x0010  /* IM_UV_LDO5_EINT */
+#define WM831X_IM_UV_LDO5_EINT_SHIFT                 4  /* IM_UV_LDO5_EINT */
+#define WM831X_IM_UV_LDO5_EINT_WIDTH                 1  /* IM_UV_LDO5_EINT */
+#define WM831X_IM_UV_LDO4_EINT                  0x0008  /* IM_UV_LDO4_EINT */
+#define WM831X_IM_UV_LDO4_EINT_MASK             0x0008  /* IM_UV_LDO4_EINT */
+#define WM831X_IM_UV_LDO4_EINT_SHIFT                 3  /* IM_UV_LDO4_EINT */
+#define WM831X_IM_UV_LDO4_EINT_WIDTH                 1  /* IM_UV_LDO4_EINT */
+#define WM831X_IM_UV_LDO3_EINT                  0x0004  /* IM_UV_LDO3_EINT */
+#define WM831X_IM_UV_LDO3_EINT_MASK             0x0004  /* IM_UV_LDO3_EINT */
+#define WM831X_IM_UV_LDO3_EINT_SHIFT                 2  /* IM_UV_LDO3_EINT */
+#define WM831X_IM_UV_LDO3_EINT_WIDTH                 1  /* IM_UV_LDO3_EINT */
+#define WM831X_IM_UV_LDO2_EINT                  0x0002  /* IM_UV_LDO2_EINT */
+#define WM831X_IM_UV_LDO2_EINT_MASK             0x0002  /* IM_UV_LDO2_EINT */
+#define WM831X_IM_UV_LDO2_EINT_SHIFT                 1  /* IM_UV_LDO2_EINT */
+#define WM831X_IM_UV_LDO2_EINT_WIDTH                 1  /* IM_UV_LDO2_EINT */
+#define WM831X_IM_UV_LDO1_EINT                  0x0001  /* IM_UV_LDO1_EINT */
+#define WM831X_IM_UV_LDO1_EINT_MASK             0x0001  /* IM_UV_LDO1_EINT */
+#define WM831X_IM_UV_LDO1_EINT_SHIFT                 0  /* IM_UV_LDO1_EINT */
+#define WM831X_IM_UV_LDO1_EINT_WIDTH                 1  /* IM_UV_LDO1_EINT */
+
+/*
+ * R16412 (0x401C) - Interrupt Status 4 Mask
+ */
+#define WM831X_IM_HC_DC2_EINT                   0x0200  /* IM_HC_DC2_EINT */
+#define WM831X_IM_HC_DC2_EINT_MASK              0x0200  /* IM_HC_DC2_EINT */
+#define WM831X_IM_HC_DC2_EINT_SHIFT                  9  /* IM_HC_DC2_EINT */
+#define WM831X_IM_HC_DC2_EINT_WIDTH                  1  /* IM_HC_DC2_EINT */
+#define WM831X_IM_HC_DC1_EINT                   0x0100  /* IM_HC_DC1_EINT */
+#define WM831X_IM_HC_DC1_EINT_MASK              0x0100  /* IM_HC_DC1_EINT */
+#define WM831X_IM_HC_DC1_EINT_SHIFT                  8  /* IM_HC_DC1_EINT */
+#define WM831X_IM_HC_DC1_EINT_WIDTH                  1  /* IM_HC_DC1_EINT */
+#define WM831X_IM_UV_DC4_EINT                   0x0008  /* IM_UV_DC4_EINT */
+#define WM831X_IM_UV_DC4_EINT_MASK              0x0008  /* IM_UV_DC4_EINT */
+#define WM831X_IM_UV_DC4_EINT_SHIFT                  3  /* IM_UV_DC4_EINT */
+#define WM831X_IM_UV_DC4_EINT_WIDTH                  1  /* IM_UV_DC4_EINT */
+#define WM831X_IM_UV_DC3_EINT                   0x0004  /* IM_UV_DC3_EINT */
+#define WM831X_IM_UV_DC3_EINT_MASK              0x0004  /* IM_UV_DC3_EINT */
+#define WM831X_IM_UV_DC3_EINT_SHIFT                  2  /* IM_UV_DC3_EINT */
+#define WM831X_IM_UV_DC3_EINT_WIDTH                  1  /* IM_UV_DC3_EINT */
+#define WM831X_IM_UV_DC2_EINT                   0x0002  /* IM_UV_DC2_EINT */
+#define WM831X_IM_UV_DC2_EINT_MASK              0x0002  /* IM_UV_DC2_EINT */
+#define WM831X_IM_UV_DC2_EINT_SHIFT                  1  /* IM_UV_DC2_EINT */
+#define WM831X_IM_UV_DC2_EINT_WIDTH                  1  /* IM_UV_DC2_EINT */
+#define WM831X_IM_UV_DC1_EINT                   0x0001  /* IM_UV_DC1_EINT */
+#define WM831X_IM_UV_DC1_EINT_MASK              0x0001  /* IM_UV_DC1_EINT */
+#define WM831X_IM_UV_DC1_EINT_SHIFT                  0  /* IM_UV_DC1_EINT */
+#define WM831X_IM_UV_DC1_EINT_WIDTH                  1  /* IM_UV_DC1_EINT */
+
+/*
+ * R16413 (0x401D) - Interrupt Status 5 Mask
+ */
+#define WM831X_IM_GP16_EINT                     0x8000  /* IM_GP16_EINT */
+#define WM831X_IM_GP16_EINT_MASK                0x8000  /* IM_GP16_EINT */
+#define WM831X_IM_GP16_EINT_SHIFT                   15  /* IM_GP16_EINT */
+#define WM831X_IM_GP16_EINT_WIDTH                    1  /* IM_GP16_EINT */
+#define WM831X_IM_GP15_EINT                     0x4000  /* IM_GP15_EINT */
+#define WM831X_IM_GP15_EINT_MASK                0x4000  /* IM_GP15_EINT */
+#define WM831X_IM_GP15_EINT_SHIFT                   14  /* IM_GP15_EINT */
+#define WM831X_IM_GP15_EINT_WIDTH                    1  /* IM_GP15_EINT */
+#define WM831X_IM_GP14_EINT                     0x2000  /* IM_GP14_EINT */
+#define WM831X_IM_GP14_EINT_MASK                0x2000  /* IM_GP14_EINT */
+#define WM831X_IM_GP14_EINT_SHIFT                   13  /* IM_GP14_EINT */
+#define WM831X_IM_GP14_EINT_WIDTH                    1  /* IM_GP14_EINT */
+#define WM831X_IM_GP13_EINT                     0x1000  /* IM_GP13_EINT */
+#define WM831X_IM_GP13_EINT_MASK                0x1000  /* IM_GP13_EINT */
+#define WM831X_IM_GP13_EINT_SHIFT                   12  /* IM_GP13_EINT */
+#define WM831X_IM_GP13_EINT_WIDTH                    1  /* IM_GP13_EINT */
+#define WM831X_IM_GP12_EINT                     0x0800  /* IM_GP12_EINT */
+#define WM831X_IM_GP12_EINT_MASK                0x0800  /* IM_GP12_EINT */
+#define WM831X_IM_GP12_EINT_SHIFT                   11  /* IM_GP12_EINT */
+#define WM831X_IM_GP12_EINT_WIDTH                    1  /* IM_GP12_EINT */
+#define WM831X_IM_GP11_EINT                     0x0400  /* IM_GP11_EINT */
+#define WM831X_IM_GP11_EINT_MASK                0x0400  /* IM_GP11_EINT */
+#define WM831X_IM_GP11_EINT_SHIFT                   10  /* IM_GP11_EINT */
+#define WM831X_IM_GP11_EINT_WIDTH                    1  /* IM_GP11_EINT */
+#define WM831X_IM_GP10_EINT                     0x0200  /* IM_GP10_EINT */
+#define WM831X_IM_GP10_EINT_MASK                0x0200  /* IM_GP10_EINT */
+#define WM831X_IM_GP10_EINT_SHIFT                    9  /* IM_GP10_EINT */
+#define WM831X_IM_GP10_EINT_WIDTH                    1  /* IM_GP10_EINT */
+#define WM831X_IM_GP9_EINT                      0x0100  /* IM_GP9_EINT */
+#define WM831X_IM_GP9_EINT_MASK                 0x0100  /* IM_GP9_EINT */
+#define WM831X_IM_GP9_EINT_SHIFT                     8  /* IM_GP9_EINT */
+#define WM831X_IM_GP9_EINT_WIDTH                     1  /* IM_GP9_EINT */
+#define WM831X_IM_GP8_EINT                      0x0080  /* IM_GP8_EINT */
+#define WM831X_IM_GP8_EINT_MASK                 0x0080  /* IM_GP8_EINT */
+#define WM831X_IM_GP8_EINT_SHIFT                     7  /* IM_GP8_EINT */
+#define WM831X_IM_GP8_EINT_WIDTH                     1  /* IM_GP8_EINT */
+#define WM831X_IM_GP7_EINT                      0x0040  /* IM_GP7_EINT */
+#define WM831X_IM_GP7_EINT_MASK                 0x0040  /* IM_GP7_EINT */
+#define WM831X_IM_GP7_EINT_SHIFT                     6  /* IM_GP7_EINT */
+#define WM831X_IM_GP7_EINT_WIDTH                     1  /* IM_GP7_EINT */
+#define WM831X_IM_GP6_EINT                      0x0020  /* IM_GP6_EINT */
+#define WM831X_IM_GP6_EINT_MASK                 0x0020  /* IM_GP6_EINT */
+#define WM831X_IM_GP6_EINT_SHIFT                     5  /* IM_GP6_EINT */
+#define WM831X_IM_GP6_EINT_WIDTH                     1  /* IM_GP6_EINT */
+#define WM831X_IM_GP5_EINT                      0x0010  /* IM_GP5_EINT */
+#define WM831X_IM_GP5_EINT_MASK                 0x0010  /* IM_GP5_EINT */
+#define WM831X_IM_GP5_EINT_SHIFT                     4  /* IM_GP5_EINT */
+#define WM831X_IM_GP5_EINT_WIDTH                     1  /* IM_GP5_EINT */
+#define WM831X_IM_GP4_EINT                      0x0008  /* IM_GP4_EINT */
+#define WM831X_IM_GP4_EINT_MASK                 0x0008  /* IM_GP4_EINT */
+#define WM831X_IM_GP4_EINT_SHIFT                     3  /* IM_GP4_EINT */
+#define WM831X_IM_GP4_EINT_WIDTH                     1  /* IM_GP4_EINT */
+#define WM831X_IM_GP3_EINT                      0x0004  /* IM_GP3_EINT */
+#define WM831X_IM_GP3_EINT_MASK                 0x0004  /* IM_GP3_EINT */
+#define WM831X_IM_GP3_EINT_SHIFT                     2  /* IM_GP3_EINT */
+#define WM831X_IM_GP3_EINT_WIDTH                     1  /* IM_GP3_EINT */
+#define WM831X_IM_GP2_EINT                      0x0002  /* IM_GP2_EINT */
+#define WM831X_IM_GP2_EINT_MASK                 0x0002  /* IM_GP2_EINT */
+#define WM831X_IM_GP2_EINT_SHIFT                     1  /* IM_GP2_EINT */
+#define WM831X_IM_GP2_EINT_WIDTH                     1  /* IM_GP2_EINT */
+#define WM831X_IM_GP1_EINT                      0x0001  /* IM_GP1_EINT */
+#define WM831X_IM_GP1_EINT_MASK                 0x0001  /* IM_GP1_EINT */
+#define WM831X_IM_GP1_EINT_SHIFT                     0  /* IM_GP1_EINT */
+#define WM831X_IM_GP1_EINT_WIDTH                     1  /* IM_GP1_EINT */
+
+
+#endif
-- 
cgit v1.2.3-71-gd317


From 7e9f9fd4b8285c52c0950a1929864346de5caa6d Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 27 Jul 2009 14:45:54 +0100
Subject: mfd: Add WM831x AUXADC support

The WM831x contains an auxiliary ADC with a number of switchable
inputs which is used to monitor some of the voltages and
temperatures in the system and has some external inputs which can be
used for machine specific purposes. Provide an API allowing drivers
to read values from the ADC.

An internal reference voltage is provided to allow callibration of
the ADC. This is used to calibrate the device at startup.

The hardware also supports continuous readings and digital comparators.
These are not yet supported by the driver.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/wm831x-core.c         | 101 ++++++++++++++++++
 include/linux/mfd/wm831x/auxadc.h | 216 ++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/wm831x/core.h   |   2 +
 3 files changed, 319 insertions(+)
 create mode 100644 include/linux/mfd/wm831x/auxadc.h

(limited to 'include/linux')

diff --git a/drivers/mfd/wm831x-core.c b/drivers/mfd/wm831x-core.c
index eb63d22160d1..42bef1dd2ca1 100644
--- a/drivers/mfd/wm831x-core.c
+++ b/drivers/mfd/wm831x-core.c
@@ -15,11 +15,14 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/i2c.h>
+#include <linux/bcd.h>
+#include <linux/delay.h>
 #include <linux/mfd/core.h>
 
 #include <linux/mfd/wm831x/core.h>
 #include <linux/mfd/wm831x/pdata.h>
 #include <linux/mfd/wm831x/irq.h>
+#include <linux/mfd/wm831x/auxadc.h>
 
 enum wm831x_parent {
 	WM8310 = 0,
@@ -244,6 +247,103 @@ out:
 }
 EXPORT_SYMBOL_GPL(wm831x_set_bits);
 
+/**
+ * wm831x_auxadc_read: Read a value from the WM831x AUXADC
+ *
+ * @wm831x: Device to read from.
+ * @input: AUXADC input to read.
+ */
+int wm831x_auxadc_read(struct wm831x *wm831x, enum wm831x_auxadc input)
+{
+	int tries = 10;
+	int ret, src;
+
+	mutex_lock(&wm831x->auxadc_lock);
+
+	ret = wm831x_set_bits(wm831x, WM831X_AUXADC_CONTROL,
+			      WM831X_AUX_ENA, WM831X_AUX_ENA);
+	if (ret < 0) {
+		dev_err(wm831x->dev, "Failed to enable AUXADC: %d\n", ret);
+		goto out;
+	}
+
+	/* We force a single source at present */
+	src = input;
+	ret = wm831x_reg_write(wm831x, WM831X_AUXADC_SOURCE,
+			       1 << src);
+	if (ret < 0) {
+		dev_err(wm831x->dev, "Failed to set AUXADC source: %d\n", ret);
+		goto out;
+	}
+
+	ret = wm831x_set_bits(wm831x, WM831X_AUXADC_CONTROL,
+			      WM831X_AUX_CVT_ENA, WM831X_AUX_CVT_ENA);
+	if (ret < 0) {
+		dev_err(wm831x->dev, "Failed to start AUXADC: %d\n", ret);
+		goto disable;
+	}
+
+	do {
+		msleep(1);
+
+		ret = wm831x_reg_read(wm831x, WM831X_AUXADC_CONTROL);
+		if (ret < 0)
+			ret = WM831X_AUX_CVT_ENA;
+	} while ((ret & WM831X_AUX_CVT_ENA) && --tries);
+
+	if (ret & WM831X_AUX_CVT_ENA) {
+		dev_err(wm831x->dev, "Timed out reading AUXADC\n");
+		ret = -EBUSY;
+		goto disable;
+	}
+
+	ret = wm831x_reg_read(wm831x, WM831X_AUXADC_DATA);
+	if (ret < 0) {
+		dev_err(wm831x->dev, "Failed to read AUXADC data: %d\n", ret);
+	} else {
+		src = ((ret & WM831X_AUX_DATA_SRC_MASK)
+		       >> WM831X_AUX_DATA_SRC_SHIFT) - 1;
+
+		if (src == 14)
+			src = WM831X_AUX_CAL;
+
+		if (src != input) {
+			dev_err(wm831x->dev, "Data from source %d not %d\n",
+				src, input);
+			ret = -EINVAL;
+		} else {
+			ret &= WM831X_AUX_DATA_MASK;
+		}
+	}
+
+disable:
+	wm831x_set_bits(wm831x, WM831X_AUXADC_CONTROL, WM831X_AUX_ENA, 0);
+out:
+	mutex_unlock(&wm831x->auxadc_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(wm831x_auxadc_read);
+
+/**
+ * wm831x_auxadc_read_uv: Read a voltage from the WM831x AUXADC
+ *
+ * @wm831x: Device to read from.
+ * @input: AUXADC input to read.
+ */
+int wm831x_auxadc_read_uv(struct wm831x *wm831x, enum wm831x_auxadc input)
+{
+	int ret;
+
+	ret = wm831x_auxadc_read(wm831x, input);
+	if (ret < 0)
+		return ret;
+
+	ret *= 1465;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(wm831x_auxadc_read_uv);
+
 static struct resource wm831x_dcdc1_resources[] = {
 	{
 		.start = WM831X_DC1_CONTROL_1,
@@ -1084,6 +1184,7 @@ static int wm831x_device_init(struct wm831x *wm831x, unsigned long id, int irq)
 
 	mutex_init(&wm831x->io_lock);
 	mutex_init(&wm831x->key_lock);
+	mutex_init(&wm831x->auxadc_lock);
 	dev_set_drvdata(wm831x->dev, wm831x);
 
 	ret = wm831x_reg_read(wm831x, WM831X_PARENT_ID);
diff --git a/include/linux/mfd/wm831x/auxadc.h b/include/linux/mfd/wm831x/auxadc.h
new file mode 100644
index 000000000000..b132067e9e99
--- /dev/null
+++ b/include/linux/mfd/wm831x/auxadc.h
@@ -0,0 +1,216 @@
+/*
+ * include/linux/mfd/wm831x/auxadc.h -- Auxiliary ADC interface for WM831x
+ *
+ * Copyright 2009 Wolfson Microelectronics PLC.
+ *
+ * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#ifndef __MFD_WM831X_AUXADC_H__
+#define __MFD_WM831X_AUXADC_H__
+
+/*
+ * R16429 (0x402D) - AuxADC Data
+ */
+#define WM831X_AUX_DATA_SRC_MASK                0xF000  /* AUX_DATA_SRC - [15:12] */
+#define WM831X_AUX_DATA_SRC_SHIFT                   12  /* AUX_DATA_SRC - [15:12] */
+#define WM831X_AUX_DATA_SRC_WIDTH                    4  /* AUX_DATA_SRC - [15:12] */
+#define WM831X_AUX_DATA_MASK                    0x0FFF  /* AUX_DATA - [11:0] */
+#define WM831X_AUX_DATA_SHIFT                        0  /* AUX_DATA - [11:0] */
+#define WM831X_AUX_DATA_WIDTH                       12  /* AUX_DATA - [11:0] */
+
+/*
+ * R16430 (0x402E) - AuxADC Control
+ */
+#define WM831X_AUX_ENA                          0x8000  /* AUX_ENA */
+#define WM831X_AUX_ENA_MASK                     0x8000  /* AUX_ENA */
+#define WM831X_AUX_ENA_SHIFT                        15  /* AUX_ENA */
+#define WM831X_AUX_ENA_WIDTH                         1  /* AUX_ENA */
+#define WM831X_AUX_CVT_ENA                      0x4000  /* AUX_CVT_ENA */
+#define WM831X_AUX_CVT_ENA_MASK                 0x4000  /* AUX_CVT_ENA */
+#define WM831X_AUX_CVT_ENA_SHIFT                    14  /* AUX_CVT_ENA */
+#define WM831X_AUX_CVT_ENA_WIDTH                     1  /* AUX_CVT_ENA */
+#define WM831X_AUX_SLPENA                       0x1000  /* AUX_SLPENA */
+#define WM831X_AUX_SLPENA_MASK                  0x1000  /* AUX_SLPENA */
+#define WM831X_AUX_SLPENA_SHIFT                     12  /* AUX_SLPENA */
+#define WM831X_AUX_SLPENA_WIDTH                      1  /* AUX_SLPENA */
+#define WM831X_AUX_FRC_ENA                      0x0800  /* AUX_FRC_ENA */
+#define WM831X_AUX_FRC_ENA_MASK                 0x0800  /* AUX_FRC_ENA */
+#define WM831X_AUX_FRC_ENA_SHIFT                    11  /* AUX_FRC_ENA */
+#define WM831X_AUX_FRC_ENA_WIDTH                     1  /* AUX_FRC_ENA */
+#define WM831X_AUX_RATE_MASK                    0x003F  /* AUX_RATE - [5:0] */
+#define WM831X_AUX_RATE_SHIFT                        0  /* AUX_RATE - [5:0] */
+#define WM831X_AUX_RATE_WIDTH                        6  /* AUX_RATE - [5:0] */
+
+/*
+ * R16431 (0x402F) - AuxADC Source
+ */
+#define WM831X_AUX_CAL_SEL                      0x8000  /* AUX_CAL_SEL */
+#define WM831X_AUX_CAL_SEL_MASK                 0x8000  /* AUX_CAL_SEL */
+#define WM831X_AUX_CAL_SEL_SHIFT                    15  /* AUX_CAL_SEL */
+#define WM831X_AUX_CAL_SEL_WIDTH                     1  /* AUX_CAL_SEL */
+#define WM831X_AUX_BKUP_BATT_SEL                0x0400  /* AUX_BKUP_BATT_SEL */
+#define WM831X_AUX_BKUP_BATT_SEL_MASK           0x0400  /* AUX_BKUP_BATT_SEL */
+#define WM831X_AUX_BKUP_BATT_SEL_SHIFT              10  /* AUX_BKUP_BATT_SEL */
+#define WM831X_AUX_BKUP_BATT_SEL_WIDTH               1  /* AUX_BKUP_BATT_SEL */
+#define WM831X_AUX_WALL_SEL                     0x0200  /* AUX_WALL_SEL */
+#define WM831X_AUX_WALL_SEL_MASK                0x0200  /* AUX_WALL_SEL */
+#define WM831X_AUX_WALL_SEL_SHIFT                    9  /* AUX_WALL_SEL */
+#define WM831X_AUX_WALL_SEL_WIDTH                    1  /* AUX_WALL_SEL */
+#define WM831X_AUX_BATT_SEL                     0x0100  /* AUX_BATT_SEL */
+#define WM831X_AUX_BATT_SEL_MASK                0x0100  /* AUX_BATT_SEL */
+#define WM831X_AUX_BATT_SEL_SHIFT                    8  /* AUX_BATT_SEL */
+#define WM831X_AUX_BATT_SEL_WIDTH                    1  /* AUX_BATT_SEL */
+#define WM831X_AUX_USB_SEL                      0x0080  /* AUX_USB_SEL */
+#define WM831X_AUX_USB_SEL_MASK                 0x0080  /* AUX_USB_SEL */
+#define WM831X_AUX_USB_SEL_SHIFT                     7  /* AUX_USB_SEL */
+#define WM831X_AUX_USB_SEL_WIDTH                     1  /* AUX_USB_SEL */
+#define WM831X_AUX_SYSVDD_SEL                   0x0040  /* AUX_SYSVDD_SEL */
+#define WM831X_AUX_SYSVDD_SEL_MASK              0x0040  /* AUX_SYSVDD_SEL */
+#define WM831X_AUX_SYSVDD_SEL_SHIFT                  6  /* AUX_SYSVDD_SEL */
+#define WM831X_AUX_SYSVDD_SEL_WIDTH                  1  /* AUX_SYSVDD_SEL */
+#define WM831X_AUX_BATT_TEMP_SEL                0x0020  /* AUX_BATT_TEMP_SEL */
+#define WM831X_AUX_BATT_TEMP_SEL_MASK           0x0020  /* AUX_BATT_TEMP_SEL */
+#define WM831X_AUX_BATT_TEMP_SEL_SHIFT               5  /* AUX_BATT_TEMP_SEL */
+#define WM831X_AUX_BATT_TEMP_SEL_WIDTH               1  /* AUX_BATT_TEMP_SEL */
+#define WM831X_AUX_CHIP_TEMP_SEL                0x0010  /* AUX_CHIP_TEMP_SEL */
+#define WM831X_AUX_CHIP_TEMP_SEL_MASK           0x0010  /* AUX_CHIP_TEMP_SEL */
+#define WM831X_AUX_CHIP_TEMP_SEL_SHIFT               4  /* AUX_CHIP_TEMP_SEL */
+#define WM831X_AUX_CHIP_TEMP_SEL_WIDTH               1  /* AUX_CHIP_TEMP_SEL */
+#define WM831X_AUX_AUX4_SEL                     0x0008  /* AUX_AUX4_SEL */
+#define WM831X_AUX_AUX4_SEL_MASK                0x0008  /* AUX_AUX4_SEL */
+#define WM831X_AUX_AUX4_SEL_SHIFT                    3  /* AUX_AUX4_SEL */
+#define WM831X_AUX_AUX4_SEL_WIDTH                    1  /* AUX_AUX4_SEL */
+#define WM831X_AUX_AUX3_SEL                     0x0004  /* AUX_AUX3_SEL */
+#define WM831X_AUX_AUX3_SEL_MASK                0x0004  /* AUX_AUX3_SEL */
+#define WM831X_AUX_AUX3_SEL_SHIFT                    2  /* AUX_AUX3_SEL */
+#define WM831X_AUX_AUX3_SEL_WIDTH                    1  /* AUX_AUX3_SEL */
+#define WM831X_AUX_AUX2_SEL                     0x0002  /* AUX_AUX2_SEL */
+#define WM831X_AUX_AUX2_SEL_MASK                0x0002  /* AUX_AUX2_SEL */
+#define WM831X_AUX_AUX2_SEL_SHIFT                    1  /* AUX_AUX2_SEL */
+#define WM831X_AUX_AUX2_SEL_WIDTH                    1  /* AUX_AUX2_SEL */
+#define WM831X_AUX_AUX1_SEL                     0x0001  /* AUX_AUX1_SEL */
+#define WM831X_AUX_AUX1_SEL_MASK                0x0001  /* AUX_AUX1_SEL */
+#define WM831X_AUX_AUX1_SEL_SHIFT                    0  /* AUX_AUX1_SEL */
+#define WM831X_AUX_AUX1_SEL_WIDTH                    1  /* AUX_AUX1_SEL */
+
+/*
+ * R16432 (0x4030) - Comparator Control
+ */
+#define WM831X_DCOMP4_STS                       0x0800  /* DCOMP4_STS */
+#define WM831X_DCOMP4_STS_MASK                  0x0800  /* DCOMP4_STS */
+#define WM831X_DCOMP4_STS_SHIFT                     11  /* DCOMP4_STS */
+#define WM831X_DCOMP4_STS_WIDTH                      1  /* DCOMP4_STS */
+#define WM831X_DCOMP3_STS                       0x0400  /* DCOMP3_STS */
+#define WM831X_DCOMP3_STS_MASK                  0x0400  /* DCOMP3_STS */
+#define WM831X_DCOMP3_STS_SHIFT                     10  /* DCOMP3_STS */
+#define WM831X_DCOMP3_STS_WIDTH                      1  /* DCOMP3_STS */
+#define WM831X_DCOMP2_STS                       0x0200  /* DCOMP2_STS */
+#define WM831X_DCOMP2_STS_MASK                  0x0200  /* DCOMP2_STS */
+#define WM831X_DCOMP2_STS_SHIFT                      9  /* DCOMP2_STS */
+#define WM831X_DCOMP2_STS_WIDTH                      1  /* DCOMP2_STS */
+#define WM831X_DCOMP1_STS                       0x0100  /* DCOMP1_STS */
+#define WM831X_DCOMP1_STS_MASK                  0x0100  /* DCOMP1_STS */
+#define WM831X_DCOMP1_STS_SHIFT                      8  /* DCOMP1_STS */
+#define WM831X_DCOMP1_STS_WIDTH                      1  /* DCOMP1_STS */
+#define WM831X_DCMP4_ENA                        0x0008  /* DCMP4_ENA */
+#define WM831X_DCMP4_ENA_MASK                   0x0008  /* DCMP4_ENA */
+#define WM831X_DCMP4_ENA_SHIFT                       3  /* DCMP4_ENA */
+#define WM831X_DCMP4_ENA_WIDTH                       1  /* DCMP4_ENA */
+#define WM831X_DCMP3_ENA                        0x0004  /* DCMP3_ENA */
+#define WM831X_DCMP3_ENA_MASK                   0x0004  /* DCMP3_ENA */
+#define WM831X_DCMP3_ENA_SHIFT                       2  /* DCMP3_ENA */
+#define WM831X_DCMP3_ENA_WIDTH                       1  /* DCMP3_ENA */
+#define WM831X_DCMP2_ENA                        0x0002  /* DCMP2_ENA */
+#define WM831X_DCMP2_ENA_MASK                   0x0002  /* DCMP2_ENA */
+#define WM831X_DCMP2_ENA_SHIFT                       1  /* DCMP2_ENA */
+#define WM831X_DCMP2_ENA_WIDTH                       1  /* DCMP2_ENA */
+#define WM831X_DCMP1_ENA                        0x0001  /* DCMP1_ENA */
+#define WM831X_DCMP1_ENA_MASK                   0x0001  /* DCMP1_ENA */
+#define WM831X_DCMP1_ENA_SHIFT                       0  /* DCMP1_ENA */
+#define WM831X_DCMP1_ENA_WIDTH                       1  /* DCMP1_ENA */
+
+/*
+ * R16433 (0x4031) - Comparator 1
+ */
+#define WM831X_DCMP1_SRC_MASK                   0xE000  /* DCMP1_SRC - [15:13] */
+#define WM831X_DCMP1_SRC_SHIFT                      13  /* DCMP1_SRC - [15:13] */
+#define WM831X_DCMP1_SRC_WIDTH                       3  /* DCMP1_SRC - [15:13] */
+#define WM831X_DCMP1_GT                         0x1000  /* DCMP1_GT */
+#define WM831X_DCMP1_GT_MASK                    0x1000  /* DCMP1_GT */
+#define WM831X_DCMP1_GT_SHIFT                       12  /* DCMP1_GT */
+#define WM831X_DCMP1_GT_WIDTH                        1  /* DCMP1_GT */
+#define WM831X_DCMP1_THR_MASK                   0x0FFF  /* DCMP1_THR - [11:0] */
+#define WM831X_DCMP1_THR_SHIFT                       0  /* DCMP1_THR - [11:0] */
+#define WM831X_DCMP1_THR_WIDTH                      12  /* DCMP1_THR - [11:0] */
+
+/*
+ * R16434 (0x4032) - Comparator 2
+ */
+#define WM831X_DCMP2_SRC_MASK                   0xE000  /* DCMP2_SRC - [15:13] */
+#define WM831X_DCMP2_SRC_SHIFT                      13  /* DCMP2_SRC - [15:13] */
+#define WM831X_DCMP2_SRC_WIDTH                       3  /* DCMP2_SRC - [15:13] */
+#define WM831X_DCMP2_GT                         0x1000  /* DCMP2_GT */
+#define WM831X_DCMP2_GT_MASK                    0x1000  /* DCMP2_GT */
+#define WM831X_DCMP2_GT_SHIFT                       12  /* DCMP2_GT */
+#define WM831X_DCMP2_GT_WIDTH                        1  /* DCMP2_GT */
+#define WM831X_DCMP2_THR_MASK                   0x0FFF  /* DCMP2_THR - [11:0] */
+#define WM831X_DCMP2_THR_SHIFT                       0  /* DCMP2_THR - [11:0] */
+#define WM831X_DCMP2_THR_WIDTH                      12  /* DCMP2_THR - [11:0] */
+
+/*
+ * R16435 (0x4033) - Comparator 3
+ */
+#define WM831X_DCMP3_SRC_MASK                   0xE000  /* DCMP3_SRC - [15:13] */
+#define WM831X_DCMP3_SRC_SHIFT                      13  /* DCMP3_SRC - [15:13] */
+#define WM831X_DCMP3_SRC_WIDTH                       3  /* DCMP3_SRC - [15:13] */
+#define WM831X_DCMP3_GT                         0x1000  /* DCMP3_GT */
+#define WM831X_DCMP3_GT_MASK                    0x1000  /* DCMP3_GT */
+#define WM831X_DCMP3_GT_SHIFT                       12  /* DCMP3_GT */
+#define WM831X_DCMP3_GT_WIDTH                        1  /* DCMP3_GT */
+#define WM831X_DCMP3_THR_MASK                   0x0FFF  /* DCMP3_THR - [11:0] */
+#define WM831X_DCMP3_THR_SHIFT                       0  /* DCMP3_THR - [11:0] */
+#define WM831X_DCMP3_THR_WIDTH                      12  /* DCMP3_THR - [11:0] */
+
+/*
+ * R16436 (0x4034) - Comparator 4
+ */
+#define WM831X_DCMP4_SRC_MASK                   0xE000  /* DCMP4_SRC - [15:13] */
+#define WM831X_DCMP4_SRC_SHIFT                      13  /* DCMP4_SRC - [15:13] */
+#define WM831X_DCMP4_SRC_WIDTH                       3  /* DCMP4_SRC - [15:13] */
+#define WM831X_DCMP4_GT                         0x1000  /* DCMP4_GT */
+#define WM831X_DCMP4_GT_MASK                    0x1000  /* DCMP4_GT */
+#define WM831X_DCMP4_GT_SHIFT                       12  /* DCMP4_GT */
+#define WM831X_DCMP4_GT_WIDTH                        1  /* DCMP4_GT */
+#define WM831X_DCMP4_THR_MASK                   0x0FFF  /* DCMP4_THR - [11:0] */
+#define WM831X_DCMP4_THR_SHIFT                       0  /* DCMP4_THR - [11:0] */
+#define WM831X_DCMP4_THR_WIDTH                      12  /* DCMP4_THR - [11:0] */
+
+#define WM831X_AUX_CAL_FACTOR  0xfff
+#define WM831X_AUX_CAL_NOMINAL 0x222
+
+enum wm831x_auxadc {
+	WM831X_AUX_CAL = 15,
+	WM831X_AUX_BKUP_BATT = 10,
+	WM831X_AUX_WALL = 9,
+	WM831X_AUX_BATT = 8,
+	WM831X_AUX_USB = 7,
+	WM831X_AUX_SYSVDD = 6,
+	WM831X_AUX_BATT_TEMP = 5,
+	WM831X_AUX_CHIP_TEMP = 4,
+	WM831X_AUX_AUX4 = 3,
+	WM831X_AUX_AUX3 = 2,
+	WM831X_AUX_AUX2 = 1,
+	WM831X_AUX_AUX1 = 0,
+};
+
+int wm831x_auxadc_read(struct wm831x *wm831x, enum wm831x_auxadc input);
+int wm831x_auxadc_read_uv(struct wm831x *wm831x, enum wm831x_auxadc input);
+
+#endif
diff --git a/include/linux/mfd/wm831x/core.h b/include/linux/mfd/wm831x/core.h
index b96c9355b16e..d7134dfba56e 100644
--- a/include/linux/mfd/wm831x/core.h
+++ b/include/linux/mfd/wm831x/core.h
@@ -234,6 +234,8 @@ struct wm831x {
 	unsigned int irq_base;
 	int irq_masks[5];
 
+	struct mutex auxadc_lock;
+
 	/* The WM831x has a security key blocking access to certain
 	 * registers.  The mutex is taken by the accessors for locking
 	 * and unlocking the security key, locked is used to fail
-- 
cgit v1.2.3-71-gd317


From 63aed85e3535b4603798184cc941e49de386d354 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 27 Jul 2009 14:45:55 +0100
Subject: mfd: Conditionally add WM831x backlight subdevice

The WM831x backlight driver requires at least the specification of the
current sink to use and a maximum current to allow them to function and
will actively interfere with other users of the regulators it uses if
misconfigured so only register the subdevice for it if this platform
data has been supplied.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/wm831x-core.c        | 15 +++++++++++++++
 include/linux/mfd/wm831x/pdata.h |  6 ++++++
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/wm831x-core.c b/drivers/mfd/wm831x-core.c
index 42bef1dd2ca1..bc40ea315cc0 100644
--- a/drivers/mfd/wm831x-core.c
+++ b/drivers/mfd/wm831x-core.c
@@ -1172,6 +1172,12 @@ static struct mfd_cell wm8312_devs[] = {
 	},
 };
 
+static struct mfd_cell backlight_devs[] = {
+	{
+		.name = "wm831x-backlight",
+	},
+};
+
 /*
  * Instantiate the generic non-control parts of the device.
  */
@@ -1325,6 +1331,15 @@ static int wm831x_device_init(struct wm831x *wm831x, unsigned long id, int irq)
 		goto err_irq;
 	}
 
+	if (pdata && pdata->backlight) {
+		/* Treat errors as non-critical */
+		ret = mfd_add_devices(wm831x->dev, -1, backlight_devs,
+				      ARRAY_SIZE(backlight_devs), NULL, 0);
+		if (ret < 0)
+			dev_err(wm831x->dev, "Failed to add backlight: %d\n",
+				ret);
+	}
+
 	if (pdata && pdata->post_init) {
 		ret = pdata->post_init(wm831x);
 		if (ret != 0) {
diff --git a/include/linux/mfd/wm831x/pdata.h b/include/linux/mfd/wm831x/pdata.h
index 571e60136264..90d820260aad 100644
--- a/include/linux/mfd/wm831x/pdata.h
+++ b/include/linux/mfd/wm831x/pdata.h
@@ -18,6 +18,11 @@
 struct wm831x;
 struct regulator_init_data;
 
+struct wm831x_backlight_pdata {
+	int isink;     /** ISINK to use, 1 or 2 */
+	int max_uA;    /** Maximum current to allow */
+};
+
 struct wm831x_backup_pdata {
 	int charger_enable;
 	int no_constant_voltage;  /** Disable constant voltage charging */
@@ -87,6 +92,7 @@ struct wm831x_pdata {
 	int (*post_init)(struct wm831x *wm831x);
 
 	int gpio_base;
+	struct wm831x_backlight_pdata *backlight;
 	struct wm831x_backup_pdata *backup;
 	struct wm831x_battery_pdata *battery;
 	struct wm831x_touch_pdata *touch;
-- 
cgit v1.2.3-71-gd317


From 6704e5171ba9053ba173bcd807c7392d2076bdb4 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 27 Jul 2009 14:45:56 +0100
Subject: mfd: Add basic WM831x OTP support

The WM831x series of devices use OTP (One Time Programmable, a type
of PROM) to store system configuration. At run time this data is
visible via registers.

Currently the only explicitly supported feature is that the unique
ID provided by every WM831x device is exported to user space via
sysfs. Other configuration data may be read by system-specific
code in the pre_init() and post_init() platform data operations.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/wm831x-core.c      |   4 +
 drivers/mfd/wm831x-otp.c       |  83 +++++++++++++++++++++
 include/linux/mfd/wm831x/otp.h | 162 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 249 insertions(+)
 create mode 100644 drivers/mfd/wm831x-otp.c
 create mode 100644 include/linux/mfd/wm831x/otp.h

(limited to 'include/linux')

diff --git a/drivers/mfd/wm831x-core.c b/drivers/mfd/wm831x-core.c
index bc40ea315cc0..33eaea2f988e 100644
--- a/drivers/mfd/wm831x-core.c
+++ b/drivers/mfd/wm831x-core.c
@@ -23,6 +23,7 @@
 #include <linux/mfd/wm831x/pdata.h>
 #include <linux/mfd/wm831x/irq.h>
 #include <linux/mfd/wm831x/auxadc.h>
+#include <linux/mfd/wm831x/otp.h>
 
 enum wm831x_parent {
 	WM8310 = 0,
@@ -1340,6 +1341,8 @@ static int wm831x_device_init(struct wm831x *wm831x, unsigned long id, int irq)
 				ret);
 	}
 
+	wm831x_otp_init(wm831x);
+
 	if (pdata && pdata->post_init) {
 		ret = pdata->post_init(wm831x);
 		if (ret != 0) {
@@ -1360,6 +1363,7 @@ err:
 
 static void wm831x_device_exit(struct wm831x *wm831x)
 {
+	wm831x_otp_exit(wm831x);
 	mfd_remove_devices(wm831x->dev);
 	wm831x_irq_exit(wm831x);
 	kfree(wm831x);
diff --git a/drivers/mfd/wm831x-otp.c b/drivers/mfd/wm831x-otp.c
new file mode 100644
index 000000000000..f742745ff354
--- /dev/null
+++ b/drivers/mfd/wm831x-otp.c
@@ -0,0 +1,83 @@
+/*
+ * wm831x-otp.c  --  OTP for Wolfson WM831x PMICs
+ *
+ * Copyright 2009 Wolfson Microelectronics PLC.
+ *
+ * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/i2c.h>
+#include <linux/bcd.h>
+#include <linux/delay.h>
+#include <linux/mfd/core.h>
+
+#include <linux/mfd/wm831x/core.h>
+#include <linux/mfd/wm831x/otp.h>
+
+/* In bytes */
+#define WM831X_UNIQUE_ID_LEN 16
+
+/* Read the unique ID from the chip into id */
+static int wm831x_unique_id_read(struct wm831x *wm831x, char *id)
+{
+	int i, val;
+
+	for (i = 0; i < WM831X_UNIQUE_ID_LEN / 2; i++) {
+		val = wm831x_reg_read(wm831x, WM831X_UNIQUE_ID_1 + i);
+		if (val < 0)
+			return val;
+
+		id[i * 2]       = (val >> 8) & 0xff;
+		id[(i * 2) + 1] = val & 0xff;
+	}
+
+	return 0;
+}
+
+static ssize_t wm831x_unique_id_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	struct wm831x *wm831x = dev_get_drvdata(dev);
+	int i, rval;
+	char id[WM831X_UNIQUE_ID_LEN];
+	ssize_t ret = 0;
+
+	rval = wm831x_unique_id_read(wm831x, id);
+	if (rval < 0)
+		return 0;
+
+	for (i = 0; i < WM831X_UNIQUE_ID_LEN; i++)
+		ret += sprintf(&buf[ret], "%02x", buf[i]);
+
+	ret += sprintf(&buf[ret], "\n");
+
+	return ret;
+}
+
+static DEVICE_ATTR(unique_id, 0444, wm831x_unique_id_show, NULL);
+
+int wm831x_otp_init(struct wm831x *wm831x)
+{
+	int ret;
+
+	ret = device_create_file(wm831x->dev, &dev_attr_unique_id);
+	if (ret != 0)
+		dev_err(wm831x->dev, "Unique ID attribute not created: %d\n",
+			ret);
+
+	return ret;
+}
+
+void wm831x_otp_exit(struct wm831x *wm831x)
+{
+	device_remove_file(wm831x->dev, &dev_attr_unique_id);
+}
+
diff --git a/include/linux/mfd/wm831x/otp.h b/include/linux/mfd/wm831x/otp.h
new file mode 100644
index 000000000000..ce1f81a39bfc
--- /dev/null
+++ b/include/linux/mfd/wm831x/otp.h
@@ -0,0 +1,162 @@
+/*
+ * include/linux/mfd/wm831x/otp.h -- OTP interface for WM831x
+ *
+ * Copyright 2009 Wolfson Microelectronics PLC.
+ *
+ * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#ifndef __MFD_WM831X_OTP_H__
+#define __MFD_WM831X_OTP_H__
+
+int wm831x_otp_init(struct wm831x *wm831x);
+void wm831x_otp_exit(struct wm831x *wm831x);
+
+/*
+ * R30720 (0x7800) - Unique ID 1
+ */
+#define WM831X_UNIQUE_ID_MASK                   0xFFFF  /* UNIQUE_ID - [15:0] */
+#define WM831X_UNIQUE_ID_SHIFT                       0  /* UNIQUE_ID - [15:0] */
+#define WM831X_UNIQUE_ID_WIDTH                      16  /* UNIQUE_ID - [15:0] */
+
+/*
+ * R30721 (0x7801) - Unique ID 2
+ */
+#define WM831X_UNIQUE_ID_MASK                   0xFFFF  /* UNIQUE_ID - [15:0] */
+#define WM831X_UNIQUE_ID_SHIFT                       0  /* UNIQUE_ID - [15:0] */
+#define WM831X_UNIQUE_ID_WIDTH                      16  /* UNIQUE_ID - [15:0] */
+
+/*
+ * R30722 (0x7802) - Unique ID 3
+ */
+#define WM831X_UNIQUE_ID_MASK                   0xFFFF  /* UNIQUE_ID - [15:0] */
+#define WM831X_UNIQUE_ID_SHIFT                       0  /* UNIQUE_ID - [15:0] */
+#define WM831X_UNIQUE_ID_WIDTH                      16  /* UNIQUE_ID - [15:0] */
+
+/*
+ * R30723 (0x7803) - Unique ID 4
+ */
+#define WM831X_UNIQUE_ID_MASK                   0xFFFF  /* UNIQUE_ID - [15:0] */
+#define WM831X_UNIQUE_ID_SHIFT                       0  /* UNIQUE_ID - [15:0] */
+#define WM831X_UNIQUE_ID_WIDTH                      16  /* UNIQUE_ID - [15:0] */
+
+/*
+ * R30724 (0x7804) - Unique ID 5
+ */
+#define WM831X_UNIQUE_ID_MASK                   0xFFFF  /* UNIQUE_ID - [15:0] */
+#define WM831X_UNIQUE_ID_SHIFT                       0  /* UNIQUE_ID - [15:0] */
+#define WM831X_UNIQUE_ID_WIDTH                      16  /* UNIQUE_ID - [15:0] */
+
+/*
+ * R30725 (0x7805) - Unique ID 6
+ */
+#define WM831X_UNIQUE_ID_MASK                   0xFFFF  /* UNIQUE_ID - [15:0] */
+#define WM831X_UNIQUE_ID_SHIFT                       0  /* UNIQUE_ID - [15:0] */
+#define WM831X_UNIQUE_ID_WIDTH                      16  /* UNIQUE_ID - [15:0] */
+
+/*
+ * R30726 (0x7806) - Unique ID 7
+ */
+#define WM831X_UNIQUE_ID_MASK                   0xFFFF  /* UNIQUE_ID - [15:0] */
+#define WM831X_UNIQUE_ID_SHIFT                       0  /* UNIQUE_ID - [15:0] */
+#define WM831X_UNIQUE_ID_WIDTH                      16  /* UNIQUE_ID - [15:0] */
+
+/*
+ * R30727 (0x7807) - Unique ID 8
+ */
+#define WM831X_UNIQUE_ID_MASK                   0xFFFF  /* UNIQUE_ID - [15:0] */
+#define WM831X_UNIQUE_ID_SHIFT                       0  /* UNIQUE_ID - [15:0] */
+#define WM831X_UNIQUE_ID_WIDTH                      16  /* UNIQUE_ID - [15:0] */
+
+/*
+ * R30728 (0x7808) - Factory OTP ID
+ */
+#define WM831X_OTP_FACT_ID_MASK                 0xFFFE  /* OTP_FACT_ID - [15:1] */
+#define WM831X_OTP_FACT_ID_SHIFT                     1  /* OTP_FACT_ID - [15:1] */
+#define WM831X_OTP_FACT_ID_WIDTH                    15  /* OTP_FACT_ID - [15:1] */
+#define WM831X_OTP_FACT_FINAL                   0x0001  /* OTP_FACT_FINAL */
+#define WM831X_OTP_FACT_FINAL_MASK              0x0001  /* OTP_FACT_FINAL */
+#define WM831X_OTP_FACT_FINAL_SHIFT                  0  /* OTP_FACT_FINAL */
+#define WM831X_OTP_FACT_FINAL_WIDTH                  1  /* OTP_FACT_FINAL */
+
+/*
+ * R30729 (0x7809) - Factory OTP 1
+ */
+#define WM831X_DC3_TRIM_MASK                    0xF000  /* DC3_TRIM - [15:12] */
+#define WM831X_DC3_TRIM_SHIFT                       12  /* DC3_TRIM - [15:12] */
+#define WM831X_DC3_TRIM_WIDTH                        4  /* DC3_TRIM - [15:12] */
+#define WM831X_DC2_TRIM_MASK                    0x0FC0  /* DC2_TRIM - [11:6] */
+#define WM831X_DC2_TRIM_SHIFT                        6  /* DC2_TRIM - [11:6] */
+#define WM831X_DC2_TRIM_WIDTH                        6  /* DC2_TRIM - [11:6] */
+#define WM831X_DC1_TRIM_MASK                    0x003F  /* DC1_TRIM - [5:0] */
+#define WM831X_DC1_TRIM_SHIFT                        0  /* DC1_TRIM - [5:0] */
+#define WM831X_DC1_TRIM_WIDTH                        6  /* DC1_TRIM - [5:0] */
+
+/*
+ * R30730 (0x780A) - Factory OTP 2
+ */
+#define WM831X_CHIP_ID_MASK                     0xFFFF  /* CHIP_ID - [15:0] */
+#define WM831X_CHIP_ID_SHIFT                         0  /* CHIP_ID - [15:0] */
+#define WM831X_CHIP_ID_WIDTH                        16  /* CHIP_ID - [15:0] */
+
+/*
+ * R30731 (0x780B) - Factory OTP 3
+ */
+#define WM831X_OSC_TRIM_MASK                    0x0780  /* OSC_TRIM - [10:7] */
+#define WM831X_OSC_TRIM_SHIFT                        7  /* OSC_TRIM - [10:7] */
+#define WM831X_OSC_TRIM_WIDTH                        4  /* OSC_TRIM - [10:7] */
+#define WM831X_BG_TRIM_MASK                     0x0078  /* BG_TRIM - [6:3] */
+#define WM831X_BG_TRIM_SHIFT                         3  /* BG_TRIM - [6:3] */
+#define WM831X_BG_TRIM_WIDTH                         4  /* BG_TRIM - [6:3] */
+#define WM831X_LPBG_TRIM_MASK                   0x0007  /* LPBG_TRIM - [2:0] */
+#define WM831X_LPBG_TRIM_SHIFT                       0  /* LPBG_TRIM - [2:0] */
+#define WM831X_LPBG_TRIM_WIDTH                       3  /* LPBG_TRIM - [2:0] */
+
+/*
+ * R30732 (0x780C) - Factory OTP 4
+ */
+#define WM831X_CHILD_I2C_ADDR_MASK              0x00FE  /* CHILD_I2C_ADDR - [7:1] */
+#define WM831X_CHILD_I2C_ADDR_SHIFT                  1  /* CHILD_I2C_ADDR - [7:1] */
+#define WM831X_CHILD_I2C_ADDR_WIDTH                  7  /* CHILD_I2C_ADDR - [7:1] */
+#define WM831X_CH_AW                            0x0001  /* CH_AW */
+#define WM831X_CH_AW_MASK                       0x0001  /* CH_AW */
+#define WM831X_CH_AW_SHIFT                           0  /* CH_AW */
+#define WM831X_CH_AW_WIDTH                           1  /* CH_AW */
+
+/*
+ * R30733 (0x780D) - Factory OTP 5
+ */
+#define WM831X_CHARGE_TRIM_MASK                 0x003F  /* CHARGE_TRIM - [5:0] */
+#define WM831X_CHARGE_TRIM_SHIFT                     0  /* CHARGE_TRIM - [5:0] */
+#define WM831X_CHARGE_TRIM_WIDTH                     6  /* CHARGE_TRIM - [5:0] */
+
+/*
+ * R30736 (0x7810) - Customer OTP ID
+ */
+#define WM831X_OTP_AUTO_PROG                    0x8000  /* OTP_AUTO_PROG */
+#define WM831X_OTP_AUTO_PROG_MASK               0x8000  /* OTP_AUTO_PROG */
+#define WM831X_OTP_AUTO_PROG_SHIFT                  15  /* OTP_AUTO_PROG */
+#define WM831X_OTP_AUTO_PROG_WIDTH                   1  /* OTP_AUTO_PROG */
+#define WM831X_OTP_CUST_ID_MASK                 0x7FFE  /* OTP_CUST_ID - [14:1] */
+#define WM831X_OTP_CUST_ID_SHIFT                     1  /* OTP_CUST_ID - [14:1] */
+#define WM831X_OTP_CUST_ID_WIDTH                    14  /* OTP_CUST_ID - [14:1] */
+#define WM831X_OTP_CUST_FINAL                   0x0001  /* OTP_CUST_FINAL */
+#define WM831X_OTP_CUST_FINAL_MASK              0x0001  /* OTP_CUST_FINAL */
+#define WM831X_OTP_CUST_FINAL_SHIFT                  0  /* OTP_CUST_FINAL */
+#define WM831X_OTP_CUST_FINAL_WIDTH                  1  /* OTP_CUST_FINAL */
+
+/*
+ * R30759 (0x7827) - DBE CHECK DATA
+ */
+#define WM831X_DBE_VALID_DATA_MASK              0xFFFF  /* DBE_VALID_DATA - [15:0] */
+#define WM831X_DBE_VALID_DATA_SHIFT                  0  /* DBE_VALID_DATA - [15:0] */
+#define WM831X_DBE_VALID_DATA_WIDTH                 16  /* DBE_VALID_DATA - [15:0] */
+
+
+#endif
-- 
cgit v1.2.3-71-gd317


From 698659d5f78606c698781574773f433c60176e40 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 27 Jul 2009 14:45:57 +0100
Subject: mfd: Export ISEL values from WM831x core

The current settings which can be used with the WM831x current sinks
can't easily be mapped between register values and currents at run
time without a lookup table since the values scale logarithmically
to match the way the human eye interprets brightness. This lookup
table is inclided in the core since several drivers need to use it.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/wm831x-core.c            | 64 ++++++++++++++++++++++++++++++++++++
 include/linux/mfd/wm831x/regulator.h | 21 ++++++++++++
 2 files changed, 85 insertions(+)
 create mode 100644 include/linux/mfd/wm831x/regulator.h

(limited to 'include/linux')

diff --git a/drivers/mfd/wm831x-core.c b/drivers/mfd/wm831x-core.c
index 33eaea2f988e..49b7885c2702 100644
--- a/drivers/mfd/wm831x-core.c
+++ b/drivers/mfd/wm831x-core.c
@@ -24,6 +24,70 @@
 #include <linux/mfd/wm831x/irq.h>
 #include <linux/mfd/wm831x/auxadc.h>
 #include <linux/mfd/wm831x/otp.h>
+#include <linux/mfd/wm831x/regulator.h>
+
+/* Current settings - values are 2*2^(reg_val/4) microamps.  These are
+ * exported since they are used by multiple drivers.
+ */
+int wm831x_isinkv_values[WM831X_ISINK_MAX_ISEL] = {
+	2,
+	2,
+	3,
+	3,
+	4,
+	5,
+	6,
+	7,
+	8,
+	10,
+	11,
+	13,
+	16,
+	19,
+	23,
+	27,
+	32,
+	38,
+	45,
+	54,
+	64,
+	76,
+	91,
+	108,
+	128,
+	152,
+	181,
+	215,
+	256,
+	304,
+	362,
+	431,
+	512,
+	609,
+	724,
+	861,
+	1024,
+	1218,
+	1448,
+	1722,
+	2048,
+	2435,
+	2896,
+	3444,
+	4096,
+	4871,
+	5793,
+	6889,
+	8192,
+	9742,
+	11585,
+	13777,
+	16384,
+	19484,
+	23170,
+	27554,
+};
+EXPORT_SYMBOL_GPL(wm831x_isinkv_values);
 
 enum wm831x_parent {
 	WM8310 = 0,
diff --git a/include/linux/mfd/wm831x/regulator.h b/include/linux/mfd/wm831x/regulator.h
new file mode 100644
index 000000000000..b5d58fb38b4e
--- /dev/null
+++ b/include/linux/mfd/wm831x/regulator.h
@@ -0,0 +1,21 @@
+/*
+ * linux/mfd/wm831x/regulator.h -- Regulator definitons for wm831x
+ *
+ * Copyright 2009 Wolfson Microelectronics PLC.
+ *
+ * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#ifndef __MFD_WM831X_REGULATOR_H__
+#define __MFD_WM831X_REGULATOR_H__
+
+#define WM831X_ISINK_MAX_ISEL 56
+extern int wm831x_isinkv_values[WM831X_ISINK_MAX_ISEL];
+
+#endif
-- 
cgit v1.2.3-71-gd317


From e4b736f18f338daae141325c818187c4ab3e244c Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 27 Jul 2009 14:46:00 +0100
Subject: gpio: Add WM831X GPIO driver

Add support for the GPIO pins on the WM831x. No direct support is
currently supplied for configuring non-gpiolib functionality such
as pull configuration and alternate functions, soft configuration
of these will be provided in a future patch.

Currently use of these pins as interrupts is not supported due to
the ongoing issues with generic irq not support interrupt controllers
on interrupt driven buses. Users can directly request the interrupts
with the wm831x-specific APIs currently provided if required.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Acked-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/gpio/Kconfig            |   7 ++
 drivers/gpio/Makefile           |   1 +
 drivers/gpio/wm831x-gpio.c      | 252 ++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/wm831x/gpio.h |  55 +++++++++
 4 files changed, 315 insertions(+)
 create mode 100644 drivers/gpio/wm831x-gpio.c
 create mode 100644 include/linux/mfd/wm831x/gpio.h

(limited to 'include/linux')

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 96dda81c9228..6b4c484a699a 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -155,6 +155,13 @@ config GPIO_TWL4030
 	  Say yes here to access the GPIO signals of various multi-function
 	  power management chips from Texas Instruments.
 
+config GPIO_WM831X
+	tristate "WM831x GPIOs"
+	depends on MFD_WM831X
+	help
+	  Say yes here to access the GPIO signals of WM831x power management
+	  chips from Wolfson Microelectronics.
+
 comment "PCI GPIO expanders:"
 
 config GPIO_BT8XX
diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile
index 9244c6fcd8be..ea7c745f26a8 100644
--- a/drivers/gpio/Makefile
+++ b/drivers/gpio/Makefile
@@ -14,3 +14,4 @@ obj-$(CONFIG_GPIO_TWL4030)	+= twl4030-gpio.o
 obj-$(CONFIG_GPIO_XILINX)	+= xilinx_gpio.o
 obj-$(CONFIG_GPIO_BT8XX)	+= bt8xxgpio.o
 obj-$(CONFIG_GPIO_VR41XX)	+= vr41xx_giu.o
+obj-$(CONFIG_GPIO_WM831X)	+= wm831x-gpio.o
diff --git a/drivers/gpio/wm831x-gpio.c b/drivers/gpio/wm831x-gpio.c
new file mode 100644
index 000000000000..f9c09a54ec7f
--- /dev/null
+++ b/drivers/gpio/wm831x-gpio.c
@@ -0,0 +1,252 @@
+/*
+ * wm831x-gpio.c  --  gpiolib support for Wolfson WM831x PMICs
+ *
+ * Copyright 2009 Wolfson Microelectronics PLC.
+ *
+ * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/gpio.h>
+#include <linux/mfd/core.h>
+#include <linux/platform_device.h>
+#include <linux/seq_file.h>
+
+#include <linux/mfd/wm831x/core.h>
+#include <linux/mfd/wm831x/pdata.h>
+#include <linux/mfd/wm831x/gpio.h>
+
+#define WM831X_GPIO_MAX 16
+
+struct wm831x_gpio {
+	struct wm831x *wm831x;
+	struct gpio_chip gpio_chip;
+};
+
+static inline struct wm831x_gpio *to_wm831x_gpio(struct gpio_chip *chip)
+{
+	return container_of(chip, struct wm831x_gpio, gpio_chip);
+}
+
+static int wm831x_gpio_direction_in(struct gpio_chip *chip, unsigned offset)
+{
+	struct wm831x_gpio *wm831x_gpio = to_wm831x_gpio(chip);
+	struct wm831x *wm831x = wm831x_gpio->wm831x;
+
+	return wm831x_set_bits(wm831x, WM831X_GPIO1_CONTROL + offset,
+			       WM831X_GPN_DIR | WM831X_GPN_TRI,
+			       WM831X_GPN_DIR);
+}
+
+static int wm831x_gpio_get(struct gpio_chip *chip, unsigned offset)
+{
+	struct wm831x_gpio *wm831x_gpio = to_wm831x_gpio(chip);
+	struct wm831x *wm831x = wm831x_gpio->wm831x;
+	int ret;
+
+	ret = wm831x_reg_read(wm831x, WM831X_GPIO_LEVEL);
+	if (ret < 0)
+		return ret;
+
+	if (ret & 1 << offset)
+		return 1;
+	else
+		return 0;
+}
+
+static int wm831x_gpio_direction_out(struct gpio_chip *chip,
+				     unsigned offset, int value)
+{
+	struct wm831x_gpio *wm831x_gpio = to_wm831x_gpio(chip);
+	struct wm831x *wm831x = wm831x_gpio->wm831x;
+
+	return wm831x_set_bits(wm831x, WM831X_GPIO1_CONTROL + offset,
+			       WM831X_GPN_DIR | WM831X_GPN_TRI, 0);
+}
+
+static void wm831x_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
+{
+	struct wm831x_gpio *wm831x_gpio = to_wm831x_gpio(chip);
+	struct wm831x *wm831x = wm831x_gpio->wm831x;
+
+	wm831x_set_bits(wm831x, WM831X_GPIO_LEVEL, 1 << offset,
+			value << offset);
+}
+
+#ifdef CONFIG_DEBUG_FS
+static void wm831x_gpio_dbg_show(struct seq_file *s, struct gpio_chip *chip)
+{
+	struct wm831x_gpio *wm831x_gpio = to_wm831x_gpio(chip);
+	struct wm831x *wm831x = wm831x_gpio->wm831x;
+	int i;
+
+	for (i = 0; i < chip->ngpio; i++) {
+		int gpio = i + chip->base;
+		int reg;
+		const char *label, *pull, *powerdomain;
+
+		/* We report the GPIO even if it's not requested since
+		 * we're also reporting things like alternate
+		 * functions which apply even when the GPIO is not in
+		 * use as a GPIO.
+		 */
+		label = gpiochip_is_requested(chip, i);
+		if (!label)
+			label = "Unrequested";
+
+		seq_printf(s, " gpio-%-3d (%-20.20s) ", gpio, label);
+
+		reg = wm831x_reg_read(wm831x, WM831X_GPIO1_CONTROL + i);
+		if (reg < 0) {
+			dev_err(wm831x->dev,
+				"GPIO control %d read failed: %d\n",
+				gpio, reg);
+			seq_printf(s, "\n");
+			continue;
+		}
+
+		switch (reg & WM831X_GPN_PULL_MASK) {
+		case WM831X_GPIO_PULL_NONE:
+			pull = "nopull";
+			break;
+		case WM831X_GPIO_PULL_DOWN:
+			pull = "pulldown";
+			break;
+		case WM831X_GPIO_PULL_UP:
+			pull = "pullup";
+		default:
+			pull = "INVALID PULL";
+			break;
+		}
+
+		switch (i + 1) {
+		case 1 ... 3:
+		case 7 ... 9:
+			if (reg & WM831X_GPN_PWR_DOM)
+				powerdomain = "VPMIC";
+			else
+				powerdomain = "DBVDD";
+			break;
+
+		case 4 ... 6:
+		case 10 ... 12:
+			if (reg & WM831X_GPN_PWR_DOM)
+				powerdomain = "SYSVDD";
+			else
+				powerdomain = "DBVDD";
+			break;
+
+		case 13 ... 16:
+			powerdomain = "TPVDD";
+			break;
+
+		default:
+			BUG();
+			break;
+		}
+
+		seq_printf(s, " %s %s %s %s%s\n"
+			   "                                  %s%s (0x%4x)\n",
+			   reg & WM831X_GPN_DIR ? "in" : "out",
+			   wm831x_gpio_get(chip, i) ? "high" : "low",
+			   pull,
+			   powerdomain,
+			   reg & WM831X_GPN_POL ? " inverted" : "",
+			   reg & WM831X_GPN_OD ? "open-drain" : "CMOS",
+			   reg & WM831X_GPN_TRI ? " tristated" : "",
+			   reg);
+	}
+}
+#else
+#define wm831x_gpio_dbg_show NULL
+#endif
+
+static struct gpio_chip template_chip = {
+	.label			= "wm831x",
+	.owner			= THIS_MODULE,
+	.direction_input	= wm831x_gpio_direction_in,
+	.get			= wm831x_gpio_get,
+	.direction_output	= wm831x_gpio_direction_out,
+	.set			= wm831x_gpio_set,
+	.dbg_show		= wm831x_gpio_dbg_show,
+	.can_sleep		= 1,
+};
+
+static int __devinit wm831x_gpio_probe(struct platform_device *pdev)
+{
+	struct wm831x *wm831x = dev_get_drvdata(pdev->dev.parent);
+	struct wm831x_pdata *pdata = wm831x->dev->platform_data;
+	struct wm831x_gpio *wm831x_gpio;
+	int ret;
+
+	wm831x_gpio = kzalloc(sizeof(*wm831x_gpio), GFP_KERNEL);
+	if (wm831x_gpio == NULL)
+		return -ENOMEM;
+
+	wm831x_gpio->wm831x = wm831x;
+	wm831x_gpio->gpio_chip = template_chip;
+	wm831x_gpio->gpio_chip.ngpio = WM831X_GPIO_MAX;
+	wm831x_gpio->gpio_chip.dev = &pdev->dev;
+	if (pdata && pdata->gpio_base)
+		wm831x_gpio->gpio_chip.base = pdata->gpio_base;
+	else
+		wm831x_gpio->gpio_chip.base = -1;
+
+	ret = gpiochip_add(&wm831x_gpio->gpio_chip);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "Could not register gpiochip, %d\n",
+			ret);
+		goto err;
+	}
+
+	platform_set_drvdata(pdev, wm831x_gpio);
+
+	return ret;
+
+err:
+	kfree(wm831x_gpio);
+	return ret;
+}
+
+static int __devexit wm831x_gpio_remove(struct platform_device *pdev)
+{
+	struct wm831x_gpio *wm831x_gpio = platform_get_drvdata(pdev);
+	int ret;
+
+	ret = gpiochip_remove(&wm831x_gpio->gpio_chip);
+	if (ret == 0)
+		kfree(wm831x_gpio);
+
+	return ret;
+}
+
+static struct platform_driver wm831x_gpio_driver = {
+	.driver.name	= "wm831x-gpio",
+	.driver.owner	= THIS_MODULE,
+	.probe		= wm831x_gpio_probe,
+	.remove		= __devexit_p(wm831x_gpio_remove),
+};
+
+static int __init wm831x_gpio_init(void)
+{
+	return platform_driver_register(&wm831x_gpio_driver);
+}
+subsys_initcall(wm831x_gpio_init);
+
+static void __exit wm831x_gpio_exit(void)
+{
+	platform_driver_unregister(&wm831x_gpio_driver);
+}
+module_exit(wm831x_gpio_exit);
+
+MODULE_AUTHOR("Mark Brown <broonie@opensource.wolfsonmicro.com>");
+MODULE_DESCRIPTION("GPIO interface for WM831x PMICs");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:wm831x-gpio");
diff --git a/include/linux/mfd/wm831x/gpio.h b/include/linux/mfd/wm831x/gpio.h
new file mode 100644
index 000000000000..2835614af0e3
--- /dev/null
+++ b/include/linux/mfd/wm831x/gpio.h
@@ -0,0 +1,55 @@
+/*
+ * include/linux/mfd/wm831x/gpio.h -- GPIO for WM831x
+ *
+ * Copyright 2009 Wolfson Microelectronics PLC.
+ *
+ * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#ifndef __MFD_WM831X_GPIO_H__
+#define __MFD_WM831X_GPIO_H__
+
+/*
+ * R16440-16455 (0x4038-0x4047) - GPIOx Control
+ */
+#define WM831X_GPN_DIR                          0x8000  /* GPN_DIR */
+#define WM831X_GPN_DIR_MASK                     0x8000  /* GPN_DIR */
+#define WM831X_GPN_DIR_SHIFT                        15  /* GPN_DIR */
+#define WM831X_GPN_DIR_WIDTH                         1  /* GPN_DIR */
+#define WM831X_GPN_PULL_MASK                    0x6000  /* GPN_PULL - [14:13] */
+#define WM831X_GPN_PULL_SHIFT                       13  /* GPN_PULL - [14:13] */
+#define WM831X_GPN_PULL_WIDTH                        2  /* GPN_PULL - [14:13] */
+#define WM831X_GPN_INT_MODE                     0x1000  /* GPN_INT_MODE */
+#define WM831X_GPN_INT_MODE_MASK                0x1000  /* GPN_INT_MODE */
+#define WM831X_GPN_INT_MODE_SHIFT                   12  /* GPN_INT_MODE */
+#define WM831X_GPN_INT_MODE_WIDTH                    1  /* GPN_INT_MODE */
+#define WM831X_GPN_PWR_DOM                      0x0800  /* GPN_PWR_DOM */
+#define WM831X_GPN_PWR_DOM_MASK                 0x0800  /* GPN_PWR_DOM */
+#define WM831X_GPN_PWR_DOM_SHIFT                    11  /* GPN_PWR_DOM */
+#define WM831X_GPN_PWR_DOM_WIDTH                     1  /* GPN_PWR_DOM */
+#define WM831X_GPN_POL                          0x0400  /* GPN_POL */
+#define WM831X_GPN_POL_MASK                     0x0400  /* GPN_POL */
+#define WM831X_GPN_POL_SHIFT                        10  /* GPN_POL */
+#define WM831X_GPN_POL_WIDTH                         1  /* GPN_POL */
+#define WM831X_GPN_OD                           0x0200  /* GPN_OD */
+#define WM831X_GPN_OD_MASK                      0x0200  /* GPN_OD */
+#define WM831X_GPN_OD_SHIFT                          9  /* GPN_OD */
+#define WM831X_GPN_OD_WIDTH                          1  /* GPN_OD */
+#define WM831X_GPN_TRI                          0x0080  /* GPN_TRI */
+#define WM831X_GPN_TRI_MASK                     0x0080  /* GPN_TRI */
+#define WM831X_GPN_TRI_SHIFT                         7  /* GPN_TRI */
+#define WM831X_GPN_TRI_WIDTH                         1  /* GPN_TRI */
+#define WM831X_GPN_FN_MASK                      0x000F  /* GPN_FN - [3:0] */
+#define WM831X_GPN_FN_SHIFT                          0  /* GPN_FN - [3:0] */
+#define WM831X_GPN_FN_WIDTH                          4  /* GPN_FN - [3:0] */
+
+#define WM831X_GPIO_PULL_NONE (0 << WM831X_GPN_PULL_SHIFT)
+#define WM831X_GPIO_PULL_DOWN (1 << WM831X_GPN_PULL_SHIFT)
+#define WM831X_GPIO_PULL_UP   (2 << WM831X_GPN_PULL_SHIFT)
+#endif
-- 
cgit v1.2.3-71-gd317


From 0c73b992dd4c645f050344cb13142c0fd3496824 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Tue, 15 Sep 2009 12:07:12 +0200
Subject: input: Add support for the WM831x ON pin

The WM831x series of PMICs support control of initial power on
through the ON pin on the device with soft control of the pin
at other times. Represent this to userspace as KEY_POWER.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Acked-by: Dmitry Torokhov <dtor@mail.ru>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/input/misc/Kconfig      |  10 +++
 drivers/input/misc/Makefile     |   1 +
 drivers/input/misc/wm831x-on.c  | 163 ++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/wm831x/core.h |  19 +++++
 4 files changed, 193 insertions(+)
 create mode 100644 drivers/input/misc/wm831x-on.c

(limited to 'include/linux')

diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig
index cbe21bc96b52..852941d108ff 100644
--- a/drivers/input/misc/Kconfig
+++ b/drivers/input/misc/Kconfig
@@ -279,4 +279,14 @@ config INPUT_BFIN_ROTARY
 	  To compile this driver as a module, choose M here: the
 	  module will be called bfin-rotary.
 
+config INPUT_WM831X_ON
+	tristate "WM831X ON pin"
+	depends on MFD_WM831X
+	help
+	  Support the ON pin of WM831X PMICs as an input device
+	  reporting power button status.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called wm831x_on.
+
 endif
diff --git a/drivers/input/misc/Makefile b/drivers/input/misc/Makefile
index 79c1e9a5ea31..c97533fb83c8 100644
--- a/drivers/input/misc/Makefile
+++ b/drivers/input/misc/Makefile
@@ -26,4 +26,5 @@ obj-$(CONFIG_INPUT_SPARCSPKR)		+= sparcspkr.o
 obj-$(CONFIG_INPUT_TWL4030_PWRBUTTON)	+= twl4030-pwrbutton.o
 obj-$(CONFIG_INPUT_UINPUT)		+= uinput.o
 obj-$(CONFIG_INPUT_WISTRON_BTNS)	+= wistron_btns.o
+obj-$(CONFIG_INPUT_WM831X_ON)		+= wm831x-on.o
 obj-$(CONFIG_INPUT_YEALINK)		+= yealink.o
diff --git a/drivers/input/misc/wm831x-on.c b/drivers/input/misc/wm831x-on.c
new file mode 100644
index 000000000000..ba4f5dd7c60e
--- /dev/null
+++ b/drivers/input/misc/wm831x-on.c
@@ -0,0 +1,163 @@
+/**
+ * wm831x-on.c - WM831X ON pin driver
+ *
+ * Copyright (C) 2009 Wolfson Microelectronics plc
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License. See the file "COPYING" in the main directory of this
+ * archive for more details.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/input.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/workqueue.h>
+#include <linux/mfd/wm831x/core.h>
+
+struct wm831x_on {
+	struct input_dev *dev;
+	struct delayed_work work;
+	struct wm831x *wm831x;
+};
+
+/*
+ * The chip gives us an interrupt when the ON pin is asserted but we
+ * then need to poll to see when the pin is deasserted.
+ */
+static void wm831x_poll_on(struct work_struct *work)
+{
+	struct wm831x_on *wm831x_on = container_of(work, struct wm831x_on,
+						   work.work);
+	struct wm831x *wm831x = wm831x_on->wm831x;
+	int poll, ret;
+
+	ret = wm831x_reg_read(wm831x, WM831X_ON_PIN_CONTROL);
+	if (ret >= 0) {
+		poll = !(ret & WM831X_ON_PIN_STS);
+
+		input_report_key(wm831x_on->dev, KEY_POWER, poll);
+		input_sync(wm831x_on->dev);
+	} else {
+		dev_err(wm831x->dev, "Failed to read ON status: %d\n", ret);
+		poll = 1;
+	}
+
+	if (poll)
+		schedule_delayed_work(&wm831x_on->work, 100);
+}
+
+static irqreturn_t wm831x_on_irq(int irq, void *data)
+{
+	struct wm831x_on *wm831x_on = data;
+
+	schedule_delayed_work(&wm831x_on->work, 0);
+
+	return IRQ_HANDLED;
+}
+
+static int __devinit wm831x_on_probe(struct platform_device *pdev)
+{
+	struct wm831x *wm831x = dev_get_drvdata(pdev->dev.parent);
+	struct wm831x_on *wm831x_on;
+	int irq = platform_get_irq(pdev, 0);
+	int ret;
+
+	wm831x_on = kzalloc(sizeof(struct wm831x_on), GFP_KERNEL);
+	if (!wm831x_on) {
+		dev_err(&pdev->dev, "Can't allocate data\n");
+		return -ENOMEM;
+	}
+
+	wm831x_on->wm831x = wm831x;
+	INIT_DELAYED_WORK(&wm831x_on->work, wm831x_poll_on);
+
+	wm831x_on->dev = input_allocate_device();
+	if (!wm831x_on->dev) {
+		dev_err(&pdev->dev, "Can't allocate input dev\n");
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	wm831x_on->dev->evbit[0] = BIT_MASK(EV_KEY);
+	wm831x_on->dev->keybit[BIT_WORD(KEY_POWER)] = BIT_MASK(KEY_POWER);
+	wm831x_on->dev->name = "wm831x_on";
+	wm831x_on->dev->phys = "wm831x_on/input0";
+	wm831x_on->dev->dev.parent = &pdev->dev;
+
+	ret = wm831x_request_irq(wm831x, irq, wm831x_on_irq,
+				 IRQF_TRIGGER_RISING, "wm831x_on", wm831x_on);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "Unable to request IRQ: %d\n", ret);
+		goto err_input_dev;
+	}
+	ret = input_register_device(wm831x_on->dev);
+	if (ret) {
+		dev_dbg(&pdev->dev, "Can't register input device: %d\n", ret);
+		goto err_irq;
+	}
+
+	platform_set_drvdata(pdev, wm831x_on);
+
+	return 0;
+
+err_irq:
+	wm831x_free_irq(wm831x, irq, NULL);
+err_input_dev:
+	input_free_device(wm831x_on->dev);
+err:
+	kfree(wm831x_on);
+	return ret;
+}
+
+static int __devexit wm831x_on_remove(struct platform_device *pdev)
+{
+	struct wm831x_on *wm831x_on = platform_get_drvdata(pdev);
+	int irq = platform_get_irq(pdev, 0);
+
+	wm831x_free_irq(wm831x_on->wm831x, irq, wm831x_on);
+	cancel_delayed_work_sync(&wm831x_on->work);
+	input_unregister_device(wm831x_on->dev);
+	kfree(wm831x_on);
+
+	return 0;
+}
+
+static struct platform_driver wm831x_on_driver = {
+	.probe		= wm831x_on_probe,
+	.remove		= __devexit_p(wm831x_on_remove),
+	.driver		= {
+		.name	= "wm831x-on",
+		.owner	= THIS_MODULE,
+	},
+};
+
+static int __init wm831x_on_init(void)
+{
+	return platform_driver_register(&wm831x_on_driver);
+}
+module_init(wm831x_on_init);
+
+static void __exit wm831x_on_exit(void)
+{
+	platform_driver_unregister(&wm831x_on_driver);
+}
+module_exit(wm831x_on_exit);
+
+MODULE_ALIAS("platform:wm831x-on");
+MODULE_DESCRIPTION("WM831x ON pin");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Mark Brown <broonie@opensource.wolfsonmicro.com>");
+
diff --git a/include/linux/mfd/wm831x/core.h b/include/linux/mfd/wm831x/core.h
index d7134dfba56e..91eb493bf14c 100644
--- a/include/linux/mfd/wm831x/core.h
+++ b/include/linux/mfd/wm831x/core.h
@@ -216,6 +216,25 @@
 #define WM831X_PARENT_ID_SHIFT                       0  /* PARENT_ID - [15:0] */
 #define WM831X_PARENT_ID_WIDTH                      16  /* PARENT_ID - [15:0] */
 
+/*
+ * R16389 (0x4005) - ON Pin Control
+ */
+#define WM831X_ON_PIN_SECACT_MASK               0x0300  /* ON_PIN_SECACT - [9:8] */
+#define WM831X_ON_PIN_SECACT_SHIFT                   8  /* ON_PIN_SECACT - [9:8] */
+#define WM831X_ON_PIN_SECACT_WIDTH                   2  /* ON_PIN_SECACT - [9:8] */
+#define WM831X_ON_PIN_PRIMACT_MASK              0x0030  /* ON_PIN_PRIMACT - [5:4] */
+#define WM831X_ON_PIN_PRIMACT_SHIFT                  4  /* ON_PIN_PRIMACT - [5:4] */
+#define WM831X_ON_PIN_PRIMACT_WIDTH                  2  /* ON_PIN_PRIMACT - [5:4] */
+#define WM831X_ON_PIN_STS                       0x0008  /* ON_PIN_STS */
+#define WM831X_ON_PIN_STS_MASK                  0x0008  /* ON_PIN_STS */
+#define WM831X_ON_PIN_STS_SHIFT                      3  /* ON_PIN_STS */
+#define WM831X_ON_PIN_STS_WIDTH                      1  /* ON_PIN_STS */
+#define WM831X_ON_PIN_TO_MASK                   0x0003  /* ON_PIN_TO - [1:0] */
+#define WM831X_ON_PIN_TO_SHIFT                       0  /* ON_PIN_TO - [1:0] */
+#define WM831X_ON_PIN_TO_WIDTH                       2  /* ON_PIN_TO - [1:0] */
+
+struct regulator_dev;
+
 struct wm831x {
 	struct mutex io_lock;
 
-- 
cgit v1.2.3-71-gd317


From be721979dd6b335e4ab6f83abb5cc11c33662aa8 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Tue, 4 Aug 2009 20:09:52 +0200
Subject: regulator: Provide mode to status conversion function

This is useful for implementing get_status() in terms of get_mode().

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/regulator/core.c         | 24 ++++++++++++++++++++++++
 include/linux/regulator/driver.h |  2 ++
 2 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 98c3a74e9949..91ba9bfaa706 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -1864,6 +1864,30 @@ int regulator_notifier_call_chain(struct regulator_dev *rdev,
 }
 EXPORT_SYMBOL_GPL(regulator_notifier_call_chain);
 
+/**
+ * regulator_mode_to_status - convert a regulator mode into a status
+ *
+ * @mode: Mode to convert
+ *
+ * Convert a regulator mode into a status.
+ */
+int regulator_mode_to_status(unsigned int mode)
+{
+	switch (mode) {
+	case REGULATOR_MODE_FAST:
+		return REGULATOR_STATUS_FAST;
+	case REGULATOR_MODE_NORMAL:
+		return REGULATOR_STATUS_NORMAL;
+	case REGULATOR_MODE_IDLE:
+		return REGULATOR_STATUS_IDLE;
+	case REGULATOR_STATUS_STANDBY:
+		return REGULATOR_STATUS_STANDBY;
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL_GPL(regulator_mode_to_status);
+
 /*
  * To avoid cluttering sysfs (and memory) with useless state, only
  * create attributes that can be meaningfully displayed.
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 225f733e7533..ce1be708ca16 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -193,6 +193,8 @@ void *rdev_get_drvdata(struct regulator_dev *rdev);
 struct device *rdev_get_dev(struct regulator_dev *rdev);
 int rdev_get_id(struct regulator_dev *rdev);
 
+int regulator_mode_to_status(unsigned int);
+
 void *regulator_get_init_drvdata(struct regulator_init_data *reg_init_data);
 
 #endif
-- 
cgit v1.2.3-71-gd317


From e4ee831f949a7c7746a56bcf1e7ca057d6f69e2a Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Tue, 28 Jul 2009 15:21:49 +0100
Subject: regulator: Add WM831x DC-DC buck convertor support

The WM831x series of devices all have 3 DC-DC buck convertors. This
driver implements software control for these regulators via the
regulator API.  Use with split hardware/software control of individual
regulators is not supported, though regulators not controlled by
software may be controlled via the hardware control interfaces.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Acked-by: Liam Girdwood <lrg@slimlogic.co.uk>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/regulator/Kconfig            |   7 +
 drivers/regulator/Makefile           |   1 +
 drivers/regulator/wm831x-dcdc.c      | 643 +++++++++++++++++++++++++++++++++++
 include/linux/mfd/wm831x/regulator.h | 571 +++++++++++++++++++++++++++++++
 4 files changed, 1222 insertions(+)
 create mode 100644 drivers/regulator/wm831x-dcdc.c

(limited to 'include/linux')

diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index da7483ef32b3..38ea5dc8e143 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -82,6 +82,13 @@ config REGULATOR_TWL4030
 	  This driver supports the voltage regulators provided by
 	  this family of companion chips.
 
+config REGULATOR_WM831X
+	tristate "Wolfson Microelcronics WM831x PMIC regulators"
+	depends on MFD_WM831X
+	help
+	  Support the voltage and current regulators of the WM831x series
+	  of PMIC devices.
+
 config REGULATOR_WM8350
 	tristate "Wolfson Microelectroncis WM8350 AudioPlus PMIC"
 	depends on MFD_WM8350
diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile
index 3a9748ff860b..b1d2b826f532 100644
--- a/drivers/regulator/Makefile
+++ b/drivers/regulator/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_REGULATOR_BQ24022) += bq24022.o
 obj-$(CONFIG_REGULATOR_LP3971) += lp3971.o
 obj-$(CONFIG_REGULATOR_MAX1586) += max1586.o
 obj-$(CONFIG_REGULATOR_TWL4030) += twl4030-regulator.o
+obj-$(CONFIG_REGULATOR_WM831X) += wm831x-dcdc.o
 obj-$(CONFIG_REGULATOR_WM8350) += wm8350-regulator.o
 obj-$(CONFIG_REGULATOR_WM8400) += wm8400-regulator.o
 obj-$(CONFIG_REGULATOR_DA903X)	+= da903x.o
diff --git a/drivers/regulator/wm831x-dcdc.c b/drivers/regulator/wm831x-dcdc.c
new file mode 100644
index 000000000000..fa5126e38acc
--- /dev/null
+++ b/drivers/regulator/wm831x-dcdc.c
@@ -0,0 +1,643 @@
+/*
+ * wm831x-dcdc.c  --  DC-DC buck convertor driver for the WM831x series
+ *
+ * Copyright 2009 Wolfson Microelectronics PLC.
+ *
+ * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/err.h>
+#include <linux/i2c.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/driver.h>
+
+#include <linux/mfd/wm831x/core.h>
+#include <linux/mfd/wm831x/regulator.h>
+#include <linux/mfd/wm831x/pdata.h>
+
+#define WM831X_BUCKV_MAX_SELECTOR 0x68
+#define WM831X_BUCKP_MAX_SELECTOR 0x66
+
+#define WM831X_DCDC_MODE_FAST    0
+#define WM831X_DCDC_MODE_NORMAL  1
+#define WM831X_DCDC_MODE_IDLE    2
+#define WM831X_DCDC_MODE_STANDBY 3
+
+#define WM831X_DCDC_MAX_NAME 6
+
+/* Register offsets in control block */
+#define WM831X_DCDC_CONTROL_1     0
+#define WM831X_DCDC_CONTROL_2     1
+#define WM831X_DCDC_ON_CONFIG     2
+#define WM831X_DCDC_SLEEP_CONTROL 3
+
+/*
+ * Shared
+ */
+
+struct wm831x_dcdc {
+	char name[WM831X_DCDC_MAX_NAME];
+	struct regulator_desc desc;
+	int base;
+	struct wm831x *wm831x;
+	struct regulator_dev *regulator;
+};
+
+static int wm831x_dcdc_is_enabled(struct regulator_dev *rdev)
+{
+	struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = dcdc->wm831x;
+	int mask = 1 << rdev_get_id(rdev);
+	int reg;
+
+	reg = wm831x_reg_read(wm831x, WM831X_DCDC_ENABLE);
+	if (reg < 0)
+		return reg;
+
+	if (reg & mask)
+		return 1;
+	else
+		return 0;
+}
+
+static int wm831x_dcdc_enable(struct regulator_dev *rdev)
+{
+	struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = dcdc->wm831x;
+	int mask = 1 << rdev_get_id(rdev);
+
+	return wm831x_set_bits(wm831x, WM831X_DCDC_ENABLE, mask, mask);
+}
+
+static int wm831x_dcdc_disable(struct regulator_dev *rdev)
+{
+	struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = dcdc->wm831x;
+	int mask = 1 << rdev_get_id(rdev);
+
+	return wm831x_set_bits(wm831x, WM831X_DCDC_ENABLE, mask, 0);
+}
+
+static unsigned int wm831x_dcdc_get_mode(struct regulator_dev *rdev)
+
+{
+	struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = dcdc->wm831x;
+	u16 reg = dcdc->base + WM831X_DCDC_ON_CONFIG;
+	int val;
+
+	val = wm831x_reg_read(wm831x, reg);
+	if (val < 0)
+		return val;
+
+	val = (val & WM831X_DC1_ON_MODE_MASK) >> WM831X_DC1_ON_MODE_SHIFT;
+
+	switch (val) {
+	case WM831X_DCDC_MODE_FAST:
+		return REGULATOR_MODE_FAST;
+	case WM831X_DCDC_MODE_NORMAL:
+		return REGULATOR_MODE_NORMAL;
+	case WM831X_DCDC_MODE_STANDBY:
+		return REGULATOR_MODE_STANDBY;
+	case WM831X_DCDC_MODE_IDLE:
+		return REGULATOR_MODE_IDLE;
+	default:
+		BUG();
+	}
+}
+
+static int wm831x_dcdc_set_mode_int(struct wm831x *wm831x, int reg,
+				    unsigned int mode)
+{
+	int val;
+
+	switch (mode) {
+	case REGULATOR_MODE_FAST:
+		val = WM831X_DCDC_MODE_FAST;
+		break;
+	case REGULATOR_MODE_NORMAL:
+		val = WM831X_DCDC_MODE_NORMAL;
+		break;
+	case REGULATOR_MODE_STANDBY:
+		val = WM831X_DCDC_MODE_STANDBY;
+		break;
+	case REGULATOR_MODE_IDLE:
+		val = WM831X_DCDC_MODE_IDLE;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return wm831x_set_bits(wm831x, reg, WM831X_DC1_ON_MODE_MASK,
+			       val << WM831X_DC1_ON_MODE_SHIFT);
+}
+
+static int wm831x_dcdc_set_mode(struct regulator_dev *rdev, unsigned int mode)
+{
+	struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = dcdc->wm831x;
+	u16 reg = dcdc->base + WM831X_DCDC_ON_CONFIG;
+
+	return wm831x_dcdc_set_mode_int(wm831x, reg, mode);
+}
+
+static int wm831x_dcdc_set_suspend_mode(struct regulator_dev *rdev,
+					unsigned int mode)
+{
+	struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = dcdc->wm831x;
+	u16 reg = dcdc->base + WM831X_DCDC_SLEEP_CONTROL;
+
+	return wm831x_dcdc_set_mode_int(wm831x, reg, mode);
+}
+
+static int wm831x_dcdc_get_status(struct regulator_dev *rdev)
+{
+	struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = dcdc->wm831x;
+	int ret;
+
+	/* First, check for errors */
+	ret = wm831x_reg_read(wm831x, WM831X_DCDC_UV_STATUS);
+	if (ret < 0)
+		return ret;
+
+	if (ret & (1 << rdev_get_id(rdev))) {
+		dev_dbg(wm831x->dev, "DCDC%d under voltage\n",
+			rdev_get_id(rdev) + 1);
+		return REGULATOR_STATUS_ERROR;
+	}
+
+	/* DCDC1 and DCDC2 can additionally detect high voltage/current */
+	if (rdev_get_id(rdev) < 2) {
+		if (ret & (WM831X_DC1_OV_STS << rdev_get_id(rdev))) {
+			dev_dbg(wm831x->dev, "DCDC%d over voltage\n",
+				rdev_get_id(rdev) + 1);
+			return REGULATOR_STATUS_ERROR;
+		}
+
+		if (ret & (WM831X_DC1_HC_STS << rdev_get_id(rdev))) {
+			dev_dbg(wm831x->dev, "DCDC%d over current\n",
+				rdev_get_id(rdev) + 1);
+			return REGULATOR_STATUS_ERROR;
+		}
+	}
+
+	/* Is the regulator on? */
+	ret = wm831x_reg_read(wm831x, WM831X_DCDC_STATUS);
+	if (ret < 0)
+		return ret;
+	if (!(ret & (1 << rdev_get_id(rdev))))
+		return REGULATOR_STATUS_OFF;
+
+	/* TODO: When we handle hardware control modes so we can report the
+	 * current mode. */
+	return REGULATOR_STATUS_ON;
+}
+
+static irqreturn_t wm831x_dcdc_uv_irq(int irq, void *data)
+{
+	struct wm831x_dcdc *dcdc = data;
+
+	regulator_notifier_call_chain(dcdc->regulator,
+				      REGULATOR_EVENT_UNDER_VOLTAGE,
+				      NULL);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t wm831x_dcdc_oc_irq(int irq, void *data)
+{
+	struct wm831x_dcdc *dcdc = data;
+
+	regulator_notifier_call_chain(dcdc->regulator,
+				      REGULATOR_EVENT_OVER_CURRENT,
+				      NULL);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * BUCKV specifics
+ */
+
+static int wm831x_buckv_list_voltage(struct regulator_dev *rdev,
+				      unsigned selector)
+{
+	if (selector <= 0x8)
+		return 600000;
+	if (selector <= WM831X_BUCKV_MAX_SELECTOR)
+		return 600000 + ((selector - 0x8) * 12500);
+	return -EINVAL;
+}
+
+static int wm831x_buckv_set_voltage_int(struct regulator_dev *rdev, int reg,
+					 int min_uV, int max_uV)
+{
+	struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = dcdc->wm831x;
+	u16 vsel;
+
+	if (min_uV < 600000)
+		vsel = 0;
+	else if (min_uV <= 1800000)
+		vsel = ((min_uV - 600000) / 12500) + 8;
+	else
+		return -EINVAL;
+
+	if (wm831x_buckv_list_voltage(rdev, vsel) > max_uV)
+		return -EINVAL;
+
+	return wm831x_set_bits(wm831x, reg, WM831X_DC1_ON_VSEL_MASK, vsel);
+}
+
+static int wm831x_buckv_set_voltage(struct regulator_dev *rdev,
+				     int min_uV, int max_uV)
+{
+	struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev);
+	u16 reg = dcdc->base + WM831X_DCDC_ON_CONFIG;
+
+	return wm831x_buckv_set_voltage_int(rdev, reg, min_uV, max_uV);
+}
+
+static int wm831x_buckv_set_suspend_voltage(struct regulator_dev *rdev,
+					     int uV)
+{
+	struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev);
+	u16 reg = dcdc->base + WM831X_DCDC_SLEEP_CONTROL;
+
+	return wm831x_buckv_set_voltage_int(rdev, reg, uV, uV);
+}
+
+static int wm831x_buckv_get_voltage(struct regulator_dev *rdev)
+{
+	struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = dcdc->wm831x;
+	u16 reg = dcdc->base + WM831X_DCDC_ON_CONFIG;
+	int val;
+
+	val = wm831x_reg_read(wm831x, reg);
+	if (val < 0)
+		return val;
+
+	return wm831x_buckv_list_voltage(rdev, val & WM831X_DC1_ON_VSEL_MASK);
+}
+
+/* Current limit options */
+static u16 wm831x_dcdc_ilim[] = {
+	125, 250, 375, 500, 625, 750, 875, 1000
+};
+
+static int wm831x_buckv_set_current_limit(struct regulator_dev *rdev,
+					   int min_uA, int max_uA)
+{
+	struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = dcdc->wm831x;
+	u16 reg = dcdc->base + WM831X_DCDC_CONTROL_2;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(wm831x_dcdc_ilim); i++) {
+		if (max_uA <= wm831x_dcdc_ilim[i])
+			break;
+	}
+	if (i == ARRAY_SIZE(wm831x_dcdc_ilim))
+		return -EINVAL;
+
+	return wm831x_set_bits(wm831x, reg, WM831X_DC1_HC_THR_MASK, i);
+}
+
+static int wm831x_buckv_get_current_limit(struct regulator_dev *rdev)
+{
+	struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = dcdc->wm831x;
+	u16 reg = dcdc->base + WM831X_DCDC_CONTROL_2;
+	int val;
+
+	val = wm831x_reg_read(wm831x, reg);
+	if (val < 0)
+		return val;
+
+	return wm831x_dcdc_ilim[val & WM831X_DC1_HC_THR_MASK];
+}
+
+static struct regulator_ops wm831x_buckv_ops = {
+	.set_voltage = wm831x_buckv_set_voltage,
+	.get_voltage = wm831x_buckv_get_voltage,
+	.list_voltage = wm831x_buckv_list_voltage,
+	.set_suspend_voltage = wm831x_buckv_set_suspend_voltage,
+	.set_current_limit = wm831x_buckv_set_current_limit,
+	.get_current_limit = wm831x_buckv_get_current_limit,
+
+	.is_enabled = wm831x_dcdc_is_enabled,
+	.enable = wm831x_dcdc_enable,
+	.disable = wm831x_dcdc_disable,
+	.get_status = wm831x_dcdc_get_status,
+	.get_mode = wm831x_dcdc_get_mode,
+	.set_mode = wm831x_dcdc_set_mode,
+	.set_suspend_mode = wm831x_dcdc_set_suspend_mode,
+};
+
+static __devinit int wm831x_buckv_probe(struct platform_device *pdev)
+{
+	struct wm831x *wm831x = dev_get_drvdata(pdev->dev.parent);
+	struct wm831x_pdata *pdata = wm831x->dev->platform_data;
+	int id = pdev->id % ARRAY_SIZE(pdata->dcdc);
+	struct wm831x_dcdc *dcdc;
+	struct resource *res;
+	int ret, irq;
+
+	dev_dbg(&pdev->dev, "Probing DCDC%d\n", id + 1);
+
+	if (pdata == NULL || pdata->dcdc[id] == NULL)
+		return -ENODEV;
+
+	dcdc = kzalloc(sizeof(struct wm831x_dcdc), GFP_KERNEL);
+	if (dcdc == NULL) {
+		dev_err(&pdev->dev, "Unable to allocate private data\n");
+		return -ENOMEM;
+	}
+
+	dcdc->wm831x = wm831x;
+
+	res = platform_get_resource(pdev, IORESOURCE_IO, 0);
+	if (res == NULL) {
+		dev_err(&pdev->dev, "No I/O resource\n");
+		ret = -EINVAL;
+		goto err;
+	}
+	dcdc->base = res->start;
+
+	snprintf(dcdc->name, sizeof(dcdc->name), "DCDC%d", id + 1);
+	dcdc->desc.name = dcdc->name;
+	dcdc->desc.id = id;
+	dcdc->desc.type = REGULATOR_VOLTAGE;
+	dcdc->desc.n_voltages = WM831X_BUCKV_MAX_SELECTOR + 1;
+	dcdc->desc.ops = &wm831x_buckv_ops;
+	dcdc->desc.owner = THIS_MODULE;
+
+	dcdc->regulator = regulator_register(&dcdc->desc, &pdev->dev,
+					     pdata->dcdc[id], dcdc);
+	if (IS_ERR(dcdc->regulator)) {
+		ret = PTR_ERR(dcdc->regulator);
+		dev_err(wm831x->dev, "Failed to register DCDC%d: %d\n",
+			id + 1, ret);
+		goto err;
+	}
+
+	irq = platform_get_irq_byname(pdev, "UV");
+	ret = wm831x_request_irq(wm831x, irq, wm831x_dcdc_uv_irq,
+				 IRQF_TRIGGER_RISING, dcdc->name,
+				 dcdc);
+	if (ret != 0) {
+		dev_err(&pdev->dev, "Failed to request UV IRQ %d: %d\n",
+			irq, ret);
+		goto err_regulator;
+	}
+
+	irq = platform_get_irq_byname(pdev, "HC");
+	ret = wm831x_request_irq(wm831x, irq, wm831x_dcdc_oc_irq,
+				 IRQF_TRIGGER_RISING, dcdc->name,
+				 dcdc);
+	if (ret != 0) {
+		dev_err(&pdev->dev, "Failed to request HC IRQ %d: %d\n",
+			irq, ret);
+		goto err_uv;
+	}
+
+	platform_set_drvdata(pdev, dcdc);
+
+	return 0;
+
+err_uv:
+	wm831x_free_irq(wm831x, platform_get_irq_byname(pdev, "UV"), dcdc);
+err_regulator:
+	regulator_unregister(dcdc->regulator);
+err:
+	kfree(dcdc);
+	return ret;
+}
+
+static __devexit int wm831x_buckv_remove(struct platform_device *pdev)
+{
+	struct wm831x_dcdc *dcdc = platform_get_drvdata(pdev);
+	struct wm831x *wm831x = dcdc->wm831x;
+
+	wm831x_free_irq(wm831x, platform_get_irq_byname(pdev, "HC"), dcdc);
+	wm831x_free_irq(wm831x, platform_get_irq_byname(pdev, "UV"), dcdc);
+	regulator_unregister(dcdc->regulator);
+	kfree(dcdc);
+
+	return 0;
+}
+
+static struct platform_driver wm831x_buckv_driver = {
+	.probe = wm831x_buckv_probe,
+	.remove = __devexit_p(wm831x_buckv_remove),
+	.driver		= {
+		.name	= "wm831x-buckv",
+	},
+};
+
+/*
+ * BUCKP specifics
+ */
+
+static int wm831x_buckp_list_voltage(struct regulator_dev *rdev,
+				      unsigned selector)
+{
+	if (selector <= WM831X_BUCKP_MAX_SELECTOR)
+		return 850000 + (selector * 25000);
+	else
+		return -EINVAL;
+}
+
+static int wm831x_buckp_set_voltage_int(struct regulator_dev *rdev, int reg,
+					int min_uV, int max_uV)
+{
+	struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = dcdc->wm831x;
+	u16 vsel;
+
+	if (min_uV <= 34000000)
+		vsel = (min_uV - 850000) / 25000;
+	else
+		return -EINVAL;
+
+	if (wm831x_buckp_list_voltage(rdev, vsel) > max_uV)
+		return -EINVAL;
+
+	return wm831x_set_bits(wm831x, reg, WM831X_DC3_ON_VSEL_MASK, vsel);
+}
+
+static int wm831x_buckp_set_voltage(struct regulator_dev *rdev,
+				    int min_uV, int max_uV)
+{
+	struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev);
+	u16 reg = dcdc->base + WM831X_DCDC_ON_CONFIG;
+
+	return wm831x_buckp_set_voltage_int(rdev, reg, min_uV, max_uV);
+}
+
+static int wm831x_buckp_set_suspend_voltage(struct regulator_dev *rdev,
+					    int uV)
+{
+	struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev);
+	u16 reg = dcdc->base + WM831X_DCDC_SLEEP_CONTROL;
+
+	return wm831x_buckp_set_voltage_int(rdev, reg, uV, uV);
+}
+
+static int wm831x_buckp_get_voltage(struct regulator_dev *rdev)
+{
+	struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = dcdc->wm831x;
+	u16 reg = dcdc->base + WM831X_DCDC_ON_CONFIG;
+	int val;
+
+	val = wm831x_reg_read(wm831x, reg);
+	if (val < 0)
+		return val;
+
+	return wm831x_buckp_list_voltage(rdev, val & WM831X_DC3_ON_VSEL_MASK);
+}
+
+static struct regulator_ops wm831x_buckp_ops = {
+	.set_voltage = wm831x_buckp_set_voltage,
+	.get_voltage = wm831x_buckp_get_voltage,
+	.list_voltage = wm831x_buckp_list_voltage,
+	.set_suspend_voltage = wm831x_buckp_set_suspend_voltage,
+
+	.is_enabled = wm831x_dcdc_is_enabled,
+	.enable = wm831x_dcdc_enable,
+	.disable = wm831x_dcdc_disable,
+	.get_status = wm831x_dcdc_get_status,
+	.get_mode = wm831x_dcdc_get_mode,
+	.set_mode = wm831x_dcdc_set_mode,
+	.set_suspend_mode = wm831x_dcdc_set_suspend_mode,
+};
+
+static __devinit int wm831x_buckp_probe(struct platform_device *pdev)
+{
+	struct wm831x *wm831x = dev_get_drvdata(pdev->dev.parent);
+	struct wm831x_pdata *pdata = wm831x->dev->platform_data;
+	int id = pdev->id % ARRAY_SIZE(pdata->dcdc);
+	struct wm831x_dcdc *dcdc;
+	struct resource *res;
+	int ret, irq;
+
+	dev_dbg(&pdev->dev, "Probing DCDC%d\n", id + 1);
+
+	if (pdata == NULL || pdata->dcdc[id] == NULL)
+		return -ENODEV;
+
+	dcdc = kzalloc(sizeof(struct wm831x_dcdc), GFP_KERNEL);
+	if (dcdc == NULL) {
+		dev_err(&pdev->dev, "Unable to allocate private data\n");
+		return -ENOMEM;
+	}
+
+	dcdc->wm831x = wm831x;
+
+	res = platform_get_resource(pdev, IORESOURCE_IO, 0);
+	if (res == NULL) {
+		dev_err(&pdev->dev, "No I/O resource\n");
+		ret = -EINVAL;
+		goto err;
+	}
+	dcdc->base = res->start;
+
+	snprintf(dcdc->name, sizeof(dcdc->name), "DCDC%d", id + 1);
+	dcdc->desc.name = dcdc->name;
+	dcdc->desc.id = id;
+	dcdc->desc.type = REGULATOR_VOLTAGE;
+	dcdc->desc.n_voltages = WM831X_BUCKP_MAX_SELECTOR + 1;
+	dcdc->desc.ops = &wm831x_buckp_ops;
+	dcdc->desc.owner = THIS_MODULE;
+
+	dcdc->regulator = regulator_register(&dcdc->desc, &pdev->dev,
+					     pdata->dcdc[id], dcdc);
+	if (IS_ERR(dcdc->regulator)) {
+		ret = PTR_ERR(dcdc->regulator);
+		dev_err(wm831x->dev, "Failed to register DCDC%d: %d\n",
+			id + 1, ret);
+		goto err;
+	}
+
+	irq = platform_get_irq_byname(pdev, "UV");
+	ret = wm831x_request_irq(wm831x, irq, wm831x_dcdc_uv_irq,
+				 IRQF_TRIGGER_RISING, dcdc->name,
+				 dcdc);
+	if (ret != 0) {
+		dev_err(&pdev->dev, "Failed to request UV IRQ %d: %d\n",
+			irq, ret);
+		goto err_regulator;
+	}
+
+	platform_set_drvdata(pdev, dcdc);
+
+	return 0;
+
+err_regulator:
+	regulator_unregister(dcdc->regulator);
+err:
+	kfree(dcdc);
+	return ret;
+}
+
+static __devexit int wm831x_buckp_remove(struct platform_device *pdev)
+{
+	struct wm831x_dcdc *dcdc = platform_get_drvdata(pdev);
+	struct wm831x *wm831x = dcdc->wm831x;
+
+	wm831x_free_irq(wm831x, platform_get_irq_byname(pdev, "UV"), dcdc);
+	regulator_unregister(dcdc->regulator);
+	kfree(dcdc);
+
+	return 0;
+}
+
+static struct platform_driver wm831x_buckp_driver = {
+	.probe = wm831x_buckp_probe,
+	.remove = __devexit_p(wm831x_buckp_remove),
+	.driver		= {
+		.name	= "wm831x-buckp",
+	},
+};
+
+static int __init wm831x_dcdc_init(void)
+{
+	int ret;
+	ret = platform_driver_register(&wm831x_buckv_driver);
+	if (ret != 0)
+		pr_err("Failed to register WM831x BUCKV driver: %d\n", ret);
+
+	ret = platform_driver_register(&wm831x_buckp_driver);
+	if (ret != 0)
+		pr_err("Failed to register WM831x BUCKP driver: %d\n", ret);
+
+	return 0;
+}
+subsys_initcall(wm831x_dcdc_init);
+
+static void __exit wm831x_dcdc_exit(void)
+{
+	platform_driver_unregister(&wm831x_buckp_driver);
+	platform_driver_unregister(&wm831x_buckv_driver);
+}
+module_exit(wm831x_dcdc_exit);
+
+/* Module information */
+MODULE_AUTHOR("Mark Brown");
+MODULE_DESCRIPTION("WM831x DC-DC convertor driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:wm831x-buckv");
+MODULE_ALIAS("platform:wm831x-buckp");
diff --git a/include/linux/mfd/wm831x/regulator.h b/include/linux/mfd/wm831x/regulator.h
index b5d58fb38b4e..c74d6aafdca8 100644
--- a/include/linux/mfd/wm831x/regulator.h
+++ b/include/linux/mfd/wm831x/regulator.h
@@ -15,6 +15,577 @@
 #ifndef __MFD_WM831X_REGULATOR_H__
 #define __MFD_WM831X_REGULATOR_H__
 
+/*
+ * R16462 (0x404E) - Current Sink 1
+ */
+#define WM831X_CS1_ENA                          0x8000  /* CS1_ENA */
+#define WM831X_CS1_ENA_MASK                     0x8000  /* CS1_ENA */
+#define WM831X_CS1_ENA_SHIFT                        15  /* CS1_ENA */
+#define WM831X_CS1_ENA_WIDTH                         1  /* CS1_ENA */
+#define WM831X_CS1_DRIVE                        0x4000  /* CS1_DRIVE */
+#define WM831X_CS1_DRIVE_MASK                   0x4000  /* CS1_DRIVE */
+#define WM831X_CS1_DRIVE_SHIFT                      14  /* CS1_DRIVE */
+#define WM831X_CS1_DRIVE_WIDTH                       1  /* CS1_DRIVE */
+#define WM831X_CS1_SLPENA                       0x1000  /* CS1_SLPENA */
+#define WM831X_CS1_SLPENA_MASK                  0x1000  /* CS1_SLPENA */
+#define WM831X_CS1_SLPENA_SHIFT                     12  /* CS1_SLPENA */
+#define WM831X_CS1_SLPENA_WIDTH                      1  /* CS1_SLPENA */
+#define WM831X_CS1_OFF_RAMP_MASK                0x0C00  /* CS1_OFF_RAMP - [11:10] */
+#define WM831X_CS1_OFF_RAMP_SHIFT                   10  /* CS1_OFF_RAMP - [11:10] */
+#define WM831X_CS1_OFF_RAMP_WIDTH                    2  /* CS1_OFF_RAMP - [11:10] */
+#define WM831X_CS1_ON_RAMP_MASK                 0x0300  /* CS1_ON_RAMP - [9:8] */
+#define WM831X_CS1_ON_RAMP_SHIFT                     8  /* CS1_ON_RAMP - [9:8] */
+#define WM831X_CS1_ON_RAMP_WIDTH                     2  /* CS1_ON_RAMP - [9:8] */
+#define WM831X_CS1_ISEL_MASK                    0x003F  /* CS1_ISEL - [5:0] */
+#define WM831X_CS1_ISEL_SHIFT                        0  /* CS1_ISEL - [5:0] */
+#define WM831X_CS1_ISEL_WIDTH                        6  /* CS1_ISEL - [5:0] */
+
+/*
+ * R16463 (0x404F) - Current Sink 2
+ */
+#define WM831X_CS2_ENA                          0x8000  /* CS2_ENA */
+#define WM831X_CS2_ENA_MASK                     0x8000  /* CS2_ENA */
+#define WM831X_CS2_ENA_SHIFT                        15  /* CS2_ENA */
+#define WM831X_CS2_ENA_WIDTH                         1  /* CS2_ENA */
+#define WM831X_CS2_DRIVE                        0x4000  /* CS2_DRIVE */
+#define WM831X_CS2_DRIVE_MASK                   0x4000  /* CS2_DRIVE */
+#define WM831X_CS2_DRIVE_SHIFT                      14  /* CS2_DRIVE */
+#define WM831X_CS2_DRIVE_WIDTH                       1  /* CS2_DRIVE */
+#define WM831X_CS2_SLPENA                       0x1000  /* CS2_SLPENA */
+#define WM831X_CS2_SLPENA_MASK                  0x1000  /* CS2_SLPENA */
+#define WM831X_CS2_SLPENA_SHIFT                     12  /* CS2_SLPENA */
+#define WM831X_CS2_SLPENA_WIDTH                      1  /* CS2_SLPENA */
+#define WM831X_CS2_OFF_RAMP_MASK                0x0C00  /* CS2_OFF_RAMP - [11:10] */
+#define WM831X_CS2_OFF_RAMP_SHIFT                   10  /* CS2_OFF_RAMP - [11:10] */
+#define WM831X_CS2_OFF_RAMP_WIDTH                    2  /* CS2_OFF_RAMP - [11:10] */
+#define WM831X_CS2_ON_RAMP_MASK                 0x0300  /* CS2_ON_RAMP - [9:8] */
+#define WM831X_CS2_ON_RAMP_SHIFT                     8  /* CS2_ON_RAMP - [9:8] */
+#define WM831X_CS2_ON_RAMP_WIDTH                     2  /* CS2_ON_RAMP - [9:8] */
+#define WM831X_CS2_ISEL_MASK                    0x003F  /* CS2_ISEL - [5:0] */
+#define WM831X_CS2_ISEL_SHIFT                        0  /* CS2_ISEL - [5:0] */
+#define WM831X_CS2_ISEL_WIDTH                        6  /* CS2_ISEL - [5:0] */
+
+/*
+ * R16464 (0x4050) - DCDC Enable
+ */
+#define WM831X_EPE2_ENA                         0x0080  /* EPE2_ENA */
+#define WM831X_EPE2_ENA_MASK                    0x0080  /* EPE2_ENA */
+#define WM831X_EPE2_ENA_SHIFT                        7  /* EPE2_ENA */
+#define WM831X_EPE2_ENA_WIDTH                        1  /* EPE2_ENA */
+#define WM831X_EPE1_ENA                         0x0040  /* EPE1_ENA */
+#define WM831X_EPE1_ENA_MASK                    0x0040  /* EPE1_ENA */
+#define WM831X_EPE1_ENA_SHIFT                        6  /* EPE1_ENA */
+#define WM831X_EPE1_ENA_WIDTH                        1  /* EPE1_ENA */
+#define WM831X_DC4_ENA                          0x0008  /* DC4_ENA */
+#define WM831X_DC4_ENA_MASK                     0x0008  /* DC4_ENA */
+#define WM831X_DC4_ENA_SHIFT                         3  /* DC4_ENA */
+#define WM831X_DC4_ENA_WIDTH                         1  /* DC4_ENA */
+#define WM831X_DC3_ENA                          0x0004  /* DC3_ENA */
+#define WM831X_DC3_ENA_MASK                     0x0004  /* DC3_ENA */
+#define WM831X_DC3_ENA_SHIFT                         2  /* DC3_ENA */
+#define WM831X_DC3_ENA_WIDTH                         1  /* DC3_ENA */
+#define WM831X_DC2_ENA                          0x0002  /* DC2_ENA */
+#define WM831X_DC2_ENA_MASK                     0x0002  /* DC2_ENA */
+#define WM831X_DC2_ENA_SHIFT                         1  /* DC2_ENA */
+#define WM831X_DC2_ENA_WIDTH                         1  /* DC2_ENA */
+#define WM831X_DC1_ENA                          0x0001  /* DC1_ENA */
+#define WM831X_DC1_ENA_MASK                     0x0001  /* DC1_ENA */
+#define WM831X_DC1_ENA_SHIFT                         0  /* DC1_ENA */
+#define WM831X_DC1_ENA_WIDTH                         1  /* DC1_ENA */
+
+/*
+ * R16465 (0x4051) - LDO Enable
+ */
+#define WM831X_LDO11_ENA                        0x0400  /* LDO11_ENA */
+#define WM831X_LDO11_ENA_MASK                   0x0400  /* LDO11_ENA */
+#define WM831X_LDO11_ENA_SHIFT                      10  /* LDO11_ENA */
+#define WM831X_LDO11_ENA_WIDTH                       1  /* LDO11_ENA */
+#define WM831X_LDO10_ENA                        0x0200  /* LDO10_ENA */
+#define WM831X_LDO10_ENA_MASK                   0x0200  /* LDO10_ENA */
+#define WM831X_LDO10_ENA_SHIFT                       9  /* LDO10_ENA */
+#define WM831X_LDO10_ENA_WIDTH                       1  /* LDO10_ENA */
+#define WM831X_LDO9_ENA                         0x0100  /* LDO9_ENA */
+#define WM831X_LDO9_ENA_MASK                    0x0100  /* LDO9_ENA */
+#define WM831X_LDO9_ENA_SHIFT                        8  /* LDO9_ENA */
+#define WM831X_LDO9_ENA_WIDTH                        1  /* LDO9_ENA */
+#define WM831X_LDO8_ENA                         0x0080  /* LDO8_ENA */
+#define WM831X_LDO8_ENA_MASK                    0x0080  /* LDO8_ENA */
+#define WM831X_LDO8_ENA_SHIFT                        7  /* LDO8_ENA */
+#define WM831X_LDO8_ENA_WIDTH                        1  /* LDO8_ENA */
+#define WM831X_LDO7_ENA                         0x0040  /* LDO7_ENA */
+#define WM831X_LDO7_ENA_MASK                    0x0040  /* LDO7_ENA */
+#define WM831X_LDO7_ENA_SHIFT                        6  /* LDO7_ENA */
+#define WM831X_LDO7_ENA_WIDTH                        1  /* LDO7_ENA */
+#define WM831X_LDO6_ENA                         0x0020  /* LDO6_ENA */
+#define WM831X_LDO6_ENA_MASK                    0x0020  /* LDO6_ENA */
+#define WM831X_LDO6_ENA_SHIFT                        5  /* LDO6_ENA */
+#define WM831X_LDO6_ENA_WIDTH                        1  /* LDO6_ENA */
+#define WM831X_LDO5_ENA                         0x0010  /* LDO5_ENA */
+#define WM831X_LDO5_ENA_MASK                    0x0010  /* LDO5_ENA */
+#define WM831X_LDO5_ENA_SHIFT                        4  /* LDO5_ENA */
+#define WM831X_LDO5_ENA_WIDTH                        1  /* LDO5_ENA */
+#define WM831X_LDO4_ENA                         0x0008  /* LDO4_ENA */
+#define WM831X_LDO4_ENA_MASK                    0x0008  /* LDO4_ENA */
+#define WM831X_LDO4_ENA_SHIFT                        3  /* LDO4_ENA */
+#define WM831X_LDO4_ENA_WIDTH                        1  /* LDO4_ENA */
+#define WM831X_LDO3_ENA                         0x0004  /* LDO3_ENA */
+#define WM831X_LDO3_ENA_MASK                    0x0004  /* LDO3_ENA */
+#define WM831X_LDO3_ENA_SHIFT                        2  /* LDO3_ENA */
+#define WM831X_LDO3_ENA_WIDTH                        1  /* LDO3_ENA */
+#define WM831X_LDO2_ENA                         0x0002  /* LDO2_ENA */
+#define WM831X_LDO2_ENA_MASK                    0x0002  /* LDO2_ENA */
+#define WM831X_LDO2_ENA_SHIFT                        1  /* LDO2_ENA */
+#define WM831X_LDO2_ENA_WIDTH                        1  /* LDO2_ENA */
+#define WM831X_LDO1_ENA                         0x0001  /* LDO1_ENA */
+#define WM831X_LDO1_ENA_MASK                    0x0001  /* LDO1_ENA */
+#define WM831X_LDO1_ENA_SHIFT                        0  /* LDO1_ENA */
+#define WM831X_LDO1_ENA_WIDTH                        1  /* LDO1_ENA */
+
+/*
+ * R16466 (0x4052) - DCDC Status
+ */
+#define WM831X_EPE2_STS                         0x0080  /* EPE2_STS */
+#define WM831X_EPE2_STS_MASK                    0x0080  /* EPE2_STS */
+#define WM831X_EPE2_STS_SHIFT                        7  /* EPE2_STS */
+#define WM831X_EPE2_STS_WIDTH                        1  /* EPE2_STS */
+#define WM831X_EPE1_STS                         0x0040  /* EPE1_STS */
+#define WM831X_EPE1_STS_MASK                    0x0040  /* EPE1_STS */
+#define WM831X_EPE1_STS_SHIFT                        6  /* EPE1_STS */
+#define WM831X_EPE1_STS_WIDTH                        1  /* EPE1_STS */
+#define WM831X_DC4_STS                          0x0008  /* DC4_STS */
+#define WM831X_DC4_STS_MASK                     0x0008  /* DC4_STS */
+#define WM831X_DC4_STS_SHIFT                         3  /* DC4_STS */
+#define WM831X_DC4_STS_WIDTH                         1  /* DC4_STS */
+#define WM831X_DC3_STS                          0x0004  /* DC3_STS */
+#define WM831X_DC3_STS_MASK                     0x0004  /* DC3_STS */
+#define WM831X_DC3_STS_SHIFT                         2  /* DC3_STS */
+#define WM831X_DC3_STS_WIDTH                         1  /* DC3_STS */
+#define WM831X_DC2_STS                          0x0002  /* DC2_STS */
+#define WM831X_DC2_STS_MASK                     0x0002  /* DC2_STS */
+#define WM831X_DC2_STS_SHIFT                         1  /* DC2_STS */
+#define WM831X_DC2_STS_WIDTH                         1  /* DC2_STS */
+#define WM831X_DC1_STS                          0x0001  /* DC1_STS */
+#define WM831X_DC1_STS_MASK                     0x0001  /* DC1_STS */
+#define WM831X_DC1_STS_SHIFT                         0  /* DC1_STS */
+#define WM831X_DC1_STS_WIDTH                         1  /* DC1_STS */
+
+/*
+ * R16467 (0x4053) - LDO Status
+ */
+#define WM831X_LDO11_STS                        0x0400  /* LDO11_STS */
+#define WM831X_LDO11_STS_MASK                   0x0400  /* LDO11_STS */
+#define WM831X_LDO11_STS_SHIFT                      10  /* LDO11_STS */
+#define WM831X_LDO11_STS_WIDTH                       1  /* LDO11_STS */
+#define WM831X_LDO10_STS                        0x0200  /* LDO10_STS */
+#define WM831X_LDO10_STS_MASK                   0x0200  /* LDO10_STS */
+#define WM831X_LDO10_STS_SHIFT                       9  /* LDO10_STS */
+#define WM831X_LDO10_STS_WIDTH                       1  /* LDO10_STS */
+#define WM831X_LDO9_STS                         0x0100  /* LDO9_STS */
+#define WM831X_LDO9_STS_MASK                    0x0100  /* LDO9_STS */
+#define WM831X_LDO9_STS_SHIFT                        8  /* LDO9_STS */
+#define WM831X_LDO9_STS_WIDTH                        1  /* LDO9_STS */
+#define WM831X_LDO8_STS                         0x0080  /* LDO8_STS */
+#define WM831X_LDO8_STS_MASK                    0x0080  /* LDO8_STS */
+#define WM831X_LDO8_STS_SHIFT                        7  /* LDO8_STS */
+#define WM831X_LDO8_STS_WIDTH                        1  /* LDO8_STS */
+#define WM831X_LDO7_STS                         0x0040  /* LDO7_STS */
+#define WM831X_LDO7_STS_MASK                    0x0040  /* LDO7_STS */
+#define WM831X_LDO7_STS_SHIFT                        6  /* LDO7_STS */
+#define WM831X_LDO7_STS_WIDTH                        1  /* LDO7_STS */
+#define WM831X_LDO6_STS                         0x0020  /* LDO6_STS */
+#define WM831X_LDO6_STS_MASK                    0x0020  /* LDO6_STS */
+#define WM831X_LDO6_STS_SHIFT                        5  /* LDO6_STS */
+#define WM831X_LDO6_STS_WIDTH                        1  /* LDO6_STS */
+#define WM831X_LDO5_STS                         0x0010  /* LDO5_STS */
+#define WM831X_LDO5_STS_MASK                    0x0010  /* LDO5_STS */
+#define WM831X_LDO5_STS_SHIFT                        4  /* LDO5_STS */
+#define WM831X_LDO5_STS_WIDTH                        1  /* LDO5_STS */
+#define WM831X_LDO4_STS                         0x0008  /* LDO4_STS */
+#define WM831X_LDO4_STS_MASK                    0x0008  /* LDO4_STS */
+#define WM831X_LDO4_STS_SHIFT                        3  /* LDO4_STS */
+#define WM831X_LDO4_STS_WIDTH                        1  /* LDO4_STS */
+#define WM831X_LDO3_STS                         0x0004  /* LDO3_STS */
+#define WM831X_LDO3_STS_MASK                    0x0004  /* LDO3_STS */
+#define WM831X_LDO3_STS_SHIFT                        2  /* LDO3_STS */
+#define WM831X_LDO3_STS_WIDTH                        1  /* LDO3_STS */
+#define WM831X_LDO2_STS                         0x0002  /* LDO2_STS */
+#define WM831X_LDO2_STS_MASK                    0x0002  /* LDO2_STS */
+#define WM831X_LDO2_STS_SHIFT                        1  /* LDO2_STS */
+#define WM831X_LDO2_STS_WIDTH                        1  /* LDO2_STS */
+#define WM831X_LDO1_STS                         0x0001  /* LDO1_STS */
+#define WM831X_LDO1_STS_MASK                    0x0001  /* LDO1_STS */
+#define WM831X_LDO1_STS_SHIFT                        0  /* LDO1_STS */
+#define WM831X_LDO1_STS_WIDTH                        1  /* LDO1_STS */
+
+/*
+ * R16468 (0x4054) - DCDC UV Status
+ */
+#define WM831X_DC2_OV_STS                       0x2000  /* DC2_OV_STS */
+#define WM831X_DC2_OV_STS_MASK                  0x2000  /* DC2_OV_STS */
+#define WM831X_DC2_OV_STS_SHIFT                     13  /* DC2_OV_STS */
+#define WM831X_DC2_OV_STS_WIDTH                      1  /* DC2_OV_STS */
+#define WM831X_DC1_OV_STS                       0x1000  /* DC1_OV_STS */
+#define WM831X_DC1_OV_STS_MASK                  0x1000  /* DC1_OV_STS */
+#define WM831X_DC1_OV_STS_SHIFT                     12  /* DC1_OV_STS */
+#define WM831X_DC1_OV_STS_WIDTH                      1  /* DC1_OV_STS */
+#define WM831X_DC2_HC_STS                       0x0200  /* DC2_HC_STS */
+#define WM831X_DC2_HC_STS_MASK                  0x0200  /* DC2_HC_STS */
+#define WM831X_DC2_HC_STS_SHIFT                      9  /* DC2_HC_STS */
+#define WM831X_DC2_HC_STS_WIDTH                      1  /* DC2_HC_STS */
+#define WM831X_DC1_HC_STS                       0x0100  /* DC1_HC_STS */
+#define WM831X_DC1_HC_STS_MASK                  0x0100  /* DC1_HC_STS */
+#define WM831X_DC1_HC_STS_SHIFT                      8  /* DC1_HC_STS */
+#define WM831X_DC1_HC_STS_WIDTH                      1  /* DC1_HC_STS */
+#define WM831X_DC4_UV_STS                       0x0008  /* DC4_UV_STS */
+#define WM831X_DC4_UV_STS_MASK                  0x0008  /* DC4_UV_STS */
+#define WM831X_DC4_UV_STS_SHIFT                      3  /* DC4_UV_STS */
+#define WM831X_DC4_UV_STS_WIDTH                      1  /* DC4_UV_STS */
+#define WM831X_DC3_UV_STS                       0x0004  /* DC3_UV_STS */
+#define WM831X_DC3_UV_STS_MASK                  0x0004  /* DC3_UV_STS */
+#define WM831X_DC3_UV_STS_SHIFT                      2  /* DC3_UV_STS */
+#define WM831X_DC3_UV_STS_WIDTH                      1  /* DC3_UV_STS */
+#define WM831X_DC2_UV_STS                       0x0002  /* DC2_UV_STS */
+#define WM831X_DC2_UV_STS_MASK                  0x0002  /* DC2_UV_STS */
+#define WM831X_DC2_UV_STS_SHIFT                      1  /* DC2_UV_STS */
+#define WM831X_DC2_UV_STS_WIDTH                      1  /* DC2_UV_STS */
+#define WM831X_DC1_UV_STS                       0x0001  /* DC1_UV_STS */
+#define WM831X_DC1_UV_STS_MASK                  0x0001  /* DC1_UV_STS */
+#define WM831X_DC1_UV_STS_SHIFT                      0  /* DC1_UV_STS */
+#define WM831X_DC1_UV_STS_WIDTH                      1  /* DC1_UV_STS */
+
+/*
+ * R16469 (0x4055) - LDO UV Status
+ */
+#define WM831X_INTLDO_UV_STS                    0x8000  /* INTLDO_UV_STS */
+#define WM831X_INTLDO_UV_STS_MASK               0x8000  /* INTLDO_UV_STS */
+#define WM831X_INTLDO_UV_STS_SHIFT                  15  /* INTLDO_UV_STS */
+#define WM831X_INTLDO_UV_STS_WIDTH                   1  /* INTLDO_UV_STS */
+#define WM831X_LDO10_UV_STS                     0x0200  /* LDO10_UV_STS */
+#define WM831X_LDO10_UV_STS_MASK                0x0200  /* LDO10_UV_STS */
+#define WM831X_LDO10_UV_STS_SHIFT                    9  /* LDO10_UV_STS */
+#define WM831X_LDO10_UV_STS_WIDTH                    1  /* LDO10_UV_STS */
+#define WM831X_LDO9_UV_STS                      0x0100  /* LDO9_UV_STS */
+#define WM831X_LDO9_UV_STS_MASK                 0x0100  /* LDO9_UV_STS */
+#define WM831X_LDO9_UV_STS_SHIFT                     8  /* LDO9_UV_STS */
+#define WM831X_LDO9_UV_STS_WIDTH                     1  /* LDO9_UV_STS */
+#define WM831X_LDO8_UV_STS                      0x0080  /* LDO8_UV_STS */
+#define WM831X_LDO8_UV_STS_MASK                 0x0080  /* LDO8_UV_STS */
+#define WM831X_LDO8_UV_STS_SHIFT                     7  /* LDO8_UV_STS */
+#define WM831X_LDO8_UV_STS_WIDTH                     1  /* LDO8_UV_STS */
+#define WM831X_LDO7_UV_STS                      0x0040  /* LDO7_UV_STS */
+#define WM831X_LDO7_UV_STS_MASK                 0x0040  /* LDO7_UV_STS */
+#define WM831X_LDO7_UV_STS_SHIFT                     6  /* LDO7_UV_STS */
+#define WM831X_LDO7_UV_STS_WIDTH                     1  /* LDO7_UV_STS */
+#define WM831X_LDO6_UV_STS                      0x0020  /* LDO6_UV_STS */
+#define WM831X_LDO6_UV_STS_MASK                 0x0020  /* LDO6_UV_STS */
+#define WM831X_LDO6_UV_STS_SHIFT                     5  /* LDO6_UV_STS */
+#define WM831X_LDO6_UV_STS_WIDTH                     1  /* LDO6_UV_STS */
+#define WM831X_LDO5_UV_STS                      0x0010  /* LDO5_UV_STS */
+#define WM831X_LDO5_UV_STS_MASK                 0x0010  /* LDO5_UV_STS */
+#define WM831X_LDO5_UV_STS_SHIFT                     4  /* LDO5_UV_STS */
+#define WM831X_LDO5_UV_STS_WIDTH                     1  /* LDO5_UV_STS */
+#define WM831X_LDO4_UV_STS                      0x0008  /* LDO4_UV_STS */
+#define WM831X_LDO4_UV_STS_MASK                 0x0008  /* LDO4_UV_STS */
+#define WM831X_LDO4_UV_STS_SHIFT                     3  /* LDO4_UV_STS */
+#define WM831X_LDO4_UV_STS_WIDTH                     1  /* LDO4_UV_STS */
+#define WM831X_LDO3_UV_STS                      0x0004  /* LDO3_UV_STS */
+#define WM831X_LDO3_UV_STS_MASK                 0x0004  /* LDO3_UV_STS */
+#define WM831X_LDO3_UV_STS_SHIFT                     2  /* LDO3_UV_STS */
+#define WM831X_LDO3_UV_STS_WIDTH                     1  /* LDO3_UV_STS */
+#define WM831X_LDO2_UV_STS                      0x0002  /* LDO2_UV_STS */
+#define WM831X_LDO2_UV_STS_MASK                 0x0002  /* LDO2_UV_STS */
+#define WM831X_LDO2_UV_STS_SHIFT                     1  /* LDO2_UV_STS */
+#define WM831X_LDO2_UV_STS_WIDTH                     1  /* LDO2_UV_STS */
+#define WM831X_LDO1_UV_STS                      0x0001  /* LDO1_UV_STS */
+#define WM831X_LDO1_UV_STS_MASK                 0x0001  /* LDO1_UV_STS */
+#define WM831X_LDO1_UV_STS_SHIFT                     0  /* LDO1_UV_STS */
+#define WM831X_LDO1_UV_STS_WIDTH                     1  /* LDO1_UV_STS */
+
+/*
+ * R16470 (0x4056) - DC1 Control 1
+ */
+#define WM831X_DC1_RATE_MASK                    0xC000  /* DC1_RATE - [15:14] */
+#define WM831X_DC1_RATE_SHIFT                       14  /* DC1_RATE - [15:14] */
+#define WM831X_DC1_RATE_WIDTH                        2  /* DC1_RATE - [15:14] */
+#define WM831X_DC1_PHASE                        0x1000  /* DC1_PHASE */
+#define WM831X_DC1_PHASE_MASK                   0x1000  /* DC1_PHASE */
+#define WM831X_DC1_PHASE_SHIFT                      12  /* DC1_PHASE */
+#define WM831X_DC1_PHASE_WIDTH                       1  /* DC1_PHASE */
+#define WM831X_DC1_FREQ_MASK                    0x0300  /* DC1_FREQ - [9:8] */
+#define WM831X_DC1_FREQ_SHIFT                        8  /* DC1_FREQ - [9:8] */
+#define WM831X_DC1_FREQ_WIDTH                        2  /* DC1_FREQ - [9:8] */
+#define WM831X_DC1_FLT                          0x0080  /* DC1_FLT */
+#define WM831X_DC1_FLT_MASK                     0x0080  /* DC1_FLT */
+#define WM831X_DC1_FLT_SHIFT                         7  /* DC1_FLT */
+#define WM831X_DC1_FLT_WIDTH                         1  /* DC1_FLT */
+#define WM831X_DC1_SOFT_START_MASK              0x0030  /* DC1_SOFT_START - [5:4] */
+#define WM831X_DC1_SOFT_START_SHIFT                  4  /* DC1_SOFT_START - [5:4] */
+#define WM831X_DC1_SOFT_START_WIDTH                  2  /* DC1_SOFT_START - [5:4] */
+#define WM831X_DC1_CAP_MASK                     0x0003  /* DC1_CAP - [1:0] */
+#define WM831X_DC1_CAP_SHIFT                         0  /* DC1_CAP - [1:0] */
+#define WM831X_DC1_CAP_WIDTH                         2  /* DC1_CAP - [1:0] */
+
+/*
+ * R16471 (0x4057) - DC1 Control 2
+ */
+#define WM831X_DC1_ERR_ACT_MASK                 0xC000  /* DC1_ERR_ACT - [15:14] */
+#define WM831X_DC1_ERR_ACT_SHIFT                    14  /* DC1_ERR_ACT - [15:14] */
+#define WM831X_DC1_ERR_ACT_WIDTH                     2  /* DC1_ERR_ACT - [15:14] */
+#define WM831X_DC1_HWC_SRC_MASK                 0x1800  /* DC1_HWC_SRC - [12:11] */
+#define WM831X_DC1_HWC_SRC_SHIFT                    11  /* DC1_HWC_SRC - [12:11] */
+#define WM831X_DC1_HWC_SRC_WIDTH                     2  /* DC1_HWC_SRC - [12:11] */
+#define WM831X_DC1_HWC_VSEL                     0x0400  /* DC1_HWC_VSEL */
+#define WM831X_DC1_HWC_VSEL_MASK                0x0400  /* DC1_HWC_VSEL */
+#define WM831X_DC1_HWC_VSEL_SHIFT                   10  /* DC1_HWC_VSEL */
+#define WM831X_DC1_HWC_VSEL_WIDTH                    1  /* DC1_HWC_VSEL */
+#define WM831X_DC1_HWC_MODE_MASK                0x0300  /* DC1_HWC_MODE - [9:8] */
+#define WM831X_DC1_HWC_MODE_SHIFT                    8  /* DC1_HWC_MODE - [9:8] */
+#define WM831X_DC1_HWC_MODE_WIDTH                    2  /* DC1_HWC_MODE - [9:8] */
+#define WM831X_DC1_HC_THR_MASK                  0x0070  /* DC1_HC_THR - [6:4] */
+#define WM831X_DC1_HC_THR_SHIFT                      4  /* DC1_HC_THR - [6:4] */
+#define WM831X_DC1_HC_THR_WIDTH                      3  /* DC1_HC_THR - [6:4] */
+#define WM831X_DC1_HC_IND_ENA                   0x0001  /* DC1_HC_IND_ENA */
+#define WM831X_DC1_HC_IND_ENA_MASK              0x0001  /* DC1_HC_IND_ENA */
+#define WM831X_DC1_HC_IND_ENA_SHIFT                  0  /* DC1_HC_IND_ENA */
+#define WM831X_DC1_HC_IND_ENA_WIDTH                  1  /* DC1_HC_IND_ENA */
+
+/*
+ * R16472 (0x4058) - DC1 ON Config
+ */
+#define WM831X_DC1_ON_SLOT_MASK                 0xE000  /* DC1_ON_SLOT - [15:13] */
+#define WM831X_DC1_ON_SLOT_SHIFT                    13  /* DC1_ON_SLOT - [15:13] */
+#define WM831X_DC1_ON_SLOT_WIDTH                     3  /* DC1_ON_SLOT - [15:13] */
+#define WM831X_DC1_ON_MODE_MASK                 0x0300  /* DC1_ON_MODE - [9:8] */
+#define WM831X_DC1_ON_MODE_SHIFT                     8  /* DC1_ON_MODE - [9:8] */
+#define WM831X_DC1_ON_MODE_WIDTH                     2  /* DC1_ON_MODE - [9:8] */
+#define WM831X_DC1_ON_VSEL_MASK                 0x007F  /* DC1_ON_VSEL - [6:0] */
+#define WM831X_DC1_ON_VSEL_SHIFT                     0  /* DC1_ON_VSEL - [6:0] */
+#define WM831X_DC1_ON_VSEL_WIDTH                     7  /* DC1_ON_VSEL - [6:0] */
+
+/*
+ * R16473 (0x4059) - DC1 SLEEP Control
+ */
+#define WM831X_DC1_SLP_SLOT_MASK                0xE000  /* DC1_SLP_SLOT - [15:13] */
+#define WM831X_DC1_SLP_SLOT_SHIFT                   13  /* DC1_SLP_SLOT - [15:13] */
+#define WM831X_DC1_SLP_SLOT_WIDTH                    3  /* DC1_SLP_SLOT - [15:13] */
+#define WM831X_DC1_SLP_MODE_MASK                0x0300  /* DC1_SLP_MODE - [9:8] */
+#define WM831X_DC1_SLP_MODE_SHIFT                    8  /* DC1_SLP_MODE - [9:8] */
+#define WM831X_DC1_SLP_MODE_WIDTH                    2  /* DC1_SLP_MODE - [9:8] */
+#define WM831X_DC1_SLP_VSEL_MASK                0x007F  /* DC1_SLP_VSEL - [6:0] */
+#define WM831X_DC1_SLP_VSEL_SHIFT                    0  /* DC1_SLP_VSEL - [6:0] */
+#define WM831X_DC1_SLP_VSEL_WIDTH                    7  /* DC1_SLP_VSEL - [6:0] */
+
+/*
+ * R16474 (0x405A) - DC1 DVS Control
+ */
+#define WM831X_DC1_DVS_SRC_MASK                 0x1800  /* DC1_DVS_SRC - [12:11] */
+#define WM831X_DC1_DVS_SRC_SHIFT                    11  /* DC1_DVS_SRC - [12:11] */
+#define WM831X_DC1_DVS_SRC_WIDTH                     2  /* DC1_DVS_SRC - [12:11] */
+#define WM831X_DC1_DVS_VSEL_MASK                0x007F  /* DC1_DVS_VSEL - [6:0] */
+#define WM831X_DC1_DVS_VSEL_SHIFT                    0  /* DC1_DVS_VSEL - [6:0] */
+#define WM831X_DC1_DVS_VSEL_WIDTH                    7  /* DC1_DVS_VSEL - [6:0] */
+
+/*
+ * R16475 (0x405B) - DC2 Control 1
+ */
+#define WM831X_DC2_RATE_MASK                    0xC000  /* DC2_RATE - [15:14] */
+#define WM831X_DC2_RATE_SHIFT                       14  /* DC2_RATE - [15:14] */
+#define WM831X_DC2_RATE_WIDTH                        2  /* DC2_RATE - [15:14] */
+#define WM831X_DC2_PHASE                        0x1000  /* DC2_PHASE */
+#define WM831X_DC2_PHASE_MASK                   0x1000  /* DC2_PHASE */
+#define WM831X_DC2_PHASE_SHIFT                      12  /* DC2_PHASE */
+#define WM831X_DC2_PHASE_WIDTH                       1  /* DC2_PHASE */
+#define WM831X_DC2_FREQ_MASK                    0x0300  /* DC2_FREQ - [9:8] */
+#define WM831X_DC2_FREQ_SHIFT                        8  /* DC2_FREQ - [9:8] */
+#define WM831X_DC2_FREQ_WIDTH                        2  /* DC2_FREQ - [9:8] */
+#define WM831X_DC2_FLT                          0x0080  /* DC2_FLT */
+#define WM831X_DC2_FLT_MASK                     0x0080  /* DC2_FLT */
+#define WM831X_DC2_FLT_SHIFT                         7  /* DC2_FLT */
+#define WM831X_DC2_FLT_WIDTH                         1  /* DC2_FLT */
+#define WM831X_DC2_SOFT_START_MASK              0x0030  /* DC2_SOFT_START - [5:4] */
+#define WM831X_DC2_SOFT_START_SHIFT                  4  /* DC2_SOFT_START - [5:4] */
+#define WM831X_DC2_SOFT_START_WIDTH                  2  /* DC2_SOFT_START - [5:4] */
+#define WM831X_DC2_CAP_MASK                     0x0003  /* DC2_CAP - [1:0] */
+#define WM831X_DC2_CAP_SHIFT                         0  /* DC2_CAP - [1:0] */
+#define WM831X_DC2_CAP_WIDTH                         2  /* DC2_CAP - [1:0] */
+
+/*
+ * R16476 (0x405C) - DC2 Control 2
+ */
+#define WM831X_DC2_ERR_ACT_MASK                 0xC000  /* DC2_ERR_ACT - [15:14] */
+#define WM831X_DC2_ERR_ACT_SHIFT                    14  /* DC2_ERR_ACT - [15:14] */
+#define WM831X_DC2_ERR_ACT_WIDTH                     2  /* DC2_ERR_ACT - [15:14] */
+#define WM831X_DC2_HWC_SRC_MASK                 0x1800  /* DC2_HWC_SRC - [12:11] */
+#define WM831X_DC2_HWC_SRC_SHIFT                    11  /* DC2_HWC_SRC - [12:11] */
+#define WM831X_DC2_HWC_SRC_WIDTH                     2  /* DC2_HWC_SRC - [12:11] */
+#define WM831X_DC2_HWC_VSEL                     0x0400  /* DC2_HWC_VSEL */
+#define WM831X_DC2_HWC_VSEL_MASK                0x0400  /* DC2_HWC_VSEL */
+#define WM831X_DC2_HWC_VSEL_SHIFT                   10  /* DC2_HWC_VSEL */
+#define WM831X_DC2_HWC_VSEL_WIDTH                    1  /* DC2_HWC_VSEL */
+#define WM831X_DC2_HWC_MODE_MASK                0x0300  /* DC2_HWC_MODE - [9:8] */
+#define WM831X_DC2_HWC_MODE_SHIFT                    8  /* DC2_HWC_MODE - [9:8] */
+#define WM831X_DC2_HWC_MODE_WIDTH                    2  /* DC2_HWC_MODE - [9:8] */
+#define WM831X_DC2_HC_THR_MASK                  0x0070  /* DC2_HC_THR - [6:4] */
+#define WM831X_DC2_HC_THR_SHIFT                      4  /* DC2_HC_THR - [6:4] */
+#define WM831X_DC2_HC_THR_WIDTH                      3  /* DC2_HC_THR - [6:4] */
+#define WM831X_DC2_HC_IND_ENA                   0x0001  /* DC2_HC_IND_ENA */
+#define WM831X_DC2_HC_IND_ENA_MASK              0x0001  /* DC2_HC_IND_ENA */
+#define WM831X_DC2_HC_IND_ENA_SHIFT                  0  /* DC2_HC_IND_ENA */
+#define WM831X_DC2_HC_IND_ENA_WIDTH                  1  /* DC2_HC_IND_ENA */
+
+/*
+ * R16477 (0x405D) - DC2 ON Config
+ */
+#define WM831X_DC2_ON_SLOT_MASK                 0xE000  /* DC2_ON_SLOT - [15:13] */
+#define WM831X_DC2_ON_SLOT_SHIFT                    13  /* DC2_ON_SLOT - [15:13] */
+#define WM831X_DC2_ON_SLOT_WIDTH                     3  /* DC2_ON_SLOT - [15:13] */
+#define WM831X_DC2_ON_MODE_MASK                 0x0300  /* DC2_ON_MODE - [9:8] */
+#define WM831X_DC2_ON_MODE_SHIFT                     8  /* DC2_ON_MODE - [9:8] */
+#define WM831X_DC2_ON_MODE_WIDTH                     2  /* DC2_ON_MODE - [9:8] */
+#define WM831X_DC2_ON_VSEL_MASK                 0x007F  /* DC2_ON_VSEL - [6:0] */
+#define WM831X_DC2_ON_VSEL_SHIFT                     0  /* DC2_ON_VSEL - [6:0] */
+#define WM831X_DC2_ON_VSEL_WIDTH                     7  /* DC2_ON_VSEL - [6:0] */
+
+/*
+ * R16478 (0x405E) - DC2 SLEEP Control
+ */
+#define WM831X_DC2_SLP_SLOT_MASK                0xE000  /* DC2_SLP_SLOT - [15:13] */
+#define WM831X_DC2_SLP_SLOT_SHIFT                   13  /* DC2_SLP_SLOT - [15:13] */
+#define WM831X_DC2_SLP_SLOT_WIDTH                    3  /* DC2_SLP_SLOT - [15:13] */
+#define WM831X_DC2_SLP_MODE_MASK                0x0300  /* DC2_SLP_MODE - [9:8] */
+#define WM831X_DC2_SLP_MODE_SHIFT                    8  /* DC2_SLP_MODE - [9:8] */
+#define WM831X_DC2_SLP_MODE_WIDTH                    2  /* DC2_SLP_MODE - [9:8] */
+#define WM831X_DC2_SLP_VSEL_MASK                0x007F  /* DC2_SLP_VSEL - [6:0] */
+#define WM831X_DC2_SLP_VSEL_SHIFT                    0  /* DC2_SLP_VSEL - [6:0] */
+#define WM831X_DC2_SLP_VSEL_WIDTH                    7  /* DC2_SLP_VSEL - [6:0] */
+
+/*
+ * R16479 (0x405F) - DC2 DVS Control
+ */
+#define WM831X_DC2_DVS_SRC_MASK                 0x1800  /* DC2_DVS_SRC - [12:11] */
+#define WM831X_DC2_DVS_SRC_SHIFT                    11  /* DC2_DVS_SRC - [12:11] */
+#define WM831X_DC2_DVS_SRC_WIDTH                     2  /* DC2_DVS_SRC - [12:11] */
+#define WM831X_DC2_DVS_VSEL_MASK                0x007F  /* DC2_DVS_VSEL - [6:0] */
+#define WM831X_DC2_DVS_VSEL_SHIFT                    0  /* DC2_DVS_VSEL - [6:0] */
+#define WM831X_DC2_DVS_VSEL_WIDTH                    7  /* DC2_DVS_VSEL - [6:0] */
+
+/*
+ * R16480 (0x4060) - DC3 Control 1
+ */
+#define WM831X_DC3_PHASE                        0x1000  /* DC3_PHASE */
+#define WM831X_DC3_PHASE_MASK                   0x1000  /* DC3_PHASE */
+#define WM831X_DC3_PHASE_SHIFT                      12  /* DC3_PHASE */
+#define WM831X_DC3_PHASE_WIDTH                       1  /* DC3_PHASE */
+#define WM831X_DC3_FLT                          0x0080  /* DC3_FLT */
+#define WM831X_DC3_FLT_MASK                     0x0080  /* DC3_FLT */
+#define WM831X_DC3_FLT_SHIFT                         7  /* DC3_FLT */
+#define WM831X_DC3_FLT_WIDTH                         1  /* DC3_FLT */
+#define WM831X_DC3_SOFT_START_MASK              0x0030  /* DC3_SOFT_START - [5:4] */
+#define WM831X_DC3_SOFT_START_SHIFT                  4  /* DC3_SOFT_START - [5:4] */
+#define WM831X_DC3_SOFT_START_WIDTH                  2  /* DC3_SOFT_START - [5:4] */
+#define WM831X_DC3_STNBY_LIM_MASK               0x000C  /* DC3_STNBY_LIM - [3:2] */
+#define WM831X_DC3_STNBY_LIM_SHIFT                   2  /* DC3_STNBY_LIM - [3:2] */
+#define WM831X_DC3_STNBY_LIM_WIDTH                   2  /* DC3_STNBY_LIM - [3:2] */
+#define WM831X_DC3_CAP_MASK                     0x0003  /* DC3_CAP - [1:0] */
+#define WM831X_DC3_CAP_SHIFT                         0  /* DC3_CAP - [1:0] */
+#define WM831X_DC3_CAP_WIDTH                         2  /* DC3_CAP - [1:0] */
+
+/*
+ * R16481 (0x4061) - DC3 Control 2
+ */
+#define WM831X_DC3_ERR_ACT_MASK                 0xC000  /* DC3_ERR_ACT - [15:14] */
+#define WM831X_DC3_ERR_ACT_SHIFT                    14  /* DC3_ERR_ACT - [15:14] */
+#define WM831X_DC3_ERR_ACT_WIDTH                     2  /* DC3_ERR_ACT - [15:14] */
+#define WM831X_DC3_HWC_SRC_MASK                 0x1800  /* DC3_HWC_SRC - [12:11] */
+#define WM831X_DC3_HWC_SRC_SHIFT                    11  /* DC3_HWC_SRC - [12:11] */
+#define WM831X_DC3_HWC_SRC_WIDTH                     2  /* DC3_HWC_SRC - [12:11] */
+#define WM831X_DC3_HWC_VSEL                     0x0400  /* DC3_HWC_VSEL */
+#define WM831X_DC3_HWC_VSEL_MASK                0x0400  /* DC3_HWC_VSEL */
+#define WM831X_DC3_HWC_VSEL_SHIFT                   10  /* DC3_HWC_VSEL */
+#define WM831X_DC3_HWC_VSEL_WIDTH                    1  /* DC3_HWC_VSEL */
+#define WM831X_DC3_HWC_MODE_MASK                0x0300  /* DC3_HWC_MODE - [9:8] */
+#define WM831X_DC3_HWC_MODE_SHIFT                    8  /* DC3_HWC_MODE - [9:8] */
+#define WM831X_DC3_HWC_MODE_WIDTH                    2  /* DC3_HWC_MODE - [9:8] */
+#define WM831X_DC3_OVP                          0x0080  /* DC3_OVP */
+#define WM831X_DC3_OVP_MASK                     0x0080  /* DC3_OVP */
+#define WM831X_DC3_OVP_SHIFT                         7  /* DC3_OVP */
+#define WM831X_DC3_OVP_WIDTH                         1  /* DC3_OVP */
+
+/*
+ * R16482 (0x4062) - DC3 ON Config
+ */
+#define WM831X_DC3_ON_SLOT_MASK                 0xE000  /* DC3_ON_SLOT - [15:13] */
+#define WM831X_DC3_ON_SLOT_SHIFT                    13  /* DC3_ON_SLOT - [15:13] */
+#define WM831X_DC3_ON_SLOT_WIDTH                     3  /* DC3_ON_SLOT - [15:13] */
+#define WM831X_DC3_ON_MODE_MASK                 0x0300  /* DC3_ON_MODE - [9:8] */
+#define WM831X_DC3_ON_MODE_SHIFT                     8  /* DC3_ON_MODE - [9:8] */
+#define WM831X_DC3_ON_MODE_WIDTH                     2  /* DC3_ON_MODE - [9:8] */
+#define WM831X_DC3_ON_VSEL_MASK                 0x007F  /* DC3_ON_VSEL - [6:0] */
+#define WM831X_DC3_ON_VSEL_SHIFT                     0  /* DC3_ON_VSEL - [6:0] */
+#define WM831X_DC3_ON_VSEL_WIDTH                     7  /* DC3_ON_VSEL - [6:0] */
+
+/*
+ * R16483 (0x4063) - DC3 SLEEP Control
+ */
+#define WM831X_DC3_SLP_SLOT_MASK                0xE000  /* DC3_SLP_SLOT - [15:13] */
+#define WM831X_DC3_SLP_SLOT_SHIFT                   13  /* DC3_SLP_SLOT - [15:13] */
+#define WM831X_DC3_SLP_SLOT_WIDTH                    3  /* DC3_SLP_SLOT - [15:13] */
+#define WM831X_DC3_SLP_MODE_MASK                0x0300  /* DC3_SLP_MODE - [9:8] */
+#define WM831X_DC3_SLP_MODE_SHIFT                    8  /* DC3_SLP_MODE - [9:8] */
+#define WM831X_DC3_SLP_MODE_WIDTH                    2  /* DC3_SLP_MODE - [9:8] */
+#define WM831X_DC3_SLP_VSEL_MASK                0x007F  /* DC3_SLP_VSEL - [6:0] */
+#define WM831X_DC3_SLP_VSEL_SHIFT                    0  /* DC3_SLP_VSEL - [6:0] */
+#define WM831X_DC3_SLP_VSEL_WIDTH                    7  /* DC3_SLP_VSEL - [6:0] */
+
+/*
+ * R16484 (0x4064) - DC4 Control
+ */
+#define WM831X_DC4_ERR_ACT_MASK                 0xC000  /* DC4_ERR_ACT - [15:14] */
+#define WM831X_DC4_ERR_ACT_SHIFT                    14  /* DC4_ERR_ACT - [15:14] */
+#define WM831X_DC4_ERR_ACT_WIDTH                     2  /* DC4_ERR_ACT - [15:14] */
+#define WM831X_DC4_HWC_SRC_MASK                 0x1800  /* DC4_HWC_SRC - [12:11] */
+#define WM831X_DC4_HWC_SRC_SHIFT                    11  /* DC4_HWC_SRC - [12:11] */
+#define WM831X_DC4_HWC_SRC_WIDTH                     2  /* DC4_HWC_SRC - [12:11] */
+#define WM831X_DC4_HWC_MODE                     0x0100  /* DC4_HWC_MODE */
+#define WM831X_DC4_HWC_MODE_MASK                0x0100  /* DC4_HWC_MODE */
+#define WM831X_DC4_HWC_MODE_SHIFT                    8  /* DC4_HWC_MODE */
+#define WM831X_DC4_HWC_MODE_WIDTH                    1  /* DC4_HWC_MODE */
+#define WM831X_DC4_RANGE_MASK                   0x000C  /* DC4_RANGE - [3:2] */
+#define WM831X_DC4_RANGE_SHIFT                       2  /* DC4_RANGE - [3:2] */
+#define WM831X_DC4_RANGE_WIDTH                       2  /* DC4_RANGE - [3:2] */
+#define WM831X_DC4_FBSRC                        0x0001  /* DC4_FBSRC */
+#define WM831X_DC4_FBSRC_MASK                   0x0001  /* DC4_FBSRC */
+#define WM831X_DC4_FBSRC_SHIFT                       0  /* DC4_FBSRC */
+#define WM831X_DC4_FBSRC_WIDTH                       1  /* DC4_FBSRC */
+
+/*
+ * R16485 (0x4065) - DC4 SLEEP Control
+ */
+#define WM831X_DC4_SLPENA                       0x0100  /* DC4_SLPENA */
+#define WM831X_DC4_SLPENA_MASK                  0x0100  /* DC4_SLPENA */
+#define WM831X_DC4_SLPENA_SHIFT                      8  /* DC4_SLPENA */
+#define WM831X_DC4_SLPENA_WIDTH                      1  /* DC4_SLPENA */
+
+/*
+ * R16526 (0x408E) - Power Good Source 1
+ */
+#define WM831X_DC4_OK                           0x0008  /* DC4_OK */
+#define WM831X_DC4_OK_MASK                      0x0008  /* DC4_OK */
+#define WM831X_DC4_OK_SHIFT                          3  /* DC4_OK */
+#define WM831X_DC4_OK_WIDTH                          1  /* DC4_OK */
+#define WM831X_DC3_OK                           0x0004  /* DC3_OK */
+#define WM831X_DC3_OK_MASK                      0x0004  /* DC3_OK */
+#define WM831X_DC3_OK_SHIFT                          2  /* DC3_OK */
+#define WM831X_DC3_OK_WIDTH                          1  /* DC3_OK */
+#define WM831X_DC2_OK                           0x0002  /* DC2_OK */
+#define WM831X_DC2_OK_MASK                      0x0002  /* DC2_OK */
+#define WM831X_DC2_OK_SHIFT                          1  /* DC2_OK */
+#define WM831X_DC2_OK_WIDTH                          1  /* DC2_OK */
+#define WM831X_DC1_OK                           0x0001  /* DC1_OK */
+#define WM831X_DC1_OK_MASK                      0x0001  /* DC1_OK */
+#define WM831X_DC1_OK_SHIFT                          0  /* DC1_OK */
+#define WM831X_DC1_OK_WIDTH                          1  /* DC1_OK */
+
 #define WM831X_ISINK_MAX_ISEL 56
 extern int wm831x_isinkv_values[WM831X_ISINK_MAX_ISEL];
 
-- 
cgit v1.2.3-71-gd317


From d1c6b4fe668b2ae02f490deee86eaab60822a362 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Tue, 28 Jul 2009 15:22:02 +0100
Subject: regulator: Add WM831x LDO support

The WM831x series of devices provide three types of LDO:

 - General purpose LDOs supporting voltages from 0.9-3.3V
 - High performance analogue LDOs supporting voltages from 1-3.5V
 - Very low power consumption LDOs intended to support always on
   functionality.

This patch adds support for all three kinds of LDO. Each regulator
is probed as an individual platform device with resources used to
provide the register map location of the regulator. Mixed hardware
and software control of regulators is not current supported.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Acked-by: Liam Girdwood <lrg@slimlogic.co.uk>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/regulator/Makefile           |   1 +
 drivers/regulator/wm831x-ldo.c       | 852 +++++++++++++++++++++++++++++++++++
 include/linux/mfd/wm831x/regulator.h | 626 +++++++++++++++++++++++++
 3 files changed, 1479 insertions(+)
 create mode 100644 drivers/regulator/wm831x-ldo.c

(limited to 'include/linux')

diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile
index b1d2b826f532..a0a635fdae87 100644
--- a/drivers/regulator/Makefile
+++ b/drivers/regulator/Makefile
@@ -13,6 +13,7 @@ obj-$(CONFIG_REGULATOR_LP3971) += lp3971.o
 obj-$(CONFIG_REGULATOR_MAX1586) += max1586.o
 obj-$(CONFIG_REGULATOR_TWL4030) += twl4030-regulator.o
 obj-$(CONFIG_REGULATOR_WM831X) += wm831x-dcdc.o
+obj-$(CONFIG_REGULATOR_WM831X) += wm831x-ldo.o
 obj-$(CONFIG_REGULATOR_WM8350) += wm8350-regulator.o
 obj-$(CONFIG_REGULATOR_WM8400) += wm8400-regulator.o
 obj-$(CONFIG_REGULATOR_DA903X)	+= da903x.o
diff --git a/drivers/regulator/wm831x-ldo.c b/drivers/regulator/wm831x-ldo.c
new file mode 100644
index 000000000000..bb61aede4801
--- /dev/null
+++ b/drivers/regulator/wm831x-ldo.c
@@ -0,0 +1,852 @@
+/*
+ * wm831x-ldo.c  --  LDO driver for the WM831x series
+ *
+ * Copyright 2009 Wolfson Microelectronics PLC.
+ *
+ * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/err.h>
+#include <linux/i2c.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/driver.h>
+
+#include <linux/mfd/wm831x/core.h>
+#include <linux/mfd/wm831x/regulator.h>
+#include <linux/mfd/wm831x/pdata.h>
+
+#define WM831X_LDO_MAX_NAME 6
+
+#define WM831X_LDO_CONTROL       0
+#define WM831X_LDO_ON_CONTROL    1
+#define WM831X_LDO_SLEEP_CONTROL 2
+
+#define WM831X_ALIVE_LDO_ON_CONTROL    0
+#define WM831X_ALIVE_LDO_SLEEP_CONTROL 1
+
+struct wm831x_ldo {
+	char name[WM831X_LDO_MAX_NAME];
+	struct regulator_desc desc;
+	int base;
+	struct wm831x *wm831x;
+	struct regulator_dev *regulator;
+};
+
+/*
+ * Shared
+ */
+
+static int wm831x_ldo_is_enabled(struct regulator_dev *rdev)
+{
+	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = ldo->wm831x;
+	int mask = 1 << rdev_get_id(rdev);
+	int reg;
+
+	reg = wm831x_reg_read(wm831x, WM831X_LDO_ENABLE);
+	if (reg < 0)
+		return reg;
+
+	if (reg & mask)
+		return 1;
+	else
+		return 0;
+}
+
+static int wm831x_ldo_enable(struct regulator_dev *rdev)
+{
+	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = ldo->wm831x;
+	int mask = 1 << rdev_get_id(rdev);
+
+	return wm831x_set_bits(wm831x, WM831X_LDO_ENABLE, mask, mask);
+}
+
+static int wm831x_ldo_disable(struct regulator_dev *rdev)
+{
+	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = ldo->wm831x;
+	int mask = 1 << rdev_get_id(rdev);
+
+	return wm831x_set_bits(wm831x, WM831X_LDO_ENABLE, mask, 0);
+}
+
+static irqreturn_t wm831x_ldo_uv_irq(int irq, void *data)
+{
+	struct wm831x_ldo *ldo = data;
+
+	regulator_notifier_call_chain(ldo->regulator,
+				      REGULATOR_EVENT_UNDER_VOLTAGE,
+				      NULL);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * General purpose LDOs
+ */
+
+#define WM831X_GP_LDO_SELECTOR_LOW 0xe
+#define WM831X_GP_LDO_MAX_SELECTOR 0x1f
+
+static int wm831x_gp_ldo_list_voltage(struct regulator_dev *rdev,
+				      unsigned int selector)
+{
+	/* 0.9-1.6V in 50mV steps */
+	if (selector <= WM831X_GP_LDO_SELECTOR_LOW)
+		return 900000 + (selector * 50000);
+	/* 1.7-3.3V in 50mV steps */
+	if (selector <= WM831X_GP_LDO_MAX_SELECTOR)
+		return 1600000 + ((selector - WM831X_GP_LDO_SELECTOR_LOW)
+				  * 100000);
+	return -EINVAL;
+}
+
+static int wm831x_gp_ldo_set_voltage_int(struct regulator_dev *rdev, int reg,
+					 int min_uV, int max_uV)
+{
+	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = ldo->wm831x;
+	int vsel, ret;
+
+	if (min_uV < 900000)
+		vsel = 0;
+	else if (min_uV < 1700000)
+		vsel = ((min_uV - 900000) / 50000);
+	else
+		vsel = ((min_uV - 1700000) / 100000)
+			+ WM831X_GP_LDO_SELECTOR_LOW + 1;
+
+	ret = wm831x_gp_ldo_list_voltage(rdev, vsel);
+	if (ret < 0)
+		return ret;
+	if (ret < min_uV || ret > max_uV)
+		return -EINVAL;
+
+	return wm831x_set_bits(wm831x, reg, WM831X_LDO1_ON_VSEL_MASK, vsel);
+}
+
+static int wm831x_gp_ldo_set_voltage(struct regulator_dev *rdev,
+				     int min_uV, int max_uV)
+{
+	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
+	int reg = ldo->base + WM831X_LDO_ON_CONTROL;
+
+	return wm831x_gp_ldo_set_voltage_int(rdev, reg, min_uV, max_uV);
+}
+
+static int wm831x_gp_ldo_set_suspend_voltage(struct regulator_dev *rdev,
+					     int uV)
+{
+	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
+	int reg = ldo->base + WM831X_LDO_SLEEP_CONTROL;
+
+	return wm831x_gp_ldo_set_voltage_int(rdev, reg, uV, uV);
+}
+
+static int wm831x_gp_ldo_get_voltage(struct regulator_dev *rdev)
+{
+	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = ldo->wm831x;
+	int reg = ldo->base + WM831X_LDO_ON_CONTROL;
+	int ret;
+
+	ret = wm831x_reg_read(wm831x, reg);
+	if (ret < 0)
+		return ret;
+
+	ret &= WM831X_LDO1_ON_VSEL_MASK;
+
+	return wm831x_gp_ldo_list_voltage(rdev, ret);
+}
+
+static unsigned int wm831x_gp_ldo_get_mode(struct regulator_dev *rdev)
+{
+	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = ldo->wm831x;
+	int ctrl_reg = ldo->base + WM831X_LDO_CONTROL;
+	int on_reg = ldo->base + WM831X_LDO_ON_CONTROL;
+	unsigned int ret;
+
+	ret = wm831x_reg_read(wm831x, on_reg);
+	if (ret < 0)
+		return 0;
+
+	if (!(ret & WM831X_LDO1_ON_MODE))
+		return REGULATOR_MODE_NORMAL;
+
+	ret = wm831x_reg_read(wm831x, ctrl_reg);
+	if (ret < 0)
+		return 0;
+
+	if (ret & WM831X_LDO1_LP_MODE)
+		return REGULATOR_MODE_STANDBY;
+	else
+		return REGULATOR_MODE_IDLE;
+}
+
+static int wm831x_gp_ldo_set_mode(struct regulator_dev *rdev,
+				  unsigned int mode)
+{
+	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = ldo->wm831x;
+	int ctrl_reg = ldo->base + WM831X_LDO_CONTROL;
+	int on_reg = ldo->base + WM831X_LDO_ON_CONTROL;
+	int ret;
+
+
+	switch (mode) {
+	case REGULATOR_MODE_NORMAL:
+		ret = wm831x_set_bits(wm831x, on_reg,
+				      WM831X_LDO1_ON_MODE, 0);
+		if (ret < 0)
+			return ret;
+		break;
+
+	case REGULATOR_MODE_IDLE:
+		ret = wm831x_set_bits(wm831x, ctrl_reg,
+				      WM831X_LDO1_LP_MODE,
+				      WM831X_LDO1_LP_MODE);
+		if (ret < 0)
+			return ret;
+
+		ret = wm831x_set_bits(wm831x, on_reg,
+				      WM831X_LDO1_ON_MODE,
+				      WM831X_LDO1_ON_MODE);
+		if (ret < 0)
+			return ret;
+
+	case REGULATOR_MODE_STANDBY:
+		ret = wm831x_set_bits(wm831x, ctrl_reg,
+				      WM831X_LDO1_LP_MODE, 0);
+		if (ret < 0)
+			return ret;
+
+		ret = wm831x_set_bits(wm831x, on_reg,
+				      WM831X_LDO1_ON_MODE,
+				      WM831X_LDO1_ON_MODE);
+		if (ret < 0)
+			return ret;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int wm831x_gp_ldo_get_status(struct regulator_dev *rdev)
+{
+	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = ldo->wm831x;
+	int mask = 1 << rdev_get_id(rdev);
+	int ret;
+
+	/* Is the regulator on? */
+	ret = wm831x_reg_read(wm831x, WM831X_LDO_STATUS);
+	if (ret < 0)
+		return ret;
+	if (!(ret & mask))
+		return REGULATOR_STATUS_OFF;
+
+	/* Is it reporting under voltage? */
+	ret = wm831x_reg_read(wm831x, WM831X_LDO_UV_STATUS);
+	if (ret & mask)
+		return REGULATOR_STATUS_ERROR;
+
+	ret = wm831x_gp_ldo_get_mode(rdev);
+	if (ret < 0)
+		return ret;
+	else
+		return regulator_mode_to_status(ret);
+}
+
+static unsigned int wm831x_gp_ldo_get_optimum_mode(struct regulator_dev *rdev,
+						   int input_uV,
+						   int output_uV, int load_uA)
+{
+	if (load_uA < 20000)
+		return REGULATOR_MODE_STANDBY;
+	if (load_uA < 50000)
+		return REGULATOR_MODE_IDLE;
+	return REGULATOR_MODE_NORMAL;
+}
+
+
+static struct regulator_ops wm831x_gp_ldo_ops = {
+	.list_voltage = wm831x_gp_ldo_list_voltage,
+	.get_voltage = wm831x_gp_ldo_get_voltage,
+	.set_voltage = wm831x_gp_ldo_set_voltage,
+	.set_suspend_voltage = wm831x_gp_ldo_set_suspend_voltage,
+	.get_mode = wm831x_gp_ldo_get_mode,
+	.set_mode = wm831x_gp_ldo_set_mode,
+	.get_status = wm831x_gp_ldo_get_status,
+	.get_optimum_mode = wm831x_gp_ldo_get_optimum_mode,
+
+	.is_enabled = wm831x_ldo_is_enabled,
+	.enable = wm831x_ldo_enable,
+	.disable = wm831x_ldo_disable,
+};
+
+static __devinit int wm831x_gp_ldo_probe(struct platform_device *pdev)
+{
+	struct wm831x *wm831x = dev_get_drvdata(pdev->dev.parent);
+	struct wm831x_pdata *pdata = wm831x->dev->platform_data;
+	int id = pdev->id % ARRAY_SIZE(pdata->ldo);
+	struct wm831x_ldo *ldo;
+	struct resource *res;
+	int ret, irq;
+
+	dev_dbg(&pdev->dev, "Probing LDO%d\n", id + 1);
+
+	if (pdata == NULL || pdata->ldo[id] == NULL)
+		return -ENODEV;
+
+	ldo = kzalloc(sizeof(struct wm831x_ldo), GFP_KERNEL);
+	if (ldo == NULL) {
+		dev_err(&pdev->dev, "Unable to allocate private data\n");
+		return -ENOMEM;
+	}
+
+	ldo->wm831x = wm831x;
+
+	res = platform_get_resource(pdev, IORESOURCE_IO, 0);
+	if (res == NULL) {
+		dev_err(&pdev->dev, "No I/O resource\n");
+		ret = -EINVAL;
+		goto err;
+	}
+	ldo->base = res->start;
+
+	snprintf(ldo->name, sizeof(ldo->name), "LDO%d", id + 1);
+	ldo->desc.name = ldo->name;
+	ldo->desc.id = id;
+	ldo->desc.type = REGULATOR_VOLTAGE;
+	ldo->desc.n_voltages = WM831X_GP_LDO_MAX_SELECTOR + 1;
+	ldo->desc.ops = &wm831x_gp_ldo_ops;
+	ldo->desc.owner = THIS_MODULE;
+
+	ldo->regulator = regulator_register(&ldo->desc, &pdev->dev,
+					     pdata->ldo[id], ldo);
+	if (IS_ERR(ldo->regulator)) {
+		ret = PTR_ERR(ldo->regulator);
+		dev_err(wm831x->dev, "Failed to register LDO%d: %d\n",
+			id + 1, ret);
+		goto err;
+	}
+
+	irq = platform_get_irq_byname(pdev, "UV");
+	ret = wm831x_request_irq(wm831x, irq, wm831x_ldo_uv_irq,
+				 IRQF_TRIGGER_RISING, ldo->name,
+				 ldo);
+	if (ret != 0) {
+		dev_err(&pdev->dev, "Failed to request UV IRQ %d: %d\n",
+			irq, ret);
+		goto err_regulator;
+	}
+
+	platform_set_drvdata(pdev, ldo);
+
+	return 0;
+
+err_regulator:
+	regulator_unregister(ldo->regulator);
+err:
+	kfree(ldo);
+	return ret;
+}
+
+static __devexit int wm831x_gp_ldo_remove(struct platform_device *pdev)
+{
+	struct wm831x_ldo *ldo = platform_get_drvdata(pdev);
+	struct wm831x *wm831x = ldo->wm831x;
+
+	wm831x_free_irq(wm831x, platform_get_irq_byname(pdev, "UV"), ldo);
+	regulator_unregister(ldo->regulator);
+	kfree(ldo);
+
+	return 0;
+}
+
+static struct platform_driver wm831x_gp_ldo_driver = {
+	.probe = wm831x_gp_ldo_probe,
+	.remove = __devexit_p(wm831x_gp_ldo_remove),
+	.driver		= {
+		.name	= "wm831x-ldo",
+	},
+};
+
+/*
+ * Analogue LDOs
+ */
+
+
+#define WM831X_ALDO_SELECTOR_LOW 0xc
+#define WM831X_ALDO_MAX_SELECTOR 0x1f
+
+static int wm831x_aldo_list_voltage(struct regulator_dev *rdev,
+				      unsigned int selector)
+{
+	/* 1-1.6V in 50mV steps */
+	if (selector <= WM831X_ALDO_SELECTOR_LOW)
+		return 1000000 + (selector * 50000);
+	/* 1.7-3.5V in 50mV steps */
+	if (selector <= WM831X_ALDO_MAX_SELECTOR)
+		return 1600000 + ((selector - WM831X_ALDO_SELECTOR_LOW)
+				  * 100000);
+	return -EINVAL;
+}
+
+static int wm831x_aldo_set_voltage_int(struct regulator_dev *rdev, int reg,
+					 int min_uV, int max_uV)
+{
+	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = ldo->wm831x;
+	int vsel, ret;
+
+	if (min_uV < 1000000)
+		vsel = 0;
+	else if (min_uV < 1700000)
+		vsel = ((min_uV - 1000000) / 50000);
+	else
+		vsel = ((min_uV - 1700000) / 100000)
+			+ WM831X_ALDO_SELECTOR_LOW + 1;
+
+	ret = wm831x_aldo_list_voltage(rdev, vsel);
+	if (ret < 0)
+		return ret;
+	if (ret < min_uV || ret > max_uV)
+		return -EINVAL;
+
+	return wm831x_set_bits(wm831x, reg, WM831X_LDO7_ON_VSEL_MASK, vsel);
+}
+
+static int wm831x_aldo_set_voltage(struct regulator_dev *rdev,
+				     int min_uV, int max_uV)
+{
+	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
+	int reg = ldo->base + WM831X_LDO_ON_CONTROL;
+
+	return wm831x_aldo_set_voltage_int(rdev, reg, min_uV, max_uV);
+}
+
+static int wm831x_aldo_set_suspend_voltage(struct regulator_dev *rdev,
+					     int uV)
+{
+	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
+	int reg = ldo->base + WM831X_LDO_SLEEP_CONTROL;
+
+	return wm831x_aldo_set_voltage_int(rdev, reg, uV, uV);
+}
+
+static int wm831x_aldo_get_voltage(struct regulator_dev *rdev)
+{
+	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = ldo->wm831x;
+	int reg = ldo->base + WM831X_LDO_ON_CONTROL;
+	int ret;
+
+	ret = wm831x_reg_read(wm831x, reg);
+	if (ret < 0)
+		return ret;
+
+	ret &= WM831X_LDO7_ON_VSEL_MASK;
+
+	return wm831x_aldo_list_voltage(rdev, ret);
+}
+
+static unsigned int wm831x_aldo_get_mode(struct regulator_dev *rdev)
+{
+	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = ldo->wm831x;
+	int on_reg = ldo->base + WM831X_LDO_ON_CONTROL;
+	unsigned int ret;
+
+	ret = wm831x_reg_read(wm831x, on_reg);
+	if (ret < 0)
+		return 0;
+
+	if (ret & WM831X_LDO7_ON_MODE)
+		return REGULATOR_MODE_IDLE;
+	else
+		return REGULATOR_MODE_NORMAL;
+}
+
+static int wm831x_aldo_set_mode(struct regulator_dev *rdev,
+				  unsigned int mode)
+{
+	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = ldo->wm831x;
+	int ctrl_reg = ldo->base + WM831X_LDO_CONTROL;
+	int on_reg = ldo->base + WM831X_LDO_ON_CONTROL;
+	int ret;
+
+
+	switch (mode) {
+	case REGULATOR_MODE_NORMAL:
+		ret = wm831x_set_bits(wm831x, on_reg,
+				      WM831X_LDO7_ON_MODE, 0);
+		if (ret < 0)
+			return ret;
+		break;
+
+	case REGULATOR_MODE_IDLE:
+		ret = wm831x_set_bits(wm831x, ctrl_reg,
+				      WM831X_LDO7_ON_MODE,
+				      WM831X_LDO7_ON_MODE);
+		if (ret < 0)
+			return ret;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int wm831x_aldo_get_status(struct regulator_dev *rdev)
+{
+	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = ldo->wm831x;
+	int mask = 1 << rdev_get_id(rdev);
+	int ret;
+
+	/* Is the regulator on? */
+	ret = wm831x_reg_read(wm831x, WM831X_LDO_STATUS);
+	if (ret < 0)
+		return ret;
+	if (!(ret & mask))
+		return REGULATOR_STATUS_OFF;
+
+	/* Is it reporting under voltage? */
+	ret = wm831x_reg_read(wm831x, WM831X_LDO_UV_STATUS);
+	if (ret & mask)
+		return REGULATOR_STATUS_ERROR;
+
+	ret = wm831x_aldo_get_mode(rdev);
+	if (ret < 0)
+		return ret;
+	else
+		return regulator_mode_to_status(ret);
+}
+
+static struct regulator_ops wm831x_aldo_ops = {
+	.list_voltage = wm831x_aldo_list_voltage,
+	.get_voltage = wm831x_aldo_get_voltage,
+	.set_voltage = wm831x_aldo_set_voltage,
+	.set_suspend_voltage = wm831x_aldo_set_suspend_voltage,
+	.get_mode = wm831x_aldo_get_mode,
+	.set_mode = wm831x_aldo_set_mode,
+	.get_status = wm831x_aldo_get_status,
+
+	.is_enabled = wm831x_ldo_is_enabled,
+	.enable = wm831x_ldo_enable,
+	.disable = wm831x_ldo_disable,
+};
+
+static __devinit int wm831x_aldo_probe(struct platform_device *pdev)
+{
+	struct wm831x *wm831x = dev_get_drvdata(pdev->dev.parent);
+	struct wm831x_pdata *pdata = wm831x->dev->platform_data;
+	int id = pdev->id % ARRAY_SIZE(pdata->ldo);
+	struct wm831x_ldo *ldo;
+	struct resource *res;
+	int ret, irq;
+
+	dev_dbg(&pdev->dev, "Probing LDO%d\n", id + 1);
+
+	if (pdata == NULL || pdata->ldo[id] == NULL)
+		return -ENODEV;
+
+	ldo = kzalloc(sizeof(struct wm831x_ldo), GFP_KERNEL);
+	if (ldo == NULL) {
+		dev_err(&pdev->dev, "Unable to allocate private data\n");
+		return -ENOMEM;
+	}
+
+	ldo->wm831x = wm831x;
+
+	res = platform_get_resource(pdev, IORESOURCE_IO, 0);
+	if (res == NULL) {
+		dev_err(&pdev->dev, "No I/O resource\n");
+		ret = -EINVAL;
+		goto err;
+	}
+	ldo->base = res->start;
+
+	snprintf(ldo->name, sizeof(ldo->name), "LDO%d", id + 1);
+	ldo->desc.name = ldo->name;
+	ldo->desc.id = id;
+	ldo->desc.type = REGULATOR_VOLTAGE;
+	ldo->desc.n_voltages = WM831X_ALDO_MAX_SELECTOR + 1;
+	ldo->desc.ops = &wm831x_aldo_ops;
+	ldo->desc.owner = THIS_MODULE;
+
+	ldo->regulator = regulator_register(&ldo->desc, &pdev->dev,
+					     pdata->ldo[id], ldo);
+	if (IS_ERR(ldo->regulator)) {
+		ret = PTR_ERR(ldo->regulator);
+		dev_err(wm831x->dev, "Failed to register LDO%d: %d\n",
+			id + 1, ret);
+		goto err;
+	}
+
+	irq = platform_get_irq_byname(pdev, "UV");
+	ret = wm831x_request_irq(wm831x, irq, wm831x_ldo_uv_irq,
+				 IRQF_TRIGGER_RISING, ldo->name,
+				 ldo);
+	if (ret != 0) {
+		dev_err(&pdev->dev, "Failed to request UV IRQ %d: %d\n",
+			irq, ret);
+		goto err_regulator;
+	}
+
+	platform_set_drvdata(pdev, ldo);
+
+	return 0;
+
+err_regulator:
+	regulator_unregister(ldo->regulator);
+err:
+	kfree(ldo);
+	return ret;
+}
+
+static __devexit int wm831x_aldo_remove(struct platform_device *pdev)
+{
+	struct wm831x_ldo *ldo = platform_get_drvdata(pdev);
+	struct wm831x *wm831x = ldo->wm831x;
+
+	wm831x_free_irq(wm831x, platform_get_irq_byname(pdev, "UV"), ldo);
+	regulator_unregister(ldo->regulator);
+	kfree(ldo);
+
+	return 0;
+}
+
+static struct platform_driver wm831x_aldo_driver = {
+	.probe = wm831x_aldo_probe,
+	.remove = __devexit_p(wm831x_aldo_remove),
+	.driver		= {
+		.name	= "wm831x-aldo",
+	},
+};
+
+/*
+ * Alive LDO
+ */
+
+#define WM831X_ALIVE_LDO_MAX_SELECTOR 0xf
+
+static int wm831x_alive_ldo_list_voltage(struct regulator_dev *rdev,
+				      unsigned int selector)
+{
+	/* 0.8-1.55V in 50mV steps */
+	if (selector <= WM831X_ALIVE_LDO_MAX_SELECTOR)
+		return 800000 + (selector * 50000);
+	return -EINVAL;
+}
+
+static int wm831x_alive_ldo_set_voltage_int(struct regulator_dev *rdev,
+					    int reg,
+					    int min_uV, int max_uV)
+{
+	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = ldo->wm831x;
+	int vsel, ret;
+
+	vsel = (min_uV - 800000) / 50000;
+
+	ret = wm831x_alive_ldo_list_voltage(rdev, vsel);
+	if (ret < 0)
+		return ret;
+	if (ret < min_uV || ret > max_uV)
+		return -EINVAL;
+
+	return wm831x_set_bits(wm831x, reg, WM831X_LDO11_ON_VSEL_MASK, vsel);
+}
+
+static int wm831x_alive_ldo_set_voltage(struct regulator_dev *rdev,
+				     int min_uV, int max_uV)
+{
+	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
+	int reg = ldo->base + WM831X_ALIVE_LDO_ON_CONTROL;
+
+	return wm831x_alive_ldo_set_voltage_int(rdev, reg, min_uV, max_uV);
+}
+
+static int wm831x_alive_ldo_set_suspend_voltage(struct regulator_dev *rdev,
+					     int uV)
+{
+	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
+	int reg = ldo->base + WM831X_ALIVE_LDO_SLEEP_CONTROL;
+
+	return wm831x_alive_ldo_set_voltage_int(rdev, reg, uV, uV);
+}
+
+static int wm831x_alive_ldo_get_voltage(struct regulator_dev *rdev)
+{
+	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = ldo->wm831x;
+	int reg = ldo->base + WM831X_ALIVE_LDO_ON_CONTROL;
+	int ret;
+
+	ret = wm831x_reg_read(wm831x, reg);
+	if (ret < 0)
+		return ret;
+
+	ret &= WM831X_LDO11_ON_VSEL_MASK;
+
+	return wm831x_alive_ldo_list_voltage(rdev, ret);
+}
+
+static int wm831x_alive_ldo_get_status(struct regulator_dev *rdev)
+{
+	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
+	struct wm831x *wm831x = ldo->wm831x;
+	int mask = 1 << rdev_get_id(rdev);
+	int ret;
+
+	/* Is the regulator on? */
+	ret = wm831x_reg_read(wm831x, WM831X_LDO_STATUS);
+	if (ret < 0)
+		return ret;
+	if (ret & mask)
+		return REGULATOR_STATUS_ON;
+	else
+		return REGULATOR_STATUS_OFF;
+}
+
+static struct regulator_ops wm831x_alive_ldo_ops = {
+	.list_voltage = wm831x_alive_ldo_list_voltage,
+	.get_voltage = wm831x_alive_ldo_get_voltage,
+	.set_voltage = wm831x_alive_ldo_set_voltage,
+	.set_suspend_voltage = wm831x_alive_ldo_set_suspend_voltage,
+	.get_status = wm831x_alive_ldo_get_status,
+
+	.is_enabled = wm831x_ldo_is_enabled,
+	.enable = wm831x_ldo_enable,
+	.disable = wm831x_ldo_disable,
+};
+
+static __devinit int wm831x_alive_ldo_probe(struct platform_device *pdev)
+{
+	struct wm831x *wm831x = dev_get_drvdata(pdev->dev.parent);
+	struct wm831x_pdata *pdata = wm831x->dev->platform_data;
+	int id = pdev->id % ARRAY_SIZE(pdata->ldo);
+	struct wm831x_ldo *ldo;
+	struct resource *res;
+	int ret;
+
+	dev_dbg(&pdev->dev, "Probing LDO%d\n", id + 1);
+
+	if (pdata == NULL || pdata->ldo[id] == NULL)
+		return -ENODEV;
+
+	ldo = kzalloc(sizeof(struct wm831x_ldo), GFP_KERNEL);
+	if (ldo == NULL) {
+		dev_err(&pdev->dev, "Unable to allocate private data\n");
+		return -ENOMEM;
+	}
+
+	ldo->wm831x = wm831x;
+
+	res = platform_get_resource(pdev, IORESOURCE_IO, 0);
+	if (res == NULL) {
+		dev_err(&pdev->dev, "No I/O resource\n");
+		ret = -EINVAL;
+		goto err;
+	}
+	ldo->base = res->start;
+
+	snprintf(ldo->name, sizeof(ldo->name), "LDO%d", id + 1);
+	ldo->desc.name = ldo->name;
+	ldo->desc.id = id;
+	ldo->desc.type = REGULATOR_VOLTAGE;
+	ldo->desc.n_voltages = WM831X_ALIVE_LDO_MAX_SELECTOR + 1;
+	ldo->desc.ops = &wm831x_alive_ldo_ops;
+	ldo->desc.owner = THIS_MODULE;
+
+	ldo->regulator = regulator_register(&ldo->desc, &pdev->dev,
+					     pdata->ldo[id], ldo);
+	if (IS_ERR(ldo->regulator)) {
+		ret = PTR_ERR(ldo->regulator);
+		dev_err(wm831x->dev, "Failed to register LDO%d: %d\n",
+			id + 1, ret);
+		goto err;
+	}
+
+	platform_set_drvdata(pdev, ldo);
+
+	return 0;
+
+err:
+	kfree(ldo);
+	return ret;
+}
+
+static __devexit int wm831x_alive_ldo_remove(struct platform_device *pdev)
+{
+	struct wm831x_ldo *ldo = platform_get_drvdata(pdev);
+
+	regulator_unregister(ldo->regulator);
+	kfree(ldo);
+
+	return 0;
+}
+
+static struct platform_driver wm831x_alive_ldo_driver = {
+	.probe = wm831x_alive_ldo_probe,
+	.remove = __devexit_p(wm831x_alive_ldo_remove),
+	.driver		= {
+		.name	= "wm831x-alive-ldo",
+	},
+};
+
+static int __init wm831x_ldo_init(void)
+{
+	int ret;
+
+	ret = platform_driver_register(&wm831x_gp_ldo_driver);
+	if (ret != 0)
+		pr_err("Failed to register WM831x GP LDO driver: %d\n", ret);
+
+	ret = platform_driver_register(&wm831x_aldo_driver);
+	if (ret != 0)
+		pr_err("Failed to register WM831x ALDO driver: %d\n", ret);
+
+	ret = platform_driver_register(&wm831x_alive_ldo_driver);
+	if (ret != 0)
+		pr_err("Failed to register WM831x alive LDO driver: %d\n",
+		       ret);
+
+	return 0;
+}
+subsys_initcall(wm831x_ldo_init);
+
+static void __exit wm831x_ldo_exit(void)
+{
+	platform_driver_unregister(&wm831x_alive_ldo_driver);
+	platform_driver_unregister(&wm831x_aldo_driver);
+	platform_driver_unregister(&wm831x_gp_ldo_driver);
+}
+module_exit(wm831x_ldo_exit);
+
+/* Module information */
+MODULE_AUTHOR("Mark Brown <broonie@opensource.wolfsonmicro.com>");
+MODULE_DESCRIPTION("WM831x LDO driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:wm831x-ldo");
+MODULE_ALIAS("platform:wm831x-aldo");
+MODULE_ALIAS("platform:wm831x-aliveldo");
diff --git a/include/linux/mfd/wm831x/regulator.h b/include/linux/mfd/wm831x/regulator.h
index c74d6aafdca8..f95466343fb2 100644
--- a/include/linux/mfd/wm831x/regulator.h
+++ b/include/linux/mfd/wm831x/regulator.h
@@ -566,6 +566,588 @@
 #define WM831X_DC4_SLPENA_SHIFT                      8  /* DC4_SLPENA */
 #define WM831X_DC4_SLPENA_WIDTH                      1  /* DC4_SLPENA */
 
+/*
+ * R16488 (0x4068) - LDO1 Control
+ */
+#define WM831X_LDO1_ERR_ACT_MASK                0xC000  /* LDO1_ERR_ACT - [15:14] */
+#define WM831X_LDO1_ERR_ACT_SHIFT                   14  /* LDO1_ERR_ACT - [15:14] */
+#define WM831X_LDO1_ERR_ACT_WIDTH                    2  /* LDO1_ERR_ACT - [15:14] */
+#define WM831X_LDO1_HWC_SRC_MASK                0x1800  /* LDO1_HWC_SRC - [12:11] */
+#define WM831X_LDO1_HWC_SRC_SHIFT                   11  /* LDO1_HWC_SRC - [12:11] */
+#define WM831X_LDO1_HWC_SRC_WIDTH                    2  /* LDO1_HWC_SRC - [12:11] */
+#define WM831X_LDO1_HWC_VSEL                    0x0400  /* LDO1_HWC_VSEL */
+#define WM831X_LDO1_HWC_VSEL_MASK               0x0400  /* LDO1_HWC_VSEL */
+#define WM831X_LDO1_HWC_VSEL_SHIFT                  10  /* LDO1_HWC_VSEL */
+#define WM831X_LDO1_HWC_VSEL_WIDTH                   1  /* LDO1_HWC_VSEL */
+#define WM831X_LDO1_HWC_MODE_MASK               0x0300  /* LDO1_HWC_MODE - [9:8] */
+#define WM831X_LDO1_HWC_MODE_SHIFT                   8  /* LDO1_HWC_MODE - [9:8] */
+#define WM831X_LDO1_HWC_MODE_WIDTH                   2  /* LDO1_HWC_MODE - [9:8] */
+#define WM831X_LDO1_FLT                         0x0080  /* LDO1_FLT */
+#define WM831X_LDO1_FLT_MASK                    0x0080  /* LDO1_FLT */
+#define WM831X_LDO1_FLT_SHIFT                        7  /* LDO1_FLT */
+#define WM831X_LDO1_FLT_WIDTH                        1  /* LDO1_FLT */
+#define WM831X_LDO1_SWI                         0x0040  /* LDO1_SWI */
+#define WM831X_LDO1_SWI_MASK                    0x0040  /* LDO1_SWI */
+#define WM831X_LDO1_SWI_SHIFT                        6  /* LDO1_SWI */
+#define WM831X_LDO1_SWI_WIDTH                        1  /* LDO1_SWI */
+#define WM831X_LDO1_LP_MODE                     0x0001  /* LDO1_LP_MODE */
+#define WM831X_LDO1_LP_MODE_MASK                0x0001  /* LDO1_LP_MODE */
+#define WM831X_LDO1_LP_MODE_SHIFT                    0  /* LDO1_LP_MODE */
+#define WM831X_LDO1_LP_MODE_WIDTH                    1  /* LDO1_LP_MODE */
+
+/*
+ * R16489 (0x4069) - LDO1 ON Control
+ */
+#define WM831X_LDO1_ON_SLOT_MASK                0xE000  /* LDO1_ON_SLOT - [15:13] */
+#define WM831X_LDO1_ON_SLOT_SHIFT                   13  /* LDO1_ON_SLOT - [15:13] */
+#define WM831X_LDO1_ON_SLOT_WIDTH                    3  /* LDO1_ON_SLOT - [15:13] */
+#define WM831X_LDO1_ON_MODE                     0x0100  /* LDO1_ON_MODE */
+#define WM831X_LDO1_ON_MODE_MASK                0x0100  /* LDO1_ON_MODE */
+#define WM831X_LDO1_ON_MODE_SHIFT                    8  /* LDO1_ON_MODE */
+#define WM831X_LDO1_ON_MODE_WIDTH                    1  /* LDO1_ON_MODE */
+#define WM831X_LDO1_ON_VSEL_MASK                0x001F  /* LDO1_ON_VSEL - [4:0] */
+#define WM831X_LDO1_ON_VSEL_SHIFT                    0  /* LDO1_ON_VSEL - [4:0] */
+#define WM831X_LDO1_ON_VSEL_WIDTH                    5  /* LDO1_ON_VSEL - [4:0] */
+
+/*
+ * R16490 (0x406A) - LDO1 SLEEP Control
+ */
+#define WM831X_LDO1_SLP_SLOT_MASK               0xE000  /* LDO1_SLP_SLOT - [15:13] */
+#define WM831X_LDO1_SLP_SLOT_SHIFT                  13  /* LDO1_SLP_SLOT - [15:13] */
+#define WM831X_LDO1_SLP_SLOT_WIDTH                   3  /* LDO1_SLP_SLOT - [15:13] */
+#define WM831X_LDO1_SLP_MODE                    0x0100  /* LDO1_SLP_MODE */
+#define WM831X_LDO1_SLP_MODE_MASK               0x0100  /* LDO1_SLP_MODE */
+#define WM831X_LDO1_SLP_MODE_SHIFT                   8  /* LDO1_SLP_MODE */
+#define WM831X_LDO1_SLP_MODE_WIDTH                   1  /* LDO1_SLP_MODE */
+#define WM831X_LDO1_SLP_VSEL_MASK               0x001F  /* LDO1_SLP_VSEL - [4:0] */
+#define WM831X_LDO1_SLP_VSEL_SHIFT                   0  /* LDO1_SLP_VSEL - [4:0] */
+#define WM831X_LDO1_SLP_VSEL_WIDTH                   5  /* LDO1_SLP_VSEL - [4:0] */
+
+/*
+ * R16491 (0x406B) - LDO2 Control
+ */
+#define WM831X_LDO2_ERR_ACT_MASK                0xC000  /* LDO2_ERR_ACT - [15:14] */
+#define WM831X_LDO2_ERR_ACT_SHIFT                   14  /* LDO2_ERR_ACT - [15:14] */
+#define WM831X_LDO2_ERR_ACT_WIDTH                    2  /* LDO2_ERR_ACT - [15:14] */
+#define WM831X_LDO2_HWC_SRC_MASK                0x1800  /* LDO2_HWC_SRC - [12:11] */
+#define WM831X_LDO2_HWC_SRC_SHIFT                   11  /* LDO2_HWC_SRC - [12:11] */
+#define WM831X_LDO2_HWC_SRC_WIDTH                    2  /* LDO2_HWC_SRC - [12:11] */
+#define WM831X_LDO2_HWC_VSEL                    0x0400  /* LDO2_HWC_VSEL */
+#define WM831X_LDO2_HWC_VSEL_MASK               0x0400  /* LDO2_HWC_VSEL */
+#define WM831X_LDO2_HWC_VSEL_SHIFT                  10  /* LDO2_HWC_VSEL */
+#define WM831X_LDO2_HWC_VSEL_WIDTH                   1  /* LDO2_HWC_VSEL */
+#define WM831X_LDO2_HWC_MODE_MASK               0x0300  /* LDO2_HWC_MODE - [9:8] */
+#define WM831X_LDO2_HWC_MODE_SHIFT                   8  /* LDO2_HWC_MODE - [9:8] */
+#define WM831X_LDO2_HWC_MODE_WIDTH                   2  /* LDO2_HWC_MODE - [9:8] */
+#define WM831X_LDO2_FLT                         0x0080  /* LDO2_FLT */
+#define WM831X_LDO2_FLT_MASK                    0x0080  /* LDO2_FLT */
+#define WM831X_LDO2_FLT_SHIFT                        7  /* LDO2_FLT */
+#define WM831X_LDO2_FLT_WIDTH                        1  /* LDO2_FLT */
+#define WM831X_LDO2_SWI                         0x0040  /* LDO2_SWI */
+#define WM831X_LDO2_SWI_MASK                    0x0040  /* LDO2_SWI */
+#define WM831X_LDO2_SWI_SHIFT                        6  /* LDO2_SWI */
+#define WM831X_LDO2_SWI_WIDTH                        1  /* LDO2_SWI */
+#define WM831X_LDO2_LP_MODE                     0x0001  /* LDO2_LP_MODE */
+#define WM831X_LDO2_LP_MODE_MASK                0x0001  /* LDO2_LP_MODE */
+#define WM831X_LDO2_LP_MODE_SHIFT                    0  /* LDO2_LP_MODE */
+#define WM831X_LDO2_LP_MODE_WIDTH                    1  /* LDO2_LP_MODE */
+
+/*
+ * R16492 (0x406C) - LDO2 ON Control
+ */
+#define WM831X_LDO2_ON_SLOT_MASK                0xE000  /* LDO2_ON_SLOT - [15:13] */
+#define WM831X_LDO2_ON_SLOT_SHIFT                   13  /* LDO2_ON_SLOT - [15:13] */
+#define WM831X_LDO2_ON_SLOT_WIDTH                    3  /* LDO2_ON_SLOT - [15:13] */
+#define WM831X_LDO2_ON_MODE                     0x0100  /* LDO2_ON_MODE */
+#define WM831X_LDO2_ON_MODE_MASK                0x0100  /* LDO2_ON_MODE */
+#define WM831X_LDO2_ON_MODE_SHIFT                    8  /* LDO2_ON_MODE */
+#define WM831X_LDO2_ON_MODE_WIDTH                    1  /* LDO2_ON_MODE */
+#define WM831X_LDO2_ON_VSEL_MASK                0x001F  /* LDO2_ON_VSEL - [4:0] */
+#define WM831X_LDO2_ON_VSEL_SHIFT                    0  /* LDO2_ON_VSEL - [4:0] */
+#define WM831X_LDO2_ON_VSEL_WIDTH                    5  /* LDO2_ON_VSEL - [4:0] */
+
+/*
+ * R16493 (0x406D) - LDO2 SLEEP Control
+ */
+#define WM831X_LDO2_SLP_SLOT_MASK               0xE000  /* LDO2_SLP_SLOT - [15:13] */
+#define WM831X_LDO2_SLP_SLOT_SHIFT                  13  /* LDO2_SLP_SLOT - [15:13] */
+#define WM831X_LDO2_SLP_SLOT_WIDTH                   3  /* LDO2_SLP_SLOT - [15:13] */
+#define WM831X_LDO2_SLP_MODE                    0x0100  /* LDO2_SLP_MODE */
+#define WM831X_LDO2_SLP_MODE_MASK               0x0100  /* LDO2_SLP_MODE */
+#define WM831X_LDO2_SLP_MODE_SHIFT                   8  /* LDO2_SLP_MODE */
+#define WM831X_LDO2_SLP_MODE_WIDTH                   1  /* LDO2_SLP_MODE */
+#define WM831X_LDO2_SLP_VSEL_MASK               0x001F  /* LDO2_SLP_VSEL - [4:0] */
+#define WM831X_LDO2_SLP_VSEL_SHIFT                   0  /* LDO2_SLP_VSEL - [4:0] */
+#define WM831X_LDO2_SLP_VSEL_WIDTH                   5  /* LDO2_SLP_VSEL - [4:0] */
+
+/*
+ * R16494 (0x406E) - LDO3 Control
+ */
+#define WM831X_LDO3_ERR_ACT_MASK                0xC000  /* LDO3_ERR_ACT - [15:14] */
+#define WM831X_LDO3_ERR_ACT_SHIFT                   14  /* LDO3_ERR_ACT - [15:14] */
+#define WM831X_LDO3_ERR_ACT_WIDTH                    2  /* LDO3_ERR_ACT - [15:14] */
+#define WM831X_LDO3_HWC_SRC_MASK                0x1800  /* LDO3_HWC_SRC - [12:11] */
+#define WM831X_LDO3_HWC_SRC_SHIFT                   11  /* LDO3_HWC_SRC - [12:11] */
+#define WM831X_LDO3_HWC_SRC_WIDTH                    2  /* LDO3_HWC_SRC - [12:11] */
+#define WM831X_LDO3_HWC_VSEL                    0x0400  /* LDO3_HWC_VSEL */
+#define WM831X_LDO3_HWC_VSEL_MASK               0x0400  /* LDO3_HWC_VSEL */
+#define WM831X_LDO3_HWC_VSEL_SHIFT                  10  /* LDO3_HWC_VSEL */
+#define WM831X_LDO3_HWC_VSEL_WIDTH                   1  /* LDO3_HWC_VSEL */
+#define WM831X_LDO3_HWC_MODE_MASK               0x0300  /* LDO3_HWC_MODE - [9:8] */
+#define WM831X_LDO3_HWC_MODE_SHIFT                   8  /* LDO3_HWC_MODE - [9:8] */
+#define WM831X_LDO3_HWC_MODE_WIDTH                   2  /* LDO3_HWC_MODE - [9:8] */
+#define WM831X_LDO3_FLT                         0x0080  /* LDO3_FLT */
+#define WM831X_LDO3_FLT_MASK                    0x0080  /* LDO3_FLT */
+#define WM831X_LDO3_FLT_SHIFT                        7  /* LDO3_FLT */
+#define WM831X_LDO3_FLT_WIDTH                        1  /* LDO3_FLT */
+#define WM831X_LDO3_SWI                         0x0040  /* LDO3_SWI */
+#define WM831X_LDO3_SWI_MASK                    0x0040  /* LDO3_SWI */
+#define WM831X_LDO3_SWI_SHIFT                        6  /* LDO3_SWI */
+#define WM831X_LDO3_SWI_WIDTH                        1  /* LDO3_SWI */
+#define WM831X_LDO3_LP_MODE                     0x0001  /* LDO3_LP_MODE */
+#define WM831X_LDO3_LP_MODE_MASK                0x0001  /* LDO3_LP_MODE */
+#define WM831X_LDO3_LP_MODE_SHIFT                    0  /* LDO3_LP_MODE */
+#define WM831X_LDO3_LP_MODE_WIDTH                    1  /* LDO3_LP_MODE */
+
+/*
+ * R16495 (0x406F) - LDO3 ON Control
+ */
+#define WM831X_LDO3_ON_SLOT_MASK                0xE000  /* LDO3_ON_SLOT - [15:13] */
+#define WM831X_LDO3_ON_SLOT_SHIFT                   13  /* LDO3_ON_SLOT - [15:13] */
+#define WM831X_LDO3_ON_SLOT_WIDTH                    3  /* LDO3_ON_SLOT - [15:13] */
+#define WM831X_LDO3_ON_MODE                     0x0100  /* LDO3_ON_MODE */
+#define WM831X_LDO3_ON_MODE_MASK                0x0100  /* LDO3_ON_MODE */
+#define WM831X_LDO3_ON_MODE_SHIFT                    8  /* LDO3_ON_MODE */
+#define WM831X_LDO3_ON_MODE_WIDTH                    1  /* LDO3_ON_MODE */
+#define WM831X_LDO3_ON_VSEL_MASK                0x001F  /* LDO3_ON_VSEL - [4:0] */
+#define WM831X_LDO3_ON_VSEL_SHIFT                    0  /* LDO3_ON_VSEL - [4:0] */
+#define WM831X_LDO3_ON_VSEL_WIDTH                    5  /* LDO3_ON_VSEL - [4:0] */
+
+/*
+ * R16496 (0x4070) - LDO3 SLEEP Control
+ */
+#define WM831X_LDO3_SLP_SLOT_MASK               0xE000  /* LDO3_SLP_SLOT - [15:13] */
+#define WM831X_LDO3_SLP_SLOT_SHIFT                  13  /* LDO3_SLP_SLOT - [15:13] */
+#define WM831X_LDO3_SLP_SLOT_WIDTH                   3  /* LDO3_SLP_SLOT - [15:13] */
+#define WM831X_LDO3_SLP_MODE                    0x0100  /* LDO3_SLP_MODE */
+#define WM831X_LDO3_SLP_MODE_MASK               0x0100  /* LDO3_SLP_MODE */
+#define WM831X_LDO3_SLP_MODE_SHIFT                   8  /* LDO3_SLP_MODE */
+#define WM831X_LDO3_SLP_MODE_WIDTH                   1  /* LDO3_SLP_MODE */
+#define WM831X_LDO3_SLP_VSEL_MASK               0x001F  /* LDO3_SLP_VSEL - [4:0] */
+#define WM831X_LDO3_SLP_VSEL_SHIFT                   0  /* LDO3_SLP_VSEL - [4:0] */
+#define WM831X_LDO3_SLP_VSEL_WIDTH                   5  /* LDO3_SLP_VSEL - [4:0] */
+
+/*
+ * R16497 (0x4071) - LDO4 Control
+ */
+#define WM831X_LDO4_ERR_ACT_MASK                0xC000  /* LDO4_ERR_ACT - [15:14] */
+#define WM831X_LDO4_ERR_ACT_SHIFT                   14  /* LDO4_ERR_ACT - [15:14] */
+#define WM831X_LDO4_ERR_ACT_WIDTH                    2  /* LDO4_ERR_ACT - [15:14] */
+#define WM831X_LDO4_HWC_SRC_MASK                0x1800  /* LDO4_HWC_SRC - [12:11] */
+#define WM831X_LDO4_HWC_SRC_SHIFT                   11  /* LDO4_HWC_SRC - [12:11] */
+#define WM831X_LDO4_HWC_SRC_WIDTH                    2  /* LDO4_HWC_SRC - [12:11] */
+#define WM831X_LDO4_HWC_VSEL                    0x0400  /* LDO4_HWC_VSEL */
+#define WM831X_LDO4_HWC_VSEL_MASK               0x0400  /* LDO4_HWC_VSEL */
+#define WM831X_LDO4_HWC_VSEL_SHIFT                  10  /* LDO4_HWC_VSEL */
+#define WM831X_LDO4_HWC_VSEL_WIDTH                   1  /* LDO4_HWC_VSEL */
+#define WM831X_LDO4_HWC_MODE_MASK               0x0300  /* LDO4_HWC_MODE - [9:8] */
+#define WM831X_LDO4_HWC_MODE_SHIFT                   8  /* LDO4_HWC_MODE - [9:8] */
+#define WM831X_LDO4_HWC_MODE_WIDTH                   2  /* LDO4_HWC_MODE - [9:8] */
+#define WM831X_LDO4_FLT                         0x0080  /* LDO4_FLT */
+#define WM831X_LDO4_FLT_MASK                    0x0080  /* LDO4_FLT */
+#define WM831X_LDO4_FLT_SHIFT                        7  /* LDO4_FLT */
+#define WM831X_LDO4_FLT_WIDTH                        1  /* LDO4_FLT */
+#define WM831X_LDO4_SWI                         0x0040  /* LDO4_SWI */
+#define WM831X_LDO4_SWI_MASK                    0x0040  /* LDO4_SWI */
+#define WM831X_LDO4_SWI_SHIFT                        6  /* LDO4_SWI */
+#define WM831X_LDO4_SWI_WIDTH                        1  /* LDO4_SWI */
+#define WM831X_LDO4_LP_MODE                     0x0001  /* LDO4_LP_MODE */
+#define WM831X_LDO4_LP_MODE_MASK                0x0001  /* LDO4_LP_MODE */
+#define WM831X_LDO4_LP_MODE_SHIFT                    0  /* LDO4_LP_MODE */
+#define WM831X_LDO4_LP_MODE_WIDTH                    1  /* LDO4_LP_MODE */
+
+/*
+ * R16498 (0x4072) - LDO4 ON Control
+ */
+#define WM831X_LDO4_ON_SLOT_MASK                0xE000  /* LDO4_ON_SLOT - [15:13] */
+#define WM831X_LDO4_ON_SLOT_SHIFT                   13  /* LDO4_ON_SLOT - [15:13] */
+#define WM831X_LDO4_ON_SLOT_WIDTH                    3  /* LDO4_ON_SLOT - [15:13] */
+#define WM831X_LDO4_ON_MODE                     0x0100  /* LDO4_ON_MODE */
+#define WM831X_LDO4_ON_MODE_MASK                0x0100  /* LDO4_ON_MODE */
+#define WM831X_LDO4_ON_MODE_SHIFT                    8  /* LDO4_ON_MODE */
+#define WM831X_LDO4_ON_MODE_WIDTH                    1  /* LDO4_ON_MODE */
+#define WM831X_LDO4_ON_VSEL_MASK                0x001F  /* LDO4_ON_VSEL - [4:0] */
+#define WM831X_LDO4_ON_VSEL_SHIFT                    0  /* LDO4_ON_VSEL - [4:0] */
+#define WM831X_LDO4_ON_VSEL_WIDTH                    5  /* LDO4_ON_VSEL - [4:0] */
+
+/*
+ * R16499 (0x4073) - LDO4 SLEEP Control
+ */
+#define WM831X_LDO4_SLP_SLOT_MASK               0xE000  /* LDO4_SLP_SLOT - [15:13] */
+#define WM831X_LDO4_SLP_SLOT_SHIFT                  13  /* LDO4_SLP_SLOT - [15:13] */
+#define WM831X_LDO4_SLP_SLOT_WIDTH                   3  /* LDO4_SLP_SLOT - [15:13] */
+#define WM831X_LDO4_SLP_MODE                    0x0100  /* LDO4_SLP_MODE */
+#define WM831X_LDO4_SLP_MODE_MASK               0x0100  /* LDO4_SLP_MODE */
+#define WM831X_LDO4_SLP_MODE_SHIFT                   8  /* LDO4_SLP_MODE */
+#define WM831X_LDO4_SLP_MODE_WIDTH                   1  /* LDO4_SLP_MODE */
+#define WM831X_LDO4_SLP_VSEL_MASK               0x001F  /* LDO4_SLP_VSEL - [4:0] */
+#define WM831X_LDO4_SLP_VSEL_SHIFT                   0  /* LDO4_SLP_VSEL - [4:0] */
+#define WM831X_LDO4_SLP_VSEL_WIDTH                   5  /* LDO4_SLP_VSEL - [4:0] */
+
+/*
+ * R16500 (0x4074) - LDO5 Control
+ */
+#define WM831X_LDO5_ERR_ACT_MASK                0xC000  /* LDO5_ERR_ACT - [15:14] */
+#define WM831X_LDO5_ERR_ACT_SHIFT                   14  /* LDO5_ERR_ACT - [15:14] */
+#define WM831X_LDO5_ERR_ACT_WIDTH                    2  /* LDO5_ERR_ACT - [15:14] */
+#define WM831X_LDO5_HWC_SRC_MASK                0x1800  /* LDO5_HWC_SRC - [12:11] */
+#define WM831X_LDO5_HWC_SRC_SHIFT                   11  /* LDO5_HWC_SRC - [12:11] */
+#define WM831X_LDO5_HWC_SRC_WIDTH                    2  /* LDO5_HWC_SRC - [12:11] */
+#define WM831X_LDO5_HWC_VSEL                    0x0400  /* LDO5_HWC_VSEL */
+#define WM831X_LDO5_HWC_VSEL_MASK               0x0400  /* LDO5_HWC_VSEL */
+#define WM831X_LDO5_HWC_VSEL_SHIFT                  10  /* LDO5_HWC_VSEL */
+#define WM831X_LDO5_HWC_VSEL_WIDTH                   1  /* LDO5_HWC_VSEL */
+#define WM831X_LDO5_HWC_MODE_MASK               0x0300  /* LDO5_HWC_MODE - [9:8] */
+#define WM831X_LDO5_HWC_MODE_SHIFT                   8  /* LDO5_HWC_MODE - [9:8] */
+#define WM831X_LDO5_HWC_MODE_WIDTH                   2  /* LDO5_HWC_MODE - [9:8] */
+#define WM831X_LDO5_FLT                         0x0080  /* LDO5_FLT */
+#define WM831X_LDO5_FLT_MASK                    0x0080  /* LDO5_FLT */
+#define WM831X_LDO5_FLT_SHIFT                        7  /* LDO5_FLT */
+#define WM831X_LDO5_FLT_WIDTH                        1  /* LDO5_FLT */
+#define WM831X_LDO5_SWI                         0x0040  /* LDO5_SWI */
+#define WM831X_LDO5_SWI_MASK                    0x0040  /* LDO5_SWI */
+#define WM831X_LDO5_SWI_SHIFT                        6  /* LDO5_SWI */
+#define WM831X_LDO5_SWI_WIDTH                        1  /* LDO5_SWI */
+#define WM831X_LDO5_LP_MODE                     0x0001  /* LDO5_LP_MODE */
+#define WM831X_LDO5_LP_MODE_MASK                0x0001  /* LDO5_LP_MODE */
+#define WM831X_LDO5_LP_MODE_SHIFT                    0  /* LDO5_LP_MODE */
+#define WM831X_LDO5_LP_MODE_WIDTH                    1  /* LDO5_LP_MODE */
+
+/*
+ * R16501 (0x4075) - LDO5 ON Control
+ */
+#define WM831X_LDO5_ON_SLOT_MASK                0xE000  /* LDO5_ON_SLOT - [15:13] */
+#define WM831X_LDO5_ON_SLOT_SHIFT                   13  /* LDO5_ON_SLOT - [15:13] */
+#define WM831X_LDO5_ON_SLOT_WIDTH                    3  /* LDO5_ON_SLOT - [15:13] */
+#define WM831X_LDO5_ON_MODE                     0x0100  /* LDO5_ON_MODE */
+#define WM831X_LDO5_ON_MODE_MASK                0x0100  /* LDO5_ON_MODE */
+#define WM831X_LDO5_ON_MODE_SHIFT                    8  /* LDO5_ON_MODE */
+#define WM831X_LDO5_ON_MODE_WIDTH                    1  /* LDO5_ON_MODE */
+#define WM831X_LDO5_ON_VSEL_MASK                0x001F  /* LDO5_ON_VSEL - [4:0] */
+#define WM831X_LDO5_ON_VSEL_SHIFT                    0  /* LDO5_ON_VSEL - [4:0] */
+#define WM831X_LDO5_ON_VSEL_WIDTH                    5  /* LDO5_ON_VSEL - [4:0] */
+
+/*
+ * R16502 (0x4076) - LDO5 SLEEP Control
+ */
+#define WM831X_LDO5_SLP_SLOT_MASK               0xE000  /* LDO5_SLP_SLOT - [15:13] */
+#define WM831X_LDO5_SLP_SLOT_SHIFT                  13  /* LDO5_SLP_SLOT - [15:13] */
+#define WM831X_LDO5_SLP_SLOT_WIDTH                   3  /* LDO5_SLP_SLOT - [15:13] */
+#define WM831X_LDO5_SLP_MODE                    0x0100  /* LDO5_SLP_MODE */
+#define WM831X_LDO5_SLP_MODE_MASK               0x0100  /* LDO5_SLP_MODE */
+#define WM831X_LDO5_SLP_MODE_SHIFT                   8  /* LDO5_SLP_MODE */
+#define WM831X_LDO5_SLP_MODE_WIDTH                   1  /* LDO5_SLP_MODE */
+#define WM831X_LDO5_SLP_VSEL_MASK               0x001F  /* LDO5_SLP_VSEL - [4:0] */
+#define WM831X_LDO5_SLP_VSEL_SHIFT                   0  /* LDO5_SLP_VSEL - [4:0] */
+#define WM831X_LDO5_SLP_VSEL_WIDTH                   5  /* LDO5_SLP_VSEL - [4:0] */
+
+/*
+ * R16503 (0x4077) - LDO6 Control
+ */
+#define WM831X_LDO6_ERR_ACT_MASK                0xC000  /* LDO6_ERR_ACT - [15:14] */
+#define WM831X_LDO6_ERR_ACT_SHIFT                   14  /* LDO6_ERR_ACT - [15:14] */
+#define WM831X_LDO6_ERR_ACT_WIDTH                    2  /* LDO6_ERR_ACT - [15:14] */
+#define WM831X_LDO6_HWC_SRC_MASK                0x1800  /* LDO6_HWC_SRC - [12:11] */
+#define WM831X_LDO6_HWC_SRC_SHIFT                   11  /* LDO6_HWC_SRC - [12:11] */
+#define WM831X_LDO6_HWC_SRC_WIDTH                    2  /* LDO6_HWC_SRC - [12:11] */
+#define WM831X_LDO6_HWC_VSEL                    0x0400  /* LDO6_HWC_VSEL */
+#define WM831X_LDO6_HWC_VSEL_MASK               0x0400  /* LDO6_HWC_VSEL */
+#define WM831X_LDO6_HWC_VSEL_SHIFT                  10  /* LDO6_HWC_VSEL */
+#define WM831X_LDO6_HWC_VSEL_WIDTH                   1  /* LDO6_HWC_VSEL */
+#define WM831X_LDO6_HWC_MODE_MASK               0x0300  /* LDO6_HWC_MODE - [9:8] */
+#define WM831X_LDO6_HWC_MODE_SHIFT                   8  /* LDO6_HWC_MODE - [9:8] */
+#define WM831X_LDO6_HWC_MODE_WIDTH                   2  /* LDO6_HWC_MODE - [9:8] */
+#define WM831X_LDO6_FLT                         0x0080  /* LDO6_FLT */
+#define WM831X_LDO6_FLT_MASK                    0x0080  /* LDO6_FLT */
+#define WM831X_LDO6_FLT_SHIFT                        7  /* LDO6_FLT */
+#define WM831X_LDO6_FLT_WIDTH                        1  /* LDO6_FLT */
+#define WM831X_LDO6_SWI                         0x0040  /* LDO6_SWI */
+#define WM831X_LDO6_SWI_MASK                    0x0040  /* LDO6_SWI */
+#define WM831X_LDO6_SWI_SHIFT                        6  /* LDO6_SWI */
+#define WM831X_LDO6_SWI_WIDTH                        1  /* LDO6_SWI */
+#define WM831X_LDO6_LP_MODE                     0x0001  /* LDO6_LP_MODE */
+#define WM831X_LDO6_LP_MODE_MASK                0x0001  /* LDO6_LP_MODE */
+#define WM831X_LDO6_LP_MODE_SHIFT                    0  /* LDO6_LP_MODE */
+#define WM831X_LDO6_LP_MODE_WIDTH                    1  /* LDO6_LP_MODE */
+
+/*
+ * R16504 (0x4078) - LDO6 ON Control
+ */
+#define WM831X_LDO6_ON_SLOT_MASK                0xE000  /* LDO6_ON_SLOT - [15:13] */
+#define WM831X_LDO6_ON_SLOT_SHIFT                   13  /* LDO6_ON_SLOT - [15:13] */
+#define WM831X_LDO6_ON_SLOT_WIDTH                    3  /* LDO6_ON_SLOT - [15:13] */
+#define WM831X_LDO6_ON_MODE                     0x0100  /* LDO6_ON_MODE */
+#define WM831X_LDO6_ON_MODE_MASK                0x0100  /* LDO6_ON_MODE */
+#define WM831X_LDO6_ON_MODE_SHIFT                    8  /* LDO6_ON_MODE */
+#define WM831X_LDO6_ON_MODE_WIDTH                    1  /* LDO6_ON_MODE */
+#define WM831X_LDO6_ON_VSEL_MASK                0x001F  /* LDO6_ON_VSEL - [4:0] */
+#define WM831X_LDO6_ON_VSEL_SHIFT                    0  /* LDO6_ON_VSEL - [4:0] */
+#define WM831X_LDO6_ON_VSEL_WIDTH                    5  /* LDO6_ON_VSEL - [4:0] */
+
+/*
+ * R16505 (0x4079) - LDO6 SLEEP Control
+ */
+#define WM831X_LDO6_SLP_SLOT_MASK               0xE000  /* LDO6_SLP_SLOT - [15:13] */
+#define WM831X_LDO6_SLP_SLOT_SHIFT                  13  /* LDO6_SLP_SLOT - [15:13] */
+#define WM831X_LDO6_SLP_SLOT_WIDTH                   3  /* LDO6_SLP_SLOT - [15:13] */
+#define WM831X_LDO6_SLP_MODE                    0x0100  /* LDO6_SLP_MODE */
+#define WM831X_LDO6_SLP_MODE_MASK               0x0100  /* LDO6_SLP_MODE */
+#define WM831X_LDO6_SLP_MODE_SHIFT                   8  /* LDO6_SLP_MODE */
+#define WM831X_LDO6_SLP_MODE_WIDTH                   1  /* LDO6_SLP_MODE */
+#define WM831X_LDO6_SLP_VSEL_MASK               0x001F  /* LDO6_SLP_VSEL - [4:0] */
+#define WM831X_LDO6_SLP_VSEL_SHIFT                   0  /* LDO6_SLP_VSEL - [4:0] */
+#define WM831X_LDO6_SLP_VSEL_WIDTH                   5  /* LDO6_SLP_VSEL - [4:0] */
+
+/*
+ * R16506 (0x407A) - LDO7 Control
+ */
+#define WM831X_LDO7_ERR_ACT_MASK                0xC000  /* LDO7_ERR_ACT - [15:14] */
+#define WM831X_LDO7_ERR_ACT_SHIFT                   14  /* LDO7_ERR_ACT - [15:14] */
+#define WM831X_LDO7_ERR_ACT_WIDTH                    2  /* LDO7_ERR_ACT - [15:14] */
+#define WM831X_LDO7_HWC_SRC_MASK                0x1800  /* LDO7_HWC_SRC - [12:11] */
+#define WM831X_LDO7_HWC_SRC_SHIFT                   11  /* LDO7_HWC_SRC - [12:11] */
+#define WM831X_LDO7_HWC_SRC_WIDTH                    2  /* LDO7_HWC_SRC - [12:11] */
+#define WM831X_LDO7_HWC_VSEL                    0x0400  /* LDO7_HWC_VSEL */
+#define WM831X_LDO7_HWC_VSEL_MASK               0x0400  /* LDO7_HWC_VSEL */
+#define WM831X_LDO7_HWC_VSEL_SHIFT                  10  /* LDO7_HWC_VSEL */
+#define WM831X_LDO7_HWC_VSEL_WIDTH                   1  /* LDO7_HWC_VSEL */
+#define WM831X_LDO7_HWC_MODE_MASK               0x0300  /* LDO7_HWC_MODE - [9:8] */
+#define WM831X_LDO7_HWC_MODE_SHIFT                   8  /* LDO7_HWC_MODE - [9:8] */
+#define WM831X_LDO7_HWC_MODE_WIDTH                   2  /* LDO7_HWC_MODE - [9:8] */
+#define WM831X_LDO7_FLT                         0x0080  /* LDO7_FLT */
+#define WM831X_LDO7_FLT_MASK                    0x0080  /* LDO7_FLT */
+#define WM831X_LDO7_FLT_SHIFT                        7  /* LDO7_FLT */
+#define WM831X_LDO7_FLT_WIDTH                        1  /* LDO7_FLT */
+#define WM831X_LDO7_SWI                         0x0040  /* LDO7_SWI */
+#define WM831X_LDO7_SWI_MASK                    0x0040  /* LDO7_SWI */
+#define WM831X_LDO7_SWI_SHIFT                        6  /* LDO7_SWI */
+#define WM831X_LDO7_SWI_WIDTH                        1  /* LDO7_SWI */
+
+/*
+ * R16507 (0x407B) - LDO7 ON Control
+ */
+#define WM831X_LDO7_ON_SLOT_MASK                0xE000  /* LDO7_ON_SLOT - [15:13] */
+#define WM831X_LDO7_ON_SLOT_SHIFT                   13  /* LDO7_ON_SLOT - [15:13] */
+#define WM831X_LDO7_ON_SLOT_WIDTH                    3  /* LDO7_ON_SLOT - [15:13] */
+#define WM831X_LDO7_ON_MODE                     0x0100  /* LDO7_ON_MODE */
+#define WM831X_LDO7_ON_MODE_MASK                0x0100  /* LDO7_ON_MODE */
+#define WM831X_LDO7_ON_MODE_SHIFT                    8  /* LDO7_ON_MODE */
+#define WM831X_LDO7_ON_MODE_WIDTH                    1  /* LDO7_ON_MODE */
+#define WM831X_LDO7_ON_VSEL_MASK                0x001F  /* LDO7_ON_VSEL - [4:0] */
+#define WM831X_LDO7_ON_VSEL_SHIFT                    0  /* LDO7_ON_VSEL - [4:0] */
+#define WM831X_LDO7_ON_VSEL_WIDTH                    5  /* LDO7_ON_VSEL - [4:0] */
+
+/*
+ * R16508 (0x407C) - LDO7 SLEEP Control
+ */
+#define WM831X_LDO7_SLP_SLOT_MASK               0xE000  /* LDO7_SLP_SLOT - [15:13] */
+#define WM831X_LDO7_SLP_SLOT_SHIFT                  13  /* LDO7_SLP_SLOT - [15:13] */
+#define WM831X_LDO7_SLP_SLOT_WIDTH                   3  /* LDO7_SLP_SLOT - [15:13] */
+#define WM831X_LDO7_SLP_MODE                    0x0100  /* LDO7_SLP_MODE */
+#define WM831X_LDO7_SLP_MODE_MASK               0x0100  /* LDO7_SLP_MODE */
+#define WM831X_LDO7_SLP_MODE_SHIFT                   8  /* LDO7_SLP_MODE */
+#define WM831X_LDO7_SLP_MODE_WIDTH                   1  /* LDO7_SLP_MODE */
+#define WM831X_LDO7_SLP_VSEL_MASK               0x001F  /* LDO7_SLP_VSEL - [4:0] */
+#define WM831X_LDO7_SLP_VSEL_SHIFT                   0  /* LDO7_SLP_VSEL - [4:0] */
+#define WM831X_LDO7_SLP_VSEL_WIDTH                   5  /* LDO7_SLP_VSEL - [4:0] */
+
+/*
+ * R16509 (0x407D) - LDO8 Control
+ */
+#define WM831X_LDO8_ERR_ACT_MASK                0xC000  /* LDO8_ERR_ACT - [15:14] */
+#define WM831X_LDO8_ERR_ACT_SHIFT                   14  /* LDO8_ERR_ACT - [15:14] */
+#define WM831X_LDO8_ERR_ACT_WIDTH                    2  /* LDO8_ERR_ACT - [15:14] */
+#define WM831X_LDO8_HWC_SRC_MASK                0x1800  /* LDO8_HWC_SRC - [12:11] */
+#define WM831X_LDO8_HWC_SRC_SHIFT                   11  /* LDO8_HWC_SRC - [12:11] */
+#define WM831X_LDO8_HWC_SRC_WIDTH                    2  /* LDO8_HWC_SRC - [12:11] */
+#define WM831X_LDO8_HWC_VSEL                    0x0400  /* LDO8_HWC_VSEL */
+#define WM831X_LDO8_HWC_VSEL_MASK               0x0400  /* LDO8_HWC_VSEL */
+#define WM831X_LDO8_HWC_VSEL_SHIFT                  10  /* LDO8_HWC_VSEL */
+#define WM831X_LDO8_HWC_VSEL_WIDTH                   1  /* LDO8_HWC_VSEL */
+#define WM831X_LDO8_HWC_MODE_MASK               0x0300  /* LDO8_HWC_MODE - [9:8] */
+#define WM831X_LDO8_HWC_MODE_SHIFT                   8  /* LDO8_HWC_MODE - [9:8] */
+#define WM831X_LDO8_HWC_MODE_WIDTH                   2  /* LDO8_HWC_MODE - [9:8] */
+#define WM831X_LDO8_FLT                         0x0080  /* LDO8_FLT */
+#define WM831X_LDO8_FLT_MASK                    0x0080  /* LDO8_FLT */
+#define WM831X_LDO8_FLT_SHIFT                        7  /* LDO8_FLT */
+#define WM831X_LDO8_FLT_WIDTH                        1  /* LDO8_FLT */
+#define WM831X_LDO8_SWI                         0x0040  /* LDO8_SWI */
+#define WM831X_LDO8_SWI_MASK                    0x0040  /* LDO8_SWI */
+#define WM831X_LDO8_SWI_SHIFT                        6  /* LDO8_SWI */
+#define WM831X_LDO8_SWI_WIDTH                        1  /* LDO8_SWI */
+
+/*
+ * R16510 (0x407E) - LDO8 ON Control
+ */
+#define WM831X_LDO8_ON_SLOT_MASK                0xE000  /* LDO8_ON_SLOT - [15:13] */
+#define WM831X_LDO8_ON_SLOT_SHIFT                   13  /* LDO8_ON_SLOT - [15:13] */
+#define WM831X_LDO8_ON_SLOT_WIDTH                    3  /* LDO8_ON_SLOT - [15:13] */
+#define WM831X_LDO8_ON_MODE                     0x0100  /* LDO8_ON_MODE */
+#define WM831X_LDO8_ON_MODE_MASK                0x0100  /* LDO8_ON_MODE */
+#define WM831X_LDO8_ON_MODE_SHIFT                    8  /* LDO8_ON_MODE */
+#define WM831X_LDO8_ON_MODE_WIDTH                    1  /* LDO8_ON_MODE */
+#define WM831X_LDO8_ON_VSEL_MASK                0x001F  /* LDO8_ON_VSEL - [4:0] */
+#define WM831X_LDO8_ON_VSEL_SHIFT                    0  /* LDO8_ON_VSEL - [4:0] */
+#define WM831X_LDO8_ON_VSEL_WIDTH                    5  /* LDO8_ON_VSEL - [4:0] */
+
+/*
+ * R16511 (0x407F) - LDO8 SLEEP Control
+ */
+#define WM831X_LDO8_SLP_SLOT_MASK               0xE000  /* LDO8_SLP_SLOT - [15:13] */
+#define WM831X_LDO8_SLP_SLOT_SHIFT                  13  /* LDO8_SLP_SLOT - [15:13] */
+#define WM831X_LDO8_SLP_SLOT_WIDTH                   3  /* LDO8_SLP_SLOT - [15:13] */
+#define WM831X_LDO8_SLP_MODE                    0x0100  /* LDO8_SLP_MODE */
+#define WM831X_LDO8_SLP_MODE_MASK               0x0100  /* LDO8_SLP_MODE */
+#define WM831X_LDO8_SLP_MODE_SHIFT                   8  /* LDO8_SLP_MODE */
+#define WM831X_LDO8_SLP_MODE_WIDTH                   1  /* LDO8_SLP_MODE */
+#define WM831X_LDO8_SLP_VSEL_MASK               0x001F  /* LDO8_SLP_VSEL - [4:0] */
+#define WM831X_LDO8_SLP_VSEL_SHIFT                   0  /* LDO8_SLP_VSEL - [4:0] */
+#define WM831X_LDO8_SLP_VSEL_WIDTH                   5  /* LDO8_SLP_VSEL - [4:0] */
+
+/*
+ * R16512 (0x4080) - LDO9 Control
+ */
+#define WM831X_LDO9_ERR_ACT_MASK                0xC000  /* LDO9_ERR_ACT - [15:14] */
+#define WM831X_LDO9_ERR_ACT_SHIFT                   14  /* LDO9_ERR_ACT - [15:14] */
+#define WM831X_LDO9_ERR_ACT_WIDTH                    2  /* LDO9_ERR_ACT - [15:14] */
+#define WM831X_LDO9_HWC_SRC_MASK                0x1800  /* LDO9_HWC_SRC - [12:11] */
+#define WM831X_LDO9_HWC_SRC_SHIFT                   11  /* LDO9_HWC_SRC - [12:11] */
+#define WM831X_LDO9_HWC_SRC_WIDTH                    2  /* LDO9_HWC_SRC - [12:11] */
+#define WM831X_LDO9_HWC_VSEL                    0x0400  /* LDO9_HWC_VSEL */
+#define WM831X_LDO9_HWC_VSEL_MASK               0x0400  /* LDO9_HWC_VSEL */
+#define WM831X_LDO9_HWC_VSEL_SHIFT                  10  /* LDO9_HWC_VSEL */
+#define WM831X_LDO9_HWC_VSEL_WIDTH                   1  /* LDO9_HWC_VSEL */
+#define WM831X_LDO9_HWC_MODE_MASK               0x0300  /* LDO9_HWC_MODE - [9:8] */
+#define WM831X_LDO9_HWC_MODE_SHIFT                   8  /* LDO9_HWC_MODE - [9:8] */
+#define WM831X_LDO9_HWC_MODE_WIDTH                   2  /* LDO9_HWC_MODE - [9:8] */
+#define WM831X_LDO9_FLT                         0x0080  /* LDO9_FLT */
+#define WM831X_LDO9_FLT_MASK                    0x0080  /* LDO9_FLT */
+#define WM831X_LDO9_FLT_SHIFT                        7  /* LDO9_FLT */
+#define WM831X_LDO9_FLT_WIDTH                        1  /* LDO9_FLT */
+#define WM831X_LDO9_SWI                         0x0040  /* LDO9_SWI */
+#define WM831X_LDO9_SWI_MASK                    0x0040  /* LDO9_SWI */
+#define WM831X_LDO9_SWI_SHIFT                        6  /* LDO9_SWI */
+#define WM831X_LDO9_SWI_WIDTH                        1  /* LDO9_SWI */
+
+/*
+ * R16513 (0x4081) - LDO9 ON Control
+ */
+#define WM831X_LDO9_ON_SLOT_MASK                0xE000  /* LDO9_ON_SLOT - [15:13] */
+#define WM831X_LDO9_ON_SLOT_SHIFT                   13  /* LDO9_ON_SLOT - [15:13] */
+#define WM831X_LDO9_ON_SLOT_WIDTH                    3  /* LDO9_ON_SLOT - [15:13] */
+#define WM831X_LDO9_ON_MODE                     0x0100  /* LDO9_ON_MODE */
+#define WM831X_LDO9_ON_MODE_MASK                0x0100  /* LDO9_ON_MODE */
+#define WM831X_LDO9_ON_MODE_SHIFT                    8  /* LDO9_ON_MODE */
+#define WM831X_LDO9_ON_MODE_WIDTH                    1  /* LDO9_ON_MODE */
+#define WM831X_LDO9_ON_VSEL_MASK                0x001F  /* LDO9_ON_VSEL - [4:0] */
+#define WM831X_LDO9_ON_VSEL_SHIFT                    0  /* LDO9_ON_VSEL - [4:0] */
+#define WM831X_LDO9_ON_VSEL_WIDTH                    5  /* LDO9_ON_VSEL - [4:0] */
+
+/*
+ * R16514 (0x4082) - LDO9 SLEEP Control
+ */
+#define WM831X_LDO9_SLP_SLOT_MASK               0xE000  /* LDO9_SLP_SLOT - [15:13] */
+#define WM831X_LDO9_SLP_SLOT_SHIFT                  13  /* LDO9_SLP_SLOT - [15:13] */
+#define WM831X_LDO9_SLP_SLOT_WIDTH                   3  /* LDO9_SLP_SLOT - [15:13] */
+#define WM831X_LDO9_SLP_MODE                    0x0100  /* LDO9_SLP_MODE */
+#define WM831X_LDO9_SLP_MODE_MASK               0x0100  /* LDO9_SLP_MODE */
+#define WM831X_LDO9_SLP_MODE_SHIFT                   8  /* LDO9_SLP_MODE */
+#define WM831X_LDO9_SLP_MODE_WIDTH                   1  /* LDO9_SLP_MODE */
+#define WM831X_LDO9_SLP_VSEL_MASK               0x001F  /* LDO9_SLP_VSEL - [4:0] */
+#define WM831X_LDO9_SLP_VSEL_SHIFT                   0  /* LDO9_SLP_VSEL - [4:0] */
+#define WM831X_LDO9_SLP_VSEL_WIDTH                   5  /* LDO9_SLP_VSEL - [4:0] */
+
+/*
+ * R16515 (0x4083) - LDO10 Control
+ */
+#define WM831X_LDO10_ERR_ACT_MASK               0xC000  /* LDO10_ERR_ACT - [15:14] */
+#define WM831X_LDO10_ERR_ACT_SHIFT                  14  /* LDO10_ERR_ACT - [15:14] */
+#define WM831X_LDO10_ERR_ACT_WIDTH                   2  /* LDO10_ERR_ACT - [15:14] */
+#define WM831X_LDO10_HWC_SRC_MASK               0x1800  /* LDO10_HWC_SRC - [12:11] */
+#define WM831X_LDO10_HWC_SRC_SHIFT                  11  /* LDO10_HWC_SRC - [12:11] */
+#define WM831X_LDO10_HWC_SRC_WIDTH                   2  /* LDO10_HWC_SRC - [12:11] */
+#define WM831X_LDO10_HWC_VSEL                   0x0400  /* LDO10_HWC_VSEL */
+#define WM831X_LDO10_HWC_VSEL_MASK              0x0400  /* LDO10_HWC_VSEL */
+#define WM831X_LDO10_HWC_VSEL_SHIFT                 10  /* LDO10_HWC_VSEL */
+#define WM831X_LDO10_HWC_VSEL_WIDTH                  1  /* LDO10_HWC_VSEL */
+#define WM831X_LDO10_HWC_MODE_MASK              0x0300  /* LDO10_HWC_MODE - [9:8] */
+#define WM831X_LDO10_HWC_MODE_SHIFT                  8  /* LDO10_HWC_MODE - [9:8] */
+#define WM831X_LDO10_HWC_MODE_WIDTH                  2  /* LDO10_HWC_MODE - [9:8] */
+#define WM831X_LDO10_FLT                        0x0080  /* LDO10_FLT */
+#define WM831X_LDO10_FLT_MASK                   0x0080  /* LDO10_FLT */
+#define WM831X_LDO10_FLT_SHIFT                       7  /* LDO10_FLT */
+#define WM831X_LDO10_FLT_WIDTH                       1  /* LDO10_FLT */
+#define WM831X_LDO10_SWI                        0x0040  /* LDO10_SWI */
+#define WM831X_LDO10_SWI_MASK                   0x0040  /* LDO10_SWI */
+#define WM831X_LDO10_SWI_SHIFT                       6  /* LDO10_SWI */
+#define WM831X_LDO10_SWI_WIDTH                       1  /* LDO10_SWI */
+
+/*
+ * R16516 (0x4084) - LDO10 ON Control
+ */
+#define WM831X_LDO10_ON_SLOT_MASK               0xE000  /* LDO10_ON_SLOT - [15:13] */
+#define WM831X_LDO10_ON_SLOT_SHIFT                  13  /* LDO10_ON_SLOT - [15:13] */
+#define WM831X_LDO10_ON_SLOT_WIDTH                   3  /* LDO10_ON_SLOT - [15:13] */
+#define WM831X_LDO10_ON_MODE                    0x0100  /* LDO10_ON_MODE */
+#define WM831X_LDO10_ON_MODE_MASK               0x0100  /* LDO10_ON_MODE */
+#define WM831X_LDO10_ON_MODE_SHIFT                   8  /* LDO10_ON_MODE */
+#define WM831X_LDO10_ON_MODE_WIDTH                   1  /* LDO10_ON_MODE */
+#define WM831X_LDO10_ON_VSEL_MASK               0x001F  /* LDO10_ON_VSEL - [4:0] */
+#define WM831X_LDO10_ON_VSEL_SHIFT                   0  /* LDO10_ON_VSEL - [4:0] */
+#define WM831X_LDO10_ON_VSEL_WIDTH                   5  /* LDO10_ON_VSEL - [4:0] */
+
+/*
+ * R16517 (0x4085) - LDO10 SLEEP Control
+ */
+#define WM831X_LDO10_SLP_SLOT_MASK              0xE000  /* LDO10_SLP_SLOT - [15:13] */
+#define WM831X_LDO10_SLP_SLOT_SHIFT                 13  /* LDO10_SLP_SLOT - [15:13] */
+#define WM831X_LDO10_SLP_SLOT_WIDTH                  3  /* LDO10_SLP_SLOT - [15:13] */
+#define WM831X_LDO10_SLP_MODE                   0x0100  /* LDO10_SLP_MODE */
+#define WM831X_LDO10_SLP_MODE_MASK              0x0100  /* LDO10_SLP_MODE */
+#define WM831X_LDO10_SLP_MODE_SHIFT                  8  /* LDO10_SLP_MODE */
+#define WM831X_LDO10_SLP_MODE_WIDTH                  1  /* LDO10_SLP_MODE */
+#define WM831X_LDO10_SLP_VSEL_MASK              0x001F  /* LDO10_SLP_VSEL - [4:0] */
+#define WM831X_LDO10_SLP_VSEL_SHIFT                  0  /* LDO10_SLP_VSEL - [4:0] */
+#define WM831X_LDO10_SLP_VSEL_WIDTH                  5  /* LDO10_SLP_VSEL - [4:0] */
+
+/*
+ * R16519 (0x4087) - LDO11 ON Control
+ */
+#define WM831X_LDO11_ON_SLOT_MASK               0xE000  /* LDO11_ON_SLOT - [15:13] */
+#define WM831X_LDO11_ON_SLOT_SHIFT                  13  /* LDO11_ON_SLOT - [15:13] */
+#define WM831X_LDO11_ON_SLOT_WIDTH                   3  /* LDO11_ON_SLOT - [15:13] */
+#define WM831X_LDO11_OFFENA                     0x1000  /* LDO11_OFFENA */
+#define WM831X_LDO11_OFFENA_MASK                0x1000  /* LDO11_OFFENA */
+#define WM831X_LDO11_OFFENA_SHIFT                   12  /* LDO11_OFFENA */
+#define WM831X_LDO11_OFFENA_WIDTH                    1  /* LDO11_OFFENA */
+#define WM831X_LDO11_VSEL_SRC                   0x0080  /* LDO11_VSEL_SRC */
+#define WM831X_LDO11_VSEL_SRC_MASK              0x0080  /* LDO11_VSEL_SRC */
+#define WM831X_LDO11_VSEL_SRC_SHIFT                  7  /* LDO11_VSEL_SRC */
+#define WM831X_LDO11_VSEL_SRC_WIDTH                  1  /* LDO11_VSEL_SRC */
+#define WM831X_LDO11_ON_VSEL_MASK               0x000F  /* LDO11_ON_VSEL - [3:0] */
+#define WM831X_LDO11_ON_VSEL_SHIFT                   0  /* LDO11_ON_VSEL - [3:0] */
+#define WM831X_LDO11_ON_VSEL_WIDTH                   4  /* LDO11_ON_VSEL - [3:0] */
+
+/*
+ * R16520 (0x4088) - LDO11 SLEEP Control
+ */
+#define WM831X_LDO11_SLP_SLOT_MASK              0xE000  /* LDO11_SLP_SLOT - [15:13] */
+#define WM831X_LDO11_SLP_SLOT_SHIFT                 13  /* LDO11_SLP_SLOT - [15:13] */
+#define WM831X_LDO11_SLP_SLOT_WIDTH                  3  /* LDO11_SLP_SLOT - [15:13] */
+#define WM831X_LDO11_SLP_VSEL_MASK              0x000F  /* LDO11_SLP_VSEL - [3:0] */
+#define WM831X_LDO11_SLP_VSEL_SHIFT                  0  /* LDO11_SLP_VSEL - [3:0] */
+#define WM831X_LDO11_SLP_VSEL_WIDTH                  4  /* LDO11_SLP_VSEL - [3:0] */
+
 /*
  * R16526 (0x408E) - Power Good Source 1
  */
@@ -586,6 +1168,50 @@
 #define WM831X_DC1_OK_SHIFT                          0  /* DC1_OK */
 #define WM831X_DC1_OK_WIDTH                          1  /* DC1_OK */
 
+/*
+ * R16527 (0x408F) - Power Good Source 2
+ */
+#define WM831X_LDO10_OK                         0x0200  /* LDO10_OK */
+#define WM831X_LDO10_OK_MASK                    0x0200  /* LDO10_OK */
+#define WM831X_LDO10_OK_SHIFT                        9  /* LDO10_OK */
+#define WM831X_LDO10_OK_WIDTH                        1  /* LDO10_OK */
+#define WM831X_LDO9_OK                          0x0100  /* LDO9_OK */
+#define WM831X_LDO9_OK_MASK                     0x0100  /* LDO9_OK */
+#define WM831X_LDO9_OK_SHIFT                         8  /* LDO9_OK */
+#define WM831X_LDO9_OK_WIDTH                         1  /* LDO9_OK */
+#define WM831X_LDO8_OK                          0x0080  /* LDO8_OK */
+#define WM831X_LDO8_OK_MASK                     0x0080  /* LDO8_OK */
+#define WM831X_LDO8_OK_SHIFT                         7  /* LDO8_OK */
+#define WM831X_LDO8_OK_WIDTH                         1  /* LDO8_OK */
+#define WM831X_LDO7_OK                          0x0040  /* LDO7_OK */
+#define WM831X_LDO7_OK_MASK                     0x0040  /* LDO7_OK */
+#define WM831X_LDO7_OK_SHIFT                         6  /* LDO7_OK */
+#define WM831X_LDO7_OK_WIDTH                         1  /* LDO7_OK */
+#define WM831X_LDO6_OK                          0x0020  /* LDO6_OK */
+#define WM831X_LDO6_OK_MASK                     0x0020  /* LDO6_OK */
+#define WM831X_LDO6_OK_SHIFT                         5  /* LDO6_OK */
+#define WM831X_LDO6_OK_WIDTH                         1  /* LDO6_OK */
+#define WM831X_LDO5_OK                          0x0010  /* LDO5_OK */
+#define WM831X_LDO5_OK_MASK                     0x0010  /* LDO5_OK */
+#define WM831X_LDO5_OK_SHIFT                         4  /* LDO5_OK */
+#define WM831X_LDO5_OK_WIDTH                         1  /* LDO5_OK */
+#define WM831X_LDO4_OK                          0x0008  /* LDO4_OK */
+#define WM831X_LDO4_OK_MASK                     0x0008  /* LDO4_OK */
+#define WM831X_LDO4_OK_SHIFT                         3  /* LDO4_OK */
+#define WM831X_LDO4_OK_WIDTH                         1  /* LDO4_OK */
+#define WM831X_LDO3_OK                          0x0004  /* LDO3_OK */
+#define WM831X_LDO3_OK_MASK                     0x0004  /* LDO3_OK */
+#define WM831X_LDO3_OK_SHIFT                         2  /* LDO3_OK */
+#define WM831X_LDO3_OK_WIDTH                         1  /* LDO3_OK */
+#define WM831X_LDO2_OK                          0x0002  /* LDO2_OK */
+#define WM831X_LDO2_OK_MASK                     0x0002  /* LDO2_OK */
+#define WM831X_LDO2_OK_SHIFT                         1  /* LDO2_OK */
+#define WM831X_LDO2_OK_WIDTH                         1  /* LDO2_OK */
+#define WM831X_LDO1_OK                          0x0001  /* LDO1_OK */
+#define WM831X_LDO1_OK_MASK                     0x0001  /* LDO1_OK */
+#define WM831X_LDO1_OK_SHIFT                         0  /* LDO1_OK */
+#define WM831X_LDO1_OK_WIDTH                         1  /* LDO1_OK */
+
 #define WM831X_ISINK_MAX_ISEL 56
 extern int wm831x_isinkv_values[WM831X_ISINK_MAX_ISEL];
 
-- 
cgit v1.2.3-71-gd317


From 956f25a6778a2510d52973ab8a3ac2e03e2c3704 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@stericsson.com>
Date: Thu, 13 Aug 2009 11:49:23 +0200
Subject: mfd: AB3100 accessor function cleanups

This adds the _interruptible suffix to the AB3100 accessor
functions on par with mutex_lock_interruptible() that's used
for blocking simultaneous calls to the AB3100 acessor functions.
Since these accesses are slow on a 100kHz I2C bus and may line
up waiting for the mutex, we need to handle interruption by
system shutdown or kill signals and may just as well denote that
in the function names.

Signed-off-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/ab3100-core.c  | 43 ++++++++++++++++++++++++-------------------
 include/linux/mfd/ab3100.h |  8 ++++----
 2 files changed, 28 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/ab3100-core.c b/drivers/mfd/ab3100-core.c
index 8ff10cb77cac..ffe4b6415465 100644
--- a/drivers/mfd/ab3100-core.c
+++ b/drivers/mfd/ab3100-core.c
@@ -77,7 +77,7 @@ u8 ab3100_get_chip_type(struct ab3100 *ab3100)
 }
 EXPORT_SYMBOL(ab3100_get_chip_type);
 
-int ab3100_set_register(struct ab3100 *ab3100, u8 reg, u8 regval)
+int ab3100_set_register_interruptible(struct ab3100 *ab3100, u8 reg, u8 regval)
 {
 	u8 regandval[2] = {reg, regval};
 	int err;
@@ -109,7 +109,8 @@ int ab3100_set_register(struct ab3100 *ab3100, u8 reg, u8 regval)
 	mutex_unlock(&ab3100->access_mutex);
 	return 0;
 }
-EXPORT_SYMBOL(ab3100_set_register);
+EXPORT_SYMBOL(ab3100_set_register_interruptible);
+
 
 /*
  * The test registers exist at an I2C bus address up one
@@ -118,7 +119,7 @@ EXPORT_SYMBOL(ab3100_set_register);
  * anyway. It's currently only used from this file so declare
  * it static and do not export.
  */
-static int ab3100_set_test_register(struct ab3100 *ab3100,
+static int ab3100_set_test_register_interruptible(struct ab3100 *ab3100,
 				    u8 reg, u8 regval)
 {
 	u8 regandval[2] = {reg, regval};
@@ -148,7 +149,8 @@ static int ab3100_set_test_register(struct ab3100 *ab3100,
 	return err;
 }
 
-int ab3100_get_register(struct ab3100 *ab3100, u8 reg, u8 *regval)
+
+int ab3100_get_register_interruptible(struct ab3100 *ab3100, u8 reg, u8 *regval)
 {
 	int err;
 
@@ -202,9 +204,10 @@ int ab3100_get_register(struct ab3100 *ab3100, u8 reg, u8 *regval)
 	mutex_unlock(&ab3100->access_mutex);
 	return err;
 }
-EXPORT_SYMBOL(ab3100_get_register);
+EXPORT_SYMBOL(ab3100_get_register_interruptible);
+
 
-int ab3100_get_register_page(struct ab3100 *ab3100,
+int ab3100_get_register_page_interruptible(struct ab3100 *ab3100,
 			     u8 first_reg, u8 *regvals, u8 numregs)
 {
 	int err;
@@ -258,9 +261,10 @@ int ab3100_get_register_page(struct ab3100 *ab3100,
 	mutex_unlock(&ab3100->access_mutex);
 	return err;
 }
-EXPORT_SYMBOL(ab3100_get_register_page);
+EXPORT_SYMBOL(ab3100_get_register_page_interruptible);
 
-int ab3100_mask_and_set_register(struct ab3100 *ab3100,
+
+int ab3100_mask_and_set_register_interruptible(struct ab3100 *ab3100,
 				 u8 reg, u8 andmask, u8 ormask)
 {
 	u8 regandval[2] = {reg, 0};
@@ -328,7 +332,8 @@ int ab3100_mask_and_set_register(struct ab3100 *ab3100,
 	mutex_unlock(&ab3100->access_mutex);
 	return err;
 }
-EXPORT_SYMBOL(ab3100_mask_and_set_register);
+EXPORT_SYMBOL(ab3100_mask_and_set_register_interruptible);
+
 
 /*
  * Register a simple callback for handling any AB3100 events.
@@ -371,7 +376,7 @@ static void ab3100_work(struct work_struct *work)
 	u32 fatevent;
 	int err;
 
-	err = ab3100_get_register_page(ab3100, AB3100_EVENTA1,
+	err = ab3100_get_register_page_interruptible(ab3100, AB3100_EVENTA1,
 				       event_regs, 3);
 	if (err)
 		goto err_event_wq;
@@ -435,7 +440,7 @@ static int ab3100_registers_print(struct seq_file *s, void *p)
 	seq_printf(s, "AB3100 registers:\n");
 
 	for (reg = 0; reg < 0xff; reg++) {
-		ab3100_get_register(ab3100, reg, &value);
+		ab3100_get_register_interruptible(ab3100, reg, &value);
 		seq_printf(s, "[0x%x]:  0x%x\n", reg, value);
 	}
 	return 0;
@@ -515,7 +520,7 @@ static ssize_t ab3100_get_set_reg(struct file *file,
 		u8 reg = (u8) user_reg;
 		u8 regvalue;
 
-		ab3100_get_register(ab3100, reg, &regvalue);
+		ab3100_get_register_interruptible(ab3100, reg, &regvalue);
 
 		dev_info(ab3100->dev,
 			 "debug read AB3100 reg[0x%02x]: 0x%02x\n",
@@ -547,8 +552,8 @@ static ssize_t ab3100_get_set_reg(struct file *file,
 			return -EINVAL;
 
 		value = (u8) user_value;
-		ab3100_set_register(ab3100, reg, value);
-		ab3100_get_register(ab3100, reg, &regvalue);
+		ab3100_set_register_interruptible(ab3100, reg, value);
+		ab3100_get_register_interruptible(ab3100, reg, &regvalue);
 
 		dev_info(ab3100->dev,
 			 "debug write reg[0x%02x] with 0x%02x, "
@@ -696,7 +701,7 @@ static int __init ab3100_setup(struct ab3100 *ab3100)
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(ab3100_init_settings); i++) {
-		err = ab3100_set_register(ab3100,
+		err = ab3100_set_register_interruptible(ab3100,
 					  ab3100_init_settings[i].abreg,
 					  ab3100_init_settings[i].setting);
 		if (err)
@@ -705,14 +710,14 @@ static int __init ab3100_setup(struct ab3100 *ab3100)
 
 	/*
 	 * Special trick to make the AB3100 use the 32kHz clock (RTC)
-	 * bit 3 in test registe 0x02 is a special, undocumented test
+	 * bit 3 in test register 0x02 is a special, undocumented test
 	 * register bit that only exist in AB3100 P1E
 	 */
 	if (ab3100->chip_id == 0xc4) {
 		dev_warn(ab3100->dev,
 			 "AB3100 P1E variant detected, "
 			 "forcing chip to 32KHz\n");
-		err = ab3100_set_test_register(ab3100, 0x02, 0x08);
+		err = ab3100_set_test_register_interruptible(ab3100, 0x02, 0x08);
 	}
 
  exit_no_setup:
@@ -852,8 +857,8 @@ static int __init ab3100_probe(struct i2c_client *client,
 	i2c_set_clientdata(client, ab3100);
 
 	/* Read chip ID register */
-	err = ab3100_get_register(ab3100, AB3100_CID,
-				  &ab3100->chip_id);
+	err = ab3100_get_register_interruptible(ab3100, AB3100_CID,
+						&ab3100->chip_id);
 	if (err) {
 		dev_err(&client->dev,
 			"could not communicate with the AB3100 analog "
diff --git a/include/linux/mfd/ab3100.h b/include/linux/mfd/ab3100.h
index 7a3f316e3848..56343b8013b5 100644
--- a/include/linux/mfd/ab3100.h
+++ b/include/linux/mfd/ab3100.h
@@ -86,11 +86,11 @@ struct ab3100 {
 	bool startup_events_read;
 };
 
-int ab3100_set_register(struct ab3100 *ab3100, u8 reg, u8 regval);
-int ab3100_get_register(struct ab3100 *ab3100, u8 reg, u8 *regval);
-int ab3100_get_register_page(struct ab3100 *ab3100,
+int ab3100_set_register_interruptible(struct ab3100 *ab3100, u8 reg, u8 regval);
+int ab3100_get_register_interruptible(struct ab3100 *ab3100, u8 reg, u8 *regval);
+int ab3100_get_register_page_interruptible(struct ab3100 *ab3100,
 			     u8 first_reg, u8 *regvals, u8 numregs);
-int ab3100_mask_and_set_register(struct ab3100 *ab3100,
+int ab3100_mask_and_set_register_interruptible(struct ab3100 *ab3100,
 				 u8 reg, u8 andmask, u8 ormask);
 u8 ab3100_get_chip_type(struct ab3100 *ab3100);
 int ab3100_event_register(struct ab3100 *ab3100,
-- 
cgit v1.2.3-71-gd317


From 8238addcc52c94c59b10c3c1e9850d3a7921f825 Mon Sep 17 00:00:00 2001
From: Sascha Hauer <s.hauer@pengutronix.de>
Date: Wed, 19 Aug 2009 01:40:28 +0200
Subject: mfd: Add Freescale MC13783 driver

This driver provides the core Freescale MC13783 support. It
registers the client platform_devices and provides access
to the A/D converter.

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/Kconfig                 |  10 +
 drivers/mfd/Makefile                |   2 +
 drivers/mfd/mc13783-core.c          | 427 ++++++++++++++++++++++++++++++++++++
 include/linux/mfd/mc13783-private.h | 396 +++++++++++++++++++++++++++++++++
 include/linux/mfd/mc13783.h         |  84 +++++++
 5 files changed, 919 insertions(+)
 create mode 100644 drivers/mfd/mc13783-core.c
 create mode 100644 include/linux/mfd/mc13783-private.h
 create mode 100644 include/linux/mfd/mc13783.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 0273456af77f..a4f3dff30ba5 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -238,6 +238,16 @@ config MFD_PCF50633
 	  facilities, and registers devices for the various functions
 	  so that function-specific drivers can bind to them.
 
+config MFD_MC13783
+	tristate "Support Freescale MC13783"
+	depends on SPI_MASTER
+	select MFD_CORE
+	help
+	  Support for the Freescale (Atlas) MC13783 PMIC and audio CODEC.
+	  This driver provides common support for accessing  the device,
+	  additional drivers must be enabled in order to use the
+	  functionality of the device.
+
 config PCF50633_ADC
 	tristate "Support for NXP PCF50633 ADC"
 	depends on MFD_PCF50633
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 285cb320e6a6..7fec04fb5f47 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -26,6 +26,8 @@ obj-$(CONFIG_MENELAUS)		+= menelaus.o
 
 obj-$(CONFIG_TWL4030_CORE)	+= twl4030-core.o twl4030-irq.o
 
+obj-$(CONFIG_MFD_MC13783)	+= mc13783-core.o
+
 obj-$(CONFIG_MFD_CORE)		+= mfd-core.o
 
 obj-$(CONFIG_EZX_PCAP)		+= ezx-pcap.o
diff --git a/drivers/mfd/mc13783-core.c b/drivers/mfd/mc13783-core.c
new file mode 100644
index 000000000000..e354d2912ef1
--- /dev/null
+++ b/drivers/mfd/mc13783-core.c
@@ -0,0 +1,427 @@
+/*
+ * Copyright 2009 Pengutronix, Sascha Hauer <s.hauer@pengutronix.de>
+ *
+ * This code is in parts based on wm8350-core.c and pcf50633-core.c
+ *
+ * Initial development of this code was funded by
+ * Phytec Messtechnik GmbH, http://www.phytec.de
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/mfd/mc13783-private.h>
+#include <linux/platform_device.h>
+#include <linux/mfd/mc13783.h>
+#include <linux/completion.h>
+#include <linux/interrupt.h>
+#include <linux/mfd/core.h>
+#include <linux/spi/spi.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/irq.h>
+
+#define MC13783_MAX_REG_NUM	0x3f
+#define MC13783_FRAME_MASK	0x00ffffff
+#define MC13783_MAX_REG_NUM	0x3f
+#define MC13783_REG_NUM_SHIFT	0x19
+#define MC13783_WRITE_BIT_SHIFT	31
+
+static inline int spi_rw(struct spi_device *spi, u8 * buf, size_t len)
+{
+	struct spi_transfer t = {
+		.tx_buf = (const void *)buf,
+		.rx_buf = buf,
+		.len = len,
+		.cs_change = 0,
+		.delay_usecs = 0,
+	};
+	struct spi_message m;
+
+	spi_message_init(&m);
+	spi_message_add_tail(&t, &m);
+	if (spi_sync(spi, &m) != 0 || m.status != 0)
+		return -EINVAL;
+	return len - m.actual_length;
+}
+
+static int mc13783_read(struct mc13783 *mc13783, int reg_num, u32 *reg_val)
+{
+	unsigned int frame = 0;
+	int ret = 0;
+
+	if (reg_num > MC13783_MAX_REG_NUM)
+		return -EINVAL;
+
+	frame |= reg_num << MC13783_REG_NUM_SHIFT;
+
+	ret = spi_rw(mc13783->spi_device, (u8 *)&frame, 4);
+
+	*reg_val = frame & MC13783_FRAME_MASK;
+
+	return ret;
+}
+
+static int mc13783_write(struct mc13783 *mc13783, int reg_num, u32 reg_val)
+{
+	unsigned int frame = 0;
+
+	if (reg_num > MC13783_MAX_REG_NUM)
+		return -EINVAL;
+
+	frame |= (1 << MC13783_WRITE_BIT_SHIFT);
+	frame |= reg_num << MC13783_REG_NUM_SHIFT;
+	frame |= reg_val & MC13783_FRAME_MASK;
+
+	return spi_rw(mc13783->spi_device, (u8 *)&frame, 4);
+}
+
+int mc13783_reg_read(struct mc13783 *mc13783, int reg_num, u32 *reg_val)
+{
+	int ret;
+
+	mutex_lock(&mc13783->io_lock);
+	ret = mc13783_read(mc13783, reg_num, reg_val);
+	mutex_unlock(&mc13783->io_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mc13783_reg_read);
+
+int mc13783_reg_write(struct mc13783 *mc13783, int reg_num, u32 reg_val)
+{
+	int ret;
+
+	mutex_lock(&mc13783->io_lock);
+	ret = mc13783_write(mc13783, reg_num, reg_val);
+	mutex_unlock(&mc13783->io_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mc13783_reg_write);
+
+/**
+ * mc13783_set_bits - Bitmask write
+ *
+ * @mc13783: Pointer to mc13783 control structure
+ * @reg:    Register to access
+ * @mask:   Mask of bits to change
+ * @val:    Value to set for masked bits
+ */
+int mc13783_set_bits(struct mc13783 *mc13783, int reg, u32 mask, u32 val)
+{
+	u32 tmp;
+	int ret;
+
+	mutex_lock(&mc13783->io_lock);
+
+	ret = mc13783_read(mc13783, reg, &tmp);
+	tmp = (tmp & ~mask) | val;
+	if (ret == 0)
+		ret = mc13783_write(mc13783, reg, tmp);
+
+	mutex_unlock(&mc13783->io_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mc13783_set_bits);
+
+int mc13783_register_irq(struct mc13783 *mc13783, int irq,
+		void (*handler) (int, void *), void *data)
+{
+	if (irq < 0 || irq > MC13783_NUM_IRQ || !handler)
+		return -EINVAL;
+
+	if (WARN_ON(mc13783->irq_handler[irq].handler))
+		return -EBUSY;
+
+	mutex_lock(&mc13783->io_lock);
+	mc13783->irq_handler[irq].handler = handler;
+	mc13783->irq_handler[irq].data = data;
+	mutex_unlock(&mc13783->io_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mc13783_register_irq);
+
+int mc13783_free_irq(struct mc13783 *mc13783, int irq)
+{
+	if (irq < 0 || irq > MC13783_NUM_IRQ)
+		return -EINVAL;
+
+	mutex_lock(&mc13783->io_lock);
+	mc13783->irq_handler[irq].handler = NULL;
+	mutex_unlock(&mc13783->io_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mc13783_free_irq);
+
+static void mc13783_irq_work(struct work_struct *work)
+{
+	struct mc13783 *mc13783 = container_of(work, struct mc13783, work);
+	int i;
+	unsigned int adc_sts;
+
+	/* check if the adc has finished any completion */
+	mc13783_reg_read(mc13783, MC13783_REG_INTERRUPT_STATUS_0, &adc_sts);
+	mc13783_reg_write(mc13783, MC13783_REG_INTERRUPT_STATUS_0,
+			adc_sts & MC13783_INT_STAT_ADCDONEI);
+
+	if (adc_sts & MC13783_INT_STAT_ADCDONEI)
+		complete_all(&mc13783->adc_done);
+
+	for (i = 0; i < MC13783_NUM_IRQ; i++)
+		if (mc13783->irq_handler[i].handler)
+			mc13783->irq_handler[i].handler(i,
+					mc13783->irq_handler[i].data);
+	enable_irq(mc13783->irq);
+}
+
+static irqreturn_t mc13783_interrupt(int irq, void *dev_id)
+{
+	struct mc13783 *mc13783 = dev_id;
+
+	disable_irq_nosync(irq);
+
+	schedule_work(&mc13783->work);
+	return IRQ_HANDLED;
+}
+
+/* set adc to ts interrupt mode, which generates touchscreen wakeup interrupt */
+static inline void mc13783_adc_set_ts_irq_mode(struct mc13783 *mc13783)
+{
+	unsigned int reg_adc0, reg_adc1;
+
+	reg_adc0 = MC13783_ADC0_ADREFEN | MC13783_ADC0_ADREFMODE
+			| MC13783_ADC0_TSMOD0;
+	reg_adc1 = MC13783_ADC1_ADEN | MC13783_ADC1_ADTRIGIGN;
+
+	mc13783_reg_write(mc13783, MC13783_REG_ADC_0, reg_adc0);
+	mc13783_reg_write(mc13783, MC13783_REG_ADC_1, reg_adc1);
+}
+
+int mc13783_adc_do_conversion(struct mc13783 *mc13783, unsigned int mode,
+		unsigned int channel, unsigned int *sample)
+{
+	unsigned int reg_adc0, reg_adc1;
+	int i;
+
+	mutex_lock(&mc13783->adc_conv_lock);
+
+	/* set up auto incrementing anyway to make quick read */
+	reg_adc0 =  MC13783_ADC0_ADINC1 | MC13783_ADC0_ADINC2;
+	/* enable the adc, ignore external triggering and set ASC to trigger
+	 * conversion */
+	reg_adc1 =  MC13783_ADC1_ADEN | MC13783_ADC1_ADTRIGIGN
+		| MC13783_ADC1_ASC;
+
+	/* setup channel number */
+	if (channel > 7)
+		reg_adc1 |= MC13783_ADC1_ADSEL;
+
+	switch (mode) {
+	case MC13783_ADC_MODE_TS:
+		/* enables touch screen reference mode and set touchscreen mode
+		 * to position mode */
+		reg_adc0 |= MC13783_ADC0_ADREFEN | MC13783_ADC0_ADREFMODE
+			| MC13783_ADC0_TSMOD0 | MC13783_ADC0_TSMOD1;
+		reg_adc1 |= 4 << MC13783_ADC1_CHAN1_SHIFT;
+		break;
+	case MC13783_ADC_MODE_SINGLE_CHAN:
+		reg_adc1 |= (channel & 0x7) << MC13783_ADC1_CHAN0_SHIFT;
+		reg_adc1 |= MC13783_ADC1_RAND;
+		break;
+	case MC13783_ADC_MODE_MULT_CHAN:
+		reg_adc1 |= 4 << MC13783_ADC1_CHAN1_SHIFT;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	mc13783_reg_write(mc13783, MC13783_REG_ADC_0, reg_adc0);
+	mc13783_reg_write(mc13783, MC13783_REG_ADC_1, reg_adc1);
+
+	wait_for_completion_interruptible(&mc13783->adc_done);
+
+	for (i = 0; i < 4; i++)
+		mc13783_reg_read(mc13783, MC13783_REG_ADC_2, &sample[i]);
+
+	if (mc13783->ts_active)
+		mc13783_adc_set_ts_irq_mode(mc13783);
+
+	mutex_unlock(&mc13783->adc_conv_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mc13783_adc_do_conversion);
+
+void mc13783_adc_set_ts_status(struct mc13783 *mc13783, unsigned int status)
+{
+	mc13783->ts_active = status;
+}
+EXPORT_SYMBOL_GPL(mc13783_adc_set_ts_status);
+
+static int mc13783_check_revision(struct mc13783 *mc13783)
+{
+	u32 rev_id, rev1, rev2, finid, icid;
+
+	mc13783_read(mc13783, MC13783_REG_REVISION, &rev_id);
+
+	rev1 = (rev_id & 0x018) >> 3;
+	rev2 = (rev_id & 0x007);
+	icid = (rev_id & 0x01C0) >> 6;
+	finid = (rev_id & 0x01E00) >> 9;
+
+	/* Ver 0.2 is actually 3.2a.  Report as 3.2 */
+	if ((rev1 == 0) && (rev2 == 2))
+		rev1 = 3;
+
+	if (rev1 == 0 || icid != 2) {
+		dev_err(mc13783->dev, "No MC13783 detected.\n");
+		return -ENODEV;
+	}
+
+	mc13783->revision = ((rev1 * 10) + rev2);
+	dev_info(mc13783->dev, "MC13783 Rev %d.%d FinVer %x detected\n", rev1,
+	       rev2, finid);
+
+	return 0;
+}
+
+/*
+ * Register a client device.  This is non-fatal since there is no need to
+ * fail the entire device init due to a single platform device failing.
+ */
+static void mc13783_client_dev_register(struct mc13783 *mc13783,
+				       const char *name)
+{
+	struct mfd_cell cell = {};
+
+	cell.name = name;
+
+	mfd_add_devices(mc13783->dev, -1, &cell, 1, NULL, 0);
+}
+
+static int __devinit mc13783_probe(struct spi_device *spi)
+{
+	struct mc13783 *mc13783;
+	struct mc13783_platform_data *pdata = spi->dev.platform_data;
+	int ret;
+
+	mc13783 = kzalloc(sizeof(struct mc13783), GFP_KERNEL);
+	if (!mc13783)
+		return -ENOMEM;
+
+	dev_set_drvdata(&spi->dev, mc13783);
+	spi->mode = SPI_MODE_0 | SPI_CS_HIGH;
+	spi->bits_per_word = 32;
+	spi_setup(spi);
+
+	mc13783->spi_device = spi;
+	mc13783->dev = &spi->dev;
+	mc13783->irq = spi->irq;
+
+	INIT_WORK(&mc13783->work, mc13783_irq_work);
+	mutex_init(&mc13783->io_lock);
+	mutex_init(&mc13783->adc_conv_lock);
+	init_completion(&mc13783->adc_done);
+
+	if (pdata) {
+		mc13783->flags = pdata->flags;
+		mc13783->regulators = pdata->regulators;
+		mc13783->num_regulators = pdata->num_regulators;
+	}
+
+	if (mc13783_check_revision(mc13783)) {
+		ret = -ENODEV;
+		goto err_out;
+	}
+
+	/* clear and mask all interrupts */
+	mc13783_reg_write(mc13783, MC13783_REG_INTERRUPT_STATUS_0, 0x00ffffff);
+	mc13783_reg_write(mc13783, MC13783_REG_INTERRUPT_MASK_0, 0x00ffffff);
+	mc13783_reg_write(mc13783, MC13783_REG_INTERRUPT_STATUS_1, 0x00ffffff);
+	mc13783_reg_write(mc13783, MC13783_REG_INTERRUPT_MASK_1, 0x00ffffff);
+
+	/* unmask adcdone interrupts */
+	mc13783_set_bits(mc13783, MC13783_REG_INTERRUPT_MASK_0,
+			MC13783_INT_MASK_ADCDONEM, 0);
+
+	ret = request_irq(mc13783->irq, mc13783_interrupt,
+			IRQF_DISABLED | IRQF_TRIGGER_HIGH, "mc13783",
+			mc13783);
+	if (ret)
+		goto err_out;
+
+	if (mc13783->flags & MC13783_USE_CODEC)
+		mc13783_client_dev_register(mc13783, "mc13783-codec");
+	if (mc13783->flags & MC13783_USE_ADC)
+		mc13783_client_dev_register(mc13783, "mc13783-adc");
+	if (mc13783->flags & MC13783_USE_RTC)
+		mc13783_client_dev_register(mc13783, "mc13783-rtc");
+	if (mc13783->flags & MC13783_USE_REGULATOR)
+		mc13783_client_dev_register(mc13783, "mc13783-regulator");
+	if (mc13783->flags & MC13783_USE_TOUCHSCREEN)
+		mc13783_client_dev_register(mc13783, "mc13783-ts");
+
+	return 0;
+
+err_out:
+	kfree(mc13783);
+	return ret;
+}
+
+static int __devexit mc13783_remove(struct spi_device *spi)
+{
+	struct mc13783 *mc13783;
+
+	mc13783 = dev_get_drvdata(&spi->dev);
+
+	free_irq(mc13783->irq, mc13783);
+
+	mfd_remove_devices(&spi->dev);
+
+	return 0;
+}
+
+static struct spi_driver pmic_driver = {
+	.driver = {
+		   .name = "mc13783",
+		   .bus = &spi_bus_type,
+		   .owner = THIS_MODULE,
+	},
+	.probe = mc13783_probe,
+	.remove = __devexit_p(mc13783_remove),
+};
+
+static int __init pmic_init(void)
+{
+	return spi_register_driver(&pmic_driver);
+}
+subsys_initcall(pmic_init);
+
+static void __exit pmic_exit(void)
+{
+	spi_unregister_driver(&pmic_driver);
+}
+module_exit(pmic_exit);
+
+MODULE_DESCRIPTION("Core/Protocol driver for Freescale MC13783 PMIC");
+MODULE_AUTHOR("Sascha Hauer <s.hauer@pengutronix.de>");
+MODULE_LICENSE("GPL");
+
diff --git a/include/linux/mfd/mc13783-private.h b/include/linux/mfd/mc13783-private.h
new file mode 100644
index 000000000000..47e698cb0f16
--- /dev/null
+++ b/include/linux/mfd/mc13783-private.h
@@ -0,0 +1,396 @@
+/*
+ * Copyright 2009 Pengutronix, Sascha Hauer <s.hauer@pengutronix.de>
+ *
+ * Initial development of this code was funded by
+ * Phytec Messtechnik GmbH, http://www.phytec.de
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __LINUX_MFD_MC13783_PRIV_H
+#define __LINUX_MFD_MC13783_PRIV_H
+
+#include <linux/platform_device.h>
+#include <linux/mfd/mc13783.h>
+#include <linux/workqueue.h>
+#include <linux/mutex.h>
+
+struct mc13783_irq {
+	void (*handler)(int, void *);
+	void *data;
+};
+
+#define MC13783_NUM_IRQ		2
+#define MC13783_IRQ_TS		0
+#define MC13783_IRQ_REGULATOR	1
+
+#define MC13783_ADC_MODE_TS		1
+#define MC13783_ADC_MODE_SINGLE_CHAN	2
+#define MC13783_ADC_MODE_MULT_CHAN	3
+
+struct mc13783 {
+	int revision;
+	struct device *dev;
+	struct spi_device *spi_device;
+
+	int (*read_dev)(void *data, char reg, int count, u32 *dst);
+	int (*write_dev)(void *data, char reg, int count, const u32 *src);
+
+	struct mutex io_lock;
+	void *io_data;
+	int irq;
+	unsigned int flags;
+
+	struct mc13783_irq irq_handler[MC13783_NUM_IRQ];
+	struct work_struct work;
+	struct completion adc_done;
+	unsigned int ts_active;
+	struct mutex adc_conv_lock;
+
+	struct mc13783_regulator_init_data *regulators;
+	int num_regulators;
+};
+
+int mc13783_reg_read(struct mc13783 *, int reg_num, u32 *);
+int mc13783_reg_write(struct mc13783 *, int, u32);
+int mc13783_set_bits(struct mc13783 *, int, u32, u32);
+int mc13783_free_irq(struct mc13783 *mc13783, int irq);
+int mc13783_register_irq(struct mc13783 *mc13783, int irq,
+		void (*handler) (int, void *), void *data);
+
+#define MC13783_REG_INTERRUPT_STATUS_0		 0
+#define MC13783_REG_INTERRUPT_MASK_0		 1
+#define MC13783_REG_INTERRUPT_SENSE_0		 2
+#define MC13783_REG_INTERRUPT_STATUS_1		 3
+#define MC13783_REG_INTERRUPT_MASK_1		 4
+#define MC13783_REG_INTERRUPT_SENSE_1		 5
+#define MC13783_REG_POWER_UP_MODE_SENSE		 6
+#define MC13783_REG_REVISION			 7
+#define MC13783_REG_SEMAPHORE			 8
+#define MC13783_REG_ARBITRATION_PERIPHERAL_AUDIO 9
+#define MC13783_REG_ARBITRATION_SWITCHERS	10
+#define MC13783_REG_ARBITRATION_REGULATORS_0	11
+#define MC13783_REG_ARBITRATION_REGULATORS_1	12
+#define MC13783_REG_POWER_CONTROL_0		13
+#define MC13783_REG_POWER_CONTROL_1		14
+#define MC13783_REG_POWER_CONTROL_2		15
+#define MC13783_REG_REGEN_ASSIGNMENT		16
+#define MC13783_REG_CONTROL_SPARE		17
+#define MC13783_REG_MEMORY_A			18
+#define MC13783_REG_MEMORY_B			19
+#define MC13783_REG_RTC_TIME			20
+#define MC13783_REG_RTC_ALARM			21
+#define MC13783_REG_RTC_DAY			22
+#define MC13783_REG_RTC_DAY_ALARM		23
+#define MC13783_REG_SWITCHERS_0			24
+#define MC13783_REG_SWITCHERS_1			25
+#define MC13783_REG_SWITCHERS_2			26
+#define MC13783_REG_SWITCHERS_3			27
+#define MC13783_REG_SWITCHERS_4			28
+#define MC13783_REG_SWITCHERS_5			29
+#define MC13783_REG_REGULATOR_SETTING_0		30
+#define MC13783_REG_REGULATOR_SETTING_1		31
+#define MC13783_REG_REGULATOR_MODE_0		32
+#define MC13783_REG_REGULATOR_MODE_1		33
+#define MC13783_REG_POWER_MISCELLANEOUS		34
+#define MC13783_REG_POWER_SPARE			35
+#define MC13783_REG_AUDIO_RX_0			36
+#define MC13783_REG_AUDIO_RX_1			37
+#define MC13783_REG_AUDIO_TX			38
+#define MC13783_REG_AUDIO_SSI_NETWORK		39
+#define MC13783_REG_AUDIO_CODEC			40
+#define MC13783_REG_AUDIO_STEREO_DAC		41
+#define MC13783_REG_AUDIO_SPARE			42
+#define MC13783_REG_ADC_0			43
+#define MC13783_REG_ADC_1			44
+#define MC13783_REG_ADC_2			45
+#define MC13783_REG_ADC_3			46
+#define MC13783_REG_ADC_4			47
+#define MC13783_REG_CHARGER			48
+#define MC13783_REG_USB				49
+#define MC13783_REG_CHARGE_USB_SPARE		50
+#define MC13783_REG_LED_CONTROL_0		51
+#define MC13783_REG_LED_CONTROL_1		52
+#define MC13783_REG_LED_CONTROL_2		53
+#define MC13783_REG_LED_CONTROL_3		54
+#define MC13783_REG_LED_CONTROL_4		55
+#define MC13783_REG_LED_CONTROL_5		56
+#define MC13783_REG_SPARE			57
+#define MC13783_REG_TRIM_0			58
+#define MC13783_REG_TRIM_1			59
+#define MC13783_REG_TEST_0			60
+#define MC13783_REG_TEST_1			61
+#define MC13783_REG_TEST_2			62
+#define MC13783_REG_TEST_3			63
+#define MC13783_REG_NB				64
+
+
+/*
+ * Interrupt Status
+ */
+#define MC13783_INT_STAT_ADCDONEI	(1 << 0)
+#define MC13783_INT_STAT_ADCBISDONEI	(1 << 1)
+#define MC13783_INT_STAT_TSI		(1 << 2)
+#define MC13783_INT_STAT_WHIGHI		(1 << 3)
+#define MC13783_INT_STAT_WLOWI		(1 << 4)
+#define MC13783_INT_STAT_CHGDETI	(1 << 6)
+#define MC13783_INT_STAT_CHGOVI		(1 << 7)
+#define MC13783_INT_STAT_CHGREVI	(1 << 8)
+#define MC13783_INT_STAT_CHGSHORTI	(1 << 9)
+#define MC13783_INT_STAT_CCCVI		(1 << 10)
+#define MC13783_INT_STAT_CHGCURRI	(1 << 11)
+#define MC13783_INT_STAT_BPONI		(1 << 12)
+#define MC13783_INT_STAT_LOBATLI	(1 << 13)
+#define MC13783_INT_STAT_LOBATHI	(1 << 14)
+#define MC13783_INT_STAT_UDPI		(1 << 15)
+#define MC13783_INT_STAT_USBI		(1 << 16)
+#define MC13783_INT_STAT_IDI		(1 << 19)
+#define MC13783_INT_STAT_Unused		(1 << 20)
+#define MC13783_INT_STAT_SE1I		(1 << 21)
+#define MC13783_INT_STAT_CKDETI		(1 << 22)
+#define MC13783_INT_STAT_UDMI		(1 << 23)
+
+/*
+ * Interrupt Mask
+ */
+#define MC13783_INT_MASK_ADCDONEM	(1 << 0)
+#define MC13783_INT_MASK_ADCBISDONEM	(1 << 1)
+#define MC13783_INT_MASK_TSM		(1 << 2)
+#define MC13783_INT_MASK_WHIGHM		(1 << 3)
+#define MC13783_INT_MASK_WLOWM		(1 << 4)
+#define MC13783_INT_MASK_CHGDETM	(1 << 6)
+#define MC13783_INT_MASK_CHGOVM		(1 << 7)
+#define MC13783_INT_MASK_CHGREVM	(1 << 8)
+#define MC13783_INT_MASK_CHGSHORTM	(1 << 9)
+#define MC13783_INT_MASK_CCCVM		(1 << 10)
+#define MC13783_INT_MASK_CHGCURRM	(1 << 11)
+#define MC13783_INT_MASK_BPONM		(1 << 12)
+#define MC13783_INT_MASK_LOBATLM	(1 << 13)
+#define MC13783_INT_MASK_LOBATHM	(1 << 14)
+#define MC13783_INT_MASK_UDPM		(1 << 15)
+#define MC13783_INT_MASK_USBM		(1 << 16)
+#define MC13783_INT_MASK_IDM		(1 << 19)
+#define MC13783_INT_MASK_SE1M		(1 << 21)
+#define MC13783_INT_MASK_CKDETM		(1 << 22)
+
+/*
+ * Reg Regulator Mode 0
+ */
+#define MC13783_REGCTRL_VAUDIO_EN	(1 << 0)
+#define MC13783_REGCTRL_VAUDIO_STBY	(1 << 1)
+#define MC13783_REGCTRL_VAUDIO_MODE	(1 << 2)
+#define MC13783_REGCTRL_VIOHI_EN	(1 << 3)
+#define MC13783_REGCTRL_VIOHI_STBY	(1 << 4)
+#define MC13783_REGCTRL_VIOHI_MODE	(1 << 5)
+#define MC13783_REGCTRL_VIOLO_EN	(1 << 6)
+#define MC13783_REGCTRL_VIOLO_STBY 	(1 << 7)
+#define MC13783_REGCTRL_VIOLO_MODE	(1 << 8)
+#define MC13783_REGCTRL_VDIG_EN		(1 << 9)
+#define MC13783_REGCTRL_VDIG_STBY	(1 << 10)
+#define MC13783_REGCTRL_VDIG_MODE	(1 << 11)
+#define MC13783_REGCTRL_VGEN_EN		(1 << 12)
+#define MC13783_REGCTRL_VGEN_STBY	(1 << 13)
+#define MC13783_REGCTRL_VGEN_MODE	(1 << 14)
+#define MC13783_REGCTRL_VRFDIG_EN	(1 << 15)
+#define MC13783_REGCTRL_VRFDIG_STBY	(1 << 16)
+#define MC13783_REGCTRL_VRFDIG_MODE	(1 << 17)
+#define MC13783_REGCTRL_VRFREF_EN	(1 << 18)
+#define MC13783_REGCTRL_VRFREF_STBY	(1 << 19)
+#define MC13783_REGCTRL_VRFREF_MODE	(1 << 20)
+#define MC13783_REGCTRL_VRFCP_EN	(1 << 21)
+#define MC13783_REGCTRL_VRFCP_STBY	(1 << 22)
+#define MC13783_REGCTRL_VRFCP_MODE	(1 << 23)
+
+/*
+ * Reg Regulator Mode 1
+ */
+#define MC13783_REGCTRL_VSIM_EN		(1 << 0)
+#define MC13783_REGCTRL_VSIM_STBY	(1 << 1)
+#define MC13783_REGCTRL_VSIM_MODE	(1 << 2)
+#define MC13783_REGCTRL_VESIM_EN	(1 << 3)
+#define MC13783_REGCTRL_VESIM_STBY	(1 << 4)
+#define MC13783_REGCTRL_VESIM_MODE	(1 << 5)
+#define MC13783_REGCTRL_VCAM_EN		(1 << 6)
+#define MC13783_REGCTRL_VCAM_STBY	(1 << 7)
+#define MC13783_REGCTRL_VCAM_MODE	(1 << 8)
+#define	MC13783_REGCTRL_VRFBG_EN	(1 << 9)
+#define MC13783_REGCTRL_VRFBG_STBY	(1 << 10)
+#define MC13783_REGCTRL_VVIB_EN		(1 << 11)
+#define MC13783_REGCTRL_VRF1_EN		(1 << 12)
+#define MC13783_REGCTRL_VRF1_STBY	(1 << 13)
+#define MC13783_REGCTRL_VRF1_MODE	(1 << 14)
+#define MC13783_REGCTRL_VRF2_EN		(1 << 15)
+#define MC13783_REGCTRL_VRF2_STBY	(1 << 16)
+#define MC13783_REGCTRL_VRF2_MODE	(1 << 17)
+#define MC13783_REGCTRL_VMMC1_EN	(1 << 18)
+#define MC13783_REGCTRL_VMMC1_STBY	(1 << 19)
+#define MC13783_REGCTRL_VMMC1_MODE	(1 << 20)
+#define MC13783_REGCTRL_VMMC2_EN	(1 << 21)
+#define MC13783_REGCTRL_VMMC2_STBY	(1 << 22)
+#define MC13783_REGCTRL_VMMC2_MODE	(1 << 23)
+
+/*
+ * Reg Regulator Misc.
+ */
+#define MC13783_REGCTRL_GPO1_EN		(1 << 6)
+#define MC13783_REGCTRL_GPO2_EN		(1 << 8)
+#define MC13783_REGCTRL_GPO3_EN		(1 << 10)
+#define MC13783_REGCTRL_GPO4_EN		(1 << 12)
+#define MC13783_REGCTRL_VIBPINCTRL	(1 << 14)
+
+/*
+ * Reg Switcher 4
+ */
+#define MC13783_SWCTRL_SW1A_MODE	(1 << 0)
+#define MC13783_SWCTRL_SW1A_STBY_MODE	(1 << 2)
+#define MC13783_SWCTRL_SW1A_DVS_SPEED	(1 << 6)
+#define MC13783_SWCTRL_SW1A_PANIC_MODE	(1 << 8)
+#define MC13783_SWCTRL_SW1A_SOFTSTART	(1 << 9)
+#define MC13783_SWCTRL_SW1B_MODE	(1 << 10)
+#define MC13783_SWCTRL_SW1B_STBY_MODE	(1 << 12)
+#define MC13783_SWCTRL_SW1B_DVS_SPEED	(1 << 14)
+#define MC13783_SWCTRL_SW1B_PANIC_MODE	(1 << 16)
+#define MC13783_SWCTRL_SW1B_SOFTSTART	(1 << 17)
+#define MC13783_SWCTRL_PLL_EN		(1 << 18)
+#define MC13783_SWCTRL_PLL_FACTOR	(1 << 19)
+
+/*
+ * Reg Switcher 5
+ */
+#define MC13783_SWCTRL_SW2A_MODE	(1 << 0)
+#define MC13783_SWCTRL_SW2A_STBY_MODE	(1 << 2)
+#define MC13783_SWCTRL_SW2A_DVS_SPEED	(1 << 6)
+#define MC13783_SWCTRL_SW2A_PANIC_MODE	(1 << 8)
+#define MC13783_SWCTRL_SW2A_SOFTSTART	(1 << 9)
+#define MC13783_SWCTRL_SW2B_MODE	(1 << 10)
+#define MC13783_SWCTRL_SW2B_STBY_MODE	(1 << 12)
+#define MC13783_SWCTRL_SW2B_DVS_SPEED	(1 << 14)
+#define MC13783_SWCTRL_SW2B_PANIC_MODE	(1 << 16)
+#define MC13783_SWCTRL_SW2B_SOFTSTART	(1 << 17)
+#define MC13783_SWSET_SW3		(1 << 18)
+#define MC13783_SWCTRL_SW3_EN		(1 << 20)
+#define MC13783_SWCTRL_SW3_STBY		(1 << 21)
+#define MC13783_SWCTRL_SW3_MODE		(1 << 22)
+
+/*
+ * ADC/Touch
+ */
+#define MC13783_ADC0_LICELLCON		(1 << 0)
+#define MC13783_ADC0_CHRGICON		(1 << 1)
+#define MC13783_ADC0_BATICON		(1 << 2)
+#define MC13783_ADC0_RTHEN 		(1 << 3)
+#define MC13783_ADC0_DTHEN		(1 << 4)
+#define MC13783_ADC0_UIDEN		(1 << 5)
+#define MC13783_ADC0_ADOUTEN 		(1 << 6)
+#define MC13783_ADC0_ADOUTPER		(1 << 7)
+#define MC13783_ADC0_ADREFEN		(1 << 10)
+#define MC13783_ADC0_ADREFMODE		(1 << 11)
+#define MC13783_ADC0_TSMOD0		(1 << 12)
+#define MC13783_ADC0_TSMOD1		(1 << 13)
+#define MC13783_ADC0_TSMOD2		(1 << 14)
+#define MC13783_ADC0_CHRGRAWDIV		(1 << 15)
+#define MC13783_ADC0_ADINC1		(1 << 16)
+#define MC13783_ADC0_ADINC2		(1 << 17)
+#define MC13783_ADC0_WCOMP		(1 << 18)
+#define MC13783_ADC0_ADCBIS0		(1 << 23)
+
+#define MC13783_ADC1_ADEN		(1 << 0)
+#define MC13783_ADC1_RAND		(1 << 1)
+#define MC13783_ADC1_ADSEL		(1 << 3)
+#define MC13783_ADC1_TRIGMASK		(1 << 4)
+#define MC13783_ADC1_ADA10		(1 << 5)
+#define MC13783_ADC1_ADA11		(1 << 6)
+#define MC13783_ADC1_ADA12		(1 << 7)
+#define MC13783_ADC1_ADA20		(1 << 8)
+#define MC13783_ADC1_ADA21		(1 << 9)
+#define MC13783_ADC1_ADA22		(1 << 10)
+#define MC13783_ADC1_ATO0		(1 << 11)
+#define MC13783_ADC1_ATO1		(1 << 12)
+#define MC13783_ADC1_ATO2		(1 << 13)
+#define MC13783_ADC1_ATO3		(1 << 14)
+#define MC13783_ADC1_ATO4		(1 << 15)
+#define MC13783_ADC1_ATO5		(1 << 16)
+#define MC13783_ADC1_ATO6		(1 << 17)
+#define MC13783_ADC1_ATO7		(1 << 18)
+#define MC13783_ADC1_ATOX		(1 << 19)
+#define MC13783_ADC1_ASC		(1 << 20)
+#define MC13783_ADC1_ADTRIGIGN		(1 << 21)
+#define MC13783_ADC1_ADONESHOT		(1 << 22)
+#define MC13783_ADC1_ADCBIS1		(1 << 23)
+
+#define MC13783_ADC1_CHAN0_SHIFT	5
+#define MC13783_ADC1_CHAN1_SHIFT	8
+
+#define MC13783_ADC2_ADD10		(1 << 2)
+#define MC13783_ADC2_ADD11		(1 << 3)
+#define MC13783_ADC2_ADD12		(1 << 4)
+#define MC13783_ADC2_ADD13		(1 << 5)
+#define MC13783_ADC2_ADD14		(1 << 6)
+#define MC13783_ADC2_ADD15		(1 << 7)
+#define MC13783_ADC2_ADD16		(1 << 8)
+#define MC13783_ADC2_ADD17		(1 << 9)
+#define MC13783_ADC2_ADD18		(1 << 10)
+#define MC13783_ADC2_ADD19		(1 << 11)
+#define MC13783_ADC2_ADD20		(1 << 14)
+#define MC13783_ADC2_ADD21		(1 << 15)
+#define MC13783_ADC2_ADD22		(1 << 16)
+#define MC13783_ADC2_ADD23		(1 << 17)
+#define MC13783_ADC2_ADD24		(1 << 18)
+#define MC13783_ADC2_ADD25		(1 << 19)
+#define MC13783_ADC2_ADD26		(1 << 20)
+#define MC13783_ADC2_ADD27		(1 << 21)
+#define MC13783_ADC2_ADD28		(1 << 22)
+#define MC13783_ADC2_ADD29		(1 << 23)
+
+#define MC13783_ADC3_WHIGH0		(1 << 0)
+#define MC13783_ADC3_WHIGH1		(1 << 1)
+#define MC13783_ADC3_WHIGH2		(1 << 2)
+#define MC13783_ADC3_WHIGH3		(1 << 3)
+#define MC13783_ADC3_WHIGH4		(1 << 4)
+#define MC13783_ADC3_WHIGH5		(1 << 5)
+#define MC13783_ADC3_ICID0		(1 << 6)
+#define MC13783_ADC3_ICID1		(1 << 7)
+#define MC13783_ADC3_ICID2		(1 << 8)
+#define MC13783_ADC3_WLOW0		(1 << 9)
+#define MC13783_ADC3_WLOW1		(1 << 10)
+#define MC13783_ADC3_WLOW2		(1 << 11)
+#define MC13783_ADC3_WLOW3		(1 << 12)
+#define MC13783_ADC3_WLOW4		(1 << 13)
+#define MC13783_ADC3_WLOW5		(1 << 14)
+#define MC13783_ADC3_ADCBIS2		(1 << 23)
+
+#define MC13783_ADC4_ADDBIS10		(1 << 2)
+#define MC13783_ADC4_ADDBIS11		(1 << 3)
+#define MC13783_ADC4_ADDBIS12		(1 << 4)
+#define MC13783_ADC4_ADDBIS13		(1 << 5)
+#define MC13783_ADC4_ADDBIS14		(1 << 6)
+#define MC13783_ADC4_ADDBIS15		(1 << 7)
+#define MC13783_ADC4_ADDBIS16		(1 << 8)
+#define MC13783_ADC4_ADDBIS17		(1 << 9)
+#define MC13783_ADC4_ADDBIS18		(1 << 10)
+#define MC13783_ADC4_ADDBIS19		(1 << 11)
+#define MC13783_ADC4_ADDBIS20		(1 << 14)
+#define MC13783_ADC4_ADDBIS21		(1 << 15)
+#define MC13783_ADC4_ADDBIS22		(1 << 16)
+#define MC13783_ADC4_ADDBIS23		(1 << 17)
+#define MC13783_ADC4_ADDBIS24		(1 << 18)
+#define MC13783_ADC4_ADDBIS25		(1 << 19)
+#define MC13783_ADC4_ADDBIS26		(1 << 20)
+#define MC13783_ADC4_ADDBIS27		(1 << 21)
+#define MC13783_ADC4_ADDBIS28		(1 << 22)
+#define MC13783_ADC4_ADDBIS29		(1 << 23)
+
+#endif /* __LINUX_MFD_MC13783_PRIV_H */
+
diff --git a/include/linux/mfd/mc13783.h b/include/linux/mfd/mc13783.h
new file mode 100644
index 000000000000..b3a2a7243573
--- /dev/null
+++ b/include/linux/mfd/mc13783.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright 2009 Pengutronix, Sascha Hauer <s.hauer@pengutronix.de>
+ *
+ * Initial development of this code was funded by
+ * Phytec Messtechnik GmbH, http://www.phytec.de
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __INCLUDE_LINUX_MFD_MC13783_H
+#define __INCLUDE_LINUX_MFD_MC13783_H
+
+struct mc13783;
+struct regulator_init_data;
+
+struct mc13783_regulator_init_data {
+	int id;
+	struct regulator_init_data *init_data;
+};
+
+struct mc13783_platform_data {
+	struct mc13783_regulator_init_data *regulators;
+	int num_regulators;
+	unsigned int flags;
+};
+
+/* mc13783_platform_data flags */
+#define MC13783_USE_TOUCHSCREEN (1 << 0)
+#define MC13783_USE_CODEC	(1 << 1)
+#define MC13783_USE_ADC		(1 << 2)
+#define MC13783_USE_RTC		(1 << 3)
+#define MC13783_USE_REGULATOR	(1 << 4)
+
+int mc13783_adc_do_conversion(struct mc13783 *mc13783, unsigned int mode,
+		unsigned int channel, unsigned int *sample);
+
+void mc13783_adc_set_ts_status(struct mc13783 *mc13783, unsigned int status);
+
+#define	MC13783_SW_SW1A		0
+#define	MC13783_SW_SW1B		1
+#define	MC13783_SW_SW2A		2
+#define	MC13783_SW_SW2B		3
+#define	MC13783_SW_SW3		4
+#define	MC13783_SW_PLL		5
+#define	MC13783_REGU_VAUDIO	6
+#define	MC13783_REGU_VIOHI	7
+#define	MC13783_REGU_VIOLO	8
+#define	MC13783_REGU_VDIG	9
+#define	MC13783_REGU_VGEN	10
+#define	MC13783_REGU_VRFDIG	11
+#define	MC13783_REGU_VRFREF	12
+#define	MC13783_REGU_VRFCP	13
+#define	MC13783_REGU_VSIM	14
+#define	MC13783_REGU_VESIM	15
+#define	MC13783_REGU_VCAM	16
+#define	MC13783_REGU_VRFBG	17
+#define	MC13783_REGU_VVIB	18
+#define	MC13783_REGU_VRF1	19
+#define	MC13783_REGU_VRF2	20
+#define	MC13783_REGU_VMMC1	21
+#define	MC13783_REGU_VMMC2	22
+#define	MC13783_REGU_GPO1	23
+#define	MC13783_REGU_GPO2	24
+#define	MC13783_REGU_GPO3	25
+#define	MC13783_REGU_GPO4	26
+#define	MC13783_REGU_V1		27
+#define	MC13783_REGU_V2		28
+#define	MC13783_REGU_V3		29
+#define	MC13783_REGU_V4		30
+
+#endif /* __INCLUDE_LINUX_MFD_MC13783_H */
+
-- 
cgit v1.2.3-71-gd317


From ebf0bd366ed8161e6fbc919705d878ccbfd51624 Mon Sep 17 00:00:00 2001
From: Amit Kucheria <amit.kucheria@verdurent.com>
Date: Mon, 31 Aug 2009 18:32:18 +0200
Subject: mfd: Add support for TWL4030/5030 dynamic power switching

The TWL4030/5030 family of multifunction devices allows board-specific
control of the the various regulators, clock and reset lines through
'scripts' that are loaded into its memory. This allows for Dynamic Power
Switching (DPS).

Implement board-independent core support for DPS that is then used by
board-specific code to load custom DPS scripts.

Signed-off-by: Amit Kucheria <amit.kucheria@verdurent.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/Kconfig         |  13 ++
 drivers/mfd/Makefile        |   1 +
 drivers/mfd/twl4030-core.c  |  10 +
 drivers/mfd/twl4030-power.c | 466 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/i2c/twl4030.h |  94 ++++++++-
 5 files changed, 574 insertions(+), 10 deletions(-)
 create mode 100644 drivers/mfd/twl4030-power.c

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 4fd4f913c36b..570be139f9df 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -108,6 +108,19 @@ config TWL4030_CORE
 	  high speed USB OTG transceiver, an audio codec (on most
 	  versions) and many other features.
 
+config TWL4030_POWER
+	bool "Support power resources on TWL4030 family chips"
+	depends on TWL4030_CORE && ARM
+	help
+	  Say yes here if you want to use the power resources on the
+	  TWL4030 family chips.  Most of these resources are regulators,
+	  which have a separate driver; some are control signals, such
+	  as clock request handshaking.
+
+	  This driver uses board-specific data to initialize the resources
+	  and load scripts controling which resources are switched off/on
+	  or reset when a sleep, wakeup or warm reset event occurs.
+
 config MFD_TMIO
 	bool
 	default n
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 6a5d8cb545fc..f3b277b90d40 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_TPS65010)		+= tps65010.o
 obj-$(CONFIG_MENELAUS)		+= menelaus.o
 
 obj-$(CONFIG_TWL4030_CORE)	+= twl4030-core.o twl4030-irq.o
+obj-$(CONFIG_TWL4030_POWER)    += twl4030-power.o
 
 obj-$(CONFIG_MFD_MC13783)	+= mc13783-core.o
 
diff --git a/drivers/mfd/twl4030-core.c b/drivers/mfd/twl4030-core.c
index 1fd2620819d0..e424cf6d8e9e 100644
--- a/drivers/mfd/twl4030-core.c
+++ b/drivers/mfd/twl4030-core.c
@@ -89,6 +89,12 @@
 #define twl_has_madc()	false
 #endif
 
+#ifdef CONFIG_TWL4030_POWER
+#define twl_has_power()        true
+#else
+#define twl_has_power()        false
+#endif
+
 #if defined(CONFIG_RTC_DRV_TWL4030) || defined(CONFIG_RTC_DRV_TWL4030_MODULE)
 #define twl_has_rtc()	true
 #else
@@ -801,6 +807,10 @@ twl4030_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	/* setup clock framework */
 	clocks_init(&client->dev);
 
+	/* load power event scripts */
+	if (twl_has_power() && pdata->power)
+		twl4030_power_init(pdata->power);
+
 	/* Maybe init the T2 Interrupt subsystem */
 	if (client->irq
 			&& pdata->irq_base
diff --git a/drivers/mfd/twl4030-power.c b/drivers/mfd/twl4030-power.c
new file mode 100644
index 000000000000..e7688b041264
--- /dev/null
+++ b/drivers/mfd/twl4030-power.c
@@ -0,0 +1,466 @@
+/*
+ * linux/drivers/i2c/chips/twl4030-power.c
+ *
+ * Handle TWL4030 Power initialization
+ *
+ * Copyright (C) 2008 Nokia Corporation
+ * Copyright (C) 2006 Texas Instruments, Inc
+ *
+ * Written by 	Kalle Jokiniemi
+ *		Peter De Schrijver <peter.de-schrijver@nokia.com>
+ * Several fixes by Amit Kucheria <amit.kucheria@verdurent.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License. See the file "COPYING" in the main directory of this
+ * archive for more details.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/module.h>
+#include <linux/pm.h>
+#include <linux/i2c/twl4030.h>
+#include <linux/platform_device.h>
+
+#include <asm/mach-types.h>
+
+static u8 twl4030_start_script_address = 0x2b;
+
+#define PWR_P1_SW_EVENTS	0x10
+#define PWR_DEVOFF	(1<<0)
+
+#define PHY_TO_OFF_PM_MASTER(p)		(p - 0x36)
+#define PHY_TO_OFF_PM_RECEIVER(p)	(p - 0x5b)
+
+/* resource - hfclk */
+#define R_HFCLKOUT_DEV_GRP 	PHY_TO_OFF_PM_RECEIVER(0xe6)
+
+/* PM events */
+#define R_P1_SW_EVENTS		PHY_TO_OFF_PM_MASTER(0x46)
+#define R_P2_SW_EVENTS		PHY_TO_OFF_PM_MASTER(0x47)
+#define R_P3_SW_EVENTS		PHY_TO_OFF_PM_MASTER(0x48)
+#define R_CFG_P1_TRANSITION	PHY_TO_OFF_PM_MASTER(0x36)
+#define R_CFG_P2_TRANSITION	PHY_TO_OFF_PM_MASTER(0x37)
+#define R_CFG_P3_TRANSITION	PHY_TO_OFF_PM_MASTER(0x38)
+
+#define LVL_WAKEUP	0x08
+
+#define ENABLE_WARMRESET (1<<4)
+
+#define END_OF_SCRIPT		0x3f
+
+#define R_SEQ_ADD_A2S		PHY_TO_OFF_PM_MASTER(0x55)
+#define R_SEQ_ADD_S2A12		PHY_TO_OFF_PM_MASTER(0x56)
+#define	R_SEQ_ADD_S2A3		PHY_TO_OFF_PM_MASTER(0x57)
+#define	R_SEQ_ADD_WARM		PHY_TO_OFF_PM_MASTER(0x58)
+#define R_MEMORY_ADDRESS	PHY_TO_OFF_PM_MASTER(0x59)
+#define R_MEMORY_DATA		PHY_TO_OFF_PM_MASTER(0x5a)
+
+#define R_PROTECT_KEY		0x0E
+#define KEY_1			0xC0
+#define KEY_2			0x0C
+
+/* resource configuration registers */
+
+#define DEVGROUP_OFFSET		0
+#define TYPE_OFFSET		1
+
+/* Bit positions */
+#define DEVGROUP_SHIFT		5
+#define DEVGROUP_MASK		(7 << DEVGROUP_SHIFT)
+#define TYPE_SHIFT		0
+#define TYPE_MASK		(7 << TYPE_SHIFT)
+#define TYPE2_SHIFT		3
+#define TYPE2_MASK		(3 << TYPE2_SHIFT)
+
+static u8 res_config_addrs[] = {
+	[RES_VAUX1]	= 0x17,
+	[RES_VAUX2]	= 0x1b,
+	[RES_VAUX3]	= 0x1f,
+	[RES_VAUX4]	= 0x23,
+	[RES_VMMC1]	= 0x27,
+	[RES_VMMC2]	= 0x2b,
+	[RES_VPLL1]	= 0x2f,
+	[RES_VPLL2]	= 0x33,
+	[RES_VSIM]	= 0x37,
+	[RES_VDAC]	= 0x3b,
+	[RES_VINTANA1]	= 0x3f,
+	[RES_VINTANA2]	= 0x43,
+	[RES_VINTDIG]	= 0x47,
+	[RES_VIO]	= 0x4b,
+	[RES_VDD1]	= 0x55,
+	[RES_VDD2]	= 0x63,
+	[RES_VUSB_1V5]	= 0x71,
+	[RES_VUSB_1V8]	= 0x74,
+	[RES_VUSB_3V1]	= 0x77,
+	[RES_VUSBCP]	= 0x7a,
+	[RES_REGEN]	= 0x7f,
+	[RES_NRES_PWRON] = 0x82,
+	[RES_CLKEN]	= 0x85,
+	[RES_SYSEN]	= 0x88,
+	[RES_HFCLKOUT]	= 0x8b,
+	[RES_32KCLKOUT]	= 0x8e,
+	[RES_RESET]	= 0x91,
+	[RES_Main_Ref]	= 0x94,
+};
+
+static int __init twl4030_write_script_byte(u8 address, u8 byte)
+{
+	int err;
+
+	err = twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER, address,
+				R_MEMORY_ADDRESS);
+	if (err)
+		goto out;
+	err = twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER, byte,
+				R_MEMORY_DATA);
+out:
+	return err;
+}
+
+static int __init twl4030_write_script_ins(u8 address, u16 pmb_message,
+					   u8 delay, u8 next)
+{
+	int err;
+
+	address *= 4;
+	err = twl4030_write_script_byte(address++, pmb_message >> 8);
+	if (err)
+		goto out;
+	err = twl4030_write_script_byte(address++, pmb_message & 0xff);
+	if (err)
+		goto out;
+	err = twl4030_write_script_byte(address++, delay);
+	if (err)
+		goto out;
+	err = twl4030_write_script_byte(address++, next);
+out:
+	return err;
+}
+
+static int __init twl4030_write_script(u8 address, struct twl4030_ins *script,
+				       int len)
+{
+	int err;
+
+	for (; len; len--, address++, script++) {
+		if (len == 1) {
+			err = twl4030_write_script_ins(address,
+						script->pmb_message,
+						script->delay,
+						END_OF_SCRIPT);
+			if (err)
+				break;
+		} else {
+			err = twl4030_write_script_ins(address,
+						script->pmb_message,
+						script->delay,
+						address + 1);
+			if (err)
+				break;
+		}
+	}
+	return err;
+}
+
+static int __init twl4030_config_wakeup3_sequence(u8 address)
+{
+	int err;
+	u8 data;
+
+	/* Set SLEEP to ACTIVE SEQ address for P3 */
+	err = twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER, address,
+				R_SEQ_ADD_S2A3);
+	if (err)
+		goto out;
+
+	/* P3 LVL_WAKEUP should be on LEVEL */
+	err = twl4030_i2c_read_u8(TWL4030_MODULE_PM_MASTER, &data,
+				R_P3_SW_EVENTS);
+	if (err)
+		goto out;
+	data |= LVL_WAKEUP;
+	err = twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER, data,
+				R_P3_SW_EVENTS);
+out:
+	if (err)
+		pr_err("TWL4030 wakeup sequence for P3 config error\n");
+	return err;
+}
+
+static int __init twl4030_config_wakeup12_sequence(u8 address)
+{
+	int err = 0;
+	u8 data;
+
+	/* Set SLEEP to ACTIVE SEQ address for P1 and P2 */
+	err = twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER, address,
+				R_SEQ_ADD_S2A12);
+	if (err)
+		goto out;
+
+	/* P1/P2 LVL_WAKEUP should be on LEVEL */
+	err = twl4030_i2c_read_u8(TWL4030_MODULE_PM_MASTER, &data,
+				R_P1_SW_EVENTS);
+	if (err)
+		goto out;
+
+	data |= LVL_WAKEUP;
+	err = twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER, data,
+				R_P1_SW_EVENTS);
+	if (err)
+		goto out;
+
+	err = twl4030_i2c_read_u8(TWL4030_MODULE_PM_MASTER, &data,
+				R_P2_SW_EVENTS);
+	if (err)
+		goto out;
+
+	data |= LVL_WAKEUP;
+	err = twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER, data,
+				R_P2_SW_EVENTS);
+	if (err)
+		goto out;
+
+	if (machine_is_omap_3430sdp() || machine_is_omap_ldp()) {
+		/* Disabling AC charger effect on sleep-active transitions */
+		err = twl4030_i2c_read_u8(TWL4030_MODULE_PM_MASTER, &data,
+					R_CFG_P1_TRANSITION);
+		if (err)
+			goto out;
+		data &= ~(1<<1);
+		err = twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER, data ,
+					R_CFG_P1_TRANSITION);
+		if (err)
+			goto out;
+	}
+
+out:
+	if (err)
+		pr_err("TWL4030 wakeup sequence for P1 and P2" \
+			"config error\n");
+	return err;
+}
+
+static int __init twl4030_config_sleep_sequence(u8 address)
+{
+	int err;
+
+	/* Set ACTIVE to SLEEP SEQ address in T2 memory*/
+	err = twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER, address,
+				R_SEQ_ADD_A2S);
+
+	if (err)
+		pr_err("TWL4030 sleep sequence config error\n");
+
+	return err;
+}
+
+static int __init twl4030_config_warmreset_sequence(u8 address)
+{
+	int err;
+	u8 rd_data;
+
+	/* Set WARM RESET SEQ address for P1 */
+	err = twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER, address,
+				R_SEQ_ADD_WARM);
+	if (err)
+		goto out;
+
+	/* P1/P2/P3 enable WARMRESET */
+	err = twl4030_i2c_read_u8(TWL4030_MODULE_PM_MASTER, &rd_data,
+				R_P1_SW_EVENTS);
+	if (err)
+		goto out;
+
+	rd_data |= ENABLE_WARMRESET;
+	err = twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER, rd_data,
+				R_P1_SW_EVENTS);
+	if (err)
+		goto out;
+
+	err = twl4030_i2c_read_u8(TWL4030_MODULE_PM_MASTER, &rd_data,
+				R_P2_SW_EVENTS);
+	if (err)
+		goto out;
+
+	rd_data |= ENABLE_WARMRESET;
+	err = twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER, rd_data,
+				R_P2_SW_EVENTS);
+	if (err)
+		goto out;
+
+	err = twl4030_i2c_read_u8(TWL4030_MODULE_PM_MASTER, &rd_data,
+				R_P3_SW_EVENTS);
+	if (err)
+		goto out;
+
+	rd_data |= ENABLE_WARMRESET;
+	err = twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER, rd_data,
+				R_P3_SW_EVENTS);
+out:
+	if (err)
+		pr_err("TWL4030 warmreset seq config error\n");
+	return err;
+}
+
+static int __init twl4030_configure_resource(struct twl4030_resconfig *rconfig)
+{
+	int rconfig_addr;
+	int err;
+	u8 type;
+	u8 grp;
+
+	if (rconfig->resource > TOTAL_RESOURCES) {
+		pr_err("TWL4030 Resource %d does not exist\n",
+			rconfig->resource);
+		return -EINVAL;
+	}
+
+	rconfig_addr = res_config_addrs[rconfig->resource];
+
+	/* Set resource group */
+	err = twl4030_i2c_read_u8(TWL4030_MODULE_PM_RECEIVER, &grp,
+				rconfig_addr + DEVGROUP_OFFSET);
+	if (err) {
+		pr_err("TWL4030 Resource %d group could not be read\n",
+			rconfig->resource);
+		return err;
+	}
+
+	if (rconfig->devgroup >= 0) {
+		grp &= ~DEVGROUP_MASK;
+		grp |= rconfig->devgroup << DEVGROUP_SHIFT;
+		err = twl4030_i2c_write_u8(TWL4030_MODULE_PM_RECEIVER,
+					grp, rconfig_addr + DEVGROUP_OFFSET);
+		if (err < 0) {
+			pr_err("TWL4030 failed to program devgroup\n");
+			return err;
+		}
+	}
+
+	/* Set resource types */
+	err = twl4030_i2c_read_u8(TWL4030_MODULE_PM_RECEIVER, &type,
+				rconfig_addr + TYPE_OFFSET);
+	if (err < 0) {
+		pr_err("TWL4030 Resource %d type could not be read\n",
+			rconfig->resource);
+		return err;
+	}
+
+	if (rconfig->type >= 0) {
+		type &= ~TYPE_MASK;
+		type |= rconfig->type << TYPE_SHIFT;
+	}
+
+	if (rconfig->type2 >= 0) {
+		type &= ~TYPE2_MASK;
+		type |= rconfig->type2 << TYPE2_SHIFT;
+	}
+
+	err = twl4030_i2c_write_u8(TWL4030_MODULE_PM_RECEIVER,
+				type, rconfig_addr + TYPE_OFFSET);
+	if (err < 0) {
+		pr_err("TWL4030 failed to program resource type\n");
+		return err;
+	}
+
+	return 0;
+}
+
+static int __init load_twl4030_script(struct twl4030_script *tscript,
+	       u8 address)
+{
+	int err;
+
+	/* Make sure the script isn't going beyond last valid address (0x3f) */
+	if ((address + tscript->size) > END_OF_SCRIPT) {
+		pr_err("TWL4030 scripts too big error\n");
+		return -EINVAL;
+	}
+
+	err = twl4030_write_script(address, tscript->script, tscript->size);
+	if (err)
+		goto out;
+
+	if (tscript->flags & TWL4030_WRST_SCRIPT) {
+		err = twl4030_config_warmreset_sequence(address);
+		if (err)
+			goto out;
+	}
+	if (tscript->flags & TWL4030_WAKEUP12_SCRIPT) {
+		err = twl4030_config_wakeup12_sequence(address);
+		if (err)
+			goto out;
+	}
+	if (tscript->flags & TWL4030_WAKEUP3_SCRIPT) {
+		err = twl4030_config_wakeup3_sequence(address);
+		if (err)
+			goto out;
+	}
+	if (tscript->flags & TWL4030_SLEEP_SCRIPT)
+		err = twl4030_config_sleep_sequence(address);
+out:
+	return err;
+}
+
+void __init twl4030_power_init(struct twl4030_power_data *twl4030_scripts)
+{
+	int err = 0;
+	int i;
+	struct twl4030_resconfig *resconfig;
+	u8 address = twl4030_start_script_address;
+
+	err = twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER, KEY_1,
+				R_PROTECT_KEY);
+	if (err)
+		goto unlock;
+
+	err = twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER, KEY_2,
+				R_PROTECT_KEY);
+	if (err)
+		goto unlock;
+
+	for (i = 0; i < twl4030_scripts->num; i++) {
+		err = load_twl4030_script(twl4030_scripts->scripts[i], address);
+		if (err)
+			goto load;
+		address += twl4030_scripts->scripts[i]->size;
+	}
+
+	resconfig = twl4030_scripts->resource_config;
+	if (resconfig) {
+		while (resconfig->resource) {
+			err = twl4030_configure_resource(resconfig);
+			if (err)
+				goto resource;
+			resconfig++;
+
+		}
+	}
+
+	err = twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER, 0, R_PROTECT_KEY);
+	if (err)
+		pr_err("TWL4030 Unable to relock registers\n");
+	return;
+
+unlock:
+	if (err)
+		pr_err("TWL4030 Unable to unlock registers\n");
+	return;
+load:
+	if (err)
+		pr_err("TWL4030 failed to load scripts\n");
+	return;
+resource:
+	if (err)
+		pr_err("TWL4030 failed to configure resource\n");
+	return;
+}
diff --git a/include/linux/i2c/twl4030.h b/include/linux/i2c/twl4030.h
index 3fd21d7cb6bf..2d02dfd7076c 100644
--- a/include/linux/i2c/twl4030.h
+++ b/include/linux/i2c/twl4030.h
@@ -223,19 +223,28 @@ int twl4030_i2c_read(u8 mod_no, u8 *value, u8 reg, unsigned num_bytes);
 
 /* Power bus message definitions */
 
-#define DEV_GRP_NULL		0x0
-#define DEV_GRP_P1		0x1
-#define DEV_GRP_P2		0x2
-#define DEV_GRP_P3		0x4
+/* The TWL4030/5030 splits its power-management resources (the various
+ * regulators, clock and reset lines) into 3 processor groups - P1, P2 and
+ * P3. These groups can then be configured to transition between sleep, wait-on
+ * and active states by sending messages to the power bus.  See Section 5.4.2
+ * Power Resources of TWL4030 TRM
+ */
 
-#define RES_GRP_RES		0x0
-#define RES_GRP_PP		0x1
-#define RES_GRP_RC		0x2
+/* Processor groups */
+#define DEV_GRP_NULL		0x0
+#define DEV_GRP_P1		0x1	/* P1: all OMAP devices */
+#define DEV_GRP_P2		0x2	/* P2: all Modem devices */
+#define DEV_GRP_P3		0x4	/* P3: all peripheral devices */
+
+/* Resource groups */
+#define RES_GRP_RES		0x0	/* Reserved */
+#define RES_GRP_PP		0x1	/* Power providers */
+#define RES_GRP_RC		0x2	/* Reset and control */
 #define RES_GRP_PP_RC		0x3
-#define RES_GRP_PR		0x4
+#define RES_GRP_PR		0x4	/* Power references */
 #define RES_GRP_PP_PR		0x5
 #define RES_GRP_RC_PR		0x6
-#define RES_GRP_ALL		0x7
+#define RES_GRP_ALL		0x7	/* All resource groups */
 
 #define RES_TYPE2_R0		0x0
 
@@ -246,6 +255,41 @@ int twl4030_i2c_read(u8 mod_no, u8 *value, u8 reg, unsigned num_bytes);
 #define RES_STATE_SLEEP		0x8
 #define RES_STATE_OFF		0x0
 
+/* Power resources */
+
+/* Power providers */
+#define RES_VAUX1               1
+#define RES_VAUX2               2
+#define RES_VAUX3               3
+#define RES_VAUX4               4
+#define RES_VMMC1               5
+#define RES_VMMC2               6
+#define RES_VPLL1               7
+#define RES_VPLL2               8
+#define RES_VSIM                9
+#define RES_VDAC                10
+#define RES_VINTANA1            11
+#define RES_VINTANA2            12
+#define RES_VINTDIG             13
+#define RES_VIO                 14
+#define RES_VDD1                15
+#define RES_VDD2                16
+#define RES_VUSB_1V5            17
+#define RES_VUSB_1V8            18
+#define RES_VUSB_3V1            19
+#define RES_VUSBCP              20
+#define RES_REGEN               21
+/* Reset and control */
+#define RES_NRES_PWRON          22
+#define RES_CLKEN               23
+#define RES_SYSEN               24
+#define RES_HFCLKOUT            25
+#define RES_32KCLKOUT           26
+#define RES_RESET               27
+/* Power Reference */
+#define RES_Main_Ref            28
+
+#define TOTAL_RESOURCES		28
 /*
  * Power Bus Message Format ... these can be sent individually by Linux,
  * but are usually part of downloaded scripts that are run when various
@@ -327,6 +371,36 @@ struct twl4030_usb_data {
 	enum twl4030_usb_mode	usb_mode;
 };
 
+struct twl4030_ins {
+	u16 pmb_message;
+	u8 delay;
+};
+
+struct twl4030_script {
+	struct twl4030_ins *script;
+	unsigned size;
+	u8 flags;
+#define TWL4030_WRST_SCRIPT	(1<<0)
+#define TWL4030_WAKEUP12_SCRIPT	(1<<1)
+#define TWL4030_WAKEUP3_SCRIPT	(1<<2)
+#define TWL4030_SLEEP_SCRIPT	(1<<3)
+};
+
+struct twl4030_resconfig {
+	u8 resource;
+	u8 devgroup;	/* Processor group that Power resource belongs to */
+	u8 type;	/* Power resource addressed, 6 / broadcast message */
+	u8 type2;	/* Power resource addressed, 3 / broadcast message */
+};
+
+struct twl4030_power_data {
+	struct twl4030_script **scripts;
+	unsigned num;
+	struct twl4030_resconfig *resource_config;
+};
+
+extern void twl4030_power_init(struct twl4030_power_data *triton2_scripts);
+
 struct twl4030_platform_data {
 	unsigned				irq_base, irq_end;
 	struct twl4030_bci_platform_data	*bci;
@@ -334,6 +408,7 @@ struct twl4030_platform_data {
 	struct twl4030_madc_platform_data	*madc;
 	struct twl4030_keypad_data		*keypad;
 	struct twl4030_usb_data			*usb;
+	struct twl4030_power_data		*power;
 
 	/* LDO regulators */
 	struct regulator_init_data		*vdac;
@@ -364,7 +439,6 @@ int twl4030_sih_setup(int module);
 #define TWL4030_VAUX3_DEV_GRP		0x1F
 #define TWL4030_VAUX3_DEDICATED		0x22
 
-
 #if defined(CONFIG_TWL4030_BCI_BATTERY) || \
 	defined(CONFIG_TWL4030_BCI_BATTERY_MODULE)
 	extern int twl4030charger_usb_en(int enable);
-- 
cgit v1.2.3-71-gd317


From 8aba721b23917bc6d374ad42bf80bde5058710e2 Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <sameo@linux.intel.com>
Date: Thu, 27 Aug 2009 20:49:08 +0200
Subject: mfd: Fix ab3100-otp build failure

ab3100.h should include linux/workqueue.h for otp to build properly.

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/ab3100-core.c  | 1 -
 include/linux/mfd/ab3100.h | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/ab3100-core.c b/drivers/mfd/ab3100-core.c
index 1d8ac1a1e304..a848df77514a 100644
--- a/drivers/mfd/ab3100-core.c
+++ b/drivers/mfd/ab3100-core.c
@@ -14,7 +14,6 @@
 #include <linux/platform_device.h>
 #include <linux/device.h>
 #include <linux/interrupt.h>
-#include <linux/workqueue.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/uaccess.h>
diff --git a/include/linux/mfd/ab3100.h b/include/linux/mfd/ab3100.h
index 56343b8013b5..3ead5cfb638c 100644
--- a/include/linux/mfd/ab3100.h
+++ b/include/linux/mfd/ab3100.h
@@ -6,6 +6,7 @@
  */
 
 #include <linux/device.h>
+#include <linux/workqueue.h>
 
 #ifndef MFD_AB3100_H
 #define MFD_AB3100_H
-- 
cgit v1.2.3-71-gd317


From d619bc143e311a738113dbbe7792bd032403939f Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@stericsson.com>
Date: Wed, 9 Sep 2009 11:31:00 +0200
Subject: regulator: AB3100 support

This adds support for the regulators found in the AB3100
Mixed-Signal IC.

It further also defines platform data for the ST-Ericsson
U300 platform and extends the AB3100 MFD driver so that
platform/board data with regulation constraints and an init
function can be passed down all the way from the board to
the regulators.

Signed-off-by: Linus Walleij <linus.walleij@stericsson.com>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/ab3100-core.c  |   4 +
 drivers/regulator/Kconfig  |   9 +
 drivers/regulator/Makefile |   1 +
 drivers/regulator/ab3100.c | 694 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/ab3100.h |  28 ++
 5 files changed, 736 insertions(+)
 create mode 100644 drivers/regulator/ab3100.c

(limited to 'include/linux')

diff --git a/drivers/mfd/ab3100-core.c b/drivers/mfd/ab3100-core.c
index a848df77514a..c533f86ff5ea 100644
--- a/drivers/mfd/ab3100-core.c
+++ b/drivers/mfd/ab3100-core.c
@@ -837,6 +837,8 @@ static int __init ab3100_probe(struct i2c_client *client,
 			const struct i2c_device_id *id)
 {
 	struct ab3100 *ab3100;
+	struct ab3100_platform_data *ab3100_plf_data =
+		client->dev.platform_data;
 	int err;
 	int i;
 
@@ -920,6 +922,8 @@ static int __init ab3100_probe(struct i2c_client *client,
 	for (i = 0; i < ARRAY_SIZE(ab3100_platform_devs); i++) {
 		ab3100_platform_devs[i]->dev.parent =
 			&client->dev;
+		ab3100_platform_devs[i]->dev.platform_data =
+			ab3100_plf_data;
 		platform_set_drvdata(ab3100_platform_devs[i], ab3100);
 	}
 
diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index c505714b0a98..2dc42bbf6fe9 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -138,5 +138,14 @@ config REGULATOR_MC13783
 	  Say y here to support the regulators found on the Freescale MC13783
 	  PMIC.
 
+config REGULATOR_AB3100
+	tristate "ST-Ericsson AB3100 Regulator functions"
+	depends on AB3100_CORE
+	default y if AB3100_CORE
+	help
+	 These regulators correspond to functionality in the
+	 AB3100 analog baseband dealing with power regulators
+	 for the system.
+
 endif
 
diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile
index 5034830a395e..768b3316d6eb 100644
--- a/drivers/regulator/Makefile
+++ b/drivers/regulator/Makefile
@@ -21,5 +21,6 @@ obj-$(CONFIG_REGULATOR_DA903X)	+= da903x.o
 obj-$(CONFIG_REGULATOR_PCF50633) += pcf50633-regulator.o
 obj-$(CONFIG_REGULATOR_PCAP) += pcap-regulator.o
 obj-$(CONFIG_REGULATOR_MC13783) += mc13783.o
+obj-$(CONFIG_REGULATOR_AB3100) += ab3100.o
 
 ccflags-$(CONFIG_REGULATOR_DEBUG) += -DDEBUG
diff --git a/drivers/regulator/ab3100.c b/drivers/regulator/ab3100.c
new file mode 100644
index 000000000000..156979d1f41e
--- /dev/null
+++ b/drivers/regulator/ab3100.c
@@ -0,0 +1,694 @@
+/*
+ * drivers/regulator/ab3100.c
+ *
+ * Copyright (C) 2008-2009 ST-Ericsson AB
+ * License terms: GNU General Public License (GPL) version 2
+ * Low-level control of the AB3100 IC Low Dropout (LDO)
+ * regulators, external regulator and buck converter
+ * Author: Mattias Wallin <mattias.wallin@stericsson.com>
+ * Author: Linus Walleij <linus.walleij@stericsson.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/delay.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/driver.h>
+#include <linux/mfd/ab3100.h>
+
+/* LDO registers and some handy masking definitions for AB3100 */
+#define AB3100_LDO_A		0x40
+#define AB3100_LDO_C		0x41
+#define AB3100_LDO_D		0x42
+#define AB3100_LDO_E		0x43
+#define AB3100_LDO_E_SLEEP	0x44
+#define AB3100_LDO_F		0x45
+#define AB3100_LDO_G		0x46
+#define AB3100_LDO_H		0x47
+#define AB3100_LDO_H_SLEEP_MODE	0
+#define AB3100_LDO_H_SLEEP_EN	2
+#define AB3100_LDO_ON		4
+#define AB3100_LDO_H_VSEL_AC	5
+#define AB3100_LDO_K		0x48
+#define AB3100_LDO_EXT		0x49
+#define AB3100_BUCK		0x4A
+#define AB3100_BUCK_SLEEP	0x4B
+#define AB3100_REG_ON_MASK	0x10
+
+/**
+ * struct ab3100_regulator
+ * A struct passed around the individual regulator functions
+ * @platform_device: platform device holding this regulator
+ * @ab3100: handle to the AB3100 parent chip
+ * @plfdata: AB3100 platform data passed in at probe time
+ * @regreg: regulator register number in the AB3100
+ * @fixed_voltage: a fixed voltage for this regulator, if this
+ *          0 the voltages array is used instead.
+ * @typ_voltages: an array of available typical voltages for
+ *          this regulator
+ * @voltages_len: length of the array of available voltages
+ */
+struct ab3100_regulator {
+	struct regulator_dev *rdev;
+	struct ab3100 *ab3100;
+	struct ab3100_platform_data *plfdata;
+	u8 regreg;
+	int fixed_voltage;
+	int const *typ_voltages;
+	u8 voltages_len;
+};
+
+/* The order in which registers are initialized */
+static const u8 ab3100_reg_init_order[AB3100_NUM_REGULATORS+2] = {
+	AB3100_LDO_A,
+	AB3100_LDO_C,
+	AB3100_LDO_E,
+	AB3100_LDO_E_SLEEP,
+	AB3100_LDO_F,
+	AB3100_LDO_G,
+	AB3100_LDO_H,
+	AB3100_LDO_K,
+	AB3100_LDO_EXT,
+	AB3100_BUCK,
+	AB3100_BUCK_SLEEP,
+	AB3100_LDO_D,
+};
+
+/* Preset (hardware defined) voltages for these regulators */
+#define LDO_A_VOLTAGE 2750000
+#define LDO_C_VOLTAGE 2650000
+#define LDO_D_VOLTAGE 2650000
+
+static const int const ldo_e_buck_typ_voltages[] = {
+	1800000,
+	1400000,
+	1300000,
+	1200000,
+	1100000,
+	1050000,
+	900000,
+};
+
+static const int const ldo_f_typ_voltages[] = {
+	1800000,
+	1400000,
+	1300000,
+	1200000,
+	1100000,
+	1050000,
+	2500000,
+	2650000,
+};
+
+static const int const ldo_g_typ_voltages[] = {
+	2850000,
+	2750000,
+	1800000,
+	1500000,
+};
+
+static const int const ldo_h_typ_voltages[] = {
+	2750000,
+	1800000,
+	1500000,
+	1200000,
+};
+
+static const int const ldo_k_typ_voltages[] = {
+	2750000,
+	1800000,
+};
+
+
+/* The regulator devices */
+static struct ab3100_regulator
+ab3100_regulators[AB3100_NUM_REGULATORS] = {
+	{
+		.regreg = AB3100_LDO_A,
+		.fixed_voltage = LDO_A_VOLTAGE,
+	},
+	{
+		.regreg = AB3100_LDO_C,
+		.fixed_voltage = LDO_C_VOLTAGE,
+	},
+	{
+		.regreg = AB3100_LDO_D,
+		.fixed_voltage = LDO_D_VOLTAGE,
+	},
+	{
+		.regreg = AB3100_LDO_E,
+		.typ_voltages = ldo_e_buck_typ_voltages,
+		.voltages_len = ARRAY_SIZE(ldo_e_buck_typ_voltages),
+	},
+	{
+		.regreg = AB3100_LDO_F,
+		.typ_voltages = ldo_f_typ_voltages,
+		.voltages_len = ARRAY_SIZE(ldo_f_typ_voltages),
+	},
+	{
+		.regreg = AB3100_LDO_G,
+		.typ_voltages = ldo_g_typ_voltages,
+		.voltages_len = ARRAY_SIZE(ldo_g_typ_voltages),
+	},
+	{
+		.regreg = AB3100_LDO_H,
+		.typ_voltages = ldo_h_typ_voltages,
+		.voltages_len = ARRAY_SIZE(ldo_h_typ_voltages),
+	},
+	{
+		.regreg = AB3100_LDO_K,
+		.typ_voltages = ldo_k_typ_voltages,
+		.voltages_len = ARRAY_SIZE(ldo_k_typ_voltages),
+	},
+	{
+		.regreg = AB3100_LDO_EXT,
+		/* No voltages for the external regulator */
+	},
+	{
+		.regreg = AB3100_BUCK,
+		.typ_voltages = ldo_e_buck_typ_voltages,
+		.voltages_len = ARRAY_SIZE(ldo_e_buck_typ_voltages),
+	},
+};
+
+/*
+ * General functions for enable, disable and is_enabled used for
+ * LDO: A,C,E,F,G,H,K,EXT and BUCK
+ */
+static int ab3100_enable_regulator(struct regulator_dev *reg)
+{
+	struct ab3100_regulator *abreg = reg->reg_data;
+	int err;
+	u8 regval;
+
+	err = ab3100_get_register_interruptible(abreg->ab3100, abreg->regreg,
+						&regval);
+	if (err) {
+		dev_warn(&reg->dev, "failed to get regid %d value\n",
+			 abreg->regreg);
+		return err;
+	}
+
+	/* The regulator is already on, no reason to go further */
+	if (regval & AB3100_REG_ON_MASK)
+		return 0;
+
+	regval |= AB3100_REG_ON_MASK;
+
+	err = ab3100_set_register_interruptible(abreg->ab3100, abreg->regreg,
+						regval);
+	if (err) {
+		dev_warn(&reg->dev, "failed to set regid %d value\n",
+			 abreg->regreg);
+		return err;
+	}
+
+	/* Per-regulator power on delay from spec */
+	switch (abreg->regreg) {
+	case AB3100_LDO_A: /* Fallthrough */
+	case AB3100_LDO_C: /* Fallthrough */
+	case AB3100_LDO_D: /* Fallthrough */
+	case AB3100_LDO_E: /* Fallthrough */
+	case AB3100_LDO_H: /* Fallthrough */
+	case AB3100_LDO_K:
+		udelay(200);
+		break;
+	case AB3100_LDO_F:
+		udelay(600);
+		break;
+	case AB3100_LDO_G:
+		udelay(400);
+		break;
+	case AB3100_BUCK:
+		mdelay(1);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static int ab3100_disable_regulator(struct regulator_dev *reg)
+{
+	struct ab3100_regulator *abreg = reg->reg_data;
+	int err;
+	u8 regval;
+
+	/*
+	 * LDO D is a special regulator. When it is disabled, the entire
+	 * system is shut down. So this is handled specially.
+	 */
+	if (abreg->regreg == AB3100_LDO_D) {
+		int i;
+
+		dev_info(&reg->dev, "disabling LDO D - shut down system\n");
+		/*
+		 * Set regulators to default values, ignore any errors,
+		 * we're going DOWN
+		 */
+		for (i = 0; i < ARRAY_SIZE(ab3100_reg_init_order); i++) {
+			(void) ab3100_set_register_interruptible(abreg->ab3100,
+					ab3100_reg_init_order[i],
+					abreg->plfdata->reg_initvals[i]);
+		}
+
+		/* Setting LDO D to 0x00 cuts the power to the SoC */
+		return ab3100_set_register_interruptible(abreg->ab3100,
+							 AB3100_LDO_D, 0x00U);
+
+	}
+
+	/*
+	 * All other regulators are handled here
+	 */
+	err = ab3100_get_register_interruptible(abreg->ab3100, abreg->regreg,
+						&regval);
+	if (err) {
+		dev_err(&reg->dev, "unable to get register 0x%x\n",
+			abreg->regreg);
+		return err;
+	}
+	regval &= ~AB3100_REG_ON_MASK;
+	return ab3100_set_register_interruptible(abreg->ab3100, abreg->regreg,
+						 regval);
+}
+
+static int ab3100_is_enabled_regulator(struct regulator_dev *reg)
+{
+	struct ab3100_regulator *abreg = reg->reg_data;
+	u8 regval;
+	int err;
+
+	err = ab3100_get_register_interruptible(abreg->ab3100, abreg->regreg,
+						&regval);
+	if (err) {
+		dev_err(&reg->dev, "unable to get register 0x%x\n",
+			abreg->regreg);
+		return err;
+	}
+
+	return regval & AB3100_REG_ON_MASK;
+}
+
+static int ab3100_list_voltage_regulator(struct regulator_dev *reg,
+					 unsigned selector)
+{
+	struct ab3100_regulator *abreg = reg->reg_data;
+
+	if (selector > abreg->voltages_len)
+		return -EINVAL;
+	return abreg->typ_voltages[selector];
+}
+
+static int ab3100_get_voltage_regulator(struct regulator_dev *reg)
+{
+	struct ab3100_regulator *abreg = reg->reg_data;
+	u8 regval;
+	int err;
+
+	/* Return the voltage for fixed regulators immediately */
+	if (abreg->fixed_voltage)
+		return abreg->fixed_voltage;
+
+	/*
+	 * For variable types, read out setting and index into
+	 * supplied voltage list.
+	 */
+	err = ab3100_get_register_interruptible(abreg->ab3100,
+						abreg->regreg, &regval);
+	if (err) {
+		dev_warn(&reg->dev,
+			 "failed to get regulator value in register %02x\n",
+			 abreg->regreg);
+		return err;
+	}
+
+	/* The 3 highest bits index voltages */
+	regval &= 0xE0;
+	regval >>= 5;
+
+	if (regval > abreg->voltages_len) {
+		dev_err(&reg->dev,
+			"regulator register %02x contains an illegal voltage setting\n",
+			abreg->regreg);
+		return -EINVAL;
+	}
+
+	return abreg->typ_voltages[regval];
+}
+
+static int ab3100_get_best_voltage_index(struct regulator_dev *reg,
+				   int min_uV, int max_uV)
+{
+	struct ab3100_regulator *abreg = reg->reg_data;
+	int i;
+	int bestmatch;
+	int bestindex;
+
+	/*
+	 * Locate the minimum voltage fitting the criteria on
+	 * this regulator. The switchable voltages are not
+	 * in strict falling order so we need to check them
+	 * all for the best match.
+	 */
+	bestmatch = INT_MAX;
+	bestindex = -1;
+	for (i = 0; i < abreg->voltages_len; i++) {
+		if (abreg->typ_voltages[i] <= max_uV &&
+		    abreg->typ_voltages[i] >= min_uV &&
+		    abreg->typ_voltages[i] < bestmatch) {
+			bestmatch = abreg->typ_voltages[i];
+			bestindex = i;
+		}
+	}
+
+	if (bestindex < 0) {
+		dev_warn(&reg->dev, "requested %d<=x<=%d uV, out of range!\n",
+			 min_uV, max_uV);
+		return -EINVAL;
+	}
+	return bestindex;
+}
+
+static int ab3100_set_voltage_regulator(struct regulator_dev *reg,
+					int min_uV, int max_uV)
+{
+	struct ab3100_regulator *abreg = reg->reg_data;
+	u8 regval;
+	int err;
+	int bestindex;
+
+	bestindex = ab3100_get_best_voltage_index(reg, min_uV, max_uV);
+	if (bestindex < 0)
+		return bestindex;
+
+	err = ab3100_get_register_interruptible(abreg->ab3100,
+						abreg->regreg, &regval);
+	if (err) {
+		dev_warn(&reg->dev,
+			 "failed to get regulator register %02x\n",
+			 abreg->regreg);
+		return err;
+	}
+
+	/* The highest three bits control the variable regulators */
+	regval &= ~0xE0;
+	regval |= (bestindex << 5);
+
+	err = ab3100_set_register_interruptible(abreg->ab3100,
+						abreg->regreg, regval);
+	if (err)
+		dev_warn(&reg->dev, "failed to set regulator register %02x\n",
+			abreg->regreg);
+
+	return err;
+}
+
+static int ab3100_set_suspend_voltage_regulator(struct regulator_dev *reg,
+						int uV)
+{
+	struct ab3100_regulator *abreg = reg->reg_data;
+	u8 regval;
+	int err;
+	int bestindex;
+	u8 targetreg;
+
+	if (abreg->regreg == AB3100_LDO_E)
+		targetreg = AB3100_LDO_E_SLEEP;
+	else if (abreg->regreg == AB3100_BUCK)
+		targetreg = AB3100_BUCK_SLEEP;
+	else
+		return -EINVAL;
+
+	/* LDO E and BUCK have special suspend voltages you can set */
+	bestindex = ab3100_get_best_voltage_index(reg, uV, uV);
+
+	err = ab3100_get_register_interruptible(abreg->ab3100,
+						targetreg, &regval);
+	if (err) {
+		dev_warn(&reg->dev,
+			 "failed to get regulator register %02x\n",
+			 targetreg);
+		return err;
+	}
+
+	/* The highest three bits control the variable regulators */
+	regval &= ~0xE0;
+	regval |= (bestindex << 5);
+
+	err = ab3100_set_register_interruptible(abreg->ab3100,
+						targetreg, regval);
+	if (err)
+		dev_warn(&reg->dev, "failed to set regulator register %02x\n",
+			abreg->regreg);
+
+	return err;
+}
+
+/*
+ * The external regulator can just define a fixed voltage.
+ */
+static int ab3100_get_voltage_regulator_external(struct regulator_dev *reg)
+{
+	struct ab3100_regulator *abreg = reg->reg_data;
+
+	return abreg->plfdata->external_voltage;
+}
+
+static struct regulator_ops regulator_ops_fixed = {
+	.enable      = ab3100_enable_regulator,
+	.disable     = ab3100_disable_regulator,
+	.is_enabled  = ab3100_is_enabled_regulator,
+	.get_voltage = ab3100_get_voltage_regulator,
+};
+
+static struct regulator_ops regulator_ops_variable = {
+	.enable      = ab3100_enable_regulator,
+	.disable     = ab3100_disable_regulator,
+	.is_enabled  = ab3100_is_enabled_regulator,
+	.get_voltage = ab3100_get_voltage_regulator,
+	.set_voltage = ab3100_set_voltage_regulator,
+	.list_voltage = ab3100_list_voltage_regulator,
+};
+
+static struct regulator_ops regulator_ops_variable_sleepable = {
+	.enable      = ab3100_enable_regulator,
+	.disable     = ab3100_disable_regulator,
+	.is_enabled  = ab3100_is_enabled_regulator,
+	.get_voltage = ab3100_get_voltage_regulator,
+	.set_voltage = ab3100_set_voltage_regulator,
+	.set_suspend_voltage = ab3100_set_suspend_voltage_regulator,
+	.list_voltage = ab3100_list_voltage_regulator,
+};
+
+/*
+ * LDO EXT is an external regulator so it is really
+ * not possible to set any voltage locally here, AB3100
+ * is an on/off switch plain an simple. The external
+ * voltage is defined in the board set-up if any.
+ */
+static struct regulator_ops regulator_ops_external = {
+	.enable      = ab3100_enable_regulator,
+	.disable     = ab3100_disable_regulator,
+	.is_enabled  = ab3100_is_enabled_regulator,
+	.get_voltage = ab3100_get_voltage_regulator_external,
+};
+
+static struct regulator_desc
+ab3100_regulator_desc[AB3100_NUM_REGULATORS] = {
+	{
+		.name = "LDO_A",
+		.id   = AB3100_LDO_A,
+		.ops  = &regulator_ops_fixed,
+		.type = REGULATOR_VOLTAGE,
+	},
+	{
+		.name = "LDO_C",
+		.id   = AB3100_LDO_C,
+		.ops  = &regulator_ops_fixed,
+		.type = REGULATOR_VOLTAGE,
+	},
+	{
+		.name = "LDO_D",
+		.id   = AB3100_LDO_D,
+		.ops  = &regulator_ops_fixed,
+		.type = REGULATOR_VOLTAGE,
+	},
+	{
+		.name = "LDO_E",
+		.id   = AB3100_LDO_E,
+		.ops  = &regulator_ops_variable_sleepable,
+		.type = REGULATOR_VOLTAGE,
+	},
+	{
+		.name = "LDO_F",
+		.id   = AB3100_LDO_F,
+		.ops  = &regulator_ops_variable,
+		.type = REGULATOR_VOLTAGE,
+	},
+	{
+		.name = "LDO_G",
+		.id   = AB3100_LDO_G,
+		.ops  = &regulator_ops_variable,
+		.type = REGULATOR_VOLTAGE,
+	},
+	{
+		.name = "LDO_H",
+		.id   = AB3100_LDO_H,
+		.ops  = &regulator_ops_variable,
+		.type = REGULATOR_VOLTAGE,
+	},
+	{
+		.name = "LDO_K",
+		.id   = AB3100_LDO_K,
+		.ops  = &regulator_ops_variable,
+		.type = REGULATOR_VOLTAGE,
+	},
+	{
+		.name = "LDO_EXT",
+		.id   = AB3100_LDO_EXT,
+		.ops  = &regulator_ops_external,
+		.type = REGULATOR_VOLTAGE,
+	},
+	{
+		.name = "BUCK",
+		.id   = AB3100_BUCK,
+		.ops  = &regulator_ops_variable_sleepable,
+		.type = REGULATOR_VOLTAGE,
+	},
+};
+
+/*
+ * NOTE: the following functions are regulators pluralis - it is the
+ * binding to the AB3100 core driver and the parent platform device
+ * for all the different regulators.
+ */
+
+static int __init ab3100_regulators_probe(struct platform_device *pdev)
+{
+	struct ab3100_platform_data *plfdata = pdev->dev.platform_data;
+	struct ab3100 *ab3100 = platform_get_drvdata(pdev);
+	int err = 0;
+	u8 data;
+	int i;
+
+	/* Check chip state */
+	err = ab3100_get_register_interruptible(ab3100,
+						AB3100_LDO_D, &data);
+	if (err) {
+		dev_err(&pdev->dev, "could not read initial status of LDO_D\n");
+		return err;
+	}
+	if (data & 0x10)
+		dev_notice(&pdev->dev,
+			   "chip is already in active mode (Warm start)\n");
+	else
+		dev_notice(&pdev->dev,
+			   "chip is in inactive mode (Cold start)\n");
+
+	/* Set up regulators */
+	for (i = 0; i < ARRAY_SIZE(ab3100_reg_init_order); i++) {
+		err = ab3100_set_register_interruptible(ab3100,
+					ab3100_reg_init_order[i],
+					plfdata->reg_initvals[i]);
+		if (err) {
+			dev_err(&pdev->dev, "regulator initialization failed with error %d\n",
+				err);
+			return err;
+		}
+	}
+
+	if (err) {
+		dev_err(&pdev->dev,
+			"LDO D regulator initialization failed with error %d\n",
+			err);
+		return err;
+	}
+
+	/* Register the regulators */
+	for (i = 0; i < AB3100_NUM_REGULATORS; i++) {
+		struct ab3100_regulator *reg = &ab3100_regulators[i];
+		struct regulator_dev *rdev;
+
+		/*
+		 * Initialize per-regulator struct.
+		 * Inherit platform data, this comes down from the
+		 * i2c boarddata, from the machine. So if you want to
+		 * see what it looks like for a certain machine, go
+		 * into the machine I2C setup.
+		 */
+		reg->ab3100 = ab3100;
+		reg->plfdata = plfdata;
+
+		/*
+		 * Register the regulator, pass around
+		 * the ab3100_regulator struct
+		 */
+		rdev = regulator_register(&ab3100_regulator_desc[i],
+					  &pdev->dev,
+					  &plfdata->reg_constraints[i],
+					  reg);
+
+		if (IS_ERR(rdev)) {
+			err = PTR_ERR(rdev);
+			dev_err(&pdev->dev,
+				"%s: failed to register regulator %s err %d\n",
+				__func__, ab3100_regulator_desc[i].name,
+				err);
+			i--;
+			/* remove the already registered regulators */
+			while (i > 0) {
+				regulator_unregister(ab3100_regulators[i].rdev);
+				i--;
+			}
+			return err;
+		}
+
+		/* Then set a pointer back to the registered regulator */
+		reg->rdev = rdev;
+	}
+
+	return 0;
+}
+
+static int __exit ab3100_regulators_remove(struct platform_device *pdev)
+{
+	int i;
+
+	for (i = 0; i < AB3100_NUM_REGULATORS; i++) {
+		struct ab3100_regulator *reg = &ab3100_regulators[i];
+
+		regulator_unregister(reg->rdev);
+	}
+	return 0;
+}
+
+static struct platform_driver ab3100_regulators_driver = {
+	.driver = {
+		.name  = "ab3100-regulators",
+		.owner = THIS_MODULE,
+	},
+	.probe = ab3100_regulators_probe,
+	.remove = __exit_p(ab3100_regulators_remove),
+};
+
+static __init int ab3100_regulators_init(void)
+{
+	return platform_driver_register(&ab3100_regulators_driver);
+}
+
+static __exit void ab3100_regulators_exit(void)
+{
+	platform_driver_register(&ab3100_regulators_driver);
+}
+
+subsys_initcall(ab3100_regulators_init);
+module_exit(ab3100_regulators_exit);
+
+MODULE_AUTHOR("Mattias Wallin <mattias.wallin@stericsson.com>");
+MODULE_DESCRIPTION("AB3100 Regulator driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:ab3100-regulators");
diff --git a/include/linux/mfd/ab3100.h b/include/linux/mfd/ab3100.h
index 3ead5cfb638c..e9aa4c9d749d 100644
--- a/include/linux/mfd/ab3100.h
+++ b/include/linux/mfd/ab3100.h
@@ -7,6 +7,7 @@
 
 #include <linux/device.h>
 #include <linux/workqueue.h>
+#include <linux/regulator/machine.h>
 
 #ifndef MFD_AB3100_H
 #define MFD_AB3100_H
@@ -57,6 +58,14 @@
 #define AB3100_STR_BATT_REMOVAL				(0x40)
 #define AB3100_STR_VBUS					(0x80)
 
+/*
+ * AB3100 contains 8 regulators, one external regulator controller
+ * and a buck converter, further the LDO E and buck converter can
+ * have separate settings if they are in sleep mode, this is
+ * modeled as a separate regulator.
+ */
+#define AB3100_NUM_REGULATORS				10
+
 /**
  * struct ab3100
  * @access_mutex: lock out concurrent accesses to the AB3100 registers
@@ -87,6 +96,25 @@ struct ab3100 {
 	bool startup_events_read;
 };
 
+/**
+ * struct ab3100_platform_data
+ * Data supplied to initialize board connections to the AB3100
+ * @reg_constraints: regulator constraints for target board
+ *     the order of these constraints are: LDO A, C, D, E,
+ *     F, G, H, K, EXT and BUCK.
+ * @reg_initvals: initial values for the regulator registers
+ *     plus two sleep settings for LDO E and the BUCK converter.
+ *     exactly AB3100_NUM_REGULATORS+2 values must be sent in.
+ *     Order: LDO A, C, E, E sleep, F, G, H, K, EXT, BUCK,
+ *     BUCK sleep, LDO D. (LDO D need to be initialized last.)
+ * @external_voltage: voltage level of the external regulator.
+ */
+struct ab3100_platform_data {
+	struct regulator_init_data reg_constraints[AB3100_NUM_REGULATORS];
+	u8 reg_initvals[AB3100_NUM_REGULATORS+2];
+	int external_voltage;
+};
+
 int ab3100_set_register_interruptible(struct ab3100 *ab3100, u8 reg, u8 regval);
 int ab3100_get_register_interruptible(struct ab3100 *ab3100, u8 reg, u8 *regval);
 int ab3100_get_register_page_interruptible(struct ab3100 *ab3100,
-- 
cgit v1.2.3-71-gd317


From ad4b78bbcbab66998b05d422ac6106b645796e54 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 16 Sep 2009 12:31:31 +0200
Subject: sched: Add new wakeup preemption mode: WAKEUP_RUNNING

Create a new wakeup preemption mode, preempt towards tasks that run
shorter on avg. It sets next buddy to be sure we actually run the task
we preempted for.

Test results:

 root@twins:~# while :; do :; done &
 [1] 6537
 root@twins:~# while :; do :; done &
 [2] 6538
 root@twins:~# while :; do :; done &
 [3] 6539
 root@twins:~# while :; do :; done &
 [4] 6540

 root@twins:/home/peter# ./latt -c4 sleep 4
 Entries: 48 (clients=4)

 Averages:
 ------------------------------
        Max          4750 usec
        Avg           497 usec
        Stdev         737 usec

 root@twins:/home/peter# echo WAKEUP_RUNNING > /debug/sched_features

 root@twins:/home/peter# ./latt -c4 sleep 4
 Entries: 48 (clients=4)

 Averages:
 ------------------------------
        Max            14 usec
        Avg             5 usec
        Stdev           3 usec

Disabled by default - needs more testing.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <new-submission>
---
 include/linux/sched.h   |  2 ++
 kernel/sched.c          | 17 ++++++++++-------
 kernel/sched_debug.c    |  1 +
 kernel/sched_fair.c     | 14 +++++++++++---
 kernel/sched_features.h |  5 +++++
 5 files changed, 29 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4a39bb2b4a4..8af3d249170e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1113,6 +1113,8 @@ struct sched_entity {
 	u64			start_runtime;
 	u64			avg_wakeup;
 
+	u64			avg_running;
+
 #ifdef CONFIG_SCHEDSTATS
 	u64			wait_start;
 	u64			wait_max;
diff --git a/kernel/sched.c b/kernel/sched.c
index 969dfaef2465..3bb4ea2ee6f0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2458,6 +2458,7 @@ static void __sched_fork(struct task_struct *p)
 	p->se.avg_overlap		= 0;
 	p->se.start_runtime		= 0;
 	p->se.avg_wakeup		= sysctl_sched_wakeup_granularity;
+	p->se.avg_running		= 0;
 
 #ifdef CONFIG_SCHEDSTATS
 	p->se.wait_start			= 0;
@@ -5310,14 +5311,13 @@ static inline void schedule_debug(struct task_struct *prev)
 #endif
 }
 
-static void put_prev_task(struct rq *rq, struct task_struct *prev)
+static void put_prev_task(struct rq *rq, struct task_struct *p)
 {
-	if (prev->state == TASK_RUNNING) {
-		u64 runtime = prev->se.sum_exec_runtime;
+	u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
 
-		runtime -= prev->se.prev_sum_exec_runtime;
-		runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
+	update_avg(&p->se.avg_running, runtime);
 
+	if (p->state == TASK_RUNNING) {
 		/*
 		 * In order to avoid avg_overlap growing stale when we are
 		 * indeed overlapping and hence not getting put to sleep, grow
@@ -5327,9 +5327,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
 		 * correlates to the amount of cache footprint a task can
 		 * build up.
 		 */
-		update_avg(&prev->se.avg_overlap, runtime);
+		runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
+		update_avg(&p->se.avg_overlap, runtime);
+	} else {
+		update_avg(&p->se.avg_running, 0);
 	}
-	prev->sched_class->put_prev_task(rq, prev);
+	p->sched_class->put_prev_task(rq, p);
 }
 
 /*
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 5ddbd0891267..efb84409bc43 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -395,6 +395,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	PN(se.sum_exec_runtime);
 	PN(se.avg_overlap);
 	PN(se.avg_wakeup);
+	PN(se.avg_running);
 
 	nr_switches = p->nvcsw + p->nivcsw;
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c741cd9d38de..3e6f78c66876 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1605,9 +1605,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 		return;
 	}
 
-	if (!sched_feat(WAKEUP_PREEMPT))
-		return;
-
 	if ((sched_feat(WAKEUP_SYNC) && sync) ||
 	    (sched_feat(WAKEUP_OVERLAP) &&
 	     (se->avg_overlap < sysctl_sched_migration_cost &&
@@ -1616,6 +1613,17 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 		return;
 	}
 
+	if (sched_feat(WAKEUP_RUNNING)) {
+		if (pse->avg_running < se->avg_running) {
+			set_next_buddy(pse);
+			resched_task(curr);
+			return;
+		}
+	}
+
+	if (!sched_feat(WAKEUP_PREEMPT))
+		return;
+
 	find_matching_se(&se, &pse);
 
 	BUG_ON(!pse);
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index d5059fd761d9..0d94083582c7 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -53,6 +53,11 @@ SCHED_FEAT(WAKEUP_SYNC, 0)
  */
 SCHED_FEAT(WAKEUP_OVERLAP, 0)
 
+/*
+ * Wakeup preemption towards tasks that run short
+ */
+SCHED_FEAT(WAKEUP_RUNNING, 0)
+
 /*
  * Use the SYNC wakeup hint, pipes and the likes use this to indicate
  * the remote end is likely to consume the data we just wrote, and
-- 
cgit v1.2.3-71-gd317


From c4c259bcc27c4242b012106afdba183622b1735f Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Tue, 15 Sep 2009 16:27:45 +0200
Subject: HID: consolidate connect and disconnect into core code

HID core registers input, hidraw and hiddev devices, but leaves
unregistering it up to the individual driver, which is not really nice.
Let's move all the logic to the core.

Reported-by: Marcel Holtmann <marcel@holtmann.org>
Reported-by: Brian Rogers <brian@xyzw.org>
Acked-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-core.c        | 11 +++++++++++
 drivers/hid/usbhid/hid-core.c | 16 +++++-----------
 include/linux/hid.h           |  3 +++
 net/bluetooth/hidp/core.c     |  7 -------
 4 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index ca9bb26c2076..be34d32906bd 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -1237,6 +1237,17 @@ int hid_connect(struct hid_device *hdev, unsigned int connect_mask)
 }
 EXPORT_SYMBOL_GPL(hid_connect);
 
+void hid_disconnect(struct hid_device *hdev)
+{
+	if (hdev->claimed & HID_CLAIMED_INPUT)
+		hidinput_disconnect(hdev);
+	if (hdev->claimed & HID_CLAIMED_HIDDEV)
+		hdev->hiddev_disconnect(hdev);
+	if (hdev->claimed & HID_CLAIMED_HIDRAW)
+		hidraw_disconnect(hdev);
+}
+EXPORT_SYMBOL_GPL(hid_disconnect);
+
 /* a list of devices for which there is a specialized driver on HID bus */
 static const struct hid_device_id hid_blacklist[] = {
 	{ HID_USB_DEVICE(USB_VENDOR_ID_A4TECH, USB_DEVICE_ID_A4TECH_WCP32PU) },
diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c
index 1b0e07a67d6d..03bd703255a3 100644
--- a/drivers/hid/usbhid/hid-core.c
+++ b/drivers/hid/usbhid/hid-core.c
@@ -1041,13 +1041,6 @@ static void usbhid_stop(struct hid_device *hid)
 
 	hid_cancel_delayed_stuff(usbhid);
 
-	if (hid->claimed & HID_CLAIMED_INPUT)
-		hidinput_disconnect(hid);
-	if (hid->claimed & HID_CLAIMED_HIDDEV)
-		hiddev_disconnect(hid);
-	if (hid->claimed & HID_CLAIMED_HIDRAW)
-		hidraw_disconnect(hid);
-
 	hid->claimed = 0;
 
 	usb_free_urb(usbhid->urbin);
@@ -1085,7 +1078,7 @@ static struct hid_ll_driver usb_hid_driver = {
 	.hidinput_input_event = usb_hidinput_input_event,
 };
 
-static int hid_probe(struct usb_interface *intf, const struct usb_device_id *id)
+static int usbhid_probe(struct usb_interface *intf, const struct usb_device_id *id)
 {
 	struct usb_host_interface *interface = intf->cur_altsetting;
 	struct usb_device *dev = interface_to_usbdev(intf);
@@ -1117,6 +1110,7 @@ static int hid_probe(struct usb_interface *intf, const struct usb_device_id *id)
 	hid->ff_init = hid_pidff_init;
 #ifdef CONFIG_USB_HIDDEV
 	hid->hiddev_connect = hiddev_connect;
+	hid->hiddev_disconnect = hiddev_disconnect;
 	hid->hiddev_hid_event = hiddev_hid_event;
 	hid->hiddev_report_event = hiddev_report_event;
 #endif
@@ -1177,7 +1171,7 @@ err:
 	return ret;
 }
 
-static void hid_disconnect(struct usb_interface *intf)
+static void usbhid_disconnect(struct usb_interface *intf)
 {
 	struct hid_device *hid = usb_get_intfdata(intf);
 	struct usbhid_device *usbhid;
@@ -1359,8 +1353,8 @@ MODULE_DEVICE_TABLE (usb, hid_usb_ids);
 
 static struct usb_driver hid_driver = {
 	.name =		"usbhid",
-	.probe =	hid_probe,
-	.disconnect =	hid_disconnect,
+	.probe =	usbhid_probe,
+	.disconnect =	usbhid_disconnect,
 #ifdef CONFIG_PM
 	.suspend =	hid_suspend,
 	.resume =	hid_resume,
diff --git a/include/linux/hid.h b/include/linux/hid.h
index a0ebdace7baa..10f628416740 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -494,6 +494,7 @@ struct hid_device {							/* device report descriptor */
 
 	/* hiddev event handler */
 	int (*hiddev_connect)(struct hid_device *, unsigned int);
+	void (*hiddev_disconnect)(struct hid_device *);
 	void (*hiddev_hid_event) (struct hid_device *, struct hid_field *field,
 				  struct hid_usage *, __s32);
 	void (*hiddev_report_event) (struct hid_device *, struct hid_report *);
@@ -691,6 +692,7 @@ struct hid_device *hid_allocate_device(void);
 int hid_parse_report(struct hid_device *hid, __u8 *start, unsigned size);
 int hid_check_keys_pressed(struct hid_device *hid);
 int hid_connect(struct hid_device *hid, unsigned int connect_mask);
+void hid_disconnect(struct hid_device *hid);
 
 /**
  * hid_map_usage - map usage input bits
@@ -800,6 +802,7 @@ static inline int __must_check hid_hw_start(struct hid_device *hdev,
  */
 static inline void hid_hw_stop(struct hid_device *hdev)
 {
+	hid_disconnect(hdev);
 	hdev->ll_driver->stop(hdev);
 }
 
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 09bedeb5579c..49d8495d69be 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -577,11 +577,6 @@ static int hidp_session(void *arg)
 	}
 
 	if (session->hid) {
-		if (session->hid->claimed & HID_CLAIMED_INPUT)
-			hidinput_disconnect(session->hid);
-		if (session->hid->claimed & HID_CLAIMED_HIDRAW)
-			hidraw_disconnect(session->hid);
-
 		hid_destroy_device(session->hid);
 		session->hid = NULL;
 	}
@@ -747,8 +742,6 @@ static void hidp_stop(struct hid_device *hid)
 	skb_queue_purge(&session->ctrl_transmit);
 	skb_queue_purge(&session->intr_transmit);
 
-	if (hid->claimed & HID_CLAIMED_INPUT)
-		hidinput_disconnect(hid);
 	hid->claimed = 0;
 }
 
-- 
cgit v1.2.3-71-gd317


From 37bce07077b0c335d8747f1ddb27ad585434a47e Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Wed, 16 Sep 2009 19:07:32 +0100
Subject: mfd: Convert WM8350 to use request_threaded_irq()

Instead of hand rolling our own variant.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/wm8350-core.c       | 24 ++++++------------------
 include/linux/mfd/wm8350/core.h |  1 -
 2 files changed, 6 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/wm8350-core.c b/drivers/mfd/wm8350-core.c
index 9d662a576a41..ba27c9dc1ad3 100644
--- a/drivers/mfd/wm8350-core.c
+++ b/drivers/mfd/wm8350-core.c
@@ -353,15 +353,15 @@ static void wm8350_irq_call_handler(struct wm8350 *wm8350, int irq)
 }
 
 /*
- * wm8350_irq_worker actually handles the interrupts.  Since all
+ * This is a threaded IRQ handler so can access I2C/SPI.  Since all
  * interrupts are clear on read the IRQ line will be reasserted and
  * the physical IRQ will be handled again if another interrupt is
  * asserted while we run - in the normal course of events this is a
  * rare occurrence so we save I2C/SPI reads.
  */
-static void wm8350_irq_worker(struct work_struct *work)
+static irqreturn_t wm8350_irq(int irq, void *data)
 {
-	struct wm8350 *wm8350 = container_of(work, struct wm8350, irq_work);
+	struct wm8350 *wm8350 = data;
 	u16 level_one, status1, status2, comp;
 
 	/* TODO: Use block reads to improve performance? */
@@ -552,16 +552,6 @@ static void wm8350_irq_worker(struct work_struct *work)
 		}
 	}
 
-	enable_irq(wm8350->chip_irq);
-}
-
-static irqreturn_t wm8350_irq(int irq, void *data)
-{
-	struct wm8350 *wm8350 = data;
-
-	disable_irq_nosync(irq);
-	schedule_work(&wm8350->irq_work);
-
 	return IRQ_HANDLED;
 }
 
@@ -1428,9 +1418,8 @@ int wm8350_device_init(struct wm8350 *wm8350, int irq,
 
 	mutex_init(&wm8350->auxadc_mutex);
 	mutex_init(&wm8350->irq_mutex);
-	INIT_WORK(&wm8350->irq_work, wm8350_irq_worker);
 	if (irq) {
-		int flags = 0;
+		int flags = IRQF_ONESHOT;
 
 		if (pdata && pdata->irq_high) {
 			flags |= IRQF_TRIGGER_HIGH;
@@ -1444,8 +1433,8 @@ int wm8350_device_init(struct wm8350 *wm8350, int irq,
 					  WM8350_IRQ_POL);
 		}
 
-		ret = request_irq(irq, wm8350_irq, flags,
-				  "wm8350", wm8350);
+		ret = request_threaded_irq(irq, NULL, wm8350_irq, flags,
+					   "wm8350", wm8350);
 		if (ret != 0) {
 			dev_err(wm8350->dev, "Failed to request IRQ: %d\n",
 				ret);
@@ -1505,7 +1494,6 @@ void wm8350_device_exit(struct wm8350 *wm8350)
 	platform_device_unregister(wm8350->codec.pdev);
 
 	free_irq(wm8350->chip_irq, wm8350);
-	flush_work(&wm8350->irq_work);
 	kfree(wm8350->reg_cache);
 }
 EXPORT_SYMBOL_GPL(wm8350_device_exit);
diff --git a/include/linux/mfd/wm8350/core.h b/include/linux/mfd/wm8350/core.h
index 969b0b55615c..1d595de6a055 100644
--- a/include/linux/mfd/wm8350/core.h
+++ b/include/linux/mfd/wm8350/core.h
@@ -626,7 +626,6 @@ struct wm8350 {
 	struct mutex auxadc_mutex;
 
 	/* Interrupt handling */
-	struct work_struct irq_work;
 	struct mutex irq_mutex; /* IRQ table mutex */
 	struct wm8350_irq irq[WM8350_NUM_IRQ];
 	int chip_irq;
-- 
cgit v1.2.3-71-gd317


From c0826574ddc0df486ecfc2d655e08904c6513209 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Thu, 17 Sep 2009 17:03:06 +1000
Subject: nfsd: return success for non-NFS4 nfs4_state_start

Today's linux-next build (sparc64_defconfig) failed like this:

In file included from arch/sparc/kernel/sys_sparc32.c:32:
include/linux/nfsd/nfsd.h: In function 'nfs4_state_start':
include/linux/nfsd/nfsd.h:177: error: no return statement in function returning non-void

Caused by commit 29ab23cc5d351658d01a4327d55e9106a73fd04f ("nfsd4: allow
nfs4 state startup to fail").  Please, if you add code that depends on a
CONFIG option, build with that option enabled and disabled.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 include/linux/nfsd/nfsd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 24fdf89cea83..03bbe9039104 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -174,7 +174,7 @@ int nfs4_reset_recoverydir(char *recdir);
 #else
 static inline int nfs4_state_init(void) { return 0; }
 static inline void nfsd4_free_slabs(void) { }
-static inline int nfs4_state_start(void) { }
+static inline int nfs4_state_start(void) { return 0; }
 static inline void nfs4_state_shutdown(void) { }
 static inline time_t nfs4_lease_time(void) { return 0; }
 static inline void nfs4_reset_lease(time_t leasetime) { }
-- 
cgit v1.2.3-71-gd317


From e2d4304b7d2b85c45de89ec420037d6b9261a12d Mon Sep 17 00:00:00 2001
From: Jesse Barnes <jbarnes@virtuousgeek.org>
Date: Thu, 17 Sep 2009 09:40:31 -0700
Subject: PCI: fix VGA arbiter header file

Remove reference to vgaarb.c and replace it with a comment about the
arbiter itself.

Reported-by: Tiago Vignatti <tiago.vignatti@nokia.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/vgaarb.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/vgaarb.h b/include/linux/vgaarb.h
index e81c64af80c1..a1fa1da8dc1a 100644
--- a/include/linux/vgaarb.h
+++ b/include/linux/vgaarb.h
@@ -1,5 +1,6 @@
 /*
- * vgaarb.c
+ * The VGA aribiter manages VGA space routing and VGA resource decode to
+ * allow multiple VGA devices to be used in a system in a safe way.
  *
  * (C) Copyright 2005 Benjamin Herrenschmidt <benh@kernel.crashing.org>
  * (C) Copyright 2007 Paulo R. Zanoni <przanoni@gmail.com>
-- 
cgit v1.2.3-71-gd317


From 2667de81f3256c944b06abdf2c56c2f192fcb724 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 17 Sep 2009 19:01:10 +0200
Subject: perf_counter: Allow for a wakeup watermark

Currently we wake the mmap() consumer once every PAGE_SIZE of data
and/or once event wakeup_events when specified.

For high speed sampling this results in too many wakeups wrt. the
buffer size, hence change this.

We move the default wakeup limit to 1/4-th the buffer size, and
provide for means to manually specify this limit.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 10 ++++++++--
 kernel/perf_counter.c        | 32 +++++++++++++++++++-------------
 2 files changed, 27 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 972f90d7a32f..6c1ef72ea501 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -199,10 +199,14 @@ struct perf_counter_attr {
 				inherit_stat   :  1, /* per task counts       */
 				enable_on_exec :  1, /* next exec enables     */
 				task           :  1, /* trace fork/exit       */
+				watermark      :  1, /* wakeup_watermark      */
 
-				__reserved_1   : 50;
+				__reserved_1   : 49;
 
-	__u32			wakeup_events;	/* wakeup every n events */
+	union {
+		__u32		wakeup_events;	  /* wakeup every n events */
+		__u32		wakeup_watermark; /* bytes before wakeup   */
+	};
 	__u32			__reserved_2;
 
 	__u64			__reserved_3;
@@ -521,6 +525,8 @@ struct perf_mmap_data {
 	atomic_t			wakeup;		/* needs a wakeup    */
 	atomic_t			lost;		/* nr records lost   */
 
+	long				watermark;	/* wakeup watermark  */
+
 	struct perf_counter_mmap_page   *user_page;
 	void				*data_pages[0];
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index fe0d1adde804..29b73b6e8146 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2176,6 +2176,13 @@ static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
 	data->nr_pages = nr_pages;
 	atomic_set(&data->lock, -1);
 
+	if (counter->attr.watermark) {
+		data->watermark = min_t(long, PAGE_SIZE * nr_pages,
+				      counter->attr.wakeup_watermark);
+	}
+	if (!data->watermark)
+		data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);
+
 	rcu_assign_pointer(counter->data, data);
 
 	return 0;
@@ -2517,23 +2524,15 @@ struct perf_output_handle {
 	unsigned long		flags;
 };
 
-static bool perf_output_space(struct perf_mmap_data *data,
-			      unsigned int offset, unsigned int head)
+static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
+			      unsigned long offset, unsigned long head)
 {
-	unsigned long tail;
 	unsigned long mask;
 
 	if (!data->writable)
 		return true;
 
 	mask = (data->nr_pages << PAGE_SHIFT) - 1;
-	/*
-	 * Userspace could choose to issue a mb() before updating the tail
-	 * pointer. So that all reads will be completed before the write is
-	 * issued.
-	 */
-	tail = ACCESS_ONCE(data->user_page->data_tail);
-	smp_rmb();
 
 	offset = (offset - tail) & mask;
 	head   = (head   - tail) & mask;
@@ -2679,7 +2678,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
 {
 	struct perf_counter *output_counter;
 	struct perf_mmap_data *data;
-	unsigned int offset, head;
+	unsigned long tail, offset, head;
 	int have_lost;
 	struct {
 		struct perf_event_header header;
@@ -2717,16 +2716,23 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	perf_output_lock(handle);
 
 	do {
+		/*
+		 * Userspace could choose to issue a mb() before updating the
+		 * tail pointer. So that all reads will be completed before the
+		 * write is issued.
+		 */
+		tail = ACCESS_ONCE(data->user_page->data_tail);
+		smp_rmb();
 		offset = head = atomic_long_read(&data->head);
 		head += size;
-		if (unlikely(!perf_output_space(data, offset, head)))
+		if (unlikely(!perf_output_space(data, tail, offset, head)))
 			goto fail;
 	} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
 
 	handle->offset	= offset;
 	handle->head	= head;
 
-	if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
+	if (head - tail > data->watermark)
 		atomic_set(&data->wakeup, 1);
 
 	if (have_lost) {
-- 
cgit v1.2.3-71-gd317


From d15d6e6cc340566d53d953ffdec2c9e96816fa52 Mon Sep 17 00:00:00 2001
From: "John(Jung-Ik) Lee" <jilee@google.com>
Date: Mon, 14 Sep 2009 21:32:33 -0700
Subject: libata: Add pata_atp867x driver for Artop/Acard ATP867X controllers

This is a new pata driver for ARTOP 867X 64bit 4-channel UDMA133 ATA ctrls.
Based on the Atp867 data sheet rev 1.2, Acard, and in part on early ide codes
from Eric Uhrhane <ericu@google.com>.

Signed-off-by: John(Jung-Ik) Lee <jilee@google.com>
Reviewed-by:  Grant Grundler <grundler@google.com>
Reviewed-by:  Gwendal Gringo <gwendal@google.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/Kconfig        |   9 +
 drivers/ata/Makefile       |   1 +
 drivers/ata/pata_atp867x.c | 548 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci_ids.h    |   2 +
 4 files changed, 560 insertions(+)
 create mode 100644 drivers/ata/pata_atp867x.c

(limited to 'include/linux')

diff --git a/drivers/ata/Kconfig b/drivers/ata/Kconfig
index ab2fa4eeb364..f2df6e2a224c 100644
--- a/drivers/ata/Kconfig
+++ b/drivers/ata/Kconfig
@@ -255,6 +255,15 @@ config PATA_ARTOP
 
 	  If unsure, say N.
 
+config PATA_ATP867X
+	tristate "ARTOP/Acard ATP867X PATA support"
+	depends on PCI
+	help
+	  This option enables support for ARTOP/Acard ATP867X PATA
+	  controllers.
+
+	  If unsure, say N.
+
 config PATA_AT32
 	tristate "Atmel AVR32 PATA support (Experimental)"
 	depends on AVR32 && PLATFORM_AT32AP && EXPERIMENTAL
diff --git a/drivers/ata/Makefile b/drivers/ata/Makefile
index 463eb52236aa..01e126f343b3 100644
--- a/drivers/ata/Makefile
+++ b/drivers/ata/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_SATA_FSL)		+= sata_fsl.o
 obj-$(CONFIG_PATA_ALI)		+= pata_ali.o
 obj-$(CONFIG_PATA_AMD)		+= pata_amd.o
 obj-$(CONFIG_PATA_ARTOP)	+= pata_artop.o
+obj-$(CONFIG_PATA_ATP867X)	+= pata_atp867x.o
 obj-$(CONFIG_PATA_AT32)		+= pata_at32.o
 obj-$(CONFIG_PATA_ATIIXP)	+= pata_atiixp.o
 obj-$(CONFIG_PATA_CMD640_PCI)	+= pata_cmd640.o
diff --git a/drivers/ata/pata_atp867x.c b/drivers/ata/pata_atp867x.c
new file mode 100644
index 000000000000..7990de925d2e
--- /dev/null
+++ b/drivers/ata/pata_atp867x.c
@@ -0,0 +1,548 @@
+/*
+ * pata_atp867x.c - ARTOP 867X 64bit 4-channel UDMA133 ATA controller driver
+ *
+ *	(C) 2009 Google Inc. John(Jung-Ik) Lee <jilee@google.com>
+ *
+ * Per Atp867 data sheet rev 1.2, Acard.
+ * Based in part on early ide code from
+ *	2003-2004 by Eric Uhrhane, Google, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *
+ * TODO:
+ *   1. RAID features [comparison, XOR, striping, mirroring, etc.]
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <scsi/scsi_host.h>
+#include <linux/libata.h>
+
+#define	DRV_NAME	"pata_atp867x"
+#define	DRV_VERSION	"0.7.5"
+
+/*
+ * IO Registers
+ * Note that all runtime hot priv ports are cached in ap private_data
+ */
+
+enum {
+	ATP867X_IO_CHANNEL_OFFSET	= 0x10,
+
+	/*
+	 * IO Register Bitfields
+	 */
+
+	ATP867X_IO_PIOSPD_ACTIVE_SHIFT	= 4,
+	ATP867X_IO_PIOSPD_RECOVER_SHIFT	= 0,
+
+	ATP867X_IO_DMAMODE_MSTR_SHIFT	= 0,
+	ATP867X_IO_DMAMODE_MSTR_MASK	= 0x07,
+	ATP867X_IO_DMAMODE_SLAVE_SHIFT	= 4,
+	ATP867X_IO_DMAMODE_SLAVE_MASK	= 0x70,
+
+	ATP867X_IO_DMAMODE_UDMA_6	= 0x07,
+	ATP867X_IO_DMAMODE_UDMA_5	= 0x06,
+	ATP867X_IO_DMAMODE_UDMA_4	= 0x05,
+	ATP867X_IO_DMAMODE_UDMA_3	= 0x04,
+	ATP867X_IO_DMAMODE_UDMA_2	= 0x03,
+	ATP867X_IO_DMAMODE_UDMA_1	= 0x02,
+	ATP867X_IO_DMAMODE_UDMA_0	= 0x01,
+	ATP867X_IO_DMAMODE_DISABLE	= 0x00,
+
+	ATP867X_IO_SYS_INFO_66MHZ	= 0x04,
+	ATP867X_IO_SYS_INFO_SLOW_UDMA5	= 0x02,
+	ATP867X_IO_SYS_MASK_RESERVED	= (~0xf1),
+
+	ATP867X_IO_PORTSPD_VAL		= 0x1143,
+	ATP867X_PREREAD_VAL		= 0x0200,
+
+	ATP867X_NUM_PORTS		= 4,
+	ATP867X_BAR_IOBASE		= 0,
+	ATP867X_BAR_ROMBASE		= 6,
+};
+
+#define ATP867X_IOBASE(ap)		((ap)->host->iomap[0])
+#define ATP867X_SYS_INFO(ap)		(0x3F + ATP867X_IOBASE(ap))
+
+#define ATP867X_IO_PORTBASE(ap, port)	(0x00 + ATP867X_IOBASE(ap) + \
+					(port) * ATP867X_IO_CHANNEL_OFFSET)
+#define ATP867X_IO_DMABASE(ap, port)	(0x40 + \
+					ATP867X_IO_PORTBASE((ap), (port)))
+
+#define ATP867X_IO_STATUS(ap, port)	(0x07 + \
+					ATP867X_IO_PORTBASE((ap), (port)))
+#define ATP867X_IO_ALTSTATUS(ap, port)	(0x0E + \
+					ATP867X_IO_PORTBASE((ap), (port)))
+
+/*
+ * hot priv ports
+ */
+#define ATP867X_IO_MSTRPIOSPD(ap, port)	(0x08 + \
+					ATP867X_IO_DMABASE((ap), (port)))
+#define ATP867X_IO_SLAVPIOSPD(ap, port)	(0x09 + \
+					ATP867X_IO_DMABASE((ap), (port)))
+#define ATP867X_IO_8BPIOSPD(ap, port)	(0x0A + \
+					ATP867X_IO_DMABASE((ap), (port)))
+#define ATP867X_IO_DMAMODE(ap, port)	(0x0B + \
+					ATP867X_IO_DMABASE((ap), (port)))
+
+#define ATP867X_IO_PORTSPD(ap, port)	(0x4A + \
+					ATP867X_IO_PORTBASE((ap), (port)))
+#define ATP867X_IO_PREREAD(ap, port)	(0x4C + \
+					ATP867X_IO_PORTBASE((ap), (port)))
+
+struct atp867x_priv {
+	void __iomem *dma_mode;
+	void __iomem *mstr_piospd;
+	void __iomem *slave_piospd;
+	void __iomem *eightb_piospd;
+	int		pci66mhz;
+};
+
+static inline u8 atp867x_speed_to_mode(u8 speed)
+{
+	return speed - XFER_UDMA_0 + 1;
+}
+
+static void atp867x_set_dmamode(struct ata_port *ap, struct ata_device *adev)
+{
+	struct pci_dev *pdev	= to_pci_dev(ap->host->dev);
+	struct atp867x_priv *dp = ap->private_data;
+	u8 speed = adev->dma_mode;
+	u8 b;
+	u8 mode;
+
+	mode = atp867x_speed_to_mode(speed);
+
+	/*
+	 * Doc 6.6.9: decrease the udma mode value by 1 for safer UDMA speed
+	 * on 66MHz bus
+	 *   rev-A: UDMA_1~4 (5, 6 no change)
+	 *   rev-B: all UDMA modes
+	 *   UDMA_0 stays not to disable UDMA
+	 */
+	if (dp->pci66mhz && mode > ATP867X_IO_DMAMODE_UDMA_0  &&
+	   (pdev->device == PCI_DEVICE_ID_ARTOP_ATP867B ||
+	    mode < ATP867X_IO_DMAMODE_UDMA_5))
+		mode--;
+
+	b = ioread8(dp->dma_mode);
+	if (adev->devno & 1) {
+		b = (b & ~ATP867X_IO_DMAMODE_SLAVE_MASK) |
+			(mode << ATP867X_IO_DMAMODE_SLAVE_SHIFT);
+	} else {
+		b = (b & ~ATP867X_IO_DMAMODE_MSTR_MASK) |
+			(mode << ATP867X_IO_DMAMODE_MSTR_SHIFT);
+	}
+	iowrite8(b, dp->dma_mode);
+}
+
+static int atp867x_get_active_clocks_shifted(unsigned int clk)
+{
+	unsigned char clocks = clk;
+
+	switch (clocks) {
+	case 0:
+		clocks = 1;
+		break;
+	case 1 ... 7:
+		break;
+	case 8 ... 12:
+		clocks = 7;
+		break;
+	default:
+		printk(KERN_WARNING "ATP867X: active %dclk is invalid. "
+			"Using default 8clk.\n", clk);
+		clocks = 0;	/* 8 clk */
+		break;
+	}
+	return clocks << ATP867X_IO_PIOSPD_ACTIVE_SHIFT;
+}
+
+static int atp867x_get_recover_clocks_shifted(unsigned int clk)
+{
+	unsigned char clocks = clk;
+
+	switch (clocks) {
+	case 0:
+		clocks = 1;
+		break;
+	case 1 ... 11:
+		break;
+	case 12:
+		clocks = 0;
+		break;
+	case 13: case 14:
+		--clocks;
+		break;
+	case 15:
+		break;
+	default:
+		printk(KERN_WARNING "ATP867X: recover %dclk is invalid. "
+			"Using default 15clk.\n", clk);
+		clocks = 0;	/* 12 clk */
+		break;
+	}
+	return clocks << ATP867X_IO_PIOSPD_RECOVER_SHIFT;
+}
+
+static void atp867x_set_piomode(struct ata_port *ap, struct ata_device *adev)
+{
+	struct ata_device *peer = ata_dev_pair(adev);
+	struct atp867x_priv *dp = ap->private_data;
+	u8 speed = adev->pio_mode;
+	struct ata_timing t, p;
+	int T, UT;
+	u8 b;
+
+	T = 1000000000 / 33333;
+	UT = T / 4;
+
+	ata_timing_compute(adev, speed, &t, T, UT);
+	if (peer && peer->pio_mode) {
+		ata_timing_compute(peer, peer->pio_mode, &p, T, UT);
+		ata_timing_merge(&p, &t, &t, ATA_TIMING_8BIT);
+	}
+
+	b = ioread8(dp->dma_mode);
+	if (adev->devno & 1)
+		b = (b & ~ATP867X_IO_DMAMODE_SLAVE_MASK);
+	else
+		b = (b & ~ATP867X_IO_DMAMODE_MSTR_MASK);
+	iowrite8(b, dp->dma_mode);
+
+	b = atp867x_get_active_clocks_shifted(t.active) |
+		atp867x_get_recover_clocks_shifted(t.recover);
+	if (dp->pci66mhz)
+		b += 0x10;
+
+	if (adev->devno & 1)
+		iowrite8(b, dp->slave_piospd);
+	else
+		iowrite8(b, dp->mstr_piospd);
+
+	/*
+	 * use the same value for comand timing as for PIO timimg
+	 */
+	iowrite8(b, dp->eightb_piospd);
+}
+
+static int atp867x_cable_detect(struct ata_port *ap)
+{
+	return ATA_CBL_PATA40_SHORT;
+}
+
+static struct scsi_host_template atp867x_sht = {
+	ATA_BMDMA_SHT(DRV_NAME),
+};
+
+static struct ata_port_operations atp867x_ops = {
+	.inherits		= &ata_bmdma_port_ops,
+	.cable_detect		= atp867x_cable_detect,
+	.set_piomode		= atp867x_set_piomode,
+	.set_dmamode		= atp867x_set_dmamode,
+};
+
+
+#ifdef	ATP867X_DEBUG
+static void atp867x_check_res(struct pci_dev *pdev)
+{
+	int i;
+	unsigned long start, len;
+
+	/* Check the PCI resources for this channel are enabled */
+	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
+		start = pci_resource_start(pdev, i);
+		len   = pci_resource_len(pdev, i);
+		printk(KERN_DEBUG "ATP867X: resource start:len=%lx:%lx\n",
+			start, len);
+	}
+}
+
+static void atp867x_check_ports(struct ata_port *ap, int port)
+{
+	struct ata_ioports *ioaddr = &ap->ioaddr;
+	struct atp867x_priv *dp = ap->private_data;
+
+	printk(KERN_DEBUG "ATP867X: port[%d] addresses\n"
+		"  cmd_addr	=0x%llx, 0x%llx\n"
+		"  ctl_addr	=0x%llx, 0x%llx\n"
+		"  bmdma_addr	=0x%llx, 0x%llx\n"
+		"  data_addr	=0x%llx\n"
+		"  error_addr	=0x%llx\n"
+		"  feature_addr	=0x%llx\n"
+		"  nsect_addr	=0x%llx\n"
+		"  lbal_addr	=0x%llx\n"
+		"  lbam_addr	=0x%llx\n"
+		"  lbah_addr	=0x%llx\n"
+		"  device_addr	=0x%llx\n"
+		"  status_addr	=0x%llx\n"
+		"  command_addr	=0x%llx\n"
+		"  dp->dma_mode	=0x%llx\n"
+		"  dp->mstr_piospd	=0x%llx\n"
+		"  dp->slave_piospd	=0x%llx\n"
+		"  dp->eightb_piospd	=0x%llx\n"
+		"  dp->pci66mhz		=0x%lx\n",
+		port,
+		(unsigned long long)ioaddr->cmd_addr,
+		(unsigned long long)ATP867X_IO_PORTBASE(ap, port),
+		(unsigned long long)ioaddr->ctl_addr,
+		(unsigned long long)ATP867X_IO_ALTSTATUS(ap, port),
+		(unsigned long long)ioaddr->bmdma_addr,
+		(unsigned long long)ATP867X_IO_DMABASE(ap, port),
+		(unsigned long long)ioaddr->data_addr,
+		(unsigned long long)ioaddr->error_addr,
+		(unsigned long long)ioaddr->feature_addr,
+		(unsigned long long)ioaddr->nsect_addr,
+		(unsigned long long)ioaddr->lbal_addr,
+		(unsigned long long)ioaddr->lbam_addr,
+		(unsigned long long)ioaddr->lbah_addr,
+		(unsigned long long)ioaddr->device_addr,
+		(unsigned long long)ioaddr->status_addr,
+		(unsigned long long)ioaddr->command_addr,
+		(unsigned long long)dp->dma_mode,
+		(unsigned long long)dp->mstr_piospd,
+		(unsigned long long)dp->slave_piospd,
+		(unsigned long long)dp->eightb_piospd,
+		(unsigned long)dp->pci66mhz);
+}
+#endif
+
+static int atp867x_set_priv(struct ata_port *ap)
+{
+	struct pci_dev *pdev = to_pci_dev(ap->host->dev);
+	struct atp867x_priv *dp;
+	int port = ap->port_no;
+
+	dp = ap->private_data =
+		devm_kzalloc(&pdev->dev, sizeof(*dp), GFP_KERNEL);
+	if (dp == NULL)
+		return -ENOMEM;
+
+	dp->dma_mode	 = ATP867X_IO_DMAMODE(ap, port);
+	dp->mstr_piospd	 = ATP867X_IO_MSTRPIOSPD(ap, port);
+	dp->slave_piospd = ATP867X_IO_SLAVPIOSPD(ap, port);
+	dp->eightb_piospd = ATP867X_IO_8BPIOSPD(ap, port);
+
+	dp->pci66mhz =
+		ioread8(ATP867X_SYS_INFO(ap)) & ATP867X_IO_SYS_INFO_66MHZ;
+
+	return 0;
+}
+
+static void atp867x_fixup(struct ata_host *host)
+{
+	struct pci_dev *pdev = to_pci_dev(host->dev);
+	struct ata_port *ap = host->ports[0];
+	int i;
+	u8 v;
+
+	/*
+	 * Broken BIOS might not set latency high enough
+	 */
+	pci_read_config_byte(pdev, PCI_LATENCY_TIMER, &v);
+	if (v < 0x80) {
+		v = 0x80;
+		pci_write_config_byte(pdev, PCI_LATENCY_TIMER, v);
+		printk(KERN_DEBUG "ATP867X: set latency timer of device %s"
+			" to %d\n", pci_name(pdev), v);
+	}
+
+	/*
+	 * init 8bit io ports speed(0aaarrrr) to 43h and
+	 * init udma modes of master/slave to 0/0(11h)
+	 */
+	for (i = 0; i < ATP867X_NUM_PORTS; i++)
+		iowrite16(ATP867X_IO_PORTSPD_VAL, ATP867X_IO_PORTSPD(ap, i));
+
+	/*
+	 * init PreREAD counts
+	 */
+	for (i = 0; i < ATP867X_NUM_PORTS; i++)
+		iowrite16(ATP867X_PREREAD_VAL, ATP867X_IO_PREREAD(ap, i));
+
+	v = ioread8(ATP867X_IOBASE(ap) + 0x28);
+	v &= 0xcf;	/* Enable INTA#: bit4=0 means enable */
+	v |= 0xc0;	/* Enable PCI burst, MRM & not immediate interrupts */
+	iowrite8(v, ATP867X_IOBASE(ap) + 0x28);
+
+	/*
+	 * Turn off the over clocked udma5 mode, only for Rev-B
+	 */
+	v = ioread8(ATP867X_SYS_INFO(ap));
+	v &= ATP867X_IO_SYS_MASK_RESERVED;
+	if (pdev->device == PCI_DEVICE_ID_ARTOP_ATP867B)
+		v |= ATP867X_IO_SYS_INFO_SLOW_UDMA5;
+	iowrite8(v, ATP867X_SYS_INFO(ap));
+}
+
+static int atp867x_ata_pci_sff_init_host(struct ata_host *host)
+{
+	struct device *gdev = host->dev;
+	struct pci_dev *pdev = to_pci_dev(gdev);
+	unsigned int mask = 0;
+	int i, rc;
+
+	/*
+	 * do not map rombase
+	 */
+	rc = pcim_iomap_regions(pdev, 1 << ATP867X_BAR_IOBASE, DRV_NAME);
+	if (rc == -EBUSY)
+		pcim_pin_device(pdev);
+	if (rc)
+		return rc;
+	host->iomap = pcim_iomap_table(pdev);
+
+#ifdef	ATP867X_DEBUG
+	atp867x_check_res(pdev);
+
+	for (i = 0; i < PCI_ROM_RESOURCE; i++)
+		printk(KERN_DEBUG "ATP867X: iomap[%d]=0x%llx\n", i,
+			(unsigned long long)(host->iomap[i]));
+#endif
+
+	/*
+	 * request, iomap BARs and init port addresses accordingly
+	 */
+	for (i = 0; i < host->n_ports; i++) {
+		struct ata_port *ap = host->ports[i];
+		struct ata_ioports *ioaddr = &ap->ioaddr;
+
+		ioaddr->cmd_addr = ATP867X_IO_PORTBASE(ap, i);
+		ioaddr->ctl_addr = ioaddr->altstatus_addr
+				 = ATP867X_IO_ALTSTATUS(ap, i);
+		ioaddr->bmdma_addr = ATP867X_IO_DMABASE(ap, i);
+
+		ata_sff_std_ports(ioaddr);
+		rc = atp867x_set_priv(ap);
+		if (rc)
+			return rc;
+
+#ifdef	ATP867X_DEBUG
+		atp867x_check_ports(ap, i);
+#endif
+		ata_port_desc(ap, "cmd 0x%lx ctl 0x%lx",
+			(unsigned long)ioaddr->cmd_addr,
+			(unsigned long)ioaddr->ctl_addr);
+		ata_port_desc(ap, "bmdma 0x%lx",
+			(unsigned long)ioaddr->bmdma_addr);
+
+		mask |= 1 << i;
+	}
+
+	if (!mask) {
+		dev_printk(KERN_ERR, gdev, "no available native port\n");
+		return -ENODEV;
+	}
+
+	atp867x_fixup(host);
+
+	rc = pci_set_dma_mask(pdev, ATA_DMA_MASK);
+	if (rc)
+		return rc;
+
+	rc = pci_set_consistent_dma_mask(pdev, ATA_DMA_MASK);
+	return rc;
+}
+
+static int atp867x_init_one(struct pci_dev *pdev,
+	const struct pci_device_id *id)
+{
+	static int printed_version;
+	static const struct ata_port_info info_867x = {
+		.flags		= ATA_FLAG_SLAVE_POSS,
+		.pio_mask	= ATA_PIO4,
+		.mwdma_mask	= ATA_MWDMA2,
+		.udma_mask 	= ATA_UDMA6,
+		.port_ops	= &atp867x_ops,
+	};
+
+	struct ata_host *host;
+	const struct ata_port_info *ppi[] = { &info_867x, NULL };
+	int rc;
+
+	if (!printed_version++)
+		dev_printk(KERN_INFO, &pdev->dev, "version " DRV_VERSION "\n");
+
+	rc = pcim_enable_device(pdev);
+	if (rc)
+		return rc;
+
+	printk(KERN_INFO "ATP867X: ATP867 ATA UDMA133 controller (rev %02X)",
+		pdev->device);
+
+	host = ata_host_alloc_pinfo(&pdev->dev, ppi, ATP867X_NUM_PORTS);
+	if (!host) {
+		dev_printk(KERN_ERR, &pdev->dev,
+			"failed to allocate ATA host\n");
+		rc = -ENOMEM;
+		goto err_out;
+	}
+
+	rc = atp867x_ata_pci_sff_init_host(host);
+	if (rc) {
+		dev_printk(KERN_ERR, &pdev->dev, "failed to init host\n");
+		goto err_out;
+	}
+
+	pci_set_master(pdev);
+
+	rc = ata_host_activate(host, pdev->irq, ata_sff_interrupt,
+				IRQF_SHARED, &atp867x_sht);
+	if (rc)
+		dev_printk(KERN_ERR, &pdev->dev, "failed to activate host\n");
+
+err_out:
+	return rc;
+}
+
+static struct pci_device_id atp867x_pci_tbl[] = {
+	{ PCI_VDEVICE(ARTOP, PCI_DEVICE_ID_ARTOP_ATP867A),	0 },
+	{ PCI_VDEVICE(ARTOP, PCI_DEVICE_ID_ARTOP_ATP867B),	0 },
+	{ },
+};
+
+static struct pci_driver atp867x_driver = {
+	.name 		= DRV_NAME,
+	.id_table 	= atp867x_pci_tbl,
+	.probe 		= atp867x_init_one,
+	.remove		= ata_pci_remove_one,
+};
+
+static int __init atp867x_init(void)
+{
+	return pci_register_driver(&atp867x_driver);
+}
+
+static void __exit atp867x_exit(void)
+{
+	pci_unregister_driver(&atp867x_driver);
+}
+
+MODULE_AUTHOR("John(Jung-Ik) Lee, Google Inc.");
+MODULE_DESCRIPTION("low level driver for Artop/Acard 867x ATA controller");
+MODULE_LICENSE("GPL");
+MODULE_DEVICE_TABLE(pci, atp867x_pci_tbl);
+MODULE_VERSION(DRV_VERSION);
+
+module_init(atp867x_init);
+module_exit(atp867x_exit);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 8975add8668f..3b6b788fe2b5 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1529,6 +1529,8 @@
 #define PCI_DEVICE_ID_ARTOP_ATP860R	0x0007
 #define PCI_DEVICE_ID_ARTOP_ATP865	0x0008
 #define PCI_DEVICE_ID_ARTOP_ATP865R	0x0009
+#define PCI_DEVICE_ID_ARTOP_ATP867A	0x000A
+#define PCI_DEVICE_ID_ARTOP_ATP867B	0x000B
 #define PCI_DEVICE_ID_ARTOP_AEC7610	0x8002
 #define PCI_DEVICE_ID_ARTOP_AEC7612UW	0x8010
 #define PCI_DEVICE_ID_ARTOP_AEC7612U	0x8020
-- 
cgit v1.2.3-71-gd317


From c3422bea5f09b0e85704f51f2b01271630b8940b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 13 Sep 2009 09:15:10 -0700
Subject: rcu: Simplify rcu_read_unlock_special() quiescent-state accounting

The earlier approach required two scheduling-clock ticks to note an
preemptable-RCU quiescent state in the situation in which the
scheduling-clock interrupt is unlucky enough to always interrupt an
RCU read-side critical section.

With this change, the quiescent state is instead noted by the
outermost rcu_read_unlock() immediately following the first
scheduling-clock tick, or, alternatively, by the first subsequent
context switch.  Therefore, this change also speeds up grace
periods.

Suggested-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
LKML-Reference: <12528585111945-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   |  1 -
 kernel/rcutree.c        | 15 ++++++--------
 kernel/rcutree_plugin.h | 54 ++++++++++++++++++++++++-------------------------
 3 files changed, 32 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f3d74bd04d18..c62a9f84d614 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1740,7 +1740,6 @@ extern cputime_t task_gtime(struct task_struct *p);
 
 #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
 #define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
-#define RCU_READ_UNLOCK_GOT_QS  (1 << 2) /* CPU has responded to RCU core. */
 
 static inline void rcu_copy_process(struct task_struct *p)
 {
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e9a4ae94647f..6c99553e9f15 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -107,27 +107,23 @@ static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp,
  */
 void rcu_sched_qs(int cpu)
 {
-	unsigned long flags;
 	struct rcu_data *rdp;
 
-	local_irq_save(flags);
 	rdp = &per_cpu(rcu_sched_data, cpu);
-	rdp->passed_quiesc = 1;
 	rdp->passed_quiesc_completed = rdp->completed;
-	rcu_preempt_qs(cpu);
-	local_irq_restore(flags);
+	barrier();
+	rdp->passed_quiesc = 1;
+	rcu_preempt_note_context_switch(cpu);
 }
 
 void rcu_bh_qs(int cpu)
 {
-	unsigned long flags;
 	struct rcu_data *rdp;
 
-	local_irq_save(flags);
 	rdp = &per_cpu(rcu_bh_data, cpu);
-	rdp->passed_quiesc = 1;
 	rdp->passed_quiesc_completed = rdp->completed;
-	local_irq_restore(flags);
+	barrier();
+	rdp->passed_quiesc = 1;
 }
 
 #ifdef CONFIG_NO_HZ
@@ -615,6 +611,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 
 	/* Advance to a new grace period and initialize state. */
 	rsp->gpnum++;
+	WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
 	rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
 	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
 	record_gp_stall_check_time(rsp);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index b8e4b0384f00..c9616e48379b 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -64,34 +64,42 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
  * not in a quiescent state.  There might be any number of tasks blocked
  * while in an RCU read-side critical section.
  */
-static void rcu_preempt_qs_record(int cpu)
+static void rcu_preempt_qs(int cpu)
 {
 	struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
-	rdp->passed_quiesc = 1;
 	rdp->passed_quiesc_completed = rdp->completed;
+	barrier();
+	rdp->passed_quiesc = 1;
 }
 
 /*
- * We have entered the scheduler or are between softirqs in ksoftirqd.
- * If we are in an RCU read-side critical section, we need to reflect
- * that in the state of the rcu_node structure corresponding to this CPU.
- * Caller must disable hardirqs.
+ * We have entered the scheduler, and the current task might soon be
+ * context-switched away from.  If this task is in an RCU read-side
+ * critical section, we will no longer be able to rely on the CPU to
+ * record that fact, so we enqueue the task on the appropriate entry
+ * of the blocked_tasks[] array.  The task will dequeue itself when
+ * it exits the outermost enclosing RCU read-side critical section.
+ * Therefore, the current grace period cannot be permitted to complete
+ * until the blocked_tasks[] entry indexed by the low-order bit of
+ * rnp->gpnum empties.
+ *
+ * Caller must disable preemption.
  */
-static void rcu_preempt_qs(int cpu)
+static void rcu_preempt_note_context_switch(int cpu)
 {
 	struct task_struct *t = current;
+	unsigned long flags;
 	int phase;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp;
 
 	if (t->rcu_read_lock_nesting &&
 	    (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
-		WARN_ON_ONCE(cpu != smp_processor_id());
 
 		/* Possibly blocking in an RCU read-side critical section. */
 		rdp = rcu_preempt_state.rda[cpu];
 		rnp = rdp->mynode;
-		spin_lock(&rnp->lock);
+		spin_lock_irqsave(&rnp->lock, flags);
 		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
 		t->rcu_blocked_node = rnp;
 
@@ -112,7 +120,7 @@ static void rcu_preempt_qs(int cpu)
 		phase = !(rnp->qsmask & rdp->grpmask) ^ (rnp->gpnum & 0x1);
 		list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
 		smp_mb();  /* Ensure later ctxt swtch seen after above. */
-		spin_unlock(&rnp->lock);
+		spin_unlock_irqrestore(&rnp->lock, flags);
 	}
 
 	/*
@@ -124,9 +132,8 @@ static void rcu_preempt_qs(int cpu)
 	 * grace period, then the fact that the task has been enqueued
 	 * means that we continue to block the current grace period.
 	 */
-	rcu_preempt_qs_record(cpu);
-	t->rcu_read_unlock_special &= ~(RCU_READ_UNLOCK_NEED_QS |
-					RCU_READ_UNLOCK_GOT_QS);
+	rcu_preempt_qs(cpu);
+	t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 }
 
 /*
@@ -162,7 +169,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
 	special = t->rcu_read_unlock_special;
 	if (special & RCU_READ_UNLOCK_NEED_QS) {
 		t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
-		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_GOT_QS;
+		rcu_preempt_qs(smp_processor_id());
 	}
 
 	/* Hardware IRQ handlers cannot block. */
@@ -199,9 +206,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
 		 */
 		if (!empty && rnp->qsmask == 0 &&
 		    list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) {
-			t->rcu_read_unlock_special &=
-				~(RCU_READ_UNLOCK_NEED_QS |
-				  RCU_READ_UNLOCK_GOT_QS);
+			t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 			if (rnp->parent == NULL) {
 				/* Only one rcu_node in the tree. */
 				cpu_quiet_msk_finish(&rcu_preempt_state, flags);
@@ -352,19 +357,12 @@ static void rcu_preempt_check_callbacks(int cpu)
 	struct task_struct *t = current;
 
 	if (t->rcu_read_lock_nesting == 0) {
-		t->rcu_read_unlock_special &=
-			~(RCU_READ_UNLOCK_NEED_QS | RCU_READ_UNLOCK_GOT_QS);
-		rcu_preempt_qs_record(cpu);
+		t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+		rcu_preempt_qs(cpu);
 		return;
 	}
 	if (per_cpu(rcu_preempt_data, cpu).qs_pending) {
-		if (t->rcu_read_unlock_special & RCU_READ_UNLOCK_GOT_QS) {
-			rcu_preempt_qs_record(cpu);
-			t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_GOT_QS;
-		} else if (!(t->rcu_read_unlock_special &
-			     RCU_READ_UNLOCK_NEED_QS)) {
-			t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
-		}
+		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
 	}
 }
 
@@ -451,7 +449,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
  * Because preemptable RCU does not exist, we never have to check for
  * CPUs being in quiescent states.
  */
-static void rcu_preempt_qs(int cpu)
+static void rcu_preempt_note_context_switch(int cpu)
 {
 }
 
-- 
cgit v1.2.3-71-gd317


From 16e3081191837a6a04733de5cd5d1d1b303140d4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 13 Sep 2009 09:15:11 -0700
Subject: rcu: Fix synchronize_rcu() for TREE_PREEMPT_RCU

The redirection of synchronize_sched() to synchronize_rcu() was
appropriate for TREE_RCU, but not for TREE_PREEMPT_RCU.

Fix this by creating an underlying synchronize_sched().  TREE_RCU
then redirects synchronize_rcu() to synchronize_sched(), while
TREE_PREEMPT_RCU has its own version of synchronize_rcu().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
LKML-Reference: <12528585111916-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcupdate.h | 23 +++++------------------
 include/linux/rcutree.h  |  4 ++--
 kernel/rcupdate.c        | 44 +++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 50 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 95e0615f4d75..39dce83c4865 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -52,8 +52,13 @@ struct rcu_head {
 };
 
 /* Exported common interfaces */
+#ifdef CONFIG_TREE_PREEMPT_RCU
 extern void synchronize_rcu(void);
+#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#define synchronize_rcu synchronize_sched
+#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
 extern void synchronize_rcu_bh(void);
+extern void synchronize_sched(void);
 extern void rcu_barrier(void);
 extern void rcu_barrier_bh(void);
 extern void rcu_barrier_sched(void);
@@ -261,24 +266,6 @@ struct rcu_synchronize {
 
 extern void wakeme_after_rcu(struct rcu_head  *head);
 
-/**
- * synchronize_sched - block until all CPUs have exited any non-preemptive
- * kernel code sequences.
- *
- * This means that all preempt_disable code sequences, including NMI and
- * hardware-interrupt handlers, in progress on entry will have completed
- * before this primitive returns.  However, this does not guarantee that
- * softirq handlers will have completed, since in some kernels, these
- * handlers can run in process context, and can block.
- *
- * This primitive provides the guarantees made by the (now removed)
- * synchronize_kernel() API.  In contrast, synchronize_rcu() only
- * guarantees that rcu_read_lock() sections will have completed.
- * In "classic RCU", these two guarantees happen to be one and
- * the same, but can differ in realtime RCU implementations.
- */
-#define synchronize_sched() __synchronize_sched()
-
 /**
  * call_rcu - Queue an RCU callback for invocation after a grace period.
  * @head: structure to be used for queueing the RCU updates.
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index a89307717825..00d08c0cbcc1 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -53,6 +53,8 @@ static inline void __rcu_read_unlock(void)
 	preempt_enable();
 }
 
+#define __synchronize_sched() synchronize_rcu()
+
 static inline void exit_rcu(void)
 {
 }
@@ -68,8 +70,6 @@ static inline void __rcu_read_unlock_bh(void)
 	local_bh_enable();
 }
 
-#define __synchronize_sched() synchronize_rcu()
-
 extern void call_rcu_sched(struct rcu_head *head,
 			   void (*func)(struct rcu_head *rcu));
 
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index bd5d5c8e5140..28d2f24e7871 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -74,6 +74,8 @@ void wakeme_after_rcu(struct rcu_head  *head)
 	complete(&rcu->completion);
 }
 
+#ifdef CONFIG_TREE_PREEMPT_RCU
+
 /**
  * synchronize_rcu - wait until a grace period has elapsed.
  *
@@ -87,7 +89,7 @@ void synchronize_rcu(void)
 {
 	struct rcu_synchronize rcu;
 
-	if (rcu_blocking_is_gp())
+	if (!rcu_scheduler_active)
 		return;
 
 	init_completion(&rcu.completion);
@@ -98,6 +100,46 @@ void synchronize_rcu(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+
+/**
+ * synchronize_sched - wait until an rcu-sched grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full rcu-sched
+ * grace period has elapsed, in other words after all currently executing
+ * rcu-sched read-side critical sections have completed.   These read-side
+ * critical sections are delimited by rcu_read_lock_sched() and
+ * rcu_read_unlock_sched(), and may be nested.  Note that preempt_disable(),
+ * local_irq_disable(), and so on may be used in place of
+ * rcu_read_lock_sched().
+ *
+ * This means that all preempt_disable code sequences, including NMI and
+ * hardware-interrupt handlers, in progress on entry will have completed
+ * before this primitive returns.  However, this does not guarantee that
+ * softirq handlers will have completed, since in some kernels, these
+ * handlers can run in process context, and can block.
+ *
+ * This primitive provides the guarantees made by the (now removed)
+ * synchronize_kernel() API.  In contrast, synchronize_rcu() only
+ * guarantees that rcu_read_lock() sections will have completed.
+ * In "classic RCU", these two guarantees happen to be one and
+ * the same, but can differ in realtime RCU implementations.
+ */
+void synchronize_sched(void)
+{
+	struct rcu_synchronize rcu;
+
+	if (rcu_blocking_is_gp())
+		return;
+
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished. */
+	call_rcu_sched(&rcu.head, wakeme_after_rcu);
+	/* Wait for it. */
+	wait_for_completion(&rcu.completion);
+}
+EXPORT_SYMBOL_GPL(synchronize_sched);
+
 /**
  * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
  *
-- 
cgit v1.2.3-71-gd317


From e5e25cf47b0bdd1f7e9b8bb6368ee48e16de0c87 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 18 Sep 2009 00:54:43 +0200
Subject: tracing: Factorize the events profile accounting

Factorize the events enabling accounting in a common tracing core
helper. This reduces the size of the profile_enable() and
profile_disable() callbacks for each trace events.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace_event.h       |  4 ++--
 include/linux/syscalls.h           | 24 ++++++++----------------
 include/trace/ftrace.h             | 28 ++++++++--------------------
 kernel/trace/trace_event_profile.c | 20 ++++++++++++++++++--
 4 files changed, 36 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index bd099ba82ccc..bc103d7b1ca8 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -130,8 +130,8 @@ struct ftrace_event_call {
 	void			*data;
 
 	atomic_t		profile_count;
-	int			(*profile_enable)(struct ftrace_event_call *);
-	void			(*profile_disable)(struct ftrace_event_call *);
+	int			(*profile_enable)(void);
+	void			(*profile_disable)(void);
 };
 
 #define MAX_FILTER_PRED		32
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a8e37821cc60..7d9803cbb20f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -100,33 +100,25 @@ struct perf_counter_attr;
 
 #ifdef CONFIG_EVENT_PROFILE
 #define TRACE_SYS_ENTER_PROFILE(sname)					       \
-static int prof_sysenter_enable_##sname(struct ftrace_event_call *event_call)  \
+static int prof_sysenter_enable_##sname(void)				       \
 {									       \
-	int ret = 0;							       \
-	if (!atomic_inc_return(&event_enter_##sname.profile_count))	       \
-		ret = reg_prof_syscall_enter("sys"#sname);		       \
-	return ret;							       \
+	return reg_prof_syscall_enter("sys"#sname);			       \
 }									       \
 									       \
-static void prof_sysenter_disable_##sname(struct ftrace_event_call *event_call)\
+static void prof_sysenter_disable_##sname(void)				       \
 {									       \
-	if (atomic_add_negative(-1, &event_enter_##sname.profile_count))       \
-		unreg_prof_syscall_enter("sys"#sname);			       \
+	unreg_prof_syscall_enter("sys"#sname);				       \
 }
 
 #define TRACE_SYS_EXIT_PROFILE(sname)					       \
-static int prof_sysexit_enable_##sname(struct ftrace_event_call *event_call)   \
+static int prof_sysexit_enable_##sname(void)				       \
 {									       \
-	int ret = 0;							       \
-	if (!atomic_inc_return(&event_exit_##sname.profile_count))	       \
-		ret = reg_prof_syscall_exit("sys"#sname);		       \
-	return ret;							       \
+	return reg_prof_syscall_exit("sys"#sname);			       \
 }									       \
 									       \
-static void prof_sysexit_disable_##sname(struct ftrace_event_call *event_call) \
+static void prof_sysexit_disable_##sname(void)				       \
 {                                                                              \
-	if (atomic_add_negative(-1, &event_exit_##sname.profile_count))	       \
-		unreg_prof_syscall_exit("sys"#sname);			       \
+	unreg_prof_syscall_exit("sys"#sname);				       \
 }
 
 #define TRACE_SYS_ENTER_PROFILE_INIT(sname)				       \
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 72a3b437b829..a822087857e9 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -382,20 +382,14 @@ static inline int ftrace_get_offsets_##call(				\
  *
  * NOTE: The insertion profile callback (ftrace_profile_<call>) is defined later
  *
- * static int ftrace_profile_enable_<call>(struct ftrace_event_call *event_call)
+ * static int ftrace_profile_enable_<call>(void)
  * {
- * 	int ret = 0;
- *
- * 	if (!atomic_inc_return(&event_call->profile_count))
- * 		ret = register_trace_<call>(ftrace_profile_<call>);
- *
- * 	return ret;
+ * 	return register_trace_<call>(ftrace_profile_<call>);
  * }
  *
- * static void ftrace_profile_disable_<call>(struct ftrace_event_call *event_call)
+ * static void ftrace_profile_disable_<call>(void)
  * {
- * 	if (atomic_add_negative(-1, &event->call->profile_count))
- * 		unregister_trace_<call>(ftrace_profile_<call>);
+ * 	unregister_trace_<call>(ftrace_profile_<call>);
  * }
  *
  */
@@ -405,20 +399,14 @@ static inline int ftrace_get_offsets_##call(				\
 									\
 static void ftrace_profile_##call(proto);				\
 									\
-static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \
+static int ftrace_profile_enable_##call(void)				\
 {									\
-	int ret = 0;							\
-									\
-	if (!atomic_inc_return(&event_call->profile_count))		\
-		ret = register_trace_##call(ftrace_profile_##call);	\
-									\
-	return ret;							\
+	return register_trace_##call(ftrace_profile_##call);		\
 }									\
 									\
-static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
+static void ftrace_profile_disable_##call(void)				\
 {									\
-	if (atomic_add_negative(-1, &event_call->profile_count))	\
-		unregister_trace_##call(ftrace_profile_##call);		\
+	unregister_trace_##call(ftrace_profile_##call);			\
 }
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 55a25c933d15..df4a74efd50c 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -8,6 +8,14 @@
 #include <linux/module.h>
 #include "trace.h"
 
+static int ftrace_profile_enable_event(struct ftrace_event_call *event)
+{
+	if (atomic_inc_return(&event->profile_count))
+		return 0;
+
+	return event->profile_enable();
+}
+
 int ftrace_profile_enable(int event_id)
 {
 	struct ftrace_event_call *event;
@@ -17,7 +25,7 @@ int ftrace_profile_enable(int event_id)
 	list_for_each_entry(event, &ftrace_events, list) {
 		if (event->id == event_id && event->profile_enable &&
 		    try_module_get(event->mod)) {
-			ret = event->profile_enable(event);
+			ret = ftrace_profile_enable_event(event);
 			break;
 		}
 	}
@@ -26,6 +34,14 @@ int ftrace_profile_enable(int event_id)
 	return ret;
 }
 
+static void ftrace_profile_disable_event(struct ftrace_event_call *event)
+{
+	if (!atomic_add_negative(-1, &event->profile_count))
+		return;
+
+	event->profile_disable();
+}
+
 void ftrace_profile_disable(int event_id)
 {
 	struct ftrace_event_call *event;
@@ -33,7 +49,7 @@ void ftrace_profile_disable(int event_id)
 	mutex_lock(&event_mutex);
 	list_for_each_entry(event, &ftrace_events, list) {
 		if (event->id == event_id) {
-			event->profile_disable(event);
+			ftrace_profile_disable_event(event);
 			module_put(event->mod);
 			break;
 		}
-- 
cgit v1.2.3-71-gd317


From 20ab4425a77a1f34028cc6ce57053c22c184ba5f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 18 Sep 2009 06:10:28 +0200
Subject: tracing: Allocate the ftrace event profile buffer dynamically

Currently the trace event profile buffer is allocated in the stack. But
this may be too much for the stack, as the events can have large
statically defined field size and can also grow with dynamic arrays.

Allocate two per cpu buffer for all profiled events. The first cpu
buffer is used to host every non-nmi context traces. It is protected
by disabling the interrupts while writing and committing the trace.

The second buffer is reserved for nmi. So that there is no race between
them and the first buffer.

The whole write/commit section is rcu protected because we release
these buffers while deactivating the last profiling trace event.

v2: Move the buffers from trace_event to be global, as pointed by
    Steven Rostedt.

v3: Fix the syscall events to handle the profiling buffer races
    by disabling interrupts, now that the buffers are globals.

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace_event.h       |  6 +++
 include/trace/ftrace.h             | 83 +++++++++++++++++++++-----------
 kernel/trace/trace_event_profile.c | 61 +++++++++++++++++++++++-
 kernel/trace/trace_syscalls.c      | 97 ++++++++++++++++++++++++++++++--------
 4 files changed, 199 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index bc103d7b1ca8..4ec5e67e18cf 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -4,6 +4,7 @@
 #include <linux/ring_buffer.h>
 #include <linux/trace_seq.h>
 #include <linux/percpu.h>
+#include <linux/hardirq.h>
 
 struct trace_array;
 struct tracer;
@@ -134,6 +135,11 @@ struct ftrace_event_call {
 	void			(*profile_disable)(void);
 };
 
+#define FTRACE_MAX_PROFILE_SIZE	2048
+
+extern char			*trace_profile_buf;
+extern char			*trace_profile_buf_nmi;
+
 #define MAX_FILTER_PRED		32
 #define MAX_FILTER_STR_VAL	256	/* Should handle KSYM_SYMBOL_LEN */
 
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index a822087857e9..a0361cb69769 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -648,11 +648,12 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
  *	struct ftrace_raw_##call *entry;
  *	u64 __addr = 0, __count = 1;
  *	unsigned long irq_flags;
+ *	struct trace_entry *ent;
  *	int __entry_size;
  *	int __data_size;
+ *	int __cpu
  *	int pc;
  *
- *	local_save_flags(irq_flags);
  *	pc = preempt_count();
  *
  *	__data_size = ftrace_get_offsets_<call>(&__data_offsets, args);
@@ -663,25 +664,34 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
  *			     sizeof(u64));
  *	__entry_size -= sizeof(u32);
  *
- *	do {
- *		char raw_data[__entry_size]; <- allocate our sample in the stack
- *		struct trace_entry *ent;
+ *	// Protect the non nmi buffer
+ *	// This also protects the rcu read side
+ *	local_irq_save(irq_flags);
+ *	__cpu = smp_processor_id();
+ *
+ *	if (in_nmi())
+ *		raw_data = rcu_dereference(trace_profile_buf_nmi);
+ *	else
+ *		raw_data = rcu_dereference(trace_profile_buf);
+ *
+ *	if (!raw_data)
+ *		goto end;
  *
- *		zero dead bytes from alignment to avoid stack leak to userspace:
+ *	raw_data = per_cpu_ptr(raw_data, __cpu);
  *
- *		*(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;
- *		entry = (struct ftrace_raw_<call> *)raw_data;
- *		ent = &entry->ent;
- *		tracing_generic_entry_update(ent, irq_flags, pc);
- *		ent->type = event_call->id;
+ *	//zero dead bytes from alignment to avoid stack leak to userspace:
+ *	*(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;
+ *	entry = (struct ftrace_raw_<call> *)raw_data;
+ *	ent = &entry->ent;
+ *	tracing_generic_entry_update(ent, irq_flags, pc);
+ *	ent->type = event_call->id;
  *
- *		<tstruct> <- do some jobs with dynamic arrays
+ *	<tstruct> <- do some jobs with dynamic arrays
  *
- *		<assign>  <- affect our values
+ *	<assign>  <- affect our values
  *
- *		perf_tpcounter_event(event_call->id, __addr, __count, entry,
- *			     __entry_size);  <- submit them to perf counter
- *	} while (0);
+ *	perf_tpcounter_event(event_call->id, __addr, __count, entry,
+ *		     __entry_size);  <- submit them to perf counter
  *
  * }
  */
@@ -704,11 +714,13 @@ static void ftrace_profile_##call(proto)				\
 	struct ftrace_raw_##call *entry;				\
 	u64 __addr = 0, __count = 1;					\
 	unsigned long irq_flags;					\
+	struct trace_entry *ent;					\
 	int __entry_size;						\
 	int __data_size;						\
+	char *raw_data;							\
+	int __cpu;							\
 	int pc;								\
 									\
-	local_save_flags(irq_flags);					\
 	pc = preempt_count();						\
 									\
 	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
@@ -716,23 +728,38 @@ static void ftrace_profile_##call(proto)				\
 			     sizeof(u64));				\
 	__entry_size -= sizeof(u32);					\
 									\
-	do {								\
-		char raw_data[__entry_size];				\
-		struct trace_entry *ent;				\
+	if (WARN_ONCE(__entry_size > FTRACE_MAX_PROFILE_SIZE,		\
+		      "profile buffer not large enough"))		\
+		return;							\
+									\
+	local_irq_save(irq_flags);					\
+	__cpu = smp_processor_id();					\
 									\
-		*(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;	\
-		entry = (struct ftrace_raw_##call *)raw_data;		\
-		ent = &entry->ent;					\
-		tracing_generic_entry_update(ent, irq_flags, pc);	\
-		ent->type = event_call->id;				\
+	if (in_nmi())							\
+		raw_data = rcu_dereference(trace_profile_buf_nmi);		\
+	else								\
+		raw_data = rcu_dereference(trace_profile_buf);		\
 									\
-		tstruct							\
+	if (!raw_data)							\
+		goto end;						\
 									\
-		{ assign; }						\
+	raw_data = per_cpu_ptr(raw_data, __cpu);			\
 									\
-		perf_tpcounter_event(event_call->id, __addr, __count, entry,\
+	*(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;		\
+	entry = (struct ftrace_raw_##call *)raw_data;			\
+	ent = &entry->ent;						\
+	tracing_generic_entry_update(ent, irq_flags, pc);		\
+	ent->type = event_call->id;					\
+									\
+	tstruct								\
+									\
+	{ assign; }							\
+									\
+	perf_tpcounter_event(event_call->id, __addr, __count, entry,	\
 			     __entry_size);				\
-	} while (0);							\
+									\
+end:									\
+	local_irq_restore(irq_flags);					\
 									\
 }
 
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index df4a74efd50c..3aaa77c3309b 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -8,12 +8,52 @@
 #include <linux/module.h>
 #include "trace.h"
 
+/*
+ * We can't use a size but a type in alloc_percpu()
+ * So let's create a dummy type that matches the desired size
+ */
+typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
+
+char		*trace_profile_buf;
+char 		*trace_profile_buf_nmi;
+
+/* Count the events in use (per event id, not per instance) */
+static int	total_profile_count;
+
 static int ftrace_profile_enable_event(struct ftrace_event_call *event)
 {
+	char *buf;
+	int ret = -ENOMEM;
+
 	if (atomic_inc_return(&event->profile_count))
 		return 0;
 
-	return event->profile_enable();
+	if (!total_profile_count++) {
+		buf = (char *)alloc_percpu(profile_buf_t);
+		if (!buf)
+			goto fail_buf;
+
+		rcu_assign_pointer(trace_profile_buf, buf);
+
+		buf = (char *)alloc_percpu(profile_buf_t);
+		if (!buf)
+			goto fail_buf_nmi;
+
+		rcu_assign_pointer(trace_profile_buf_nmi, buf);
+	}
+
+	ret = event->profile_enable();
+	if (!ret)
+		return 0;
+
+	kfree(trace_profile_buf_nmi);
+fail_buf_nmi:
+	kfree(trace_profile_buf);
+fail_buf:
+	total_profile_count--;
+	atomic_dec(&event->profile_count);
+
+	return ret;
 }
 
 int ftrace_profile_enable(int event_id)
@@ -36,10 +76,29 @@ int ftrace_profile_enable(int event_id)
 
 static void ftrace_profile_disable_event(struct ftrace_event_call *event)
 {
+	char *buf, *nmi_buf;
+
 	if (!atomic_add_negative(-1, &event->profile_count))
 		return;
 
 	event->profile_disable();
+
+	if (!--total_profile_count) {
+		buf = trace_profile_buf;
+		rcu_assign_pointer(trace_profile_buf, NULL);
+
+		nmi_buf = trace_profile_buf_nmi;
+		rcu_assign_pointer(trace_profile_buf_nmi, NULL);
+
+		/*
+		 * Ensure every events in profiling have finished before
+		 * releasing the buffers
+		 */
+		synchronize_sched();
+
+		free_percpu(buf);
+		free_percpu(nmi_buf);
+	}
 }
 
 void ftrace_profile_disable(int event_id)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8712ce3c6a0e..7a3550cf2597 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -384,10 +384,13 @@ static int sys_prof_refcount_exit;
 
 static void prof_syscall_enter(struct pt_regs *regs, long id)
 {
-	struct syscall_trace_enter *rec;
 	struct syscall_metadata *sys_data;
+	struct syscall_trace_enter *rec;
+	unsigned long flags;
+	char *raw_data;
 	int syscall_nr;
 	int size;
+	int cpu;
 
 	syscall_nr = syscall_get_nr(current, regs);
 	if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
@@ -402,20 +405,38 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 	size = ALIGN(size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	do {
-		char raw_data[size];
+	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+		      "profile buffer not large enough"))
+		return;
+
+	/* Protect the per cpu buffer, begin the rcu read side */
+	local_irq_save(flags);
 
-		/* zero the dead bytes from align to not leak stack to user */
-		*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+	cpu = smp_processor_id();
+
+	if (in_nmi())
+		raw_data = rcu_dereference(trace_profile_buf_nmi);
+	else
+		raw_data = rcu_dereference(trace_profile_buf);
+
+	if (!raw_data)
+		goto end;
 
-		rec = (struct syscall_trace_enter *) raw_data;
-		tracing_generic_entry_update(&rec->ent, 0, 0);
-		rec->ent.type = sys_data->enter_id;
-		rec->nr = syscall_nr;
-		syscall_get_arguments(current, regs, 0, sys_data->nb_args,
-				       (unsigned long *)&rec->args);
-		perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
-	} while(0);
+	raw_data = per_cpu_ptr(raw_data, cpu);
+
+	/* zero the dead bytes from align to not leak stack to user */
+	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+
+	rec = (struct syscall_trace_enter *) raw_data;
+	tracing_generic_entry_update(&rec->ent, 0, 0);
+	rec->ent.type = sys_data->enter_id;
+	rec->nr = syscall_nr;
+	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
+			       (unsigned long *)&rec->args);
+	perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
+
+end:
+	local_irq_restore(flags);
 }
 
 int reg_prof_syscall_enter(char *name)
@@ -460,8 +481,12 @@ void unreg_prof_syscall_enter(char *name)
 static void prof_syscall_exit(struct pt_regs *regs, long ret)
 {
 	struct syscall_metadata *sys_data;
-	struct syscall_trace_exit rec;
+	struct syscall_trace_exit *rec;
+	unsigned long flags;
 	int syscall_nr;
+	char *raw_data;
+	int size;
+	int cpu;
 
 	syscall_nr = syscall_get_nr(current, regs);
 	if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
@@ -471,12 +496,46 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 	if (!sys_data)
 		return;
 
-	tracing_generic_entry_update(&rec.ent, 0, 0);
-	rec.ent.type = sys_data->exit_id;
-	rec.nr = syscall_nr;
-	rec.ret = syscall_get_return_value(current, regs);
+	/* We can probably do that at build time */
+	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
+	size -= sizeof(u32);
 
-	perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec));
+	/*
+	 * Impossible, but be paranoid with the future
+	 * How to put this check outside runtime?
+	 */
+	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+		"exit event has grown above profile buffer size"))
+		return;
+
+	/* Protect the per cpu buffer, begin the rcu read side */
+	local_irq_save(flags);
+	cpu = smp_processor_id();
+
+	if (in_nmi())
+		raw_data = rcu_dereference(trace_profile_buf_nmi);
+	else
+		raw_data = rcu_dereference(trace_profile_buf);
+
+	if (!raw_data)
+		goto end;
+
+	raw_data = per_cpu_ptr(raw_data, cpu);
+
+	/* zero the dead bytes from align to not leak stack to user */
+	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+
+	rec = (struct syscall_trace_exit *)raw_data;
+
+	tracing_generic_entry_update(&rec->ent, 0, 0);
+	rec->ent.type = sys_data->exit_id;
+	rec->nr = syscall_nr;
+	rec->ret = syscall_get_return_value(current, regs);
+
+	perf_tpcounter_event(sys_data->exit_id, 0, 1, rec, size);
+
+end:
+	local_irq_restore(flags);
 }
 
 int reg_prof_syscall_exit(char *name)
-- 
cgit v1.2.3-71-gd317


From 181d683d752c432635eda0f182ee71548c1f1820 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Wed, 16 Sep 2009 01:06:43 -0700
Subject: Input: libps2 - additional locking for i8042 ports

The serio ports on i8042 are not completely isolated; while we provide
enough locking to ensure proper serialization when accessing control
and data registers AUX and KBD ports can still have an effect on each
other on PS/2 protocol level. The most prominent effect is that
issuing a command for the device connected to one port may cause
abort of the command currently executing by the device connected to
another port.

Since i8042 nor serio subsystem are not aware of the details of the
PS/2 protocol (length of the commands and their replies and so on) the
locking should be done on libps2 level by adding special handling when
we see that we are dealing with serio port on i8042.

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/mouse/sentelic.c  | 18 ++++++++++--------
 drivers/input/serio/i8042.c     | 41 +++++++++++++++++++++++++++++++++++++++++
 drivers/input/serio/libps2.c    | 28 ++++++++++++++++++++++++----
 drivers/leds/leds-clevo-mail.c  |  8 ++++++++
 drivers/platform/x86/acer-wmi.c |  2 ++
 include/linux/i8042.h           | 30 ++++++++++++++++++++++++++++++
 include/linux/libps2.h          |  2 ++
 7 files changed, 117 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/mouse/sentelic.c b/drivers/input/mouse/sentelic.c
index 84e2fc04d11b..f84cbd97c884 100644
--- a/drivers/input/mouse/sentelic.c
+++ b/drivers/input/mouse/sentelic.c
@@ -92,7 +92,8 @@ static int fsp_reg_read(struct psmouse *psmouse, int reg_addr, int *reg_val)
 	 */
 	ps2_command(ps2dev, NULL, PSMOUSE_CMD_DISABLE);
 	psmouse_set_state(psmouse, PSMOUSE_CMD_MODE);
-	mutex_lock(&ps2dev->cmd_mutex);
+
+	ps2_begin_command(ps2dev);
 
 	if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0)
 		goto out;
@@ -126,7 +127,7 @@ static int fsp_reg_read(struct psmouse *psmouse, int reg_addr, int *reg_val)
 	rc = 0;
 
  out:
-	mutex_unlock(&ps2dev->cmd_mutex);
+	ps2_end_command(ps2dev);
 	ps2_command(ps2dev, NULL, PSMOUSE_CMD_ENABLE);
 	psmouse_set_state(psmouse, PSMOUSE_ACTIVATED);
 	dev_dbg(&ps2dev->serio->dev, "READ REG: 0x%02x is 0x%02x (rc = %d)\n",
@@ -140,7 +141,7 @@ static int fsp_reg_write(struct psmouse *psmouse, int reg_addr, int reg_val)
 	unsigned char v;
 	int rc = -1;
 
-	mutex_lock(&ps2dev->cmd_mutex);
+	ps2_begin_command(ps2dev);
 
 	if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0)
 		goto out;
@@ -179,7 +180,7 @@ static int fsp_reg_write(struct psmouse *psmouse, int reg_addr, int reg_val)
 	rc = 0;
 
  out:
-	mutex_unlock(&ps2dev->cmd_mutex);
+	ps2_end_command(ps2dev);
 	dev_dbg(&ps2dev->serio->dev, "WRITE REG: 0x%02x to 0x%02x (rc = %d)\n",
 		reg_addr, reg_val, rc);
 	return rc;
@@ -214,7 +215,8 @@ static int fsp_page_reg_read(struct psmouse *psmouse, int *reg_val)
 
 	ps2_command(ps2dev, NULL, PSMOUSE_CMD_DISABLE);
 	psmouse_set_state(psmouse, PSMOUSE_CMD_MODE);
-	mutex_lock(&ps2dev->cmd_mutex);
+
+	ps2_begin_command(ps2dev);
 
 	if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0)
 		goto out;
@@ -236,7 +238,7 @@ static int fsp_page_reg_read(struct psmouse *psmouse, int *reg_val)
 	rc = 0;
 
  out:
-	mutex_unlock(&ps2dev->cmd_mutex);
+	ps2_end_command(ps2dev);
 	ps2_command(ps2dev, NULL, PSMOUSE_CMD_ENABLE);
 	psmouse_set_state(psmouse, PSMOUSE_ACTIVATED);
 	dev_dbg(&ps2dev->serio->dev, "READ PAGE REG: 0x%02x (rc = %d)\n",
@@ -250,7 +252,7 @@ static int fsp_page_reg_write(struct psmouse *psmouse, int reg_val)
 	unsigned char v;
 	int rc = -1;
 
-	mutex_lock(&ps2dev->cmd_mutex);
+	ps2_begin_command(ps2dev);
 
 	if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0)
 		goto out;
@@ -275,7 +277,7 @@ static int fsp_page_reg_write(struct psmouse *psmouse, int reg_val)
 	rc = 0;
 
  out:
-	mutex_unlock(&ps2dev->cmd_mutex);
+	ps2_end_command(ps2dev);
 	dev_dbg(&ps2dev->serio->dev, "WRITE PAGE REG: to 0x%02x (rc = %d)\n",
 		reg_val, rc);
 	return rc;
diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c
index eb3ff94af58c..bc56e52b945f 100644
--- a/drivers/input/serio/i8042.c
+++ b/drivers/input/serio/i8042.c
@@ -87,8 +87,22 @@ static bool i8042_bypass_aux_irq_test;
 
 #include "i8042.h"
 
+/*
+ * i8042_lock protects serialization between i8042_command and
+ * the interrupt handler.
+ */
 static DEFINE_SPINLOCK(i8042_lock);
 
+/*
+ * Writers to AUX and KBD ports as well as users issuing i8042_command
+ * directly should acquire i8042_mutex (by means of calling
+ * i8042_lock_chip() and i8042_unlock_ship() helpers) to ensure that
+ * they do not disturb each other (unfortunately in many i8042
+ * implementations write to one of the ports will immediately abort
+ * command that is being processed by another port).
+ */
+static DEFINE_MUTEX(i8042_mutex);
+
 struct i8042_port {
 	struct serio *serio;
 	int irq;
@@ -113,6 +127,18 @@ static struct platform_device *i8042_platform_device;
 
 static irqreturn_t i8042_interrupt(int irq, void *dev_id);
 
+void i8042_lock_chip(void)
+{
+	mutex_lock(&i8042_mutex);
+}
+EXPORT_SYMBOL(i8042_lock_chip);
+
+void i8042_unlock_chip(void)
+{
+	mutex_unlock(&i8042_mutex);
+}
+EXPORT_SYMBOL(i8042_unlock_chip);
+
 /*
  * The i8042_wait_read() and i8042_wait_write functions wait for the i8042 to
  * be ready for reading values from it / writing values to it.
@@ -1161,6 +1187,21 @@ static void __devexit i8042_unregister_ports(void)
 	}
 }
 
+/*
+ * Checks whether port belongs to i8042 controller.
+ */
+bool i8042_check_port_owner(const struct serio *port)
+{
+	int i;
+
+	for (i = 0; i < I8042_NUM_PORTS; i++)
+		if (i8042_ports[i].serio == port)
+			return true;
+
+	return false;
+}
+EXPORT_SYMBOL(i8042_check_port_owner);
+
 static void i8042_free_irqs(void)
 {
 	if (i8042_aux_irq_registered)
diff --git a/drivers/input/serio/libps2.c b/drivers/input/serio/libps2.c
index 3a95b508bf27..769ba65a585a 100644
--- a/drivers/input/serio/libps2.c
+++ b/drivers/input/serio/libps2.c
@@ -17,6 +17,7 @@
 #include <linux/interrupt.h>
 #include <linux/input.h>
 #include <linux/serio.h>
+#include <linux/i8042.h>
 #include <linux/init.h>
 #include <linux/libps2.h>
 
@@ -54,6 +55,24 @@ int ps2_sendbyte(struct ps2dev *ps2dev, unsigned char byte, int timeout)
 }
 EXPORT_SYMBOL(ps2_sendbyte);
 
+void ps2_begin_command(struct ps2dev *ps2dev)
+{
+	mutex_lock(&ps2dev->cmd_mutex);
+
+	if (i8042_check_port_owner(ps2dev->serio))
+		i8042_lock_chip();
+}
+EXPORT_SYMBOL(ps2_begin_command);
+
+void ps2_end_command(struct ps2dev *ps2dev)
+{
+	if (i8042_check_port_owner(ps2dev->serio))
+		i8042_unlock_chip();
+
+	mutex_unlock(&ps2dev->cmd_mutex);
+}
+EXPORT_SYMBOL(ps2_end_command);
+
 /*
  * ps2_drain() waits for device to transmit requested number of bytes
  * and discards them.
@@ -66,7 +85,7 @@ void ps2_drain(struct ps2dev *ps2dev, int maxbytes, int timeout)
 		maxbytes = sizeof(ps2dev->cmdbuf);
 	}
 
-	mutex_lock(&ps2dev->cmd_mutex);
+	ps2_begin_command(ps2dev);
 
 	serio_pause_rx(ps2dev->serio);
 	ps2dev->flags = PS2_FLAG_CMD;
@@ -76,7 +95,8 @@ void ps2_drain(struct ps2dev *ps2dev, int maxbytes, int timeout)
 	wait_event_timeout(ps2dev->wait,
 			   !(ps2dev->flags & PS2_FLAG_CMD),
 			   msecs_to_jiffies(timeout));
-	mutex_unlock(&ps2dev->cmd_mutex);
+
+	ps2_end_command(ps2dev);
 }
 EXPORT_SYMBOL(ps2_drain);
 
@@ -237,9 +257,9 @@ int ps2_command(struct ps2dev *ps2dev, unsigned char *param, int command)
 {
 	int rc;
 
-	mutex_lock(&ps2dev->cmd_mutex);
+	ps2_begin_command(ps2dev);
 	rc = __ps2_command(ps2dev, param, command);
-	mutex_unlock(&ps2dev->cmd_mutex);
+	ps2_end_command(ps2dev);
 
 	return rc;
 }
diff --git a/drivers/leds/leds-clevo-mail.c b/drivers/leds/leds-clevo-mail.c
index 1813c84ea5fc..f2242db54016 100644
--- a/drivers/leds/leds-clevo-mail.c
+++ b/drivers/leds/leds-clevo-mail.c
@@ -93,6 +93,8 @@ static struct dmi_system_id __initdata mail_led_whitelist[] = {
 static void clevo_mail_led_set(struct led_classdev *led_cdev,
 				enum led_brightness value)
 {
+	i8042_lock_chip();
+
 	if (value == LED_OFF)
 		i8042_command(NULL, CLEVO_MAIL_LED_OFF);
 	else if (value <= LED_HALF)
@@ -100,6 +102,8 @@ static void clevo_mail_led_set(struct led_classdev *led_cdev,
 	else
 		i8042_command(NULL, CLEVO_MAIL_LED_BLINK_1HZ);
 
+	i8042_unlock_chip();
+
 }
 
 static int clevo_mail_led_blink(struct led_classdev *led_cdev,
@@ -108,6 +112,8 @@ static int clevo_mail_led_blink(struct led_classdev *led_cdev,
 {
 	int status = -EINVAL;
 
+	i8042_lock_chip();
+
 	if (*delay_on == 0 /* ms */ && *delay_off == 0 /* ms */) {
 		/* Special case: the leds subsystem requested us to
 		 * chose one user friendly blinking of the LED, and
@@ -135,6 +141,8 @@ static int clevo_mail_led_blink(struct led_classdev *led_cdev,
 		       *delay_on, *delay_off);
 	}
 
+	i8042_unlock_chip();
+
 	return status;
 }
 
diff --git a/drivers/platform/x86/acer-wmi.c b/drivers/platform/x86/acer-wmi.c
index fb45f5ee8df1..454970d2d701 100644
--- a/drivers/platform/x86/acer-wmi.c
+++ b/drivers/platform/x86/acer-wmi.c
@@ -746,7 +746,9 @@ static acpi_status WMID_set_u32(u32 value, u32 cap, struct wmi_interface *iface)
 			return AE_BAD_PARAMETER;
 		if (quirks->mailled == 1) {
 			param = value ? 0x92 : 0x93;
+			i8042_lock_chip();
 			i8042_command(&param, 0x1059);
+			i8042_unlock_chip();
 			return 0;
 		}
 		break;
diff --git a/include/linux/i8042.h b/include/linux/i8042.h
index 7907a72403ee..60c3360ef6ad 100644
--- a/include/linux/i8042.h
+++ b/include/linux/i8042.h
@@ -7,6 +7,7 @@
  * the Free Software Foundation.
  */
 
+#include <linux/types.h>
 
 /*
  * Standard commands.
@@ -30,6 +31,35 @@
 #define I8042_CMD_MUX_PFX	0x0090
 #define I8042_CMD_MUX_SEND	0x1090
 
+struct serio;
+
+#if defined(CONFIG_SERIO_I8042) || defined(CONFIG_SERIO_I8042_MODULE)
+
+void i8042_lock_chip(void);
+void i8042_unlock_chip(void);
 int i8042_command(unsigned char *param, int command);
+bool i8042_check_port_owner(const struct serio *);
+
+#else
+
+void i8042_lock_chip(void)
+{
+}
+
+void i8042_unlock_chip(void)
+{
+}
+
+int i8042_command(unsigned char *param, int command)
+{
+	return -ENOSYS;
+}
+
+bool i8042_check_port_owner(const struct serio *serio)
+{
+	return false;
+}
+
+#endif
 
 #endif
diff --git a/include/linux/libps2.h b/include/linux/libps2.h
index fcf5fbe6a50c..79603a6c356f 100644
--- a/include/linux/libps2.h
+++ b/include/linux/libps2.h
@@ -44,6 +44,8 @@ struct ps2dev {
 void ps2_init(struct ps2dev *ps2dev, struct serio *serio);
 int ps2_sendbyte(struct ps2dev *ps2dev, unsigned char byte, int timeout);
 void ps2_drain(struct ps2dev *ps2dev, int maxbytes, int timeout);
+void ps2_begin_command(struct ps2dev *ps2dev);
+void ps2_end_command(struct ps2dev *ps2dev);
 int __ps2_command(struct ps2dev *ps2dev, unsigned char *param, int command);
 int ps2_command(struct ps2dev *ps2dev, unsigned char *param, int command);
 int ps2_handle_ack(struct ps2dev *ps2dev, unsigned char data);
-- 
cgit v1.2.3-71-gd317


From ffd0db97196c1057f09c2ab42dd5b30e94e511d9 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Wed, 16 Sep 2009 01:06:43 -0700
Subject: Input: add generic suspend and resume for input devices

Automatically turn off leds and sound effects as part of suspend
process and restore led state, sounds and repeat rate at resume.

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/input.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/input.h |  2 +-
 2 files changed, 64 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/input.c b/drivers/input/input.c
index 7c237e6ac711..b8ed4294fccd 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/init.h>
+#include <linux/types.h>
 #include <linux/input.h>
 #include <linux/module.h>
 #include <linux/random.h>
@@ -514,7 +515,7 @@ static void input_disconnect_device(struct input_dev *dev)
 	 * that there are no threads in the middle of input_open_device()
 	 */
 	mutex_lock(&dev->mutex);
-	dev->going_away = 1;
+	dev->going_away = true;
 	mutex_unlock(&dev->mutex);
 
 	spin_lock_irq(&dev->event_lock);
@@ -1259,10 +1260,71 @@ static int input_dev_uevent(struct device *device, struct kobj_uevent_env *env)
 	return 0;
 }
 
+#define INPUT_DO_TOGGLE(dev, type, bits, on)			\
+	do {							\
+		int i;						\
+		if (!test_bit(EV_##type, dev->evbit))		\
+			break;					\
+		for (i = 0; i < type##_MAX; i++) {		\
+			if (!test_bit(i, dev->bits##bit) ||	\
+			    !test_bit(i, dev->bits))		\
+				continue;			\
+			dev->event(dev, EV_##type, i, on);	\
+		}						\
+	} while (0)
+
+static void input_dev_reset(struct input_dev *dev, bool activate)
+{
+	if (!dev->event)
+		return;
+
+	INPUT_DO_TOGGLE(dev, LED, led, activate);
+	INPUT_DO_TOGGLE(dev, SND, snd, activate);
+
+	if (activate && test_bit(EV_REP, dev->evbit)) {
+		dev->event(dev, EV_REP, REP_PERIOD, dev->rep[REP_PERIOD]);
+		dev->event(dev, EV_REP, REP_DELAY, dev->rep[REP_DELAY]);
+	}
+}
+
+#ifdef CONFIG_PM
+static int input_dev_suspend(struct device *dev)
+{
+	struct input_dev *input_dev = to_input_dev(dev);
+
+	mutex_lock(&input_dev->mutex);
+	input_dev_reset(input_dev, false);
+	mutex_unlock(&input_dev->mutex);
+
+	return 0;
+}
+
+static int input_dev_resume(struct device *dev)
+{
+	struct input_dev *input_dev = to_input_dev(dev);
+
+	mutex_lock(&input_dev->mutex);
+	input_dev_reset(input_dev, true);
+	mutex_unlock(&input_dev->mutex);
+
+	return 0;
+}
+
+static const struct dev_pm_ops input_dev_pm_ops = {
+	.suspend	= input_dev_suspend,
+	.resume		= input_dev_resume,
+	.poweroff	= input_dev_suspend,
+	.restore	= input_dev_resume,
+};
+#endif /* CONFIG_PM */
+
 static struct device_type input_dev_type = {
 	.groups		= input_dev_attr_groups,
 	.release	= input_dev_release,
 	.uevent		= input_dev_uevent,
+#ifdef CONFIG_PM
+	.pm		= &input_dev_pm_ops,
+#endif
 };
 
 static char *input_nodename(struct device *dev)
diff --git a/include/linux/input.h b/include/linux/input.h
index 8b3bc3e0d146..0ccfc30cd40f 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -1123,7 +1123,7 @@ struct input_dev {
 	struct mutex mutex;
 
 	unsigned int users;
-	int going_away;
+	bool going_away;
 
 	struct device dev;
 
-- 
cgit v1.2.3-71-gd317


From 38e783b38148531c0840ac130b97eb8158f84b48 Mon Sep 17 00:00:00 2001
From: Joonyoung Shim <jy0922.shim@samsung.com>
Date: Thu, 17 Sep 2009 22:35:45 -0700
Subject: Input: add touchscreen driver for MELFAS MCS-5000 controller

The MELPAS MCS-5000 is the touchscreen controller. The overview of this
controller can see at the following website:

http://www.melfas.com/product/product01.asp?k_r=eng_

This driver is tested on s3c6410 NCP board and supports only the i2c
interface.

Signed-off-by: Joonyoung Shim <jy0922.shim@samsung.com>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/touchscreen/Kconfig      |  11 ++
 drivers/input/touchscreen/Makefile     |   1 +
 drivers/input/touchscreen/mcs5000_ts.c | 318 +++++++++++++++++++++++++++++++++
 include/linux/i2c/mcs5000_ts.h         |  24 +++
 4 files changed, 354 insertions(+)
 create mode 100644 drivers/input/touchscreen/mcs5000_ts.c
 create mode 100644 include/linux/i2c/mcs5000_ts.h

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig
index 5a252d449e55..6dfb0a2b0a7d 100644
--- a/drivers/input/touchscreen/Kconfig
+++ b/drivers/input/touchscreen/Kconfig
@@ -169,6 +169,17 @@ config TOUCHSCREEN_WACOM_W8001
 	  To compile this driver as a module, choose M here: the
 	  module will be called wacom_w8001.
 
+config TOUCHSCREEN_MCS5000
+	tristate "MELFAS MCS-5000 touchscreen"
+	depends on I2C
+	help
+	  Say Y here if you have the MELFAS MCS-5000 touchscreen controller
+	  chip in your system.
+
+	  If unsure, say N.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called mcs5000_ts.
 
 config TOUCHSCREEN_MTOUCH
 	tristate "MicroTouch serial touchscreens"
diff --git a/drivers/input/touchscreen/Makefile b/drivers/input/touchscreen/Makefile
index 3e1c5e0b952f..a882ca16506b 100644
--- a/drivers/input/touchscreen/Makefile
+++ b/drivers/input/touchscreen/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_TOUCHSCREEN_EETI)		+= eeti_ts.o
 obj-$(CONFIG_TOUCHSCREEN_ELO)		+= elo.o
 obj-$(CONFIG_TOUCHSCREEN_FUJITSU)	+= fujitsu_ts.o
 obj-$(CONFIG_TOUCHSCREEN_INEXIO)	+= inexio.o
+obj-$(CONFIG_TOUCHSCREEN_MCS5000)	+= mcs5000_ts.o
 obj-$(CONFIG_TOUCHSCREEN_MIGOR)		+= migor_ts.o
 obj-$(CONFIG_TOUCHSCREEN_MTOUCH)	+= mtouch.o
 obj-$(CONFIG_TOUCHSCREEN_MK712)		+= mk712.o
diff --git a/drivers/input/touchscreen/mcs5000_ts.c b/drivers/input/touchscreen/mcs5000_ts.c
new file mode 100644
index 000000000000..4c28b89757f9
--- /dev/null
+++ b/drivers/input/touchscreen/mcs5000_ts.c
@@ -0,0 +1,318 @@
+/*
+ * mcs5000_ts.c - Touchscreen driver for MELFAS MCS-5000 controller
+ *
+ * Copyright (C) 2009 Samsung Electronics Co.Ltd
+ * Author: Joonyoung Shim <jy0922.shim@samsung.com>
+ *
+ * Based on wm97xx-core.c
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/i2c.h>
+#include <linux/i2c/mcs5000_ts.h>
+#include <linux/interrupt.h>
+#include <linux/input.h>
+#include <linux/irq.h>
+
+/* Registers */
+#define MCS5000_TS_STATUS		0x00
+#define STATUS_OFFSET			0
+#define STATUS_NO			(0 << STATUS_OFFSET)
+#define STATUS_INIT			(1 << STATUS_OFFSET)
+#define STATUS_SENSING			(2 << STATUS_OFFSET)
+#define STATUS_COORD			(3 << STATUS_OFFSET)
+#define STATUS_GESTURE			(4 << STATUS_OFFSET)
+#define ERROR_OFFSET			4
+#define ERROR_NO			(0 << ERROR_OFFSET)
+#define ERROR_POWER_ON_RESET		(1 << ERROR_OFFSET)
+#define ERROR_INT_RESET			(2 << ERROR_OFFSET)
+#define ERROR_EXT_RESET			(3 << ERROR_OFFSET)
+#define ERROR_INVALID_REG_ADDRESS	(8 << ERROR_OFFSET)
+#define ERROR_INVALID_REG_VALUE		(9 << ERROR_OFFSET)
+
+#define MCS5000_TS_OP_MODE		0x01
+#define RESET_OFFSET			0
+#define RESET_NO			(0 << RESET_OFFSET)
+#define RESET_EXT_SOFT			(1 << RESET_OFFSET)
+#define OP_MODE_OFFSET			1
+#define OP_MODE_SLEEP			(0 << OP_MODE_OFFSET)
+#define OP_MODE_ACTIVE			(1 << OP_MODE_OFFSET)
+#define GESTURE_OFFSET			4
+#define GESTURE_DISABLE			(0 << GESTURE_OFFSET)
+#define GESTURE_ENABLE			(1 << GESTURE_OFFSET)
+#define PROXIMITY_OFFSET		5
+#define PROXIMITY_DISABLE		(0 << PROXIMITY_OFFSET)
+#define PROXIMITY_ENABLE		(1 << PROXIMITY_OFFSET)
+#define SCAN_MODE_OFFSET		6
+#define SCAN_MODE_INTERRUPT		(0 << SCAN_MODE_OFFSET)
+#define SCAN_MODE_POLLING		(1 << SCAN_MODE_OFFSET)
+#define REPORT_RATE_OFFSET		7
+#define REPORT_RATE_40			(0 << REPORT_RATE_OFFSET)
+#define REPORT_RATE_80			(1 << REPORT_RATE_OFFSET)
+
+#define MCS5000_TS_SENS_CTL		0x02
+#define MCS5000_TS_FILTER_CTL		0x03
+#define PRI_FILTER_OFFSET		0
+#define SEC_FILTER_OFFSET		4
+
+#define MCS5000_TS_X_SIZE_UPPER		0x08
+#define MCS5000_TS_X_SIZE_LOWER		0x09
+#define MCS5000_TS_Y_SIZE_UPPER		0x0A
+#define MCS5000_TS_Y_SIZE_LOWER		0x0B
+
+#define MCS5000_TS_INPUT_INFO		0x10
+#define INPUT_TYPE_OFFSET		0
+#define INPUT_TYPE_NONTOUCH		(0 << INPUT_TYPE_OFFSET)
+#define INPUT_TYPE_SINGLE		(1 << INPUT_TYPE_OFFSET)
+#define INPUT_TYPE_DUAL			(2 << INPUT_TYPE_OFFSET)
+#define INPUT_TYPE_PALM			(3 << INPUT_TYPE_OFFSET)
+#define INPUT_TYPE_PROXIMITY		(7 << INPUT_TYPE_OFFSET)
+#define GESTURE_CODE_OFFSET		3
+#define GESTURE_CODE_NO			(0 << GESTURE_CODE_OFFSET)
+
+#define MCS5000_TS_X_POS_UPPER		0x11
+#define MCS5000_TS_X_POS_LOWER		0x12
+#define MCS5000_TS_Y_POS_UPPER		0x13
+#define MCS5000_TS_Y_POS_LOWER		0x14
+#define MCS5000_TS_Z_POS		0x15
+#define MCS5000_TS_WIDTH		0x16
+#define MCS5000_TS_GESTURE_VAL		0x17
+#define MCS5000_TS_MODULE_REV		0x20
+#define MCS5000_TS_FIRMWARE_VER		0x21
+
+/* Touchscreen absolute values */
+#define MCS5000_MAX_XC			0x3ff
+#define MCS5000_MAX_YC			0x3ff
+
+enum mcs5000_ts_read_offset {
+	READ_INPUT_INFO,
+	READ_X_POS_UPPER,
+	READ_X_POS_LOWER,
+	READ_Y_POS_UPPER,
+	READ_Y_POS_LOWER,
+	READ_BLOCK_SIZE,
+};
+
+/* Each client has this additional data */
+struct mcs5000_ts_data {
+	struct i2c_client *client;
+	struct input_dev *input_dev;
+	const struct mcs5000_ts_platform_data *platform_data;
+};
+
+static irqreturn_t mcs5000_ts_interrupt(int irq, void *dev_id)
+{
+	struct mcs5000_ts_data *data = dev_id;
+	struct i2c_client *client = data->client;
+	u8 buffer[READ_BLOCK_SIZE];
+	int err;
+	int x;
+	int y;
+
+	err = i2c_smbus_read_i2c_block_data(client, MCS5000_TS_INPUT_INFO,
+			READ_BLOCK_SIZE, buffer);
+	if (err < 0) {
+		dev_err(&client->dev, "%s, err[%d]\n", __func__, err);
+		goto out;
+	}
+
+	switch (buffer[READ_INPUT_INFO]) {
+	case INPUT_TYPE_NONTOUCH:
+		input_report_key(data->input_dev, BTN_TOUCH, 0);
+		input_sync(data->input_dev);
+		break;
+
+	case INPUT_TYPE_SINGLE:
+		x = (buffer[READ_X_POS_UPPER] << 8) | buffer[READ_X_POS_LOWER];
+		y = (buffer[READ_Y_POS_UPPER] << 8) | buffer[READ_Y_POS_LOWER];
+
+		input_report_key(data->input_dev, BTN_TOUCH, 1);
+		input_report_abs(data->input_dev, ABS_X, x);
+		input_report_abs(data->input_dev, ABS_Y, y);
+		input_sync(data->input_dev);
+		break;
+
+	case INPUT_TYPE_DUAL:
+		/* TODO */
+		break;
+
+	case INPUT_TYPE_PALM:
+		/* TODO */
+		break;
+
+	case INPUT_TYPE_PROXIMITY:
+		/* TODO */
+		break;
+
+	default:
+		dev_err(&client->dev, "Unknown ts input type %d\n",
+				buffer[READ_INPUT_INFO]);
+		break;
+	}
+
+ out:
+	return IRQ_HANDLED;
+}
+
+static void mcs5000_ts_phys_init(struct mcs5000_ts_data *data)
+{
+	const struct mcs5000_ts_platform_data *platform_data =
+		data->platform_data;
+	struct i2c_client *client = data->client;
+
+	/* Touch reset & sleep mode */
+	i2c_smbus_write_byte_data(client, MCS5000_TS_OP_MODE,
+			RESET_EXT_SOFT | OP_MODE_SLEEP);
+
+	/* Touch size */
+	i2c_smbus_write_byte_data(client, MCS5000_TS_X_SIZE_UPPER,
+			platform_data->x_size >> 8);
+	i2c_smbus_write_byte_data(client, MCS5000_TS_X_SIZE_LOWER,
+			platform_data->x_size & 0xff);
+	i2c_smbus_write_byte_data(client, MCS5000_TS_Y_SIZE_UPPER,
+			platform_data->y_size >> 8);
+	i2c_smbus_write_byte_data(client, MCS5000_TS_Y_SIZE_LOWER,
+			platform_data->y_size & 0xff);
+
+	/* Touch active mode & 80 report rate */
+	i2c_smbus_write_byte_data(data->client, MCS5000_TS_OP_MODE,
+			OP_MODE_ACTIVE | REPORT_RATE_80);
+}
+
+static int __devinit mcs5000_ts_probe(struct i2c_client *client,
+		const struct i2c_device_id *id)
+{
+	struct mcs5000_ts_data *data;
+	struct input_dev *input_dev;
+	int ret;
+
+	if (!client->dev.platform_data)
+		return -EINVAL;
+
+	data = kzalloc(sizeof(struct mcs5000_ts_data), GFP_KERNEL);
+	input_dev = input_allocate_device();
+	if (!data || !input_dev) {
+		dev_err(&client->dev, "Failed to allocate memory\n");
+		ret = -ENOMEM;
+		goto err_free_mem;
+	}
+
+	data->client = client;
+	data->input_dev = input_dev;
+	data->platform_data = client->dev.platform_data;
+
+	input_dev->name = "MELPAS MCS-5000 Touchscreen";
+	input_dev->id.bustype = BUS_I2C;
+	input_dev->dev.parent = &client->dev;
+
+	__set_bit(EV_ABS, input_dev->evbit);
+	__set_bit(EV_KEY, input_dev->evbit);
+	__set_bit(BTN_TOUCH, input_dev->keybit);
+	input_set_abs_params(input_dev, ABS_X, 0, MCS5000_MAX_XC, 0, 0);
+	input_set_abs_params(input_dev, ABS_Y, 0, MCS5000_MAX_YC, 0, 0);
+
+	input_set_drvdata(input_dev, data);
+
+	if (data->platform_data->cfg_pin)
+		data->platform_data->cfg_pin();
+
+	ret = request_threaded_irq(client->irq, NULL, mcs5000_ts_interrupt,
+			IRQF_TRIGGER_LOW | IRQF_ONESHOT, "mcs5000_ts", data);
+
+	if (ret < 0) {
+		dev_err(&client->dev, "Failed to register interrupt\n");
+		goto err_free_mem;
+	}
+
+	ret = input_register_device(data->input_dev);
+	if (ret < 0)
+		goto err_free_irq;
+
+	mcs5000_ts_phys_init(data);
+	i2c_set_clientdata(client, data);
+
+	return 0;
+
+err_free_irq:
+	free_irq(client->irq, data);
+err_free_mem:
+	input_free_device(input_dev);
+	kfree(data);
+	return ret;
+}
+
+static int __devexit mcs5000_ts_remove(struct i2c_client *client)
+{
+	struct mcs5000_ts_data *data = i2c_get_clientdata(client);
+
+	free_irq(client->irq, data);
+	input_unregister_device(data->input_dev);
+	kfree(data);
+	i2c_set_clientdata(client, NULL);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int mcs5000_ts_suspend(struct i2c_client *client, pm_message_t mesg)
+{
+	/* Touch sleep mode */
+	i2c_smbus_write_byte_data(client, MCS5000_TS_OP_MODE, OP_MODE_SLEEP);
+
+	return 0;
+}
+
+static int mcs5000_ts_resume(struct i2c_client *client)
+{
+	struct mcs5000_ts_data *data = i2c_get_clientdata(client);
+
+	mcs5000_ts_phys_init(data);
+
+	return 0;
+}
+#else
+#define mcs5000_ts_suspend	NULL
+#define mcs5000_ts_resume	NULL
+#endif
+
+static const struct i2c_device_id mcs5000_ts_id[] = {
+	{ "mcs5000_ts", 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, mcs5000_ts_id);
+
+static struct i2c_driver mcs5000_ts_driver = {
+	.probe		= mcs5000_ts_probe,
+	.remove		= __devexit_p(mcs5000_ts_remove),
+	.suspend	= mcs5000_ts_suspend,
+	.resume		= mcs5000_ts_resume,
+	.driver = {
+		.name = "mcs5000_ts",
+	},
+	.id_table	= mcs5000_ts_id,
+};
+
+static int __init mcs5000_ts_init(void)
+{
+	return i2c_add_driver(&mcs5000_ts_driver);
+}
+
+static void __exit mcs5000_ts_exit(void)
+{
+	i2c_del_driver(&mcs5000_ts_driver);
+}
+
+module_init(mcs5000_ts_init);
+module_exit(mcs5000_ts_exit);
+
+/* Module information */
+MODULE_AUTHOR("Joonyoung Shim <jy0922.shim@samsung.com>");
+MODULE_DESCRIPTION("Touchscreen driver for MELFAS MCS-5000 controller");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/i2c/mcs5000_ts.h b/include/linux/i2c/mcs5000_ts.h
new file mode 100644
index 000000000000..5a117b5ca15e
--- /dev/null
+++ b/include/linux/i2c/mcs5000_ts.h
@@ -0,0 +1,24 @@
+/*
+ * mcs5000_ts.h
+ *
+ * Copyright (C) 2009 Samsung Electronics Co.Ltd
+ * Author: Joonyoung Shim <jy0922.shim@samsung.com>
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#ifndef __LINUX_MCS5000_TS_H
+#define __LINUX_MCS5000_TS_H
+
+/* platform data for the MELFAS MCS-5000 touchscreen driver */
+struct mcs5000_ts_platform_data {
+	void (*cfg_pin)(void);
+	int x_size;
+	int y_size;
+};
+
+#endif	/* __LINUX_MCS5000_TS_H */
-- 
cgit v1.2.3-71-gd317


From 88751dd6ce1fb0627c36c4ab08a40730e5a50d3e Mon Sep 17 00:00:00 2001
From: Michael Hennerich <michael.hennerich@analog.com>
Date: Thu, 17 Sep 2009 22:39:38 -0700
Subject: Input: add driver for ADP5588 QWERTY I2C Keypad

Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/keyboard/Kconfig        |  10 +
 drivers/input/keyboard/Makefile       |   1 +
 drivers/input/keyboard/adp5588-keys.c | 361 ++++++++++++++++++++++++++++++++++
 include/linux/i2c/adp5588.h           |  92 +++++++++
 4 files changed, 464 insertions(+)
 create mode 100644 drivers/input/keyboard/adp5588-keys.c
 create mode 100644 include/linux/i2c/adp5588.h

(limited to 'include/linux')

diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig
index b14bd3a07140..d615c09a83c6 100644
--- a/drivers/input/keyboard/Kconfig
+++ b/drivers/input/keyboard/Kconfig
@@ -24,6 +24,16 @@ config KEYBOARD_AAED2000
 	  To compile this driver as a module, choose M here: the
 	  module will be called aaed2000_kbd.
 
+config KEYBOARD_ADP5588
+	tristate "ADP5588 I2C QWERTY Keypad and IO Expander"
+	depends on I2C
+	help
+	  Say Y here if you want to use a ADP5588 attached to your
+	  system I2C bus.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called adp5588-keys.
+
 config KEYBOARD_AMIGA
 	tristate "Amiga keyboard"
 	depends on AMIGA
diff --git a/drivers/input/keyboard/Makefile b/drivers/input/keyboard/Makefile
index ab35ac36111e..a5c08cdf8083 100644
--- a/drivers/input/keyboard/Makefile
+++ b/drivers/input/keyboard/Makefile
@@ -5,6 +5,7 @@
 # Each configuration option enables a list of files.
 
 obj-$(CONFIG_KEYBOARD_AAED2000)		+= aaed2000_kbd.o
+obj-$(CONFIG_KEYBOARD_ADP5588)		+= adp5588-keys.o
 obj-$(CONFIG_KEYBOARD_AMIGA)		+= amikbd.o
 obj-$(CONFIG_KEYBOARD_ATARI)		+= atakbd.o
 obj-$(CONFIG_KEYBOARD_ATKBD)		+= atkbd.o
diff --git a/drivers/input/keyboard/adp5588-keys.c b/drivers/input/keyboard/adp5588-keys.c
new file mode 100644
index 000000000000..d48c808d5928
--- /dev/null
+++ b/drivers/input/keyboard/adp5588-keys.c
@@ -0,0 +1,361 @@
+/*
+ * File: drivers/input/keyboard/adp5588_keys.c
+ * Description:  keypad driver for ADP5588 I2C QWERTY Keypad and IO Expander
+ * Bugs: Enter bugs at http://blackfin.uclinux.org/
+ *
+ * Copyright (C) 2008-2009 Analog Devices Inc.
+ * Licensed under the GPL-2 or later.
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/workqueue.h>
+#include <linux/errno.h>
+#include <linux/pm.h>
+#include <linux/platform_device.h>
+#include <linux/input.h>
+#include <linux/i2c.h>
+
+#include <linux/i2c/adp5588.h>
+
+ /* Configuration Register1 */
+#define AUTO_INC	(1 << 7)
+#define GPIEM_CFG	(1 << 6)
+#define OVR_FLOW_M	(1 << 5)
+#define INT_CFG		(1 << 4)
+#define OVR_FLOW_IEN	(1 << 3)
+#define K_LCK_IM	(1 << 2)
+#define GPI_IEN		(1 << 1)
+#define KE_IEN		(1 << 0)
+
+/* Interrupt Status Register */
+#define CMP2_INT	(1 << 5)
+#define CMP1_INT	(1 << 4)
+#define OVR_FLOW_INT	(1 << 3)
+#define K_LCK_INT	(1 << 2)
+#define GPI_INT		(1 << 1)
+#define KE_INT		(1 << 0)
+
+/* Key Lock and Event Counter Register */
+#define K_LCK_EN	(1 << 6)
+#define LCK21		0x30
+#define KEC		0xF
+
+/* Key Event Register xy */
+#define KEY_EV_PRESSED		(1 << 7)
+#define KEY_EV_MASK		(0x7F)
+
+#define KP_SEL(x)		(0xFFFF >> (16 - x))	/* 2^x-1 */
+
+#define KEYP_MAX_EVENT		10
+
+/*
+ * Early pre 4.0 Silicon required to delay readout by at least 25ms,
+ * since the Event Counter Register updated 25ms after the interrupt
+ * asserted.
+ */
+#define WA_DELAYED_READOUT_REVID(rev)		((rev) < 4)
+
+struct adp5588_kpad {
+	struct i2c_client *client;
+	struct input_dev *input;
+	struct delayed_work work;
+	unsigned long delay;
+	unsigned short keycode[ADP5588_KEYMAPSIZE];
+};
+
+static int adp5588_read(struct i2c_client *client, u8 reg)
+{
+	int ret = i2c_smbus_read_byte_data(client, reg);
+
+	if (ret < 0)
+		dev_err(&client->dev, "Read Error\n");
+
+	return ret;
+}
+
+static int adp5588_write(struct i2c_client *client, u8 reg, u8 val)
+{
+	return i2c_smbus_write_byte_data(client, reg, val);
+}
+
+static void adp5588_work(struct work_struct *work)
+{
+	struct adp5588_kpad *kpad = container_of(work,
+						struct adp5588_kpad, work.work);
+	struct i2c_client *client = kpad->client;
+	int i, key, status, ev_cnt;
+
+	status = adp5588_read(client, INT_STAT);
+
+	if (status & OVR_FLOW_INT)	/* Unlikely and should never happen */
+		dev_err(&client->dev, "Event Overflow Error\n");
+
+	if (status & KE_INT) {
+		ev_cnt = adp5588_read(client, KEY_LCK_EC_STAT) & KEC;
+		if (ev_cnt) {
+			for (i = 0; i < ev_cnt; i++) {
+				key = adp5588_read(client, Key_EVENTA + i);
+				input_report_key(kpad->input,
+					kpad->keycode[(key & KEY_EV_MASK) - 1],
+					key & KEY_EV_PRESSED);
+			}
+			input_sync(kpad->input);
+		}
+	}
+	adp5588_write(client, INT_STAT, status); /* Status is W1C */
+}
+
+static irqreturn_t adp5588_irq(int irq, void *handle)
+{
+	struct adp5588_kpad *kpad = handle;
+
+	/*
+	 * use keventd context to read the event fifo registers
+	 * Schedule readout at least 25ms after notification for
+	 * REVID < 4
+	 */
+
+	schedule_delayed_work(&kpad->work, kpad->delay);
+
+	return IRQ_HANDLED;
+}
+
+static int __devinit adp5588_setup(struct i2c_client *client)
+{
+	struct adp5588_kpad_platform_data *pdata = client->dev.platform_data;
+	int i, ret;
+
+	ret = adp5588_write(client, KP_GPIO1, KP_SEL(pdata->rows));
+	ret |= adp5588_write(client, KP_GPIO2, KP_SEL(pdata->cols) & 0xFF);
+	ret |= adp5588_write(client, KP_GPIO3, KP_SEL(pdata->cols) >> 8);
+
+	if (pdata->en_keylock) {
+		ret |= adp5588_write(client, UNLOCK1, pdata->unlock_key1);
+		ret |= adp5588_write(client, UNLOCK2, pdata->unlock_key2);
+		ret |= adp5588_write(client, KEY_LCK_EC_STAT, K_LCK_EN);
+	}
+
+	for (i = 0; i < KEYP_MAX_EVENT; i++)
+		ret |= adp5588_read(client, Key_EVENTA);
+
+	ret |= adp5588_write(client, INT_STAT, CMP2_INT | CMP1_INT |
+					OVR_FLOW_INT | K_LCK_INT |
+					GPI_INT | KE_INT); /* Status is W1C */
+
+	ret |= adp5588_write(client, CFG, INT_CFG | OVR_FLOW_IEN | KE_IEN);
+
+	if (ret < 0) {
+		dev_err(&client->dev, "Write Error\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int __devinit adp5588_probe(struct i2c_client *client,
+					const struct i2c_device_id *id)
+{
+	struct adp5588_kpad *kpad;
+	struct adp5588_kpad_platform_data *pdata = client->dev.platform_data;
+	struct input_dev *input;
+	unsigned int revid;
+	int ret, i;
+	int error;
+
+	if (!i2c_check_functionality(client->adapter,
+					I2C_FUNC_SMBUS_BYTE_DATA)) {
+		dev_err(&client->dev, "SMBUS Byte Data not Supported\n");
+		return -EIO;
+	}
+
+	if (!pdata) {
+		dev_err(&client->dev, "no platform data?\n");
+		return -EINVAL;
+	}
+
+	if (!pdata->rows || !pdata->cols || !pdata->keymap) {
+		dev_err(&client->dev, "no rows, cols or keymap from pdata\n");
+		return -EINVAL;
+	}
+
+	if (pdata->keymapsize != ADP5588_KEYMAPSIZE) {
+		dev_err(&client->dev, "invalid keymapsize\n");
+		return -EINVAL;
+	}
+
+	if (!client->irq) {
+		dev_err(&client->dev, "no IRQ?\n");
+		return -EINVAL;
+	}
+
+	kpad = kzalloc(sizeof(*kpad), GFP_KERNEL);
+	input = input_allocate_device();
+	if (!kpad || !input) {
+		error = -ENOMEM;
+		goto err_free_mem;
+	}
+
+	kpad->client = client;
+	kpad->input = input;
+	INIT_DELAYED_WORK(&kpad->work, adp5588_work);
+
+	ret = adp5588_read(client, DEV_ID);
+	if (ret < 0) {
+		error = ret;
+		goto err_free_mem;
+	}
+
+	revid = (u8) ret & ADP5588_DEVICE_ID_MASK;
+	if (WA_DELAYED_READOUT_REVID(revid))
+		kpad->delay = msecs_to_jiffies(30);
+
+	input->name = client->name;
+	input->phys = "adp5588-keys/input0";
+	input->dev.parent = &client->dev;
+
+	input_set_drvdata(input, kpad);
+
+	input->id.bustype = BUS_I2C;
+	input->id.vendor = 0x0001;
+	input->id.product = 0x0001;
+	input->id.version = revid;
+
+	input->keycodesize = sizeof(kpad->keycode[0]);
+	input->keycodemax = pdata->keymapsize;
+	input->keycode = kpad->keycode;
+
+	memcpy(kpad->keycode, pdata->keymap,
+		pdata->keymapsize * input->keycodesize);
+
+	/* setup input device */
+	__set_bit(EV_KEY, input->evbit);
+
+	if (pdata->repeat)
+		__set_bit(EV_REP, input->evbit);
+
+	for (i = 0; i < input->keycodemax; i++)
+		__set_bit(kpad->keycode[i] & KEY_MAX, input->keybit);
+	__clear_bit(KEY_RESERVED, input->keybit);
+
+	error = input_register_device(input);
+	if (error) {
+		dev_err(&client->dev, "unable to register input device\n");
+		goto err_free_mem;
+	}
+
+	error = request_irq(client->irq, adp5588_irq,
+			    IRQF_TRIGGER_FALLING | IRQF_DISABLED,
+			    client->dev.driver->name, kpad);
+	if (error) {
+		dev_err(&client->dev, "irq %d busy?\n", client->irq);
+		goto err_unreg_dev;
+	}
+
+	error = adp5588_setup(client);
+	if (error)
+		goto err_free_irq;
+
+	device_init_wakeup(&client->dev, 1);
+	i2c_set_clientdata(client, kpad);
+
+	dev_info(&client->dev, "Rev.%d keypad, irq %d\n", revid, client->irq);
+	return 0;
+
+ err_free_irq:
+	free_irq(client->irq, kpad);
+ err_unreg_dev:
+	input_unregister_device(input);
+	input = NULL;
+ err_free_mem:
+	input_free_device(input);
+	kfree(kpad);
+
+	return error;
+}
+
+static int __devexit adp5588_remove(struct i2c_client *client)
+{
+	struct adp5588_kpad *kpad = i2c_get_clientdata(client);
+
+	adp5588_write(client, CFG, 0);
+	free_irq(client->irq, kpad);
+	cancel_delayed_work_sync(&kpad->work);
+	input_unregister_device(kpad->input);
+	i2c_set_clientdata(client, NULL);
+	kfree(kpad);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int adp5588_suspend(struct device *dev)
+{
+	struct adp5588_kpad *kpad = dev_get_drvdata(dev);
+	struct i2c_client *client = kpad->client;
+
+	disable_irq(client->irq);
+	cancel_delayed_work_sync(&kpad->work);
+
+	if (device_may_wakeup(&client->dev))
+		enable_irq_wake(client->irq);
+
+	return 0;
+}
+
+static int adp5588_resume(struct device *dev)
+{
+	struct adp5588_kpad *kpad = dev_get_drvdata(dev);
+	struct i2c_client *client = kpad->client;
+
+	if (device_may_wakeup(&client->dev))
+		disable_irq_wake(client->irq);
+
+	enable_irq(client->irq);
+
+	return 0;
+}
+
+static struct dev_pm_ops adp5588_dev_pm_ops = {
+	.suspend = adp5588_suspend,
+	.resume  = adp5588_resume,
+};
+#endif
+
+static const struct i2c_device_id adp5588_id[] = {
+	{ KBUILD_MODNAME, 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, adp5588_id);
+
+static struct i2c_driver adp5588_driver = {
+	.driver = {
+		.name = KBUILD_MODNAME,
+#ifdef CONFIG_PM
+		.pm   = &adp5588_dev_pm_ops,
+#endif
+	},
+	.probe    = adp5588_probe,
+	.remove   = __devexit_p(adp5588_remove),
+	.id_table = adp5588_id,
+};
+
+static int __init adp5588_init(void)
+{
+	return i2c_add_driver(&adp5588_driver);
+}
+module_init(adp5588_init);
+
+static void __exit adp5588_exit(void)
+{
+	i2c_del_driver(&adp5588_driver);
+}
+module_exit(adp5588_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michael Hennerich <hennerich@blackfin.uclinux.org>");
+MODULE_DESCRIPTION("ADP5588 Keypad driver");
+MODULE_ALIAS("platform:adp5588-keys");
diff --git a/include/linux/i2c/adp5588.h b/include/linux/i2c/adp5588.h
new file mode 100644
index 000000000000..fc5db826b48e
--- /dev/null
+++ b/include/linux/i2c/adp5588.h
@@ -0,0 +1,92 @@
+/*
+ * Analog Devices ADP5588 I/O Expander and QWERTY Keypad Controller
+ *
+ * Copyright 2009 Analog Devices Inc.
+ *
+ * Licensed under the GPL-2 or later.
+ */
+
+#ifndef _ADP5588_H
+#define _ADP5588_H
+
+#define DEV_ID 0x00		/* Device ID */
+#define CFG 0x01		/* Configuration Register1 */
+#define INT_STAT 0x02		/* Interrupt Status Register */
+#define KEY_LCK_EC_STAT 0x03	/* Key Lock and Event Counter Register */
+#define Key_EVENTA 0x04		/* Key Event Register A */
+#define Key_EVENTB 0x05		/* Key Event Register B */
+#define Key_EVENTC 0x06		/* Key Event Register C */
+#define Key_EVENTD 0x07		/* Key Event Register D */
+#define Key_EVENTE 0x08		/* Key Event Register E */
+#define Key_EVENTF 0x09		/* Key Event Register F */
+#define Key_EVENTG 0x0A		/* Key Event Register G */
+#define Key_EVENTH 0x0B		/* Key Event Register H */
+#define Key_EVENTI 0x0C		/* Key Event Register I */
+#define Key_EVENTJ 0x0D		/* Key Event Register J */
+#define KP_LCK_TMR 0x0E		/* Keypad Lock1 to Lock2 Timer */
+#define UNLOCK1 0x0F		/* Unlock Key1 */
+#define UNLOCK2 0x10		/* Unlock Key2 */
+#define GPIO_INT_STAT1 0x11	/* GPIO Interrupt Status */
+#define GPIO_INT_STAT2 0x12	/* GPIO Interrupt Status */
+#define GPIO_INT_STAT3 0x13	/* GPIO Interrupt Status */
+#define GPIO_DAT_STAT1 0x14	/* GPIO Data Status, Read twice to clear */
+#define GPIO_DAT_STAT2 0x15	/* GPIO Data Status, Read twice to clear */
+#define GPIO_DAT_STAT3 0x16	/* GPIO Data Status, Read twice to clear */
+#define GPIO_DAT_OUT1 0x17	/* GPIO DATA OUT */
+#define GPIO_DAT_OUT2 0x18	/* GPIO DATA OUT */
+#define GPIO_DAT_OUT3 0x19	/* GPIO DATA OUT */
+#define GPIO_INT_EN1 0x1A	/* GPIO Interrupt Enable */
+#define GPIO_INT_EN2 0x1B	/* GPIO Interrupt Enable */
+#define GPIO_INT_EN3 0x1C	/* GPIO Interrupt Enable */
+#define KP_GPIO1 0x1D		/* Keypad or GPIO Selection */
+#define KP_GPIO2 0x1E		/* Keypad or GPIO Selection */
+#define KP_GPIO3 0x1F		/* Keypad or GPIO Selection */
+#define GPI_EM1 0x20		/* GPI Event Mode 1 */
+#define GPI_EM2 0x21		/* GPI Event Mode 2 */
+#define GPI_EM3 0x22		/* GPI Event Mode 3 */
+#define GPIO_DIR1 0x23		/* GPIO Data Direction */
+#define GPIO_DIR2 0x24		/* GPIO Data Direction */
+#define GPIO_DIR3 0x25		/* GPIO Data Direction */
+#define GPIO_INT_LVL1 0x26	/* GPIO Edge/Level Detect */
+#define GPIO_INT_LVL2 0x27	/* GPIO Edge/Level Detect */
+#define GPIO_INT_LVL3 0x28	/* GPIO Edge/Level Detect */
+#define Debounce_DIS1 0x29	/* Debounce Disable */
+#define Debounce_DIS2 0x2A	/* Debounce Disable */
+#define Debounce_DIS3 0x2B	/* Debounce Disable */
+#define GPIO_PULL1 0x2C		/* GPIO Pull Disable */
+#define GPIO_PULL2 0x2D		/* GPIO Pull Disable */
+#define GPIO_PULL3 0x2E		/* GPIO Pull Disable */
+#define CMP_CFG_STAT 0x30	/* Comparator Configuration and Status Register */
+#define CMP_CONFG_SENS1 0x31	/* Sensor1 Comparator Configuration Register */
+#define CMP_CONFG_SENS2 0x32	/* L2 Light Sensor Reference Level, Output Falling for Sensor 1 */
+#define CMP1_LVL2_TRIP 0x33	/* L2 Light Sensor Hysteresis (Active when Output Rising) for Sensor 1 */
+#define CMP1_LVL2_HYS 0x34	/* L3 Light Sensor Reference Level, Output Falling For Sensor 1 */
+#define CMP1_LVL3_TRIP 0x35	/* L3 Light Sensor Hysteresis (Active when Output Rising) For Sensor 1 */
+#define CMP1_LVL3_HYS 0x36	/* Sensor 2 Comparator Configuration Register */
+#define CMP2_LVL2_TRIP 0x37	/* L2 Light Sensor Reference Level, Output Falling for Sensor 2 */
+#define CMP2_LVL2_HYS 0x38	/* L2 Light Sensor Hysteresis (Active when Output Rising) for Sensor 2 */
+#define CMP2_LVL3_TRIP 0x39	/* L3 Light Sensor Reference Level, Output Falling For Sensor 2 */
+#define CMP2_LVL3_HYS 0x3A	/* L3 Light Sensor Hysteresis (Active when Output Rising) For Sensor 2 */
+#define CMP1_ADC_DAT_R1 0x3B	/* Comparator 1 ADC data Register1 */
+#define CMP1_ADC_DAT_R2 0x3C	/* Comparator 1 ADC data Register2 */
+#define CMP2_ADC_DAT_R1 0x3D	/* Comparator 2 ADC data Register1 */
+#define CMP2_ADC_DAT_R2 0x3E	/* Comparator 2 ADC data Register2 */
+
+#define ADP5588_DEVICE_ID_MASK	0xF
+
+/* Put one of these structures in i2c_board_info platform_data */
+
+#define ADP5588_KEYMAPSIZE	80
+
+struct adp5588_kpad_platform_data {
+	int rows;			/* Number of rows */
+	int cols;			/* Number of columns */
+	const unsigned short *keymap;	/* Pointer to keymap */
+	unsigned short keymapsize;	/* Keymap size */
+	unsigned repeat:1;		/* Enable key repeat */
+	unsigned en_keylock:1;		/* Enable Key Lock feature */
+	unsigned short unlock_key1;	/* Unlock Key 1 */
+	unsigned short unlock_key2;	/* Unlock Key 2 */
+};
+
+#endif
-- 
cgit v1.2.3-71-gd317


From 502a0106b2cc31940f690dc6693fddfd3b97cab5 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 27 Jul 2009 14:46:12 +0100
Subject: [WATCHDOG] Add support for WM831x watchdog

The WM831x series of devices provide a watchdog with configurable
behaviour on timer expiry.

Currently this driver support refreshes via a register or GPIO line and
autonomous refreshes from a hardware source (eg, a clock).

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Wim Van Sebroeck <wim@iguana.be>
---
 drivers/watchdog/Kconfig            |   7 +
 drivers/watchdog/Makefile           |   1 +
 drivers/watchdog/wm831x_wdt.c       | 441 ++++++++++++++++++++++++++++++++++++
 include/linux/mfd/wm831x/watchdog.h |  52 +++++
 4 files changed, 501 insertions(+)
 create mode 100644 drivers/watchdog/wm831x_wdt.c
 create mode 100644 include/linux/mfd/wm831x/watchdog.h

(limited to 'include/linux')

diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 3229889c3591..ff3eb8ff6bd7 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -55,6 +55,13 @@ config SOFT_WATCHDOG
 	  To compile this driver as a module, choose M here: the
 	  module will be called softdog.
 
+config WM831X_WATCHDOG
+	tristate "WM831x watchdog"
+	depends on MFD_WM831X
+	help
+	  Support for the watchdog in the WM831x AudioPlus PMICs.  When
+	  the watchdog triggers the system will be reset.
+
 config WM8350_WATCHDOG
 	tristate "WM8350 watchdog"
 	depends on MFD_WM8350
diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile
index e1784a6208a7..348b3b862c99 100644
--- a/drivers/watchdog/Makefile
+++ b/drivers/watchdog/Makefile
@@ -141,5 +141,6 @@ obj-$(CONFIG_WATCHDOG_CP1XXX)		+= cpwd.o
 # XTENSA Architecture
 
 # Architecture Independant
+obj-$(CONFIG_WM831X_WATCHDOG) += wm831x_wdt.o
 obj-$(CONFIG_WM8350_WATCHDOG) += wm8350_wdt.o
 obj-$(CONFIG_SOFT_WATCHDOG) += softdog.o
diff --git a/drivers/watchdog/wm831x_wdt.c b/drivers/watchdog/wm831x_wdt.c
new file mode 100644
index 000000000000..775bcd807f31
--- /dev/null
+++ b/drivers/watchdog/wm831x_wdt.c
@@ -0,0 +1,441 @@
+/*
+ * Watchdog driver for the wm831x PMICs
+ *
+ * Copyright (C) 2009 Wolfson Microelectronics
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/platform_device.h>
+#include <linux/watchdog.h>
+#include <linux/uaccess.h>
+#include <linux/gpio.h>
+
+#include <linux/mfd/wm831x/core.h>
+#include <linux/mfd/wm831x/pdata.h>
+#include <linux/mfd/wm831x/watchdog.h>
+
+static int nowayout = WATCHDOG_NOWAYOUT;
+module_param(nowayout, int, 0);
+MODULE_PARM_DESC(nowayout,
+		 "Watchdog cannot be stopped once started (default="
+		 __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
+
+static unsigned long wm831x_wdt_users;
+static struct miscdevice wm831x_wdt_miscdev;
+static int wm831x_wdt_expect_close;
+static DEFINE_MUTEX(wdt_mutex);
+static struct wm831x *wm831x;
+static unsigned int update_gpio;
+static unsigned int update_state;
+
+/* We can't use the sub-second values here but they're included
+ * for completeness.  */
+static struct {
+	int time;  /* Seconds */
+	u16 val;   /* WDOG_TO value */
+} wm831x_wdt_cfgs[] = {
+	{  1, 2 },
+	{  2, 3 },
+	{  4, 4 },
+	{  8, 5 },
+	{ 16, 6 },
+	{ 32, 7 },
+	{ 33, 7 },  /* Actually 32.768s so include both, others round down */
+};
+
+static int wm831x_wdt_set_timeout(struct wm831x *wm831x, u16 value)
+{
+	int ret;
+
+	mutex_lock(&wdt_mutex);
+
+	ret = wm831x_reg_unlock(wm831x);
+	if (ret == 0) {
+		ret = wm831x_set_bits(wm831x, WM831X_WATCHDOG,
+				      WM831X_WDOG_TO_MASK, value);
+		wm831x_reg_lock(wm831x);
+	} else {
+		dev_err(wm831x->dev, "Failed to unlock security key: %d\n",
+			ret);
+	}
+
+	mutex_unlock(&wdt_mutex);
+
+	return ret;
+}
+
+static int wm831x_wdt_start(struct wm831x *wm831x)
+{
+	int ret;
+
+	mutex_lock(&wdt_mutex);
+
+	ret = wm831x_reg_unlock(wm831x);
+	if (ret == 0) {
+		ret = wm831x_set_bits(wm831x, WM831X_WATCHDOG,
+				      WM831X_WDOG_ENA, WM831X_WDOG_ENA);
+		wm831x_reg_lock(wm831x);
+	} else {
+		dev_err(wm831x->dev, "Failed to unlock security key: %d\n",
+			ret);
+	}
+
+	mutex_unlock(&wdt_mutex);
+
+	return ret;
+}
+
+static int wm831x_wdt_stop(struct wm831x *wm831x)
+{
+	int ret;
+
+	mutex_lock(&wdt_mutex);
+
+	ret = wm831x_reg_unlock(wm831x);
+	if (ret == 0) {
+		ret = wm831x_set_bits(wm831x, WM831X_WATCHDOG,
+				      WM831X_WDOG_ENA, 0);
+		wm831x_reg_lock(wm831x);
+	} else {
+		dev_err(wm831x->dev, "Failed to unlock security key: %d\n",
+			ret);
+	}
+
+	mutex_unlock(&wdt_mutex);
+
+	return ret;
+}
+
+static int wm831x_wdt_kick(struct wm831x *wm831x)
+{
+	int ret;
+	u16 reg;
+
+	mutex_lock(&wdt_mutex);
+
+	if (update_gpio) {
+		gpio_set_value_cansleep(update_gpio, update_state);
+		update_state = !update_state;
+		ret = 0;
+		goto out;
+	}
+
+
+	reg = wm831x_reg_read(wm831x, WM831X_WATCHDOG);
+
+	if (!(reg & WM831X_WDOG_RST_SRC)) {
+		dev_err(wm831x->dev, "Hardware watchdog update unsupported\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	reg |= WM831X_WDOG_RESET;
+
+	ret = wm831x_reg_unlock(wm831x);
+	if (ret == 0) {
+		ret = wm831x_reg_write(wm831x, WM831X_WATCHDOG, reg);
+		wm831x_reg_lock(wm831x);
+	} else {
+		dev_err(wm831x->dev, "Failed to unlock security key: %d\n",
+			ret);
+	}
+
+out:
+	mutex_unlock(&wdt_mutex);
+
+	return ret;
+}
+
+static int wm831x_wdt_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	if (!wm831x)
+		return -ENODEV;
+
+	if (test_and_set_bit(0, &wm831x_wdt_users))
+		return -EBUSY;
+
+	ret = wm831x_wdt_start(wm831x);
+	if (ret != 0)
+		return ret;
+
+	return nonseekable_open(inode, file);
+}
+
+static int wm831x_wdt_release(struct inode *inode, struct file *file)
+{
+	if (wm831x_wdt_expect_close)
+		wm831x_wdt_stop(wm831x);
+	else {
+		dev_warn(wm831x->dev, "Watchdog device closed uncleanly\n");
+		wm831x_wdt_kick(wm831x);
+	}
+
+	clear_bit(0, &wm831x_wdt_users);
+
+	return 0;
+}
+
+static ssize_t wm831x_wdt_write(struct file *file,
+				const char __user *data, size_t count,
+				loff_t *ppos)
+{
+	size_t i;
+
+	if (count) {
+		wm831x_wdt_kick(wm831x);
+
+		if (!nowayout) {
+			/* In case it was set long ago */
+			wm831x_wdt_expect_close = 0;
+
+			/* scan to see whether or not we got the magic
+			   character */
+			for (i = 0; i != count; i++) {
+				char c;
+				if (get_user(c, data + i))
+					return -EFAULT;
+				if (c == 'V')
+					wm831x_wdt_expect_close = 42;
+			}
+		}
+	}
+	return count;
+}
+
+static struct watchdog_info ident = {
+	.options = WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING | WDIOF_MAGICCLOSE,
+	.identity = "WM831x Watchdog",
+};
+
+static long wm831x_wdt_ioctl(struct file *file, unsigned int cmd,
+			     unsigned long arg)
+{
+	int ret = -ENOTTY, time, i;
+	void __user *argp = (void __user *)arg;
+	int __user *p = argp;
+	u16 reg;
+
+	switch (cmd) {
+	case WDIOC_GETSUPPORT:
+		ret = copy_to_user(argp, &ident, sizeof(ident)) ? -EFAULT : 0;
+		break;
+
+	case WDIOC_GETSTATUS:
+	case WDIOC_GETBOOTSTATUS:
+		ret = put_user(0, p);
+		break;
+
+	case WDIOC_SETOPTIONS:
+	{
+		int options;
+
+		if (get_user(options, p))
+			return -EFAULT;
+
+		ret = -EINVAL;
+
+		/* Setting both simultaneously means at least one must fail */
+		if (options == WDIOS_DISABLECARD)
+			ret = wm831x_wdt_start(wm831x);
+
+		if (options == WDIOS_ENABLECARD)
+			ret = wm831x_wdt_stop(wm831x);
+		break;
+	}
+
+	case WDIOC_KEEPALIVE:
+		ret = wm831x_wdt_kick(wm831x);
+		break;
+
+	case WDIOC_SETTIMEOUT:
+		ret = get_user(time, p);
+		if (ret)
+			break;
+
+		if (time == 0) {
+			if (nowayout)
+				ret = -EINVAL;
+			else
+				wm831x_wdt_stop(wm831x);
+			break;
+		}
+
+		for (i = 0; i < ARRAY_SIZE(wm831x_wdt_cfgs); i++)
+			if (wm831x_wdt_cfgs[i].time == time)
+				break;
+		if (i == ARRAY_SIZE(wm831x_wdt_cfgs))
+			ret = -EINVAL;
+		else
+			ret = wm831x_wdt_set_timeout(wm831x,
+						     wm831x_wdt_cfgs[i].val);
+		break;
+
+	case WDIOC_GETTIMEOUT:
+		reg = wm831x_reg_read(wm831x, WM831X_WATCHDOG);
+		reg &= WM831X_WDOG_TO_MASK;
+		for (i = 0; i < ARRAY_SIZE(wm831x_wdt_cfgs); i++)
+			if (wm831x_wdt_cfgs[i].val == reg)
+				break;
+		if (i == ARRAY_SIZE(wm831x_wdt_cfgs)) {
+			dev_warn(wm831x->dev,
+				 "Unknown watchdog configuration: %x\n", reg);
+			ret = -EINVAL;
+		} else
+			ret = put_user(wm831x_wdt_cfgs[i].time, p);
+
+	}
+
+	return ret;
+}
+
+static const struct file_operations wm831x_wdt_fops = {
+	.owner = THIS_MODULE,
+	.llseek = no_llseek,
+	.write = wm831x_wdt_write,
+	.unlocked_ioctl = wm831x_wdt_ioctl,
+	.open = wm831x_wdt_open,
+	.release = wm831x_wdt_release,
+};
+
+static struct miscdevice wm831x_wdt_miscdev = {
+	.minor = WATCHDOG_MINOR,
+	.name = "watchdog",
+	.fops = &wm831x_wdt_fops,
+};
+
+static int __devinit wm831x_wdt_probe(struct platform_device *pdev)
+{
+	struct wm831x_pdata *chip_pdata;
+	struct wm831x_watchdog_pdata *pdata;
+	int reg, ret;
+
+	wm831x = dev_get_drvdata(pdev->dev.parent);
+
+	ret = wm831x_reg_read(wm831x, WM831X_WATCHDOG);
+	if (ret < 0) {
+		dev_err(wm831x->dev, "Failed to read watchdog status: %d\n",
+			ret);
+		goto err;
+	}
+	reg = ret;
+
+	if (reg & WM831X_WDOG_DEBUG)
+		dev_warn(wm831x->dev, "Watchdog is paused\n");
+
+	/* Apply any configuration */
+	if (pdev->dev.parent->platform_data) {
+		chip_pdata = pdev->dev.parent->platform_data;
+		pdata = chip_pdata->watchdog;
+	} else {
+		pdata = NULL;
+	}
+
+	if (pdata) {
+		reg &= ~(WM831X_WDOG_SECACT_MASK | WM831X_WDOG_PRIMACT_MASK |
+			 WM831X_WDOG_RST_SRC);
+
+		reg |= pdata->primary << WM831X_WDOG_PRIMACT_SHIFT;
+		reg |= pdata->secondary << WM831X_WDOG_SECACT_SHIFT;
+		reg |= pdata->software << WM831X_WDOG_RST_SRC_SHIFT;
+
+		if (pdata->update_gpio) {
+			ret = gpio_request(pdata->update_gpio,
+					   "Watchdog update");
+			if (ret < 0) {
+				dev_err(wm831x->dev,
+					"Failed to request update GPIO: %d\n",
+					ret);
+				goto err;
+			}
+
+			ret = gpio_direction_output(pdata->update_gpio, 0);
+			if (ret != 0) {
+				dev_err(wm831x->dev,
+					"gpio_direction_output returned: %d\n",
+					ret);
+				goto err_gpio;
+			}
+
+			update_gpio = pdata->update_gpio;
+
+			/* Make sure the watchdog takes hardware updates */
+			reg |= WM831X_WDOG_RST_SRC;
+		}
+
+		ret = wm831x_reg_unlock(wm831x);
+		if (ret == 0) {
+			ret = wm831x_reg_write(wm831x, WM831X_WATCHDOG, reg);
+			wm831x_reg_lock(wm831x);
+		} else {
+			dev_err(wm831x->dev,
+				"Failed to unlock security key: %d\n", ret);
+			goto err_gpio;
+		}
+	}
+
+	wm831x_wdt_miscdev.parent = &pdev->dev;
+
+	ret = misc_register(&wm831x_wdt_miscdev);
+	if (ret != 0) {
+		dev_err(wm831x->dev, "Failed to register miscdev: %d\n", ret);
+		goto err_gpio;
+	}
+
+	return 0;
+
+err_gpio:
+	if (update_gpio) {
+		gpio_free(update_gpio);
+		update_gpio = 0;
+	}
+err:
+	return ret;
+}
+
+static int __devexit wm831x_wdt_remove(struct platform_device *pdev)
+{
+	if (update_gpio) {
+		gpio_free(update_gpio);
+		update_gpio = 0;
+	}
+
+	misc_deregister(&wm831x_wdt_miscdev);
+
+	return 0;
+}
+
+static struct platform_driver wm831x_wdt_driver = {
+	.probe = wm831x_wdt_probe,
+	.remove = __devexit_p(wm831x_wdt_remove),
+	.driver = {
+		.name = "wm831x-watchdog",
+	},
+};
+
+static int __init wm831x_wdt_init(void)
+{
+	return platform_driver_register(&wm831x_wdt_driver);
+}
+module_init(wm831x_wdt_init);
+
+static void __exit wm831x_wdt_exit(void)
+{
+	platform_driver_unregister(&wm831x_wdt_driver);
+}
+module_exit(wm831x_wdt_exit);
+
+MODULE_AUTHOR("Mark Brown");
+MODULE_DESCRIPTION("WM831x Watchdog");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:wm831x-watchdog");
diff --git a/include/linux/mfd/wm831x/watchdog.h b/include/linux/mfd/wm831x/watchdog.h
new file mode 100644
index 000000000000..97a99b52956f
--- /dev/null
+++ b/include/linux/mfd/wm831x/watchdog.h
@@ -0,0 +1,52 @@
+/*
+ * include/linux/mfd/wm831x/watchdog.h -- Watchdog for WM831x
+ *
+ * Copyright 2009 Wolfson Microelectronics PLC.
+ *
+ * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#ifndef __MFD_WM831X_WATCHDOG_H__
+#define __MFD_WM831X_WATCHDOG_H__
+
+
+/*
+ * R16388 (0x4004) - Watchdog
+ */
+#define WM831X_WDOG_ENA                         0x8000  /* WDOG_ENA */
+#define WM831X_WDOG_ENA_MASK                    0x8000  /* WDOG_ENA */
+#define WM831X_WDOG_ENA_SHIFT                       15  /* WDOG_ENA */
+#define WM831X_WDOG_ENA_WIDTH                        1  /* WDOG_ENA */
+#define WM831X_WDOG_DEBUG                       0x4000  /* WDOG_DEBUG */
+#define WM831X_WDOG_DEBUG_MASK                  0x4000  /* WDOG_DEBUG */
+#define WM831X_WDOG_DEBUG_SHIFT                     14  /* WDOG_DEBUG */
+#define WM831X_WDOG_DEBUG_WIDTH                      1  /* WDOG_DEBUG */
+#define WM831X_WDOG_RST_SRC                     0x2000  /* WDOG_RST_SRC */
+#define WM831X_WDOG_RST_SRC_MASK                0x2000  /* WDOG_RST_SRC */
+#define WM831X_WDOG_RST_SRC_SHIFT                   13  /* WDOG_RST_SRC */
+#define WM831X_WDOG_RST_SRC_WIDTH                    1  /* WDOG_RST_SRC */
+#define WM831X_WDOG_SLPENA                      0x1000  /* WDOG_SLPENA */
+#define WM831X_WDOG_SLPENA_MASK                 0x1000  /* WDOG_SLPENA */
+#define WM831X_WDOG_SLPENA_SHIFT                    12  /* WDOG_SLPENA */
+#define WM831X_WDOG_SLPENA_WIDTH                     1  /* WDOG_SLPENA */
+#define WM831X_WDOG_RESET                       0x0800  /* WDOG_RESET */
+#define WM831X_WDOG_RESET_MASK                  0x0800  /* WDOG_RESET */
+#define WM831X_WDOG_RESET_SHIFT                     11  /* WDOG_RESET */
+#define WM831X_WDOG_RESET_WIDTH                      1  /* WDOG_RESET */
+#define WM831X_WDOG_SECACT_MASK                 0x0300  /* WDOG_SECACT - [9:8] */
+#define WM831X_WDOG_SECACT_SHIFT                     8  /* WDOG_SECACT - [9:8] */
+#define WM831X_WDOG_SECACT_WIDTH                     2  /* WDOG_SECACT - [9:8] */
+#define WM831X_WDOG_PRIMACT_MASK                0x0030  /* WDOG_PRIMACT - [5:4] */
+#define WM831X_WDOG_PRIMACT_SHIFT                    4  /* WDOG_PRIMACT - [5:4] */
+#define WM831X_WDOG_PRIMACT_WIDTH                    2  /* WDOG_PRIMACT - [5:4] */
+#define WM831X_WDOG_TO_MASK                     0x0007  /* WDOG_TO - [2:0] */
+#define WM831X_WDOG_TO_SHIFT                         0  /* WDOG_TO - [2:0] */
+#define WM831X_WDOG_TO_WIDTH                         3  /* WDOG_TO - [2:0] */
+
+#endif
-- 
cgit v1.2.3-71-gd317


From 6952b61de9984073289859073e8195ad0bee8fd5 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 18 Sep 2009 23:55:55 +0400
Subject: headers: taskstats_kern.h trim

Remove net/genetlink.h inclusion, now sched.c won't be recompiled
because of some networking changes.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/delayacct.h      | 1 -
 include/linux/taskstats_kern.h | 1 -
 kernel/delayacct.c             | 1 +
 mm/memory.c                    | 1 +
 4 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index f352f06fa063..5076fe0c8a96 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -18,7 +18,6 @@
 #define _LINUX_DELAYACCT_H
 
 #include <linux/sched.h>
-#include <linux/taskstats_kern.h>
 
 /*
  * Per-task flags relevant to delay accounting
diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h
index 7e9680f4afdd..3398f4553269 100644
--- a/include/linux/taskstats_kern.h
+++ b/include/linux/taskstats_kern.h
@@ -9,7 +9,6 @@
 
 #include <linux/taskstats.h>
 #include <linux/sched.h>
-#include <net/genetlink.h>
 
 #ifdef CONFIG_TASKSTATS
 extern struct kmem_cache *taskstats_cache;
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index abb6e17505e2..ead9b610aa71 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -15,6 +15,7 @@
 
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/taskstats.h>
 #include <linux/time.h>
 #include <linux/sysctl.h>
 #include <linux/delayacct.h>
diff --git a/mm/memory.c b/mm/memory.c
index aede2ce3aba4..e8f63d9961ea 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -56,6 +56,7 @@
 #include <linux/swapops.h>
 #include <linux/elf.h>
 
+#include <asm/io.h>
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
 #include <asm/tlb.h>
-- 
cgit v1.2.3-71-gd317


From 5622f295b53fb60dbf9bed3e2c89d182490a8b7f Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Tue, 15 Sep 2009 13:00:23 +0200
Subject: x86, perf_counter, bts: Optimize BTS overflow handling

Draining the BTS buffer on a buffer overflow interrupt takes too
long resulting in a kernel lockup when tracing the kernel.

Restructure perf_counter sampling into sample creation and sample
output.

Prepare a single reference sample for BTS sampling and update the
from and to address fields when draining the BTS buffer. Drain the
entire BTS buffer between a single perf_output_begin() /
perf_output_end() pair.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090915130023.A16204@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c |  60 ++++---
 include/linux/perf_counter.h       |  68 +++++++-
 kernel/perf_counter.c              | 312 ++++++++++++++++++++-----------------
 3 files changed, 266 insertions(+), 174 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f9cd0849bd42..6a0e71b38126 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -36,10 +36,10 @@ static u64 perf_counter_mask __read_mostly;
 #define BTS_RECORD_SIZE		24
 
 /* The size of a per-cpu BTS buffer in bytes: */
-#define BTS_BUFFER_SIZE		(BTS_RECORD_SIZE * 1024)
+#define BTS_BUFFER_SIZE		(BTS_RECORD_SIZE * 2048)
 
 /* The BTS overflow threshold in bytes from the end of the buffer: */
-#define BTS_OVFL_TH		(BTS_RECORD_SIZE * 64)
+#define BTS_OVFL_TH		(BTS_RECORD_SIZE * 128)
 
 
 /*
@@ -1488,8 +1488,7 @@ void perf_counter_print_debug(void)
 	local_irq_restore(flags);
 }
 
-static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
-				       struct perf_sample_data *data)
+static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc)
 {
 	struct debug_store *ds = cpuc->ds;
 	struct bts_record {
@@ -1498,8 +1497,11 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
 		u64	flags;
 	};
 	struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS];
-	unsigned long orig_ip = data->regs->ip;
 	struct bts_record *at, *top;
+	struct perf_output_handle handle;
+	struct perf_event_header header;
+	struct perf_sample_data data;
+	struct pt_regs regs;
 
 	if (!counter)
 		return;
@@ -1510,19 +1512,38 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
 	at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
 	top = (struct bts_record *)(unsigned long)ds->bts_index;
 
+	if (top <= at)
+		return;
+
 	ds->bts_index = ds->bts_buffer_base;
 
+
+	data.period	= counter->hw.last_period;
+	data.addr	= 0;
+	regs.ip		= 0;
+
+	/*
+	 * Prepare a generic sample, i.e. fill in the invariant fields.
+	 * We will overwrite the from and to address before we output
+	 * the sample.
+	 */
+	perf_prepare_sample(&header, &data, counter, &regs);
+
+	if (perf_output_begin(&handle, counter,
+			      header.size * (top - at), 1, 1))
+		return;
+
 	for (; at < top; at++) {
-		data->regs->ip	= at->from;
-		data->addr	= at->to;
+		data.ip		= at->from;
+		data.addr	= at->to;
 
-		perf_counter_output(counter, 1, data);
+		perf_output_sample(&handle, &header, &data, counter);
 	}
 
-	data->regs->ip	= orig_ip;
-	data->addr	= 0;
+	perf_output_end(&handle);
 
 	/* There's new data available. */
+	counter->hw.interrupts++;
 	counter->pending_kill = POLL_IN;
 }
 
@@ -1552,13 +1573,9 @@ static void x86_pmu_disable(struct perf_counter *counter)
 	x86_perf_counter_update(counter, hwc, idx);
 
 	/* Drain the remaining BTS records. */
-	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
-		struct perf_sample_data data;
-		struct pt_regs regs;
+	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
+		intel_pmu_drain_bts_buffer(cpuc);
 
-		data.regs = &regs;
-		intel_pmu_drain_bts_buffer(cpuc, &data);
-	}
 	cpuc->counters[idx] = NULL;
 	clear_bit(idx, cpuc->used_mask);
 
@@ -1619,7 +1636,6 @@ static int p6_pmu_handle_irq(struct pt_regs *regs)
 	int idx, handled = 0;
 	u64 val;
 
-	data.regs = regs;
 	data.addr = 0;
 
 	cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -1644,7 +1660,7 @@ static int p6_pmu_handle_irq(struct pt_regs *regs)
 		if (!x86_perf_counter_set_period(counter, hwc, idx))
 			continue;
 
-		if (perf_counter_overflow(counter, 1, &data))
+		if (perf_counter_overflow(counter, 1, &data, regs))
 			p6_pmu_disable_counter(hwc, idx);
 	}
 
@@ -1665,13 +1681,12 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 	int bit, loops;
 	u64 ack, status;
 
-	data.regs = regs;
 	data.addr = 0;
 
 	cpuc = &__get_cpu_var(cpu_hw_counters);
 
 	perf_disable();
-	intel_pmu_drain_bts_buffer(cpuc, &data);
+	intel_pmu_drain_bts_buffer(cpuc);
 	status = intel_pmu_get_status();
 	if (!status) {
 		perf_enable();
@@ -1702,7 +1717,7 @@ again:
 
 		data.period = counter->hw.last_period;
 
-		if (perf_counter_overflow(counter, 1, &data))
+		if (perf_counter_overflow(counter, 1, &data, regs))
 			intel_pmu_disable_counter(&counter->hw, bit);
 	}
 
@@ -1729,7 +1744,6 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
 	int idx, handled = 0;
 	u64 val;
 
-	data.regs = regs;
 	data.addr = 0;
 
 	cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -1754,7 +1768,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
 		if (!x86_perf_counter_set_period(counter, hwc, idx))
 			continue;
 
-		if (perf_counter_overflow(counter, 1, &data))
+		if (perf_counter_overflow(counter, 1, &data, regs))
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 6c1ef72ea501..c7375f97aa19 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -691,6 +691,17 @@ struct perf_cpu_context {
 	int				recursion[4];
 };
 
+struct perf_output_handle {
+	struct perf_counter	*counter;
+	struct perf_mmap_data	*data;
+	unsigned long		head;
+	unsigned long		offset;
+	int			nmi;
+	int			sample;
+	int			locked;
+	unsigned long		flags;
+};
+
 #ifdef CONFIG_PERF_COUNTERS
 
 /*
@@ -722,16 +733,38 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 extern void perf_counter_update_userpage(struct perf_counter *counter);
 
 struct perf_sample_data {
-	struct pt_regs			*regs;
+	u64				type;
+
+	u64				ip;
+	struct {
+		u32	pid;
+		u32	tid;
+	}				tid_entry;
+	u64				time;
 	u64				addr;
+	u64				id;
+	u64				stream_id;
+	struct {
+		u32	cpu;
+		u32	reserved;
+	}				cpu_entry;
 	u64				period;
+	struct perf_callchain_entry	*callchain;
 	struct perf_raw_record		*raw;
 };
 
+extern void perf_output_sample(struct perf_output_handle *handle,
+			       struct perf_event_header *header,
+			       struct perf_sample_data *data,
+			       struct perf_counter *counter);
+extern void perf_prepare_sample(struct perf_event_header *header,
+				struct perf_sample_data *data,
+				struct perf_counter *counter,
+				struct pt_regs *regs);
+
 extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
-				 struct perf_sample_data *data);
-extern void perf_counter_output(struct perf_counter *counter, int nmi,
-				struct perf_sample_data *data);
+				 struct perf_sample_data *data,
+				 struct pt_regs *regs);
 
 /*
  * Return 1 for a software counter, 0 for a hardware counter
@@ -781,6 +814,12 @@ extern void perf_tpcounter_event(int event_id, u64 addr, u64 count,
 #define perf_instruction_pointer(regs)	instruction_pointer(regs)
 #endif
 
+extern int perf_output_begin(struct perf_output_handle *handle,
+			     struct perf_counter *counter, unsigned int size,
+			     int nmi, int sample);
+extern void perf_output_end(struct perf_output_handle *handle);
+extern void perf_output_copy(struct perf_output_handle *handle,
+			     const void *buf, unsigned int len);
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -807,7 +846,28 @@ static inline void perf_counter_mmap(struct vm_area_struct *vma)	{ }
 static inline void perf_counter_comm(struct task_struct *tsk)		{ }
 static inline void perf_counter_fork(struct task_struct *tsk)		{ }
 static inline void perf_counter_init(void)				{ }
+
+static inline int
+perf_output_begin(struct perf_output_handle *handle, struct perf_counter *c,
+		  unsigned int size, int nmi, int sample)		{ }
+static inline void perf_output_end(struct perf_output_handle *handle)	{ }
+static inline void
+perf_output_copy(struct perf_output_handle *handle,
+		 const void *buf, unsigned int len)			{ }
+static inline void
+perf_output_sample(struct perf_output_handle *handle,
+		   struct perf_event_header *header,
+		   struct perf_sample_data *data,
+		   struct perf_counter *counter)			{ }
+static inline void
+perf_prepare_sample(struct perf_event_header *header,
+		    struct perf_sample_data *data,
+		    struct perf_counter *counter,
+		    struct pt_regs *regs)				{ }
 #endif
 
+#define perf_output_put(handle, x) \
+	perf_output_copy((handle), &(x), sizeof(x))
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_PERF_COUNTER_H */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 29b73b6e8146..215845243a69 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2512,18 +2512,6 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 /*
  * Output
  */
-
-struct perf_output_handle {
-	struct perf_counter	*counter;
-	struct perf_mmap_data	*data;
-	unsigned long		head;
-	unsigned long		offset;
-	int			nmi;
-	int			sample;
-	int			locked;
-	unsigned long		flags;
-};
-
 static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
 			      unsigned long offset, unsigned long head)
 {
@@ -2633,8 +2621,8 @@ out:
 	local_irq_restore(handle->flags);
 }
 
-static void perf_output_copy(struct perf_output_handle *handle,
-			     const void *buf, unsigned int len)
+void perf_output_copy(struct perf_output_handle *handle,
+		      const void *buf, unsigned int len)
 {
 	unsigned int pages_mask;
 	unsigned int offset;
@@ -2669,12 +2657,9 @@ static void perf_output_copy(struct perf_output_handle *handle,
 	WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
 }
 
-#define perf_output_put(handle, x) \
-	perf_output_copy((handle), &(x), sizeof(x))
-
-static int perf_output_begin(struct perf_output_handle *handle,
-			     struct perf_counter *counter, unsigned int size,
-			     int nmi, int sample)
+int perf_output_begin(struct perf_output_handle *handle,
+		      struct perf_counter *counter, unsigned int size,
+		      int nmi, int sample)
 {
 	struct perf_counter *output_counter;
 	struct perf_mmap_data *data;
@@ -2756,7 +2741,7 @@ out:
 	return -ENOSPC;
 }
 
-static void perf_output_end(struct perf_output_handle *handle)
+void perf_output_end(struct perf_output_handle *handle)
 {
 	struct perf_counter *counter = handle->counter;
 	struct perf_mmap_data *data = handle->data;
@@ -2870,82 +2855,151 @@ static void perf_output_read(struct perf_output_handle *handle,
 		perf_output_read_one(handle, counter);
 }
 
-void perf_counter_output(struct perf_counter *counter, int nmi,
-				struct perf_sample_data *data)
+void perf_output_sample(struct perf_output_handle *handle,
+			struct perf_event_header *header,
+			struct perf_sample_data *data,
+			struct perf_counter *counter)
+{
+	u64 sample_type = data->type;
+
+	perf_output_put(handle, *header);
+
+	if (sample_type & PERF_SAMPLE_IP)
+		perf_output_put(handle, data->ip);
+
+	if (sample_type & PERF_SAMPLE_TID)
+		perf_output_put(handle, data->tid_entry);
+
+	if (sample_type & PERF_SAMPLE_TIME)
+		perf_output_put(handle, data->time);
+
+	if (sample_type & PERF_SAMPLE_ADDR)
+		perf_output_put(handle, data->addr);
+
+	if (sample_type & PERF_SAMPLE_ID)
+		perf_output_put(handle, data->id);
+
+	if (sample_type & PERF_SAMPLE_STREAM_ID)
+		perf_output_put(handle, data->stream_id);
+
+	if (sample_type & PERF_SAMPLE_CPU)
+		perf_output_put(handle, data->cpu_entry);
+
+	if (sample_type & PERF_SAMPLE_PERIOD)
+		perf_output_put(handle, data->period);
+
+	if (sample_type & PERF_SAMPLE_READ)
+		perf_output_read(handle, counter);
+
+	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+		if (data->callchain) {
+			int size = 1;
+
+			if (data->callchain)
+				size += data->callchain->nr;
+
+			size *= sizeof(u64);
+
+			perf_output_copy(handle, data->callchain, size);
+		} else {
+			u64 nr = 0;
+			perf_output_put(handle, nr);
+		}
+	}
+
+	if (sample_type & PERF_SAMPLE_RAW) {
+		if (data->raw) {
+			perf_output_put(handle, data->raw->size);
+			perf_output_copy(handle, data->raw->data,
+					 data->raw->size);
+		} else {
+			struct {
+				u32	size;
+				u32	data;
+			} raw = {
+				.size = sizeof(u32),
+				.data = 0,
+			};
+			perf_output_put(handle, raw);
+		}
+	}
+}
+
+void perf_prepare_sample(struct perf_event_header *header,
+			 struct perf_sample_data *data,
+			 struct perf_counter *counter,
+			 struct pt_regs *regs)
 {
-	int ret;
 	u64 sample_type = counter->attr.sample_type;
-	struct perf_output_handle handle;
-	struct perf_event_header header;
-	u64 ip;
-	struct {
-		u32 pid, tid;
-	} tid_entry;
-	struct perf_callchain_entry *callchain = NULL;
-	int callchain_size = 0;
-	u64 time;
-	struct {
-		u32 cpu, reserved;
-	} cpu_entry;
 
-	header.type = PERF_EVENT_SAMPLE;
-	header.size = sizeof(header);
+	data->type = sample_type;
 
-	header.misc = 0;
-	header.misc |= perf_misc_flags(data->regs);
+	header->type = PERF_EVENT_SAMPLE;
+	header->size = sizeof(*header);
+
+	header->misc = 0;
+	header->misc |= perf_misc_flags(regs);
 
 	if (sample_type & PERF_SAMPLE_IP) {
-		ip = perf_instruction_pointer(data->regs);
-		header.size += sizeof(ip);
+		data->ip = perf_instruction_pointer(regs);
+
+		header->size += sizeof(data->ip);
 	}
 
 	if (sample_type & PERF_SAMPLE_TID) {
 		/* namespace issues */
-		tid_entry.pid = perf_counter_pid(counter, current);
-		tid_entry.tid = perf_counter_tid(counter, current);
+		data->tid_entry.pid = perf_counter_pid(counter, current);
+		data->tid_entry.tid = perf_counter_tid(counter, current);
 
-		header.size += sizeof(tid_entry);
+		header->size += sizeof(data->tid_entry);
 	}
 
 	if (sample_type & PERF_SAMPLE_TIME) {
 		/*
 		 * Maybe do better on x86 and provide cpu_clock_nmi()
 		 */
-		time = sched_clock();
+		data->time = sched_clock();
 
-		header.size += sizeof(u64);
+		header->size += sizeof(data->time);
 	}
 
 	if (sample_type & PERF_SAMPLE_ADDR)
-		header.size += sizeof(u64);
+		header->size += sizeof(data->addr);
 
-	if (sample_type & PERF_SAMPLE_ID)
-		header.size += sizeof(u64);
+	if (sample_type & PERF_SAMPLE_ID) {
+		data->id = primary_counter_id(counter);
 
-	if (sample_type & PERF_SAMPLE_STREAM_ID)
-		header.size += sizeof(u64);
+		header->size += sizeof(data->id);
+	}
+
+	if (sample_type & PERF_SAMPLE_STREAM_ID) {
+		data->stream_id = counter->id;
+
+		header->size += sizeof(data->stream_id);
+	}
 
 	if (sample_type & PERF_SAMPLE_CPU) {
-		header.size += sizeof(cpu_entry);
+		data->cpu_entry.cpu		= raw_smp_processor_id();
+		data->cpu_entry.reserved	= 0;
 
-		cpu_entry.cpu = raw_smp_processor_id();
-		cpu_entry.reserved = 0;
+		header->size += sizeof(data->cpu_entry);
 	}
 
 	if (sample_type & PERF_SAMPLE_PERIOD)
-		header.size += sizeof(u64);
+		header->size += sizeof(data->period);
 
 	if (sample_type & PERF_SAMPLE_READ)
-		header.size += perf_counter_read_size(counter);
+		header->size += perf_counter_read_size(counter);
 
 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
-		callchain = perf_callchain(data->regs);
+		int size = 1;
 
-		if (callchain) {
-			callchain_size = (1 + callchain->nr) * sizeof(u64);
-			header.size += callchain_size;
-		} else
-			header.size += sizeof(u64);
+		data->callchain = perf_callchain(regs);
+
+		if (data->callchain)
+			size += data->callchain->nr;
+
+		header->size += size * sizeof(u64);
 	}
 
 	if (sample_type & PERF_SAMPLE_RAW) {
@@ -2957,69 +3011,23 @@ void perf_counter_output(struct perf_counter *counter, int nmi,
 			size += sizeof(u32);
 
 		WARN_ON_ONCE(size & (sizeof(u64)-1));
-		header.size += size;
-	}
-
-	ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
-	if (ret)
-		return;
-
-	perf_output_put(&handle, header);
-
-	if (sample_type & PERF_SAMPLE_IP)
-		perf_output_put(&handle, ip);
-
-	if (sample_type & PERF_SAMPLE_TID)
-		perf_output_put(&handle, tid_entry);
-
-	if (sample_type & PERF_SAMPLE_TIME)
-		perf_output_put(&handle, time);
-
-	if (sample_type & PERF_SAMPLE_ADDR)
-		perf_output_put(&handle, data->addr);
-
-	if (sample_type & PERF_SAMPLE_ID) {
-		u64 id = primary_counter_id(counter);
-
-		perf_output_put(&handle, id);
+		header->size += size;
 	}
+}
 
-	if (sample_type & PERF_SAMPLE_STREAM_ID)
-		perf_output_put(&handle, counter->id);
-
-	if (sample_type & PERF_SAMPLE_CPU)
-		perf_output_put(&handle, cpu_entry);
-
-	if (sample_type & PERF_SAMPLE_PERIOD)
-		perf_output_put(&handle, data->period);
+static void perf_counter_output(struct perf_counter *counter, int nmi,
+				struct perf_sample_data *data,
+				struct pt_regs *regs)
+{
+	struct perf_output_handle handle;
+	struct perf_event_header header;
 
-	if (sample_type & PERF_SAMPLE_READ)
-		perf_output_read(&handle, counter);
+	perf_prepare_sample(&header, data, counter, regs);
 
-	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
-		if (callchain)
-			perf_output_copy(&handle, callchain, callchain_size);
-		else {
-			u64 nr = 0;
-			perf_output_put(&handle, nr);
-		}
-	}
+	if (perf_output_begin(&handle, counter, header.size, nmi, 1))
+		return;
 
-	if (sample_type & PERF_SAMPLE_RAW) {
-		if (data->raw) {
-			perf_output_put(&handle, data->raw->size);
-			perf_output_copy(&handle, data->raw->data, data->raw->size);
-		} else {
-			struct {
-				u32	size;
-				u32	data;
-			} raw = {
-				.size = sizeof(u32),
-				.data = 0,
-			};
-			perf_output_put(&handle, raw);
-		}
-	}
+	perf_output_sample(&handle, &header, data, counter);
 
 	perf_output_end(&handle);
 }
@@ -3501,7 +3509,8 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
  */
 
 static int __perf_counter_overflow(struct perf_counter *counter, int nmi,
-				   int throttle, struct perf_sample_data *data)
+				   int throttle, struct perf_sample_data *data,
+				   struct pt_regs *regs)
 {
 	int events = atomic_read(&counter->event_limit);
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -3557,14 +3566,15 @@ static int __perf_counter_overflow(struct perf_counter *counter, int nmi,
 			perf_counter_disable(counter);
 	}
 
-	perf_counter_output(counter, nmi, data);
+	perf_counter_output(counter, nmi, data, regs);
 	return ret;
 }
 
 int perf_counter_overflow(struct perf_counter *counter, int nmi,
-			  struct perf_sample_data *data)
+			  struct perf_sample_data *data,
+			  struct pt_regs *regs)
 {
-	return __perf_counter_overflow(counter, nmi, 1, data);
+	return __perf_counter_overflow(counter, nmi, 1, data, regs);
 }
 
 /*
@@ -3602,7 +3612,8 @@ again:
 }
 
 static void perf_swcounter_overflow(struct perf_counter *counter,
-				    int nmi, struct perf_sample_data *data)
+				    int nmi, struct perf_sample_data *data,
+				    struct pt_regs *regs)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	int throttle = 0;
@@ -3615,7 +3626,8 @@ static void perf_swcounter_overflow(struct perf_counter *counter,
 		return;
 
 	for (; overflow; overflow--) {
-		if (__perf_counter_overflow(counter, nmi, throttle, data)) {
+		if (__perf_counter_overflow(counter, nmi, throttle,
+					    data, regs)) {
 			/*
 			 * We inhibit the overflow from happening when
 			 * hwc->interrupts == MAX_INTERRUPTS.
@@ -3634,7 +3646,8 @@ static void perf_swcounter_unthrottle(struct perf_counter *counter)
 }
 
 static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
-			       int nmi, struct perf_sample_data *data)
+			       int nmi, struct perf_sample_data *data,
+			       struct pt_regs *regs)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 
@@ -3643,11 +3656,11 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
 	if (!hwc->sample_period)
 		return;
 
-	if (!data->regs)
+	if (!regs)
 		return;
 
 	if (!atomic64_add_negative(nr, &hwc->period_left))
-		perf_swcounter_overflow(counter, nmi, data);
+		perf_swcounter_overflow(counter, nmi, data, regs);
 }
 
 static int perf_swcounter_is_counting(struct perf_counter *counter)
@@ -3706,7 +3719,8 @@ static int perf_swcounter_match(struct perf_counter *counter,
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 				     enum perf_type_id type,
 				     u32 event, u64 nr, int nmi,
-				     struct perf_sample_data *data)
+				     struct perf_sample_data *data,
+				     struct pt_regs *regs)
 {
 	struct perf_counter *counter;
 
@@ -3715,8 +3729,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-		if (perf_swcounter_match(counter, type, event, data->regs))
-			perf_swcounter_add(counter, nr, nmi, data);
+		if (perf_swcounter_match(counter, type, event, regs))
+			perf_swcounter_add(counter, nr, nmi, data, regs);
 	}
 	rcu_read_unlock();
 }
@@ -3737,7 +3751,8 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
 
 static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
 				    u64 nr, int nmi,
-				    struct perf_sample_data *data)
+				    struct perf_sample_data *data,
+				    struct pt_regs *regs)
 {
 	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
 	int *recursion = perf_swcounter_recursion_context(cpuctx);
@@ -3750,7 +3765,7 @@ static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
 	barrier();
 
 	perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
-				 nr, nmi, data);
+				 nr, nmi, data, regs);
 	rcu_read_lock();
 	/*
 	 * doesn't really matter which of the child contexts the
@@ -3758,7 +3773,7 @@ static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
 	 */
 	ctx = rcu_dereference(current->perf_counter_ctxp);
 	if (ctx)
-		perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
+		perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data, regs);
 	rcu_read_unlock();
 
 	barrier();
@@ -3772,11 +3787,11 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi,
 			    struct pt_regs *regs, u64 addr)
 {
 	struct perf_sample_data data = {
-		.regs = regs,
 		.addr = addr,
 	};
 
-	do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
+	do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi,
+				&data, regs);
 }
 
 static void perf_swcounter_read(struct perf_counter *counter)
@@ -3813,6 +3828,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 {
 	enum hrtimer_restart ret = HRTIMER_RESTART;
 	struct perf_sample_data data;
+	struct pt_regs *regs;
 	struct perf_counter *counter;
 	u64 period;
 
@@ -3820,17 +3836,17 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 	counter->pmu->read(counter);
 
 	data.addr = 0;
-	data.regs = get_irq_regs();
+	regs = get_irq_regs();
 	/*
 	 * In case we exclude kernel IPs or are somehow not in interrupt
 	 * context, provide the next best thing, the user IP.
 	 */
-	if ((counter->attr.exclude_kernel || !data.regs) &&
+	if ((counter->attr.exclude_kernel || !regs) &&
 			!counter->attr.exclude_user)
-		data.regs = task_pt_regs(current);
+		regs = task_pt_regs(current);
 
-	if (data.regs) {
-		if (perf_counter_overflow(counter, 0, &data))
+	if (regs) {
+		if (perf_counter_overflow(counter, 0, &data, regs))
 			ret = HRTIMER_NORESTART;
 	}
 
@@ -3966,15 +3982,17 @@ void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
 	};
 
 	struct perf_sample_data data = {
-		.regs = get_irq_regs(),
 		.addr = addr,
 		.raw = &raw,
 	};
 
-	if (!data.regs)
-		data.regs = task_pt_regs(current);
+	struct pt_regs *regs = get_irq_regs();
+
+	if (!regs)
+		regs = task_pt_regs(current);
 
-	do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
+	do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
+				&data, regs);
 }
 EXPORT_SYMBOL_GPL(perf_tpcounter_event);
 
-- 
cgit v1.2.3-71-gd317


From fc5377668c3d808e1d53c4aee152c836f55c3490 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 17 Sep 2009 19:35:28 +0200
Subject: tracing: Remove markers

Now that the last users of markers have migrated to the event
tracer we can kill off the (now orphan) support code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20090917173527.GA1699@lst.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/markers.txt                 | 104 ----
 arch/powerpc/platforms/cell/spufs/file.c  |   1 -
 arch/powerpc/platforms/cell/spufs/sched.c |   1 -
 include/linux/kvm_host.h                  |   1 -
 include/linux/marker.h                    | 221 -------
 include/linux/module.h                    |  11 -
 init/Kconfig                              |   7 -
 kernel/Makefile                           |   1 -
 kernel/marker.c                           | 930 ------------------------------
 kernel/module.c                           |  18 -
 kernel/trace/trace_printk.c               |   1 -
 samples/Kconfig                           |   6 -
 samples/Makefile                          |   2 +-
 samples/markers/Makefile                  |   4 -
 samples/markers/marker-example.c          |  53 --
 samples/markers/probe-example.c           |  92 ---
 scripts/Makefile.modpost                  |  12 -
 17 files changed, 1 insertion(+), 1464 deletions(-)
 delete mode 100644 Documentation/markers.txt
 delete mode 100644 include/linux/marker.h
 delete mode 100644 kernel/marker.c
 delete mode 100644 samples/markers/Makefile
 delete mode 100644 samples/markers/marker-example.c
 delete mode 100644 samples/markers/probe-example.c

(limited to 'include/linux')

diff --git a/Documentation/markers.txt b/Documentation/markers.txt
deleted file mode 100644
index d2b3d0e91b26..000000000000
--- a/Documentation/markers.txt
+++ /dev/null
@@ -1,104 +0,0 @@
- 	             Using the Linux Kernel Markers
-
-			    Mathieu Desnoyers
-
-
-This document introduces Linux Kernel Markers and their use. It provides
-examples of how to insert markers in the kernel and connect probe functions to
-them and provides some examples of probe functions.
-
-
-* Purpose of markers
-
-A marker placed in code provides a hook to call a function (probe) that you can
-provide at runtime. A marker can be "on" (a probe is connected to it) or "off"
-(no probe is attached). When a marker is "off" it has no effect, except for
-adding a tiny time penalty (checking a condition for a branch) and space
-penalty (adding a few bytes for the function call at the end of the
-instrumented function and adds a data structure in a separate section).  When a
-marker is "on", the function you provide is called each time the marker is
-executed, in the execution context of the caller. When the function provided
-ends its execution, it returns to the caller (continuing from the marker site).
-
-You can put markers at important locations in the code. Markers are
-lightweight hooks that can pass an arbitrary number of parameters,
-described in a printk-like format string, to the attached probe function.
-
-They can be used for tracing and performance accounting.
-
-
-* Usage
-
-In order to use the macro trace_mark, you should include linux/marker.h.
-
-#include <linux/marker.h>
-
-And,
-
-trace_mark(subsystem_event, "myint %d mystring %s", someint, somestring);
-Where :
-- subsystem_event is an identifier unique to your event
-    - subsystem is the name of your subsystem.
-    - event is the name of the event to mark.
-- "myint %d mystring %s" is the formatted string for the serializer. "myint" and
-  "mystring" are repectively the field names associated with the first and
-  second parameter.
-- someint is an integer.
-- somestring is a char pointer.
-
-Connecting a function (probe) to a marker is done by providing a probe (function
-to call) for the specific marker through marker_probe_register() and can be
-activated by calling marker_arm(). Marker deactivation can be done by calling
-marker_disarm() as many times as marker_arm() has been called. Removing a probe
-is done through marker_probe_unregister(); it will disarm the probe.
-
-marker_synchronize_unregister() must be called between probe unregistration and
-the first occurrence of
-- the end of module exit function,
-  to make sure there is no caller left using the probe;
-- the free of any resource used by the probes,
-  to make sure the probes wont be accessing invalid data.
-This, and the fact that preemption is disabled around the probe call, make sure
-that probe removal and module unload are safe. See the "Probe example" section
-below for a sample probe module.
-
-The marker mechanism supports inserting multiple instances of the same marker.
-Markers can be put in inline functions, inlined static functions, and
-unrolled loops as well as regular functions.
-
-The naming scheme "subsystem_event" is suggested here as a convention intended
-to limit collisions. Marker names are global to the kernel: they are considered
-as being the same whether they are in the core kernel image or in modules.
-Conflicting format strings for markers with the same name will cause the markers
-to be detected to have a different format string not to be armed and will output
-a printk warning which identifies the inconsistency:
-
-"Format mismatch for probe probe_name (format), marker (format)"
-
-Another way to use markers is to simply define the marker without generating any
-function call to actually call into the marker. This is useful in combination
-with tracepoint probes in a scheme like this :
-
-void probe_tracepoint_name(unsigned int arg1, struct task_struct *tsk);
-
-DEFINE_MARKER_TP(marker_eventname, tracepoint_name, probe_tracepoint_name,
-	"arg1 %u pid %d");
-
-notrace void probe_tracepoint_name(unsigned int arg1, struct task_struct *tsk)
-{
-	struct marker *marker = &GET_MARKER(kernel_irq_entry);
-	/* write data to trace buffers ... */
-}
-
-* Probe / marker example
-
-See the example provided in samples/markers/src
-
-Compile them with your kernel.
-
-Run, as root :
-modprobe marker-example (insmod order is not important)
-modprobe probe-example
-cat /proc/marker-example (returns an expected error)
-rmmod marker-example probe-example
-dmesg
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index ab8aef9bb8ea..8f079b865ad0 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -29,7 +29,6 @@
 #include <linux/poll.h>
 #include <linux/ptrace.h>
 #include <linux/seq_file.h>
-#include <linux/marker.h>
 
 #include <asm/io.h>
 #include <asm/time.h>
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index bb5b77c66d05..4678078fede8 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -39,7 +39,6 @@
 #include <linux/pid_namespace.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <linux/marker.h>
 
 #include <asm/io.h>
 #include <asm/mmu_context.h>
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4af56036a6bf..b7bbb5ddd7ae 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -15,7 +15,6 @@
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/preempt.h>
-#include <linux/marker.h>
 #include <linux/msi.h>
 #include <asm/signal.h>
 
diff --git a/include/linux/marker.h b/include/linux/marker.h
deleted file mode 100644
index b85e74ca782f..000000000000
--- a/include/linux/marker.h
+++ /dev/null
@@ -1,221 +0,0 @@
-#ifndef _LINUX_MARKER_H
-#define _LINUX_MARKER_H
-
-/*
- * Code markup for dynamic and static tracing.
- *
- * See Documentation/marker.txt.
- *
- * (C) Copyright 2006 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
- *
- * This file is released under the GPLv2.
- * See the file COPYING for more details.
- */
-
-#include <stdarg.h>
-#include <linux/types.h>
-
-struct module;
-struct marker;
-
-/**
- * marker_probe_func - Type of a marker probe function
- * @probe_private: probe private data
- * @call_private: call site private data
- * @fmt: format string
- * @args: variable argument list pointer. Use a pointer to overcome C's
- *        inability to pass this around as a pointer in a portable manner in
- *        the callee otherwise.
- *
- * Type of marker probe functions. They receive the mdata and need to parse the
- * format string to recover the variable argument list.
- */
-typedef void marker_probe_func(void *probe_private, void *call_private,
-		const char *fmt, va_list *args);
-
-struct marker_probe_closure {
-	marker_probe_func *func;	/* Callback */
-	void *probe_private;		/* Private probe data */
-};
-
-struct marker {
-	const char *name;	/* Marker name */
-	const char *format;	/* Marker format string, describing the
-				 * variable argument list.
-				 */
-	char state;		/* Marker state. */
-	char ptype;		/* probe type : 0 : single, 1 : multi */
-				/* Probe wrapper */
-	void (*call)(const struct marker *mdata, void *call_private, ...);
-	struct marker_probe_closure single;
-	struct marker_probe_closure *multi;
-	const char *tp_name;	/* Optional tracepoint name */
-	void *tp_cb;		/* Optional tracepoint callback */
-} __attribute__((aligned(8)));
-
-#ifdef CONFIG_MARKERS
-
-#define _DEFINE_MARKER(name, tp_name_str, tp_cb, format)		\
-		static const char __mstrtab_##name[]			\
-		__attribute__((section("__markers_strings")))		\
-		= #name "\0" format;					\
-		static struct marker __mark_##name			\
-		__attribute__((section("__markers"), aligned(8))) =	\
-		{ __mstrtab_##name, &__mstrtab_##name[sizeof(#name)],	\
-		  0, 0, marker_probe_cb, { __mark_empty_function, NULL},\
-		  NULL, tp_name_str, tp_cb }
-
-#define DEFINE_MARKER(name, format)					\
-		_DEFINE_MARKER(name, NULL, NULL, format)
-
-#define DEFINE_MARKER_TP(name, tp_name, tp_cb, format)			\
-		_DEFINE_MARKER(name, #tp_name, tp_cb, format)
-
-/*
- * Note : the empty asm volatile with read constraint is used here instead of a
- * "used" attribute to fix a gcc 4.1.x bug.
- * Make sure the alignment of the structure in the __markers section will
- * not add unwanted padding between the beginning of the section and the
- * structure. Force alignment to the same alignment as the section start.
- *
- * The "generic" argument controls which marker enabling mechanism must be used.
- * If generic is true, a variable read is used.
- * If generic is false, immediate values are used.
- */
-#define __trace_mark(generic, name, call_private, format, args...)	\
-	do {								\
-		DEFINE_MARKER(name, format);				\
-		__mark_check_format(format, ## args);			\
-		if (unlikely(__mark_##name.state)) {			\
-			(*__mark_##name.call)				\
-				(&__mark_##name, call_private, ## args);\
-		}							\
-	} while (0)
-
-#define __trace_mark_tp(name, call_private, tp_name, tp_cb, format, args...) \
-	do {								\
-		void __check_tp_type(void)				\
-		{							\
-			register_trace_##tp_name(tp_cb);		\
-		}							\
-		DEFINE_MARKER_TP(name, tp_name, tp_cb, format);		\
-		__mark_check_format(format, ## args);			\
-		(*__mark_##name.call)(&__mark_##name, call_private,	\
-					## args);			\
-	} while (0)
-
-extern void marker_update_probe_range(struct marker *begin,
-	struct marker *end);
-
-#define GET_MARKER(name)	(__mark_##name)
-
-#else /* !CONFIG_MARKERS */
-#define DEFINE_MARKER(name, tp_name, tp_cb, format)
-#define __trace_mark(generic, name, call_private, format, args...) \
-		__mark_check_format(format, ## args)
-#define __trace_mark_tp(name, call_private, tp_name, tp_cb, format, args...) \
-	do {								\
-		void __check_tp_type(void)				\
-		{							\
-			register_trace_##tp_name(tp_cb);		\
-		}							\
-		__mark_check_format(format, ## args);			\
-	} while (0)
-static inline void marker_update_probe_range(struct marker *begin,
-	struct marker *end)
-{ }
-#define GET_MARKER(name)
-#endif /* CONFIG_MARKERS */
-
-/**
- * trace_mark - Marker using code patching
- * @name: marker name, not quoted.
- * @format: format string
- * @args...: variable argument list
- *
- * Places a marker using optimized code patching technique (imv_read())
- * to be enabled when immediate values are present.
- */
-#define trace_mark(name, format, args...) \
-	__trace_mark(0, name, NULL, format, ## args)
-
-/**
- * _trace_mark - Marker using variable read
- * @name: marker name, not quoted.
- * @format: format string
- * @args...: variable argument list
- *
- * Places a marker using a standard memory read (_imv_read()) to be
- * enabled. Should be used for markers in code paths where instruction
- * modification based enabling is not welcome. (__init and __exit functions,
- * lockdep, some traps, printk).
- */
-#define _trace_mark(name, format, args...) \
-	__trace_mark(1, name, NULL, format, ## args)
-
-/**
- * trace_mark_tp - Marker in a tracepoint callback
- * @name: marker name, not quoted.
- * @tp_name: tracepoint name, not quoted.
- * @tp_cb: tracepoint callback. Should have an associated global symbol so it
- *         is not optimized away by the compiler (should not be static).
- * @format: format string
- * @args...: variable argument list
- *
- * Places a marker in a tracepoint callback.
- */
-#define trace_mark_tp(name, tp_name, tp_cb, format, args...)	\
-	__trace_mark_tp(name, NULL, tp_name, tp_cb, format, ## args)
-
-/**
- * MARK_NOARGS - Format string for a marker with no argument.
- */
-#define MARK_NOARGS " "
-
-/* To be used for string format validity checking with gcc */
-static inline void __printf(1, 2) ___mark_check_format(const char *fmt, ...)
-{
-}
-
-#define __mark_check_format(format, args...)				\
-	do {								\
-		if (0)							\
-			___mark_check_format(format, ## args);		\
-	} while (0)
-
-extern marker_probe_func __mark_empty_function;
-
-extern void marker_probe_cb(const struct marker *mdata,
-	void *call_private, ...);
-
-/*
- * Connect a probe to a marker.
- * private data pointer must be a valid allocated memory address, or NULL.
- */
-extern int marker_probe_register(const char *name, const char *format,
-				marker_probe_func *probe, void *probe_private);
-
-/*
- * Returns the private data given to marker_probe_register.
- */
-extern int marker_probe_unregister(const char *name,
-	marker_probe_func *probe, void *probe_private);
-/*
- * Unregister a marker by providing the registered private data.
- */
-extern int marker_probe_unregister_private_data(marker_probe_func *probe,
-	void *probe_private);
-
-extern void *marker_get_private_data(const char *name, marker_probe_func *probe,
-	int num);
-
-/*
- * marker_synchronize_unregister must be called between the last marker probe
- * unregistration and the first one of
- * - the end of module exit function
- * - the free of any resource used by the probes
- * to ensure the code and data are valid for any possibly running probes.
- */
-#define marker_synchronize_unregister() synchronize_sched()
-
-#endif
diff --git a/include/linux/module.h b/include/linux/module.h
index f8f92d015efe..1c755b2f937d 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -15,7 +15,6 @@
 #include <linux/stringify.h>
 #include <linux/kobject.h>
 #include <linux/moduleparam.h>
-#include <linux/marker.h>
 #include <linux/tracepoint.h>
 
 #include <asm/local.h>
@@ -327,10 +326,6 @@ struct module
 	/* The command line arguments (may be mangled).  People like
 	   keeping pointers to this stuff */
 	char *args;
-#ifdef CONFIG_MARKERS
-	struct marker *markers;
-	unsigned int num_markers;
-#endif
 #ifdef CONFIG_TRACEPOINTS
 	struct tracepoint *tracepoints;
 	unsigned int num_tracepoints;
@@ -535,8 +530,6 @@ int unregister_module_notifier(struct notifier_block * nb);
 
 extern void print_modules(void);
 
-extern void module_update_markers(void);
-
 extern void module_update_tracepoints(void);
 extern int module_get_iter_tracepoints(struct tracepoint_iter *iter);
 
@@ -651,10 +644,6 @@ static inline void print_modules(void)
 {
 }
 
-static inline void module_update_markers(void)
-{
-}
-
 static inline void module_update_tracepoints(void)
 {
 }
diff --git a/init/Kconfig b/init/Kconfig
index 8e8b76d8a272..4cc0fa13d5eb 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1054,13 +1054,6 @@ config PROFILING
 config TRACEPOINTS
 	bool
 
-config MARKERS
-	bool "Activate markers"
-	select TRACEPOINTS
-	help
-	  Place an empty function call at each marker site. Can be
-	  dynamically changed for a probe function.
-
 source "arch/Kconfig"
 
 config SLOW_WORK
diff --git a/kernel/Makefile b/kernel/Makefile
index 3d9c7e27e3f9..7c9b0a585502 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -87,7 +87,6 @@ obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
-obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
diff --git a/kernel/marker.c b/kernel/marker.c
deleted file mode 100644
index ea54f2647868..000000000000
--- a/kernel/marker.c
+++ /dev/null
@@ -1,930 +0,0 @@
-/*
- * Copyright (C) 2007 Mathieu Desnoyers
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- */
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/types.h>
-#include <linux/jhash.h>
-#include <linux/list.h>
-#include <linux/rcupdate.h>
-#include <linux/marker.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-
-extern struct marker __start___markers[];
-extern struct marker __stop___markers[];
-
-/* Set to 1 to enable marker debug output */
-static const int marker_debug;
-
-/*
- * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
- * and module markers and the hash table.
- */
-static DEFINE_MUTEX(markers_mutex);
-
-/*
- * Marker hash table, containing the active markers.
- * Protected by module_mutex.
- */
-#define MARKER_HASH_BITS 6
-#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
-static struct hlist_head marker_table[MARKER_TABLE_SIZE];
-
-/*
- * Note about RCU :
- * It is used to make sure every handler has finished using its private data
- * between two consecutive operation (add or remove) on a given marker.  It is
- * also used to delay the free of multiple probes array until a quiescent state
- * is reached.
- * marker entries modifications are protected by the markers_mutex.
- */
-struct marker_entry {
-	struct hlist_node hlist;
-	char *format;
-			/* Probe wrapper */
-	void (*call)(const struct marker *mdata, void *call_private, ...);
-	struct marker_probe_closure single;
-	struct marker_probe_closure *multi;
-	int refcount;	/* Number of times armed. 0 if disarmed. */
-	struct rcu_head rcu;
-	void *oldptr;
-	int rcu_pending;
-	unsigned char ptype:1;
-	unsigned char format_allocated:1;
-	char name[0];	/* Contains name'\0'format'\0' */
-};
-
-/**
- * __mark_empty_function - Empty probe callback
- * @probe_private: probe private data
- * @call_private: call site private data
- * @fmt: format string
- * @...: variable argument list
- *
- * Empty callback provided as a probe to the markers. By providing this to a
- * disabled marker, we make sure the  execution flow is always valid even
- * though the function pointer change and the marker enabling are two distinct
- * operations that modifies the execution flow of preemptible code.
- */
-notrace void __mark_empty_function(void *probe_private, void *call_private,
-	const char *fmt, va_list *args)
-{
-}
-EXPORT_SYMBOL_GPL(__mark_empty_function);
-
-/*
- * marker_probe_cb Callback that prepares the variable argument list for probes.
- * @mdata: pointer of type struct marker
- * @call_private: caller site private data
- * @...:  Variable argument list.
- *
- * Since we do not use "typical" pointer based RCU in the 1 argument case, we
- * need to put a full smp_rmb() in this branch. This is why we do not use
- * rcu_dereference() for the pointer read.
- */
-notrace void marker_probe_cb(const struct marker *mdata,
-		void *call_private, ...)
-{
-	va_list args;
-	char ptype;
-
-	/*
-	 * rcu_read_lock_sched does two things : disabling preemption to make
-	 * sure the teardown of the callbacks can be done correctly when they
-	 * are in modules and they insure RCU read coherency.
-	 */
-	rcu_read_lock_sched_notrace();
-	ptype = mdata->ptype;
-	if (likely(!ptype)) {
-		marker_probe_func *func;
-		/* Must read the ptype before ptr. They are not data dependant,
-		 * so we put an explicit smp_rmb() here. */
-		smp_rmb();
-		func = mdata->single.func;
-		/* Must read the ptr before private data. They are not data
-		 * dependant, so we put an explicit smp_rmb() here. */
-		smp_rmb();
-		va_start(args, call_private);
-		func(mdata->single.probe_private, call_private, mdata->format,
-			&args);
-		va_end(args);
-	} else {
-		struct marker_probe_closure *multi;
-		int i;
-		/*
-		 * Read mdata->ptype before mdata->multi.
-		 */
-		smp_rmb();
-		multi = mdata->multi;
-		/*
-		 * multi points to an array, therefore accessing the array
-		 * depends on reading multi. However, even in this case,
-		 * we must insure that the pointer is read _before_ the array
-		 * data. Same as rcu_dereference, but we need a full smp_rmb()
-		 * in the fast path, so put the explicit barrier here.
-		 */
-		smp_read_barrier_depends();
-		for (i = 0; multi[i].func; i++) {
-			va_start(args, call_private);
-			multi[i].func(multi[i].probe_private, call_private,
-				mdata->format, &args);
-			va_end(args);
-		}
-	}
-	rcu_read_unlock_sched_notrace();
-}
-EXPORT_SYMBOL_GPL(marker_probe_cb);
-
-/*
- * marker_probe_cb Callback that does not prepare the variable argument list.
- * @mdata: pointer of type struct marker
- * @call_private: caller site private data
- * @...:  Variable argument list.
- *
- * Should be connected to markers "MARK_NOARGS".
- */
-static notrace void marker_probe_cb_noarg(const struct marker *mdata,
-		void *call_private, ...)
-{
-	va_list args;	/* not initialized */
-	char ptype;
-
-	rcu_read_lock_sched_notrace();
-	ptype = mdata->ptype;
-	if (likely(!ptype)) {
-		marker_probe_func *func;
-		/* Must read the ptype before ptr. They are not data dependant,
-		 * so we put an explicit smp_rmb() here. */
-		smp_rmb();
-		func = mdata->single.func;
-		/* Must read the ptr before private data. They are not data
-		 * dependant, so we put an explicit smp_rmb() here. */
-		smp_rmb();
-		func(mdata->single.probe_private, call_private, mdata->format,
-			&args);
-	} else {
-		struct marker_probe_closure *multi;
-		int i;
-		/*
-		 * Read mdata->ptype before mdata->multi.
-		 */
-		smp_rmb();
-		multi = mdata->multi;
-		/*
-		 * multi points to an array, therefore accessing the array
-		 * depends on reading multi. However, even in this case,
-		 * we must insure that the pointer is read _before_ the array
-		 * data. Same as rcu_dereference, but we need a full smp_rmb()
-		 * in the fast path, so put the explicit barrier here.
-		 */
-		smp_read_barrier_depends();
-		for (i = 0; multi[i].func; i++)
-			multi[i].func(multi[i].probe_private, call_private,
-				mdata->format, &args);
-	}
-	rcu_read_unlock_sched_notrace();
-}
-
-static void free_old_closure(struct rcu_head *head)
-{
-	struct marker_entry *entry = container_of(head,
-		struct marker_entry, rcu);
-	kfree(entry->oldptr);
-	/* Make sure we free the data before setting the pending flag to 0 */
-	smp_wmb();
-	entry->rcu_pending = 0;
-}
-
-static void debug_print_probes(struct marker_entry *entry)
-{
-	int i;
-
-	if (!marker_debug)
-		return;
-
-	if (!entry->ptype) {
-		printk(KERN_DEBUG "Single probe : %p %p\n",
-			entry->single.func,
-			entry->single.probe_private);
-	} else {
-		for (i = 0; entry->multi[i].func; i++)
-			printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
-				entry->multi[i].func,
-				entry->multi[i].probe_private);
-	}
-}
-
-static struct marker_probe_closure *
-marker_entry_add_probe(struct marker_entry *entry,
-		marker_probe_func *probe, void *probe_private)
-{
-	int nr_probes = 0;
-	struct marker_probe_closure *old, *new;
-
-	WARN_ON(!probe);
-
-	debug_print_probes(entry);
-	old = entry->multi;
-	if (!entry->ptype) {
-		if (entry->single.func == probe &&
-				entry->single.probe_private == probe_private)
-			return ERR_PTR(-EBUSY);
-		if (entry->single.func == __mark_empty_function) {
-			/* 0 -> 1 probes */
-			entry->single.func = probe;
-			entry->single.probe_private = probe_private;
-			entry->refcount = 1;
-			entry->ptype = 0;
-			debug_print_probes(entry);
-			return NULL;
-		} else {
-			/* 1 -> 2 probes */
-			nr_probes = 1;
-			old = NULL;
-		}
-	} else {
-		/* (N -> N+1), (N != 0, 1) probes */
-		for (nr_probes = 0; old[nr_probes].func; nr_probes++)
-			if (old[nr_probes].func == probe
-					&& old[nr_probes].probe_private
-						== probe_private)
-				return ERR_PTR(-EBUSY);
-	}
-	/* + 2 : one for new probe, one for NULL func */
-	new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure),
-			GFP_KERNEL);
-	if (new == NULL)
-		return ERR_PTR(-ENOMEM);
-	if (!old)
-		new[0] = entry->single;
-	else
-		memcpy(new, old,
-			nr_probes * sizeof(struct marker_probe_closure));
-	new[nr_probes].func = probe;
-	new[nr_probes].probe_private = probe_private;
-	entry->refcount = nr_probes + 1;
-	entry->multi = new;
-	entry->ptype = 1;
-	debug_print_probes(entry);
-	return old;
-}
-
-static struct marker_probe_closure *
-marker_entry_remove_probe(struct marker_entry *entry,
-		marker_probe_func *probe, void *probe_private)
-{
-	int nr_probes = 0, nr_del = 0, i;
-	struct marker_probe_closure *old, *new;
-
-	old = entry->multi;
-
-	debug_print_probes(entry);
-	if (!entry->ptype) {
-		/* 0 -> N is an error */
-		WARN_ON(entry->single.func == __mark_empty_function);
-		/* 1 -> 0 probes */
-		WARN_ON(probe && entry->single.func != probe);
-		WARN_ON(entry->single.probe_private != probe_private);
-		entry->single.func = __mark_empty_function;
-		entry->refcount = 0;
-		entry->ptype = 0;
-		debug_print_probes(entry);
-		return NULL;
-	} else {
-		/* (N -> M), (N > 1, M >= 0) probes */
-		for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
-			if ((!probe || old[nr_probes].func == probe)
-					&& old[nr_probes].probe_private
-						== probe_private)
-				nr_del++;
-		}
-	}
-
-	if (nr_probes - nr_del == 0) {
-		/* N -> 0, (N > 1) */
-		entry->single.func = __mark_empty_function;
-		entry->refcount = 0;
-		entry->ptype = 0;
-	} else if (nr_probes - nr_del == 1) {
-		/* N -> 1, (N > 1) */
-		for (i = 0; old[i].func; i++)
-			if ((probe && old[i].func != probe) ||
-					old[i].probe_private != probe_private)
-				entry->single = old[i];
-		entry->refcount = 1;
-		entry->ptype = 0;
-	} else {
-		int j = 0;
-		/* N -> M, (N > 1, M > 1) */
-		/* + 1 for NULL */
-		new = kzalloc((nr_probes - nr_del + 1)
-			* sizeof(struct marker_probe_closure), GFP_KERNEL);
-		if (new == NULL)
-			return ERR_PTR(-ENOMEM);
-		for (i = 0; old[i].func; i++)
-			if ((probe && old[i].func != probe) ||
-					old[i].probe_private != probe_private)
-				new[j++] = old[i];
-		entry->refcount = nr_probes - nr_del;
-		entry->ptype = 1;
-		entry->multi = new;
-	}
-	debug_print_probes(entry);
-	return old;
-}
-
-/*
- * Get marker if the marker is present in the marker hash table.
- * Must be called with markers_mutex held.
- * Returns NULL if not present.
- */
-static struct marker_entry *get_marker(const char *name)
-{
-	struct hlist_head *head;
-	struct hlist_node *node;
-	struct marker_entry *e;
-	u32 hash = jhash(name, strlen(name), 0);
-
-	head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
-	hlist_for_each_entry(e, node, head, hlist) {
-		if (!strcmp(name, e->name))
-			return e;
-	}
-	return NULL;
-}
-
-/*
- * Add the marker to the marker hash table. Must be called with markers_mutex
- * held.
- */
-static struct marker_entry *add_marker(const char *name, const char *format)
-{
-	struct hlist_head *head;
-	struct hlist_node *node;
-	struct marker_entry *e;
-	size_t name_len = strlen(name) + 1;
-	size_t format_len = 0;
-	u32 hash = jhash(name, name_len-1, 0);
-
-	if (format)
-		format_len = strlen(format) + 1;
-	head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
-	hlist_for_each_entry(e, node, head, hlist) {
-		if (!strcmp(name, e->name)) {
-			printk(KERN_NOTICE
-				"Marker %s busy\n", name);
-			return ERR_PTR(-EBUSY);	/* Already there */
-		}
-	}
-	/*
-	 * Using kmalloc here to allocate a variable length element. Could
-	 * cause some memory fragmentation if overused.
-	 */
-	e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
-			GFP_KERNEL);
-	if (!e)
-		return ERR_PTR(-ENOMEM);
-	memcpy(&e->name[0], name, name_len);
-	if (format) {
-		e->format = &e->name[name_len];
-		memcpy(e->format, format, format_len);
-		if (strcmp(e->format, MARK_NOARGS) == 0)
-			e->call = marker_probe_cb_noarg;
-		else
-			e->call = marker_probe_cb;
-		trace_mark(core_marker_format, "name %s format %s",
-				e->name, e->format);
-	} else {
-		e->format = NULL;
-		e->call = marker_probe_cb;
-	}
-	e->single.func = __mark_empty_function;
-	e->single.probe_private = NULL;
-	e->multi = NULL;
-	e->ptype = 0;
-	e->format_allocated = 0;
-	e->refcount = 0;
-	e->rcu_pending = 0;
-	hlist_add_head(&e->hlist, head);
-	return e;
-}
-
-/*
- * Remove the marker from the marker hash table. Must be called with mutex_lock
- * held.
- */
-static int remove_marker(const char *name)
-{
-	struct hlist_head *head;
-	struct hlist_node *node;
-	struct marker_entry *e;
-	int found = 0;
-	size_t len = strlen(name) + 1;
-	u32 hash = jhash(name, len-1, 0);
-
-	head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
-	hlist_for_each_entry(e, node, head, hlist) {
-		if (!strcmp(name, e->name)) {
-			found = 1;
-			break;
-		}
-	}
-	if (!found)
-		return -ENOENT;
-	if (e->single.func != __mark_empty_function)
-		return -EBUSY;
-	hlist_del(&e->hlist);
-	if (e->format_allocated)
-		kfree(e->format);
-	/* Make sure the call_rcu has been executed */
-	if (e->rcu_pending)
-		rcu_barrier_sched();
-	kfree(e);
-	return 0;
-}
-
-/*
- * Set the mark_entry format to the format found in the element.
- */
-static int marker_set_format(struct marker_entry *entry, const char *format)
-{
-	entry->format = kstrdup(format, GFP_KERNEL);
-	if (!entry->format)
-		return -ENOMEM;
-	entry->format_allocated = 1;
-
-	trace_mark(core_marker_format, "name %s format %s",
-			entry->name, entry->format);
-	return 0;
-}
-
-/*
- * Sets the probe callback corresponding to one marker.
- */
-static int set_marker(struct marker_entry *entry, struct marker *elem,
-		int active)
-{
-	int ret = 0;
-	WARN_ON(strcmp(entry->name, elem->name) != 0);
-
-	if (entry->format) {
-		if (strcmp(entry->format, elem->format) != 0) {
-			printk(KERN_NOTICE
-				"Format mismatch for probe %s "
-				"(%s), marker (%s)\n",
-				entry->name,
-				entry->format,
-				elem->format);
-			return -EPERM;
-		}
-	} else {
-		ret = marker_set_format(entry, elem->format);
-		if (ret)
-			return ret;
-	}
-
-	/*
-	 * probe_cb setup (statically known) is done here. It is
-	 * asynchronous with the rest of execution, therefore we only
-	 * pass from a "safe" callback (with argument) to an "unsafe"
-	 * callback (does not set arguments).
-	 */
-	elem->call = entry->call;
-	/*
-	 * Sanity check :
-	 * We only update the single probe private data when the ptr is
-	 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
-	 */
-	WARN_ON(elem->single.func != __mark_empty_function
-		&& elem->single.probe_private != entry->single.probe_private
-		&& !elem->ptype);
-	elem->single.probe_private = entry->single.probe_private;
-	/*
-	 * Make sure the private data is valid when we update the
-	 * single probe ptr.
-	 */
-	smp_wmb();
-	elem->single.func = entry->single.func;
-	/*
-	 * We also make sure that the new probe callbacks array is consistent
-	 * before setting a pointer to it.
-	 */
-	rcu_assign_pointer(elem->multi, entry->multi);
-	/*
-	 * Update the function or multi probe array pointer before setting the
-	 * ptype.
-	 */
-	smp_wmb();
-	elem->ptype = entry->ptype;
-
-	if (elem->tp_name && (active ^ elem->state)) {
-		WARN_ON(!elem->tp_cb);
-		/*
-		 * It is ok to directly call the probe registration because type
-		 * checking has been done in the __trace_mark_tp() macro.
-		 */
-
-		if (active) {
-			/*
-			 * try_module_get should always succeed because we hold
-			 * lock_module() to get the tp_cb address.
-			 */
-			ret = try_module_get(__module_text_address(
-				(unsigned long)elem->tp_cb));
-			BUG_ON(!ret);
-			ret = tracepoint_probe_register_noupdate(
-				elem->tp_name,
-				elem->tp_cb);
-		} else {
-			ret = tracepoint_probe_unregister_noupdate(
-				elem->tp_name,
-				elem->tp_cb);
-			/*
-			 * tracepoint_probe_update_all() must be called
-			 * before the module containing tp_cb is unloaded.
-			 */
-			module_put(__module_text_address(
-				(unsigned long)elem->tp_cb));
-		}
-	}
-	elem->state = active;
-
-	return ret;
-}
-
-/*
- * Disable a marker and its probe callback.
- * Note: only waiting an RCU period after setting elem->call to the empty
- * function insures that the original callback is not used anymore. This insured
- * by rcu_read_lock_sched around the call site.
- */
-static void disable_marker(struct marker *elem)
-{
-	int ret;
-
-	/* leave "call" as is. It is known statically. */
-	if (elem->tp_name && elem->state) {
-		WARN_ON(!elem->tp_cb);
-		/*
-		 * It is ok to directly call the probe registration because type
-		 * checking has been done in the __trace_mark_tp() macro.
-		 */
-		ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
-			elem->tp_cb);
-		WARN_ON(ret);
-		/*
-		 * tracepoint_probe_update_all() must be called
-		 * before the module containing tp_cb is unloaded.
-		 */
-		module_put(__module_text_address((unsigned long)elem->tp_cb));
-	}
-	elem->state = 0;
-	elem->single.func = __mark_empty_function;
-	/* Update the function before setting the ptype */
-	smp_wmb();
-	elem->ptype = 0;	/* single probe */
-	/*
-	 * Leave the private data and id there, because removal is racy and
-	 * should be done only after an RCU period. These are never used until
-	 * the next initialization anyway.
-	 */
-}
-
-/**
- * marker_update_probe_range - Update a probe range
- * @begin: beginning of the range
- * @end: end of the range
- *
- * Updates the probe callback corresponding to a range of markers.
- */
-void marker_update_probe_range(struct marker *begin,
-	struct marker *end)
-{
-	struct marker *iter;
-	struct marker_entry *mark_entry;
-
-	mutex_lock(&markers_mutex);
-	for (iter = begin; iter < end; iter++) {
-		mark_entry = get_marker(iter->name);
-		if (mark_entry) {
-			set_marker(mark_entry, iter, !!mark_entry->refcount);
-			/*
-			 * ignore error, continue
-			 */
-		} else {
-			disable_marker(iter);
-		}
-	}
-	mutex_unlock(&markers_mutex);
-}
-
-/*
- * Update probes, removing the faulty probes.
- *
- * Internal callback only changed before the first probe is connected to it.
- * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
- * transitions.  All other transitions will leave the old private data valid.
- * This makes the non-atomicity of the callback/private data updates valid.
- *
- * "special case" updates :
- * 0 -> 1 callback
- * 1 -> 0 callback
- * 1 -> 2 callbacks
- * 2 -> 1 callbacks
- * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
- * Site effect : marker_set_format may delete the marker entry (creating a
- * replacement).
- */
-static void marker_update_probes(void)
-{
-	/* Core kernel markers */
-	marker_update_probe_range(__start___markers, __stop___markers);
-	/* Markers in modules. */
-	module_update_markers();
-	tracepoint_probe_update_all();
-}
-
-/**
- * marker_probe_register -  Connect a probe to a marker
- * @name: marker name
- * @format: format string
- * @probe: probe handler
- * @probe_private: probe private data
- *
- * private data must be a valid allocated memory address, or NULL.
- * Returns 0 if ok, error value on error.
- * The probe address must at least be aligned on the architecture pointer size.
- */
-int marker_probe_register(const char *name, const char *format,
-			marker_probe_func *probe, void *probe_private)
-{
-	struct marker_entry *entry;
-	int ret = 0;
-	struct marker_probe_closure *old;
-
-	mutex_lock(&markers_mutex);
-	entry = get_marker(name);
-	if (!entry) {
-		entry = add_marker(name, format);
-		if (IS_ERR(entry))
-			ret = PTR_ERR(entry);
-	} else if (format) {
-		if (!entry->format)
-			ret = marker_set_format(entry, format);
-		else if (strcmp(entry->format, format))
-			ret = -EPERM;
-	}
-	if (ret)
-		goto end;
-
-	/*
-	 * If we detect that a call_rcu is pending for this marker,
-	 * make sure it's executed now.
-	 */
-	if (entry->rcu_pending)
-		rcu_barrier_sched();
-	old = marker_entry_add_probe(entry, probe, probe_private);
-	if (IS_ERR(old)) {
-		ret = PTR_ERR(old);
-		goto end;
-	}
-	mutex_unlock(&markers_mutex);
-	marker_update_probes();
-	mutex_lock(&markers_mutex);
-	entry = get_marker(name);
-	if (!entry)
-		goto end;
-	if (entry->rcu_pending)
-		rcu_barrier_sched();
-	entry->oldptr = old;
-	entry->rcu_pending = 1;
-	/* write rcu_pending before calling the RCU callback */
-	smp_wmb();
-	call_rcu_sched(&entry->rcu, free_old_closure);
-end:
-	mutex_unlock(&markers_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(marker_probe_register);
-
-/**
- * marker_probe_unregister -  Disconnect a probe from a marker
- * @name: marker name
- * @probe: probe function pointer
- * @probe_private: probe private data
- *
- * Returns the private data given to marker_probe_register, or an ERR_PTR().
- * We do not need to call a synchronize_sched to make sure the probes have
- * finished running before doing a module unload, because the module unload
- * itself uses stop_machine(), which insures that every preempt disabled section
- * have finished.
- */
-int marker_probe_unregister(const char *name,
-	marker_probe_func *probe, void *probe_private)
-{
-	struct marker_entry *entry;
-	struct marker_probe_closure *old;
-	int ret = -ENOENT;
-
-	mutex_lock(&markers_mutex);
-	entry = get_marker(name);
-	if (!entry)
-		goto end;
-	if (entry->rcu_pending)
-		rcu_barrier_sched();
-	old = marker_entry_remove_probe(entry, probe, probe_private);
-	mutex_unlock(&markers_mutex);
-	marker_update_probes();
-	mutex_lock(&markers_mutex);
-	entry = get_marker(name);
-	if (!entry)
-		goto end;
-	if (entry->rcu_pending)
-		rcu_barrier_sched();
-	entry->oldptr = old;
-	entry->rcu_pending = 1;
-	/* write rcu_pending before calling the RCU callback */
-	smp_wmb();
-	call_rcu_sched(&entry->rcu, free_old_closure);
-	remove_marker(name);	/* Ignore busy error message */
-	ret = 0;
-end:
-	mutex_unlock(&markers_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(marker_probe_unregister);
-
-static struct marker_entry *
-get_marker_from_private_data(marker_probe_func *probe, void *probe_private)
-{
-	struct marker_entry *entry;
-	unsigned int i;
-	struct hlist_head *head;
-	struct hlist_node *node;
-
-	for (i = 0; i < MARKER_TABLE_SIZE; i++) {
-		head = &marker_table[i];
-		hlist_for_each_entry(entry, node, head, hlist) {
-			if (!entry->ptype) {
-				if (entry->single.func == probe
-						&& entry->single.probe_private
-						== probe_private)
-					return entry;
-			} else {
-				struct marker_probe_closure *closure;
-				closure = entry->multi;
-				for (i = 0; closure[i].func; i++) {
-					if (closure[i].func == probe &&
-							closure[i].probe_private
-							== probe_private)
-						return entry;
-				}
-			}
-		}
-	}
-	return NULL;
-}
-
-/**
- * marker_probe_unregister_private_data -  Disconnect a probe from a marker
- * @probe: probe function
- * @probe_private: probe private data
- *
- * Unregister a probe by providing the registered private data.
- * Only removes the first marker found in hash table.
- * Return 0 on success or error value.
- * We do not need to call a synchronize_sched to make sure the probes have
- * finished running before doing a module unload, because the module unload
- * itself uses stop_machine(), which insures that every preempt disabled section
- * have finished.
- */
-int marker_probe_unregister_private_data(marker_probe_func *probe,
-		void *probe_private)
-{
-	struct marker_entry *entry;
-	int ret = 0;
-	struct marker_probe_closure *old;
-
-	mutex_lock(&markers_mutex);
-	entry = get_marker_from_private_data(probe, probe_private);
-	if (!entry) {
-		ret = -ENOENT;
-		goto end;
-	}
-	if (entry->rcu_pending)
-		rcu_barrier_sched();
-	old = marker_entry_remove_probe(entry, NULL, probe_private);
-	mutex_unlock(&markers_mutex);
-	marker_update_probes();
-	mutex_lock(&markers_mutex);
-	entry = get_marker_from_private_data(probe, probe_private);
-	if (!entry)
-		goto end;
-	if (entry->rcu_pending)
-		rcu_barrier_sched();
-	entry->oldptr = old;
-	entry->rcu_pending = 1;
-	/* write rcu_pending before calling the RCU callback */
-	smp_wmb();
-	call_rcu_sched(&entry->rcu, free_old_closure);
-	remove_marker(entry->name);	/* Ignore busy error message */
-end:
-	mutex_unlock(&markers_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
-
-/**
- * marker_get_private_data - Get a marker's probe private data
- * @name: marker name
- * @probe: probe to match
- * @num: get the nth matching probe's private data
- *
- * Returns the nth private data pointer (starting from 0) matching, or an
- * ERR_PTR.
- * Returns the private data pointer, or an ERR_PTR.
- * The private data pointer should _only_ be dereferenced if the caller is the
- * owner of the data, or its content could vanish. This is mostly used to
- * confirm that a caller is the owner of a registered probe.
- */
-void *marker_get_private_data(const char *name, marker_probe_func *probe,
-		int num)
-{
-	struct hlist_head *head;
-	struct hlist_node *node;
-	struct marker_entry *e;
-	size_t name_len = strlen(name) + 1;
-	u32 hash = jhash(name, name_len-1, 0);
-	int i;
-
-	head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
-	hlist_for_each_entry(e, node, head, hlist) {
-		if (!strcmp(name, e->name)) {
-			if (!e->ptype) {
-				if (num == 0 && e->single.func == probe)
-					return e->single.probe_private;
-			} else {
-				struct marker_probe_closure *closure;
-				int match = 0;
-				closure = e->multi;
-				for (i = 0; closure[i].func; i++) {
-					if (closure[i].func != probe)
-						continue;
-					if (match++ == num)
-						return closure[i].probe_private;
-				}
-			}
-			break;
-		}
-	}
-	return ERR_PTR(-ENOENT);
-}
-EXPORT_SYMBOL_GPL(marker_get_private_data);
-
-#ifdef CONFIG_MODULES
-
-int marker_module_notify(struct notifier_block *self,
-			 unsigned long val, void *data)
-{
-	struct module *mod = data;
-
-	switch (val) {
-	case MODULE_STATE_COMING:
-		marker_update_probe_range(mod->markers,
-			mod->markers + mod->num_markers);
-		break;
-	case MODULE_STATE_GOING:
-		marker_update_probe_range(mod->markers,
-			mod->markers + mod->num_markers);
-		break;
-	}
-	return 0;
-}
-
-struct notifier_block marker_module_nb = {
-	.notifier_call = marker_module_notify,
-	.priority = 0,
-};
-
-static int init_markers(void)
-{
-	return register_module_notifier(&marker_module_nb);
-}
-__initcall(init_markers);
-
-#endif /* CONFIG_MODULES */
diff --git a/kernel/module.c b/kernel/module.c
index 05ce49ced8f6..b6ee424245dd 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2237,10 +2237,6 @@ static noinline struct module *load_module(void __user *umod,
 				  sizeof(*mod->ctors), &mod->num_ctors);
 #endif
 
-#ifdef CONFIG_MARKERS
-	mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
-				    sizeof(*mod->markers), &mod->num_markers);
-#endif
 #ifdef CONFIG_TRACEPOINTS
 	mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
 					"__tracepoints",
@@ -2958,20 +2954,6 @@ void module_layout(struct module *mod,
 EXPORT_SYMBOL(module_layout);
 #endif
 
-#ifdef CONFIG_MARKERS
-void module_update_markers(void)
-{
-	struct module *mod;
-
-	mutex_lock(&module_mutex);
-	list_for_each_entry(mod, &modules, list)
-		if (!mod->taints)
-			marker_update_probe_range(mod->markers,
-				mod->markers + mod->num_markers);
-	mutex_unlock(&module_mutex);
-}
-#endif
-
 #ifdef CONFIG_TRACEPOINTS
 void module_update_tracepoints(void)
 {
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 687699d365ae..2547d8813cf0 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -11,7 +11,6 @@
 #include <linux/ftrace.h>
 #include <linux/string.h>
 #include <linux/module.h>
-#include <linux/marker.h>
 #include <linux/mutex.h>
 #include <linux/ctype.h>
 #include <linux/list.h>
diff --git a/samples/Kconfig b/samples/Kconfig
index 428b065ba695..b92bde3c6a89 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -7,12 +7,6 @@ menuconfig SAMPLES
 
 if SAMPLES
 
-config SAMPLE_MARKERS
-	tristate "Build markers examples -- loadable modules only"
-	depends on MARKERS && m
-	help
-	  This build markers example modules.
-
 config SAMPLE_TRACEPOINTS
 	tristate "Build tracepoints examples -- loadable modules only"
 	depends on TRACEPOINTS && m
diff --git a/samples/Makefile b/samples/Makefile
index 13e4b470b539..43343a03b1f4 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -1,3 +1,3 @@
 # Makefile for Linux samples code
 
-obj-$(CONFIG_SAMPLES)	+= markers/ kobject/ kprobes/ tracepoints/ trace_events/
+obj-$(CONFIG_SAMPLES)	+= kobject/ kprobes/ tracepoints/ trace_events/
diff --git a/samples/markers/Makefile b/samples/markers/Makefile
deleted file mode 100644
index 6d7231265f0f..000000000000
--- a/samples/markers/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-# builds the kprobes example kernel modules;
-# then to use one (as root):  insmod <module_name.ko>
-
-obj-$(CONFIG_SAMPLE_MARKERS) += probe-example.o marker-example.o
diff --git a/samples/markers/marker-example.c b/samples/markers/marker-example.c
deleted file mode 100644
index e9cd9c0bc84f..000000000000
--- a/samples/markers/marker-example.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/* marker-example.c
- *
- * Executes a marker when /proc/marker-example is opened.
- *
- * (C) Copyright 2007 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
- *
- * This file is released under the GPLv2.
- * See the file COPYING for more details.
- */
-
-#include <linux/module.h>
-#include <linux/marker.h>
-#include <linux/sched.h>
-#include <linux/proc_fs.h>
-
-struct proc_dir_entry *pentry_example;
-
-static int my_open(struct inode *inode, struct file *file)
-{
-	int i;
-
-	trace_mark(subsystem_event, "integer %d string %s", 123,
-		"example string");
-	for (i = 0; i < 10; i++)
-		trace_mark(subsystem_eventb, MARK_NOARGS);
-	return -EPERM;
-}
-
-static struct file_operations mark_ops = {
-	.open = my_open,
-};
-
-static int __init example_init(void)
-{
-	printk(KERN_ALERT "example init\n");
-	pentry_example = proc_create("marker-example", 0444, NULL, &mark_ops);
-	if (!pentry_example)
-		return -EPERM;
-	return 0;
-}
-
-static void __exit example_exit(void)
-{
-	printk(KERN_ALERT "example exit\n");
-	remove_proc_entry("marker-example", NULL);
-}
-
-module_init(example_init)
-module_exit(example_exit)
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Mathieu Desnoyers");
-MODULE_DESCRIPTION("Marker example");
diff --git a/samples/markers/probe-example.c b/samples/markers/probe-example.c
deleted file mode 100644
index 2dfb3b32937e..000000000000
--- a/samples/markers/probe-example.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/* probe-example.c
- *
- * Connects two functions to marker call sites.
- *
- * (C) Copyright 2007 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
- *
- * This file is released under the GPLv2.
- * See the file COPYING for more details.
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/marker.h>
-#include <asm/atomic.h>
-
-struct probe_data {
-	const char *name;
-	const char *format;
-	marker_probe_func *probe_func;
-};
-
-void probe_subsystem_event(void *probe_data, void *call_data,
-	const char *format, va_list *args)
-{
-	/* Declare args */
-	unsigned int value;
-	const char *mystr;
-
-	/* Assign args */
-	value = va_arg(*args, typeof(value));
-	mystr = va_arg(*args, typeof(mystr));
-
-	/* Call printk */
-	printk(KERN_INFO "Value %u, string %s\n", value, mystr);
-
-	/* or count, check rights, serialize data in a buffer */
-}
-
-atomic_t eventb_count = ATOMIC_INIT(0);
-
-void probe_subsystem_eventb(void *probe_data, void *call_data,
-	const char *format, va_list *args)
-{
-	/* Increment counter */
-	atomic_inc(&eventb_count);
-}
-
-static struct probe_data probe_array[] =
-{
-	{	.name = "subsystem_event",
-		.format = "integer %d string %s",
-		.probe_func = probe_subsystem_event },
-	{	.name = "subsystem_eventb",
-		.format = MARK_NOARGS,
-		.probe_func = probe_subsystem_eventb },
-};
-
-static int __init probe_init(void)
-{
-	int result;
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(probe_array); i++) {
-		result = marker_probe_register(probe_array[i].name,
-				probe_array[i].format,
-				probe_array[i].probe_func, &probe_array[i]);
-		if (result)
-			printk(KERN_INFO "Unable to register probe %s\n",
-				probe_array[i].name);
-	}
-	return 0;
-}
-
-static void __exit probe_fini(void)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(probe_array); i++)
-		marker_probe_unregister(probe_array[i].name,
-			probe_array[i].probe_func, &probe_array[i]);
-	printk(KERN_INFO "Number of event b : %u\n",
-			atomic_read(&eventb_count));
-	marker_synchronize_unregister();
-}
-
-module_init(probe_init);
-module_exit(probe_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Mathieu Desnoyers");
-MODULE_DESCRIPTION("SUBSYSTEM Probe");
diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost
index f4053dc7b5d6..8f14c81abbc7 100644
--- a/scripts/Makefile.modpost
+++ b/scripts/Makefile.modpost
@@ -13,7 +13,6 @@
 # 2) modpost is then used to
 # 3)  create one <module>.mod.c file pr. module
 # 4)  create one Module.symvers file with CRC for all exported symbols
-# 4a) [CONFIG_MARKERS] create one Module.markers file listing defined markers
 # 5) compile all <module>.mod.c files
 # 6) final link of the module to a <module.ko> file
 
@@ -59,10 +58,6 @@ include scripts/Makefile.lib
 
 kernelsymfile := $(objtree)/Module.symvers
 modulesymfile := $(firstword $(KBUILD_EXTMOD))/Module.symvers
-kernelmarkersfile := $(objtree)/Module.markers
-modulemarkersfile := $(firstword $(KBUILD_EXTMOD))/Module.markers
-
-markersfile = $(if $(KBUILD_EXTMOD),$(modulemarkersfile),$(kernelmarkersfile))
 
 # Step 1), find all modules listed in $(MODVERDIR)/
 __modules := $(sort $(shell grep -h '\.ko' /dev/null $(wildcard $(MODVERDIR)/*.mod)))
@@ -85,8 +80,6 @@ modpost = scripts/mod/modpost                    \
  $(if $(KBUILD_EXTRA_SYMBOLS), $(patsubst %, -e %,$(KBUILD_EXTRA_SYMBOLS))) \
  $(if $(KBUILD_EXTMOD),-o $(modulesymfile))      \
  $(if $(CONFIG_DEBUG_SECTION_MISMATCH),,-S)      \
- $(if $(CONFIG_MARKERS),-K $(kernelmarkersfile)) \
- $(if $(CONFIG_MARKERS),-M $(markersfile))	 \
  $(if $(KBUILD_EXTMOD)$(KBUILD_MODPOST_WARN),-w) \
  $(if $(cross_build),-c)
 
@@ -101,17 +94,12 @@ quiet_cmd_kernel-mod = MODPOST $@
       cmd_kernel-mod = $(modpost) $@
 
 vmlinux.o: FORCE
-	@rm -fr $(kernelmarkersfile)
 	$(call cmd,kernel-mod)
 
 # Declare generated files as targets for modpost
 $(symverfile):         __modpost ;
 $(modules:.ko=.mod.c): __modpost ;
 
-ifdef CONFIG_MARKERS
-$(markersfile):	       __modpost ;
-endif
-
 
 # Step 5), compile all *.mod.c files
 
-- 
cgit v1.2.3-71-gd317


From 6a891a3111fe701517bb31c2204304724c7299c8 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Fri, 18 Sep 2009 22:45:43 +0200
Subject: i2c: Drop unused i2c_driver.id field

Nobody is using i2c_driver.id any longer, so we can drop that field.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 include/linux/i2c-id.h | 11 -----------
 include/linux/i2c.h    |  2 --
 2 files changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c-id.h b/include/linux/i2c-id.h
index c9087de5c6c6..e844a0b18695 100644
--- a/include/linux/i2c-id.h
+++ b/include/linux/i2c-id.h
@@ -27,17 +27,6 @@
    legacy chip driver needs to identify a bus or a bus driver needs to
    identify a legacy client. If you don't need them, just don't set them. */
 
-/*
- * ---- Driver types -----------------------------------------------------
- */
-
-#define I2C_DRIVERID_MSP3400	 1
-#define I2C_DRIVERID_TUNER	 2
-#define I2C_DRIVERID_TDA7432	27	/* Stereo sound processor	*/
-#define I2C_DRIVERID_TVAUDIO    29      /* Generic TV sound driver      */
-#define I2C_DRIVERID_SAA711X	73	/* saa711x video encoders	*/
-#define I2C_DRIVERID_INFRARED	75	/* I2C InfraRed on Video boards */
-
 /*
  * ---- Adapter types ----------------------------------------------------
  */
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index f4784c0fe975..57d41b0abce2 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -98,7 +98,6 @@ extern s32 i2c_smbus_write_i2c_block_data(struct i2c_client *client,
 
 /**
  * struct i2c_driver - represent an I2C device driver
- * @id: Unique driver ID (optional)
  * @class: What kind of i2c device we instantiate (for detect)
  * @attach_adapter: Callback for bus addition (for legacy drivers)
  * @detach_adapter: Callback for bus removal (for legacy drivers)
@@ -135,7 +134,6 @@ extern s32 i2c_smbus_write_i2c_block_data(struct i2c_client *client,
  * not allowed.
  */
 struct i2c_driver {
-	int id;
 	unsigned int class;
 
 	/* Notifies the driver that a new bus has appeared or is about to be
-- 
cgit v1.2.3-71-gd317


From 76b3e28fa728bb68895cbd8375f5ce233bd891de Mon Sep 17 00:00:00 2001
From: Crane Cai <crane.cai@amd.com>
Date: Fri, 18 Sep 2009 22:45:50 +0200
Subject: i2c-piix4: Add AMD SB900 SMBus device ID

Add new SMBus device ID for AMD SB900.

Signed-off-by: Crane Cai <crane.cai@amd.com>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 Documentation/i2c/busses/i2c-piix4 | 2 ++
 drivers/i2c/busses/Kconfig         | 3 ++-
 drivers/i2c/busses/i2c-piix4.c     | 9 ++++++---
 include/linux/pci_ids.h            | 1 +
 4 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/i2c/busses/i2c-piix4 b/Documentation/i2c/busses/i2c-piix4
index f889481762b5..c5b37c570554 100644
--- a/Documentation/i2c/busses/i2c-piix4
+++ b/Documentation/i2c/busses/i2c-piix4
@@ -8,6 +8,8 @@ Supported adapters:
     Datasheet: Only available via NDA from ServerWorks
   * ATI IXP200, IXP300, IXP400, SB600, SB700 and SB800 southbridges
     Datasheet: Not publicly available
+  * AMD SB900
+    Datasheet: Not publicly available
   * Standard Microsystems (SMSC) SLC90E66 (Victory66) southbridge
     Datasheet: Publicly available at the SMSC website http://www.smsc.com
 
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index 8206442fbabd..bb216c5fe30e 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -113,7 +113,7 @@ config I2C_ISCH
 	  will be called i2c-isch.
 
 config I2C_PIIX4
-	tristate "Intel PIIX4 and compatible (ATI/Serverworks/Broadcom/SMSC)"
+	tristate "Intel PIIX4 and compatible (ATI/AMD/Serverworks/Broadcom/SMSC)"
 	depends on PCI
 	help
 	  If you say yes to this option, support will be included for the Intel
@@ -128,6 +128,7 @@ config I2C_PIIX4
 	    ATI SB600
 	    ATI SB700
 	    ATI SB800
+	    AMD SB900
 	    Serverworks OSB4
 	    Serverworks CSB5
 	    Serverworks CSB6
diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c
index 0249a7d762b9..a782c7a08f9e 100644
--- a/drivers/i2c/busses/i2c-piix4.c
+++ b/drivers/i2c/busses/i2c-piix4.c
@@ -22,6 +22,7 @@
 	Intel PIIX4, 440MX
 	Serverworks OSB4, CSB5, CSB6, HT-1000, HT-1100
 	ATI IXP200, IXP300, IXP400, SB600, SB700, SB800
+	AMD SB900
 	SMSC Victory66
 
    Note: we assume there can only be one device, with one SMBus interface.
@@ -479,6 +480,7 @@ static struct pci_device_id piix4_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP300_SMBUS) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_SB900_SMBUS) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_SERVERWORKS,
 		     PCI_DEVICE_ID_SERVERWORKS_OSB4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_SERVERWORKS,
@@ -499,9 +501,10 @@ static int __devinit piix4_probe(struct pci_dev *dev,
 {
 	int retval;
 
-	if ((dev->vendor == PCI_VENDOR_ID_ATI) &&
-	    (dev->device == PCI_DEVICE_ID_ATI_SBX00_SMBUS) &&
-	    (dev->revision >= 0x40))
+	if ((dev->vendor == PCI_VENDOR_ID_ATI &&
+	     dev->device == PCI_DEVICE_ID_ATI_SBX00_SMBUS &&
+	     dev->revision >= 0x40) ||
+	    dev->vendor == PCI_VENDOR_ID_AMD)
 		/* base address location etc changed in SB800 */
 		retval = piix4_setup_sb800(dev, id);
 	else
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 3b6b788fe2b5..7803565aa877 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -543,6 +543,7 @@
 #define PCI_DEVICE_ID_AMD_8131_BRIDGE	0x7450
 #define PCI_DEVICE_ID_AMD_8131_APIC	0x7451
 #define PCI_DEVICE_ID_AMD_8132_BRIDGE	0x7458
+#define PCI_DEVICE_ID_AMD_SB900_SMBUS	0x780b
 #define PCI_DEVICE_ID_AMD_CS5535_IDE    0x208F
 #define PCI_DEVICE_ID_AMD_CS5536_ISA    0x2090
 #define PCI_DEVICE_ID_AMD_CS5536_FLASH  0x2091
-- 
cgit v1.2.3-71-gd317


From b6e760f3097501e60e76fbcb7a313d42da930c1f Mon Sep 17 00:00:00 2001
From: Patrick Boettcher <pboettcher@dibcom.fr>
Date: Mon, 3 Aug 2009 14:39:15 -0300
Subject: V4L/DVB (12892): DVB-API: add support for ISDB-T and ISDB-Tsb
 (version 5.1)

This patch increments the DVB-API to version 5.1 in order to reflect the addition of ISDB-T and ISDB-Tsb on Linux' DVB-API.

Changes in detail:
- added a small document to describe how to use the API to tune to an ISDB-T or ISDB-Tsb channel
- added necessary fields to dtv_frontend_cache
- added a smarter clear-cache function which resets all fields of the dtv_frontend_cache
- added a TRANSMISSION_MODE_4K to fe_transmit_mode_t

Signed-off-by: Olivier Grenie <olgrenie@dibcom.fr>
Signed-off-by: Patrick Boettcher <pboettcher@dibcom.fr>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/media/dvb/dvb-core/dvb_frontend.c | 202 +++++++++++++++++++++++++++++-
 drivers/media/dvb/dvb-core/dvb_frontend.h |  14 +++
 include/linux/dvb/frontend.h              |  44 +++++--
 include/linux/dvb/version.h               |   2 +-
 4 files changed, 247 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/dvb/dvb-core/dvb_frontend.c b/drivers/media/dvb/dvb-core/dvb_frontend.c
index d13ebcb0c6b6..826080416c93 100644
--- a/drivers/media/dvb/dvb-core/dvb_frontend.c
+++ b/drivers/media/dvb/dvb-core/dvb_frontend.c
@@ -850,6 +850,49 @@ static int dvb_frontend_check_parameters(struct dvb_frontend *fe,
 	return 0;
 }
 
+static int dvb_frontend_clear_cache(struct dvb_frontend *fe)
+{
+	int i;
+
+	memset(&(fe->dtv_property_cache), 0,
+			sizeof(struct dtv_frontend_properties));
+
+	fe->dtv_property_cache.state = DTV_CLEAR;
+	fe->dtv_property_cache.delivery_system = SYS_UNDEFINED;
+	fe->dtv_property_cache.inversion = INVERSION_AUTO;
+	fe->dtv_property_cache.fec_inner = FEC_AUTO;
+	fe->dtv_property_cache.transmission_mode = TRANSMISSION_MODE_AUTO;
+	fe->dtv_property_cache.bandwidth_hz = BANDWIDTH_AUTO;
+	fe->dtv_property_cache.guard_interval = GUARD_INTERVAL_AUTO;
+	fe->dtv_property_cache.hierarchy = HIERARCHY_AUTO;
+	fe->dtv_property_cache.symbol_rate = QAM_AUTO;
+	fe->dtv_property_cache.code_rate_HP = FEC_AUTO;
+	fe->dtv_property_cache.code_rate_LP = FEC_AUTO;
+
+	fe->dtv_property_cache.isdbt_partial_reception = -1;
+	fe->dtv_property_cache.isdbt_sb_mode = -1;
+	fe->dtv_property_cache.isdbt_sb_subchannel = -1;
+	fe->dtv_property_cache.isdbt_sb_segment_idx = -1;
+	fe->dtv_property_cache.isdbt_sb_segment_count = -1;
+	fe->dtv_property_cache.isdbt_layer_enabled = 0x7;
+	for (i = 0; i < 3; i++) {
+		fe->dtv_property_cache.layer[i].fec = FEC_AUTO;
+		fe->dtv_property_cache.layer[i].modulation = QAM_AUTO;
+		fe->dtv_property_cache.layer[i].interleaving = -1;
+		fe->dtv_property_cache.layer[i].segment_count = -1;
+	}
+
+	return 0;
+}
+
+#define _DTV_CMD(n, s, b) \
+[n] = { \
+	.name = #n, \
+	.cmd  = n, \
+	.set  = s,\
+	.buffer = b \
+}
+
 static struct dtv_cmds_h dtv_cmds[] = {
 	[DTV_TUNE] = {
 		.name	= "DTV_TUNE",
@@ -949,6 +992,43 @@ static struct dtv_cmds_h dtv_cmds[] = {
 		.cmd	= DTV_TRANSMISSION_MODE,
 		.set	= 1,
 	},
+
+	_DTV_CMD(DTV_ISDBT_PARTIAL_RECEPTION, 1, 0),
+	_DTV_CMD(DTV_ISDBT_SOUND_BROADCASTING, 1, 0),
+	_DTV_CMD(DTV_ISDBT_SB_SUBCHANNEL_ID, 1, 0),
+	_DTV_CMD(DTV_ISDBT_SB_SEGMENT_IDX, 1, 0),
+	_DTV_CMD(DTV_ISDBT_SB_SEGMENT_COUNT, 1, 0),
+	_DTV_CMD(DTV_ISDBT_LAYERA_FEC, 1, 0),
+	_DTV_CMD(DTV_ISDBT_LAYERA_MODULATION, 1, 0),
+	_DTV_CMD(DTV_ISDBT_LAYERA_SEGMENT_COUNT, 1, 0),
+	_DTV_CMD(DTV_ISDBT_LAYERA_TIME_INTERLEAVING, 1, 0),
+	_DTV_CMD(DTV_ISDBT_LAYERB_FEC, 1, 0),
+	_DTV_CMD(DTV_ISDBT_LAYERB_MODULATION, 1, 0),
+	_DTV_CMD(DTV_ISDBT_LAYERB_SEGMENT_COUNT, 1, 0),
+	_DTV_CMD(DTV_ISDBT_LAYERB_TIME_INTERLEAVING, 1, 0),
+	_DTV_CMD(DTV_ISDBT_LAYERC_FEC, 1, 0),
+	_DTV_CMD(DTV_ISDBT_LAYERC_MODULATION, 1, 0),
+	_DTV_CMD(DTV_ISDBT_LAYERC_SEGMENT_COUNT, 1, 0),
+	_DTV_CMD(DTV_ISDBT_LAYERC_TIME_INTERLEAVING, 1, 0),
+
+	_DTV_CMD(DTV_ISDBT_PARTIAL_RECEPTION, 0, 0),
+	_DTV_CMD(DTV_ISDBT_SOUND_BROADCASTING, 0, 0),
+	_DTV_CMD(DTV_ISDBT_SB_SUBCHANNEL_ID, 0, 0),
+	_DTV_CMD(DTV_ISDBT_SB_SEGMENT_IDX, 0, 0),
+	_DTV_CMD(DTV_ISDBT_SB_SEGMENT_COUNT, 0, 0),
+	_DTV_CMD(DTV_ISDBT_LAYERA_FEC, 0, 0),
+	_DTV_CMD(DTV_ISDBT_LAYERA_MODULATION, 0, 0),
+	_DTV_CMD(DTV_ISDBT_LAYERA_SEGMENT_COUNT, 0, 0),
+	_DTV_CMD(DTV_ISDBT_LAYERA_TIME_INTERLEAVING, 0, 0),
+	_DTV_CMD(DTV_ISDBT_LAYERB_FEC, 0, 0),
+	_DTV_CMD(DTV_ISDBT_LAYERB_MODULATION, 0, 0),
+	_DTV_CMD(DTV_ISDBT_LAYERB_SEGMENT_COUNT, 0, 0),
+	_DTV_CMD(DTV_ISDBT_LAYERB_TIME_INTERLEAVING, 0, 0),
+	_DTV_CMD(DTV_ISDBT_LAYERC_FEC, 0, 0),
+	_DTV_CMD(DTV_ISDBT_LAYERC_MODULATION, 0, 0),
+	_DTV_CMD(DTV_ISDBT_LAYERC_SEGMENT_COUNT, 0, 0),
+	_DTV_CMD(DTV_ISDBT_LAYERC_TIME_INTERLEAVING, 0, 0),
+
 	/* Get */
 	[DTV_DISEQC_SLAVE_REPLY] = {
 		.name	= "DTV_DISEQC_SLAVE_REPLY",
@@ -956,6 +1036,7 @@ static struct dtv_cmds_h dtv_cmds[] = {
 		.set	= 0,
 		.buffer	= 1,
 	},
+
 	[DTV_API_VERSION] = {
 		.name	= "DTV_API_VERSION",
 		.cmd	= DTV_API_VERSION,
@@ -1165,14 +1246,21 @@ static void dtv_property_adv_params_sync(struct dvb_frontend *fe)
 	if(c->delivery_system == SYS_ISDBT) {
 		/* Fake out a generic DVB-T request so we pass validation in the ioctl */
 		p->frequency = c->frequency;
-		p->inversion = INVERSION_AUTO;
+		p->inversion = c->inversion;
 		p->u.ofdm.constellation = QAM_AUTO;
 		p->u.ofdm.code_rate_HP = FEC_AUTO;
 		p->u.ofdm.code_rate_LP = FEC_AUTO;
-		p->u.ofdm.bandwidth = BANDWIDTH_AUTO;
 		p->u.ofdm.transmission_mode = TRANSMISSION_MODE_AUTO;
 		p->u.ofdm.guard_interval = GUARD_INTERVAL_AUTO;
 		p->u.ofdm.hierarchy_information = HIERARCHY_AUTO;
+		if (c->bandwidth_hz == 8000000)
+			p->u.ofdm.bandwidth = BANDWIDTH_8_MHZ;
+		else if (c->bandwidth_hz == 7000000)
+			p->u.ofdm.bandwidth = BANDWIDTH_7_MHZ;
+		else if (c->bandwidth_hz == 6000000)
+			p->u.ofdm.bandwidth = BANDWIDTH_6_MHZ;
+		else
+			p->u.ofdm.bandwidth = BANDWIDTH_AUTO;
 	}
 }
 
@@ -1274,6 +1362,59 @@ static int dtv_property_process_get(struct dvb_frontend *fe,
 	case DTV_HIERARCHY:
 		tvp->u.data = fe->dtv_property_cache.hierarchy;
 		break;
+
+	/* ISDB-T Support here */
+	case DTV_ISDBT_PARTIAL_RECEPTION:
+		tvp->u.data = fe->dtv_property_cache.isdbt_partial_reception;
+		break;
+	case DTV_ISDBT_SOUND_BROADCASTING:
+		tvp->u.data = fe->dtv_property_cache.isdbt_sb_mode;
+		break;
+	case DTV_ISDBT_SB_SUBCHANNEL_ID:
+		tvp->u.data = fe->dtv_property_cache.isdbt_sb_subchannel;
+		break;
+	case DTV_ISDBT_SB_SEGMENT_IDX:
+		tvp->u.data = fe->dtv_property_cache.isdbt_sb_segment_idx;
+		break;
+	case DTV_ISDBT_SB_SEGMENT_COUNT:
+		tvp->u.data = fe->dtv_property_cache.isdbt_sb_segment_count;
+		break;
+	case DTV_ISDBT_LAYERA_FEC:
+		tvp->u.data = fe->dtv_property_cache.layer[0].fec;
+		break;
+	case DTV_ISDBT_LAYERA_MODULATION:
+		tvp->u.data = fe->dtv_property_cache.layer[0].modulation;
+		break;
+	case DTV_ISDBT_LAYERA_SEGMENT_COUNT:
+		tvp->u.data = fe->dtv_property_cache.layer[0].segment_count;
+		break;
+	case DTV_ISDBT_LAYERA_TIME_INTERLEAVING:
+		tvp->u.data = fe->dtv_property_cache.layer[0].interleaving;
+		break;
+	case DTV_ISDBT_LAYERB_FEC:
+		tvp->u.data = fe->dtv_property_cache.layer[1].fec;
+		break;
+	case DTV_ISDBT_LAYERB_MODULATION:
+		tvp->u.data = fe->dtv_property_cache.layer[1].modulation;
+		break;
+	case DTV_ISDBT_LAYERB_SEGMENT_COUNT:
+		tvp->u.data = fe->dtv_property_cache.layer[1].segment_count;
+		break;
+	case DTV_ISDBT_LAYERB_TIME_INTERLEAVING:
+		tvp->u.data = fe->dtv_property_cache.layer[1].interleaving;
+		break;
+	case DTV_ISDBT_LAYERC_FEC:
+		tvp->u.data = fe->dtv_property_cache.layer[2].fec;
+		break;
+	case DTV_ISDBT_LAYERC_MODULATION:
+		tvp->u.data = fe->dtv_property_cache.layer[2].modulation;
+		break;
+	case DTV_ISDBT_LAYERC_SEGMENT_COUNT:
+		tvp->u.data = fe->dtv_property_cache.layer[2].segment_count;
+		break;
+	case DTV_ISDBT_LAYERC_TIME_INTERLEAVING:
+		tvp->u.data = fe->dtv_property_cache.layer[2].interleaving;
+		break;
 	default:
 		r = -1;
 	}
@@ -1302,10 +1443,8 @@ static int dtv_property_process_set(struct dvb_frontend *fe,
 		/* Reset a cache of data specific to the frontend here. This does
 		 * not effect hardware.
 		 */
+		dvb_frontend_clear_cache(fe);
 		dprintk("%s() Flushing property cache\n", __func__);
-		memset(&fe->dtv_property_cache, 0, sizeof(struct dtv_frontend_properties));
-		fe->dtv_property_cache.state = tvp->cmd;
-		fe->dtv_property_cache.delivery_system = SYS_UNDEFINED;
 		break;
 	case DTV_TUNE:
 		/* interpret the cache of data, build either a traditional frontend
@@ -1371,6 +1510,59 @@ static int dtv_property_process_set(struct dvb_frontend *fe,
 	case DTV_HIERARCHY:
 		fe->dtv_property_cache.hierarchy = tvp->u.data;
 		break;
+
+	/* ISDB-T Support here */
+	case DTV_ISDBT_PARTIAL_RECEPTION:
+		fe->dtv_property_cache.isdbt_partial_reception = tvp->u.data;
+		break;
+	case DTV_ISDBT_SOUND_BROADCASTING:
+		fe->dtv_property_cache.isdbt_sb_mode = tvp->u.data;
+		break;
+	case DTV_ISDBT_SB_SUBCHANNEL_ID:
+		fe->dtv_property_cache.isdbt_sb_subchannel = tvp->u.data;
+		break;
+	case DTV_ISDBT_SB_SEGMENT_IDX:
+		fe->dtv_property_cache.isdbt_sb_segment_idx = tvp->u.data;
+		break;
+	case DTV_ISDBT_SB_SEGMENT_COUNT:
+		fe->dtv_property_cache.isdbt_sb_segment_count = tvp->u.data;
+		break;
+	case DTV_ISDBT_LAYERA_FEC:
+		fe->dtv_property_cache.layer[0].fec = tvp->u.data;
+		break;
+	case DTV_ISDBT_LAYERA_MODULATION:
+		fe->dtv_property_cache.layer[0].modulation = tvp->u.data;
+		break;
+	case DTV_ISDBT_LAYERA_SEGMENT_COUNT:
+		fe->dtv_property_cache.layer[0].segment_count = tvp->u.data;
+		break;
+	case DTV_ISDBT_LAYERA_TIME_INTERLEAVING:
+		fe->dtv_property_cache.layer[0].interleaving = tvp->u.data;
+		break;
+	case DTV_ISDBT_LAYERB_FEC:
+		fe->dtv_property_cache.layer[1].fec = tvp->u.data;
+		break;
+	case DTV_ISDBT_LAYERB_MODULATION:
+		fe->dtv_property_cache.layer[1].modulation = tvp->u.data;
+		break;
+	case DTV_ISDBT_LAYERB_SEGMENT_COUNT:
+		fe->dtv_property_cache.layer[1].segment_count = tvp->u.data;
+		break;
+	case DTV_ISDBT_LAYERB_TIME_INTERLEAVING:
+		fe->dtv_property_cache.layer[1].interleaving = tvp->u.data;
+		break;
+	case DTV_ISDBT_LAYERC_FEC:
+		fe->dtv_property_cache.layer[2].fec = tvp->u.data;
+		break;
+	case DTV_ISDBT_LAYERC_MODULATION:
+		fe->dtv_property_cache.layer[2].modulation = tvp->u.data;
+		break;
+	case DTV_ISDBT_LAYERC_SEGMENT_COUNT:
+		fe->dtv_property_cache.layer[2].segment_count = tvp->u.data;
+		break;
+	case DTV_ISDBT_LAYERC_TIME_INTERLEAVING:
+		fe->dtv_property_cache.layer[2].interleaving = tvp->u.data;
+		break;
 	default:
 		r = -1;
 	}
diff --git a/drivers/media/dvb/dvb-core/dvb_frontend.h b/drivers/media/dvb/dvb-core/dvb_frontend.h
index e176da472d7a..9e46f1772c54 100644
--- a/drivers/media/dvb/dvb-core/dvb_frontend.h
+++ b/drivers/media/dvb/dvb-core/dvb_frontend.h
@@ -341,6 +341,20 @@ struct dtv_frontend_properties {
 	fe_rolloff_t		rolloff;
 
 	fe_delivery_system_t	delivery_system;
+
+	/* ISDB-T specifics */
+	u8			isdbt_partial_reception;
+	u8			isdbt_sb_mode;
+	u8			isdbt_sb_subchannel;
+	u32			isdbt_sb_segment_idx;
+	u32			isdbt_sb_segment_count;
+	u8			isdbt_layer_enabled;
+	struct {
+	    u8			segment_count;
+	    fe_code_rate_t	fec;
+	    fe_modulation_t	modulation;
+	    u8			interleaving;
+	} layer[3];
 };
 
 struct dvb_frontend {
diff --git a/include/linux/dvb/frontend.h b/include/linux/dvb/frontend.h
index 51c8d2d49e42..25b01c14727b 100644
--- a/include/linux/dvb/frontend.h
+++ b/include/linux/dvb/frontend.h
@@ -173,7 +173,8 @@ typedef enum fe_modulation {
 typedef enum fe_transmit_mode {
 	TRANSMISSION_MODE_2K,
 	TRANSMISSION_MODE_8K,
-	TRANSMISSION_MODE_AUTO
+	TRANSMISSION_MODE_AUTO,
+	TRANSMISSION_MODE_4K
 } fe_transmit_mode_t;
 
 typedef enum fe_bandwidth {
@@ -268,15 +269,40 @@ struct dvb_frontend_event {
 #define DTV_FE_CAPABILITY	16
 #define DTV_DELIVERY_SYSTEM	17
 
-#define DTV_API_VERSION				35
-#define DTV_API_VERSION				35
-#define DTV_CODE_RATE_HP			36
-#define DTV_CODE_RATE_LP			37
-#define DTV_GUARD_INTERVAL			38
-#define DTV_TRANSMISSION_MODE			39
-#define DTV_HIERARCHY				40
+/* ISDB-T and ISDB-Tsb */
+#define DTV_ISDBT_PARTIAL_RECEPTION	18
+#define DTV_ISDBT_SOUND_BROADCASTING	19
 
-#define DTV_MAX_COMMAND				DTV_HIERARCHY
+#define DTV_ISDBT_SB_SUBCHANNEL_ID	20
+#define DTV_ISDBT_SB_SEGMENT_IDX	21
+#define DTV_ISDBT_SB_SEGMENT_COUNT	22
+
+#define DTV_ISDBT_LAYERA_FEC			23
+#define DTV_ISDBT_LAYERA_MODULATION		24
+#define DTV_ISDBT_LAYERA_SEGMENT_COUNT		25
+#define DTV_ISDBT_LAYERA_TIME_INTERLEAVING	26
+
+#define DTV_ISDBT_LAYERB_FEC			27
+#define DTV_ISDBT_LAYERB_MODULATION		28
+#define DTV_ISDBT_LAYERB_SEGMENT_COUNT		29
+#define DTV_ISDBT_LAYERB_TIME_INTERLEAVING	30
+
+#define DTV_ISDBT_LAYERC_FEC			31
+#define DTV_ISDBT_LAYERC_MODULATION		32
+#define DTV_ISDBT_LAYERC_SEGMENT_COUNT		33
+#define DTV_ISDBT_LAYERC_TIME_INTERLEAVING	34
+
+#define DTV_API_VERSION		35
+
+#define DTV_CODE_RATE_HP	36
+#define DTV_CODE_RATE_LP	37
+#define DTV_GUARD_INTERVAL	38
+#define DTV_TRANSMISSION_MODE	39
+#define DTV_HIERARCHY		40
+
+#define DTV_ISDBT_LAYER_ENABLED	41
+
+#define DTV_MAX_COMMAND				DTV_ISDBT_LAYER_ENABLED
 
 typedef enum fe_pilot {
 	PILOT_ON,
diff --git a/include/linux/dvb/version.h b/include/linux/dvb/version.h
index 25b823b81734..540b0583d9fb 100644
--- a/include/linux/dvb/version.h
+++ b/include/linux/dvb/version.h
@@ -24,6 +24,6 @@
 #define _DVBVERSION_H_
 
 #define DVB_API_VERSION 5
-#define DVB_API_VERSION_MINOR 0
+#define DVB_API_VERSION_MINOR 1
 
 #endif /*_DVBVERSION_H_*/
-- 
cgit v1.2.3-71-gd317


From 9afef394308cf63ddb67b003a1c6036615456eb9 Mon Sep 17 00:00:00 2001
From: Steven Toth <stoth@kernellabs.com>
Date: Sat, 9 May 2009 13:24:12 -0300
Subject: V4L/DVB (12922): Add the SAA7164 I2C bus identifier

Add the SAA7164 I2C bus identifier

Signed-off-by: Steven Toth <stoth@kernellabs.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 include/linux/i2c-id.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/i2c-id.h b/include/linux/i2c-id.h
index c9087de5c6c6..271b67a3167e 100644
--- a/include/linux/i2c-id.h
+++ b/include/linux/i2c-id.h
@@ -53,6 +53,7 @@
 #define I2C_HW_B_AU0828		0x010023 /* auvitek au0828 usb bridge */
 #define I2C_HW_B_CX231XX	0x010024 /* Conexant CX231XX USB based cards */
 #define I2C_HW_B_HDPVR		0x010025 /* Hauppauge HD PVR */
+#define I2C_HW_B_SAA7164        0x010024 /* NXP 7164 based tv cards */
 
 /* --- SGI adapters							*/
 #define I2C_HW_SGI_VINO		0x160000
-- 
cgit v1.2.3-71-gd317


From c9230457a98f33fe3658fd0bd9b9ceffdd97b63b Mon Sep 17 00:00:00 2001
From: Steven Toth <stoth@kernellabs.com>
Date: Thu, 17 Sep 2009 15:05:38 -0300
Subject: V4L/DVB (12974): SAA7164: Remove the SAA7164 bus id, no longer
 required.

SAA7164: Remove the SAA7164 bus id, no longer required.

Signed-off-by: Steven Toth <stoth@kernellabs.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/media/video/saa7164/saa7164-i2c.c | 1 -
 include/linux/i2c-id.h                    | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/saa7164/saa7164-i2c.c b/drivers/media/video/saa7164/saa7164-i2c.c
index 4c431b4ac8db..8198e6833f75 100644
--- a/drivers/media/video/saa7164/saa7164-i2c.c
+++ b/drivers/media/video/saa7164/saa7164-i2c.c
@@ -116,7 +116,6 @@ static struct i2c_algorithm saa7164_i2c_algo_template = {
 static struct i2c_adapter saa7164_i2c_adap_template = {
 	.name              = "saa7164",
 	.owner             = THIS_MODULE,
-	.id                = I2C_HW_B_SAA7164,
 	.algo              = &saa7164_i2c_algo_template,
 	.client_register   = attach_inform,
 	.client_unregister = detach_inform,
diff --git a/include/linux/i2c-id.h b/include/linux/i2c-id.h
index 271b67a3167e..c9087de5c6c6 100644
--- a/include/linux/i2c-id.h
+++ b/include/linux/i2c-id.h
@@ -53,7 +53,6 @@
 #define I2C_HW_B_AU0828		0x010023 /* auvitek au0828 usb bridge */
 #define I2C_HW_B_CX231XX	0x010024 /* Conexant CX231XX USB based cards */
 #define I2C_HW_B_HDPVR		0x010025 /* Hauppauge HD PVR */
-#define I2C_HW_B_SAA7164        0x010024 /* NXP 7164 based tv cards */
 
 /* --- SGI adapters							*/
 #define I2C_HW_SGI_VINO		0x160000
-- 
cgit v1.2.3-71-gd317


From 3418f165cc1cbcb30a8c017d7ebbc774710066bb Mon Sep 17 00:00:00 2001
From: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Date: Tue, 25 Aug 2009 11:44:14 -0300
Subject: V4L/DVB (12511): V4L2: add a new V4L2_CID_BAND_STOP_FILTER integer
 control

Add a new V4L2_CID_BAND_STOP_FILTER integer control, which either switches the
band-stop filter off, or sets it to a certain strength.

Signed-off-by: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 include/linux/videodev2.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 3689d7d81fe9..b59e78c57161 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -910,9 +910,10 @@ enum v4l2_colorfx {
 	V4L2_COLORFX_SEPIA	= 2,
 };
 #define V4L2_CID_AUTOBRIGHTNESS			(V4L2_CID_BASE+32)
+#define V4L2_CID_BAND_STOP_FILTER		(V4L2_CID_BASE+33)
 
 /* last CID + 1 */
-#define V4L2_CID_LASTP1                         (V4L2_CID_BASE+33)
+#define V4L2_CID_LASTP1                         (V4L2_CID_BASE+34)
 
 /*  MPEG-class control IDs defined by V4L2 */
 #define V4L2_CID_MPEG_BASE 			(V4L2_CTRL_CLASS_MPEG | 0x900)
-- 
cgit v1.2.3-71-gd317


From 98293ef3e54f9f2175f11b4d14c119a2ff753d61 Mon Sep 17 00:00:00 2001
From: HIRANO Takahito <hiranotaka@zng.info>
Date: Fri, 18 Sep 2009 11:17:54 -0300
Subject: V4L/DVB (12997): Add the DTV_ISDB_TS_ID property for ISDB_S

In ISDB-S, time-devision duplex is used to multiplexing several waves
in the same frequency. Each wave is identified by its own transport
stream ID, or TS ID. We need to provide some way to specify this ID
from user applications to handle ISDB-S frontends.

This code has been tested with the Earthsoft PT1 driver.

[mchehab@infradead.org: Fix merge conflicts with isdbt and rename the new parameter to DTV_ISDBS_TS_ID]
Signed-off-by: HIRANO Takahito <hiranotaka@zng.info>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/media/dvb/dvb-core/dvb_frontend.c | 8 ++++++++
 drivers/media/dvb/dvb-core/dvb_frontend.h | 3 +++
 include/linux/dvb/frontend.h              | 4 +++-
 3 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/media/dvb/dvb-core/dvb_frontend.c b/drivers/media/dvb/dvb-core/dvb_frontend.c
index 3c9482660eac..ddf639ed2fd8 100644
--- a/drivers/media/dvb/dvb-core/dvb_frontend.c
+++ b/drivers/media/dvb/dvb-core/dvb_frontend.c
@@ -1031,6 +1031,8 @@ static struct dtv_cmds_h dtv_cmds[] = {
 	_DTV_CMD(DTV_ISDBT_LAYERC_SEGMENT_COUNT, 0, 0),
 	_DTV_CMD(DTV_ISDBT_LAYERC_TIME_INTERLEAVING, 0, 0),
 
+	_DTV_CMD(DTV_ISDBS_TS_ID, 1, 0),
+
 	/* Get */
 	[DTV_DISEQC_SLAVE_REPLY] = {
 		.name	= "DTV_DISEQC_SLAVE_REPLY",
@@ -1420,6 +1422,9 @@ static int dtv_property_process_get(struct dvb_frontend *fe,
 	case DTV_ISDBT_LAYERC_TIME_INTERLEAVING:
 		tvp->u.data = fe->dtv_property_cache.layer[2].interleaving;
 		break;
+	case DTV_ISDBS_TS_ID:
+		tvp->u.data = fe->dtv_property_cache.isdbs_ts_id;
+		break;
 	default:
 		r = -1;
 	}
@@ -1571,6 +1576,9 @@ static int dtv_property_process_set(struct dvb_frontend *fe,
 	case DTV_ISDBT_LAYERC_TIME_INTERLEAVING:
 		fe->dtv_property_cache.layer[2].interleaving = tvp->u.data;
 		break;
+	case DTV_ISDBS_TS_ID:
+		fe->dtv_property_cache.isdbs_ts_id = tvp->u.data;
+		break;
 	default:
 		r = -1;
 	}
diff --git a/drivers/media/dvb/dvb-core/dvb_frontend.h b/drivers/media/dvb/dvb-core/dvb_frontend.h
index 9e46f1772c54..810f07d63246 100644
--- a/drivers/media/dvb/dvb-core/dvb_frontend.h
+++ b/drivers/media/dvb/dvb-core/dvb_frontend.h
@@ -355,6 +355,9 @@ struct dtv_frontend_properties {
 	    fe_modulation_t	modulation;
 	    u8			interleaving;
 	} layer[3];
+
+	/* ISDB-T specifics */
+	u32			isdbs_ts_id;
 };
 
 struct dvb_frontend {
diff --git a/include/linux/dvb/frontend.h b/include/linux/dvb/frontend.h
index 25b01c14727b..b6cb5425cde3 100644
--- a/include/linux/dvb/frontend.h
+++ b/include/linux/dvb/frontend.h
@@ -302,7 +302,9 @@ struct dvb_frontend_event {
 
 #define DTV_ISDBT_LAYER_ENABLED	41
 
-#define DTV_MAX_COMMAND				DTV_ISDBT_LAYER_ENABLED
+#define DTV_ISDBS_TS_ID		42
+
+#define DTV_MAX_COMMAND				DTV_ISDBS_TS_ID
 
 typedef enum fe_pilot {
 	PILOT_ON,
-- 
cgit v1.2.3-71-gd317


From eb27cae8adaa658a0bf31631baa1ce29d8183759 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Mon, 6 Jul 2009 23:40:19 -0400
Subject: ACPI: linux/acpi.h should not include linux/dmi.h

users of acpi.h that need dmi.h should include it directly.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/bus.c      | 1 +
 drivers/acpi/ec.c       | 1 +
 drivers/acpi/pci_slot.c | 1 +
 drivers/acpi/video.c    | 2 +-
 drivers/pci/dmar.c      | 1 +
 include/linux/acpi.h    | 2 --
 6 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 2876fc70c3a9..0fb6b2a8b103 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -38,6 +38,7 @@
 #include <linux/pci.h>
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
+#include <linux/dmi.h>
 
 #include "internal.h"
 
diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
index 391f331674c7..c434f6571ab8 100644
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c
@@ -42,6 +42,7 @@
 #include <asm/io.h>
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
+#include <linux/dmi.h>
 
 #define ACPI_EC_CLASS			"embedded_controller"
 #define ACPI_EC_DEVICE_NAME		"Embedded Controller"
diff --git a/drivers/acpi/pci_slot.c b/drivers/acpi/pci_slot.c
index 12158e0d009b..7aa6802c0ee3 100644
--- a/drivers/acpi/pci_slot.c
+++ b/drivers/acpi/pci_slot.c
@@ -31,6 +31,7 @@
 #include <linux/acpi.h>
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
+#include <linux/dmi.h>
 
 static int debug;
 static int check_sta_before_sun;
diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c
index 60ea984c84a0..055a455b6c75 100644
--- a/drivers/acpi/video.c
+++ b/drivers/acpi/video.c
@@ -40,7 +40,7 @@
 #include <linux/pci.h>
 #include <linux/pci_ids.h>
 #include <asm/uaccess.h>
-
+#include <linux/dmi.h>
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
 
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 7b287cb38b7a..4484ac089772 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -33,6 +33,7 @@
 #include <linux/timer.h>
 #include <linux/irq.h>
 #include <linux/interrupt.h>
+#include <linux/dmi.h>
 
 #undef PREFIX
 #define PREFIX "DMAR:"
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 34321cfffeab..7950c65c43af 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -41,8 +41,6 @@
 #include <acpi/acpi_drivers.h>
 #include <acpi/acpi_numa.h>
 #include <asm/acpi.h>
-#include <linux/dmi.h>
-
 
 enum acpi_irq_model_id {
 	ACPI_IRQ_MODEL_PIC = 0,
-- 
cgit v1.2.3-71-gd317


From e4f55966d02c5dfade2978c2aa05fb202a78a4d1 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Mon, 6 Jul 2009 23:42:10 -0400
Subject: ACPI: remove unnecessary #ifdef CONFIG_DMI

acpi_osi_setup() does not depend on CONFIG_DMI
acpi_dmi_osi_linux()'s definition doesn't depend on CONFIG_DMI either

Signed-off-by: Len Brown <len.brown@intel.com>
---
 include/linux/acpi.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 7950c65c43af..9260408bf4ea 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -217,10 +217,8 @@ static inline int acpi_video_display_switch_support(void)
 #endif /* defined(CONFIG_ACPI_VIDEO) || defined(CONFIG_ACPI_VIDEO_MODULE) */
 
 extern int acpi_blacklisted(void);
-#ifdef CONFIG_DMI
 extern void acpi_dmi_osi_linux(int enable, const struct dmi_system_id *d);
 extern int acpi_osi_setup(char *str);
-#endif
 
 #ifdef CONFIG_ACPI_NUMA
 int acpi_get_pxm(acpi_handle handle);
-- 
cgit v1.2.3-71-gd317


From a71fca58b7f4abca551ae2256ac08dd9123a03f9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 18 Sep 2009 10:28:19 -0700
Subject: rcu: Fix whitespace inconsistencies

Fix a number of whitespace ^Ierrors in the include/linux/rcu*
and the kernel/rcu* files.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
LKML-Reference: <20090918172819.GA24405@linux.vnet.ibm.com>
[ did more checkpatch fixlets ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rculist_nulls.h |  2 +-
 include/linux/rcupdate.h      |  6 +++---
 include/linux/rcutree.h       |  2 +-
 kernel/rcupdate.c             |  4 ++--
 kernel/rcutorture.c           | 28 +++++++++++++++-------------
 kernel/rcutree.c              |  2 +-
 kernel/rcutree.h              |  2 +-
 kernel/rcutree_plugin.h       |  3 +--
 kernel/rcutree_trace.c        |  2 +-
 9 files changed, 26 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index f9ddd03961a8..589a40919f01 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -102,7 +102,7 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
  */
 #define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \
 	for (pos = rcu_dereference((head)->first);			 \
-		(!is_a_nulls(pos)) && 			\
+		(!is_a_nulls(pos)) &&			\
 		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \
 		pos = rcu_dereference(pos->next))
 
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 39dce83c4865..6fe0363724e9 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -1,5 +1,5 @@
 /*
- * Read-Copy Update mechanism for mutual exclusion 
+ * Read-Copy Update mechanism for mutual exclusion
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -18,7 +18,7 @@
  * Copyright IBM Corporation, 2001
  *
  * Author: Dipankar Sarma <dipankar@in.ibm.com>
- * 
+ *
  * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
  * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
  * Papers:
@@ -26,7 +26,7 @@
  * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
  *
  * For detailed explanation of Read-Copy Update mechanism see -
- * 		http://lse.sourceforge.net/locking/rcupdate.html
+ *		http://lse.sourceforge.net/locking/rcupdate.html
  *
  */
 
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 00d08c0cbcc1..37682770e9d2 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -24,7 +24,7 @@
  * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
  *
  * For detailed explanation of Read-Copy Update mechanism see -
- * 	Documentation/RCU
+ *	Documentation/RCU
  */
 
 #ifndef __LINUX_RCUTREE_H
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 28d2f24e7871..37ac45483082 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -19,7 +19,7 @@
  *
  * Authors: Dipankar Sarma <dipankar@in.ibm.com>
  *	    Manfred Spraul <manfred@colorfullife.com>
- * 
+ *
  * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
  * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
  * Papers:
@@ -27,7 +27,7 @@
  * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
  *
  * For detailed explanation of Read-Copy Update mechanism see -
- * 		http://lse.sourceforge.net/locking/rcupdate.html
+ *		http://lse.sourceforge.net/locking/rcupdate.html
  *
  */
 #include <linux/types.h>
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 328a8257c885..233768f21f97 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -18,7 +18,7 @@
  * Copyright (C) IBM Corporation, 2005, 2006
  *
  * Authors: Paul E. McKenney <paulmck@us.ibm.com>
- *          Josh Triplett <josh@freedesktop.org>
+ *	  Josh Triplett <josh@freedesktop.org>
  *
  * See also:  Documentation/RCU/torture.txt
  */
@@ -50,7 +50,7 @@
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
-              "Josh Triplett <josh@freedesktop.org>");
+	      "Josh Triplett <josh@freedesktop.org>");
 
 static int nreaders = -1;	/* # reader threads, defaults to 2*ncpus */
 static int nfakewriters = 4;	/* # fake writer threads */
@@ -110,8 +110,8 @@ struct rcu_torture {
 };
 
 static LIST_HEAD(rcu_torture_freelist);
-static struct rcu_torture *rcu_torture_current = NULL;
-static long rcu_torture_current_version = 0;
+static struct rcu_torture *rcu_torture_current;
+static long rcu_torture_current_version;
 static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
 static DEFINE_SPINLOCK(rcu_torture_lock);
 static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
@@ -124,11 +124,11 @@ static atomic_t n_rcu_torture_alloc_fail;
 static atomic_t n_rcu_torture_free;
 static atomic_t n_rcu_torture_mberror;
 static atomic_t n_rcu_torture_error;
-static long n_rcu_torture_timers = 0;
+static long n_rcu_torture_timers;
 static struct list_head rcu_torture_removed;
 static cpumask_var_t shuffle_tmp_mask;
 
-static int stutter_pause_test = 0;
+static int stutter_pause_test;
 
 #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
 #define RCUTORTURE_RUNNABLE_INIT 1
@@ -267,7 +267,8 @@ struct rcu_torture_ops {
 	int irq_capable;
 	char *name;
 };
-static struct rcu_torture_ops *cur_ops = NULL;
+
+static struct rcu_torture_ops *cur_ops;
 
 /*
  * Definitions for rcu torture testing.
@@ -342,8 +343,8 @@ static struct rcu_torture_ops rcu_ops = {
 	.sync		= synchronize_rcu,
 	.cb_barrier	= rcu_barrier,
 	.stats		= NULL,
-	.irq_capable 	= 1,
-	.name 		= "rcu"
+	.irq_capable	= 1,
+	.name		= "rcu"
 };
 
 static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
@@ -641,7 +642,8 @@ rcu_torture_writer(void *arg)
 
 	do {
 		schedule_timeout_uninterruptible(1);
-		if ((rp = rcu_torture_alloc()) == NULL)
+		rp = rcu_torture_alloc();
+		if (rp == NULL)
 			continue;
 		rp->rtort_pipe_count = 0;
 		udelay(rcu_random(&rand) & 0x3ff);
@@ -1113,7 +1115,7 @@ rcu_torture_init(void)
 		printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
 		       torture_type);
 		mutex_unlock(&fullstop_mutex);
-		return (-EINVAL);
+		return -EINVAL;
 	}
 	if (cur_ops->init)
 		cur_ops->init(); /* no "goto unwind" prior to this point!!! */
@@ -1164,7 +1166,7 @@ rcu_torture_init(void)
 		goto unwind;
 	}
 	fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
-	                           GFP_KERNEL);
+				   GFP_KERNEL);
 	if (fakewriter_tasks == NULL) {
 		VERBOSE_PRINTK_ERRSTRING("out of memory");
 		firsterr = -ENOMEM;
@@ -1173,7 +1175,7 @@ rcu_torture_init(void)
 	for (i = 0; i < nfakewriters; i++) {
 		VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task");
 		fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL,
-		                                  "rcu_torture_fakewriter");
+						  "rcu_torture_fakewriter");
 		if (IS_ERR(fakewriter_tasks[i])) {
 			firsterr = PTR_ERR(fakewriter_tasks[i]);
 			VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter");
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 1b32cdd1b2e2..52b06f6e158c 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -25,7 +25,7 @@
  * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
  *
  * For detailed explanation of Read-Copy Update mechanism see -
- * 	Documentation/RCU
+ *	Documentation/RCU
  */
 #include <linux/types.h>
 #include <linux/kernel.h>
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index bf8a6f9f134d..8e8287a983c2 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -142,7 +142,7 @@ struct rcu_data {
 	 */
 	struct rcu_head *nxtlist;
 	struct rcu_head **nxttail[RCU_NEXT_SIZE];
-	long		qlen; 	 	/* # of queued callbacks */
+	long		qlen;		/* # of queued callbacks */
 	long		blimit;		/* Upper limit on a processed batch */
 
 #ifdef CONFIG_NO_HZ
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 09b7325baad1..1cee04f627eb 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -370,9 +370,8 @@ static void rcu_preempt_check_callbacks(int cpu)
 		rcu_preempt_qs(cpu);
 		return;
 	}
-	if (per_cpu(rcu_preempt_data, cpu).qs_pending) {
+	if (per_cpu(rcu_preempt_data, cpu).qs_pending)
 		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
-	}
 }
 
 /*
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 0ea1bff69727..c89f5e9fd173 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -20,7 +20,7 @@
  * Papers:  http://www.rdrop.com/users/paulmck/RCU
  *
  * For detailed explanation of Read-Copy Update mechanism see -
- * 		Documentation/RCU
+ *		Documentation/RCU
  *
  */
 #include <linux/types.h>
-- 
cgit v1.2.3-71-gd317


From 393b2ad8c757ba3ccd2a99ca5bbcd6db4d3fa53d Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 12 Sep 2009 07:52:47 +0200
Subject: perf: Add a timestamp to fork events

perf timechart needs to know when a process forked, in order to be
able to visualize properly when tasks start.

This patch adds a time field to the event structure, and fills it
in appropriately.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20090912130341.51ad2de2@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  2 ++
 kernel/perf_counter.c        | 11 +++++++++--
 tools/perf/util/event.h      |  1 +
 3 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c7375f97aa19..bd341007c4fc 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -336,6 +336,7 @@ enum perf_event_type {
 	 *	struct perf_event_header	header;
 	 *	u32				pid, ppid;
 	 *	u32				tid, ptid;
+	 *	u64				time;
 	 * };
 	 */
 	PERF_EVENT_EXIT			= 4,
@@ -356,6 +357,7 @@ enum perf_event_type {
 	 *	struct perf_event_header	header;
 	 *	u32				pid, ppid;
 	 *	u32				tid, ptid;
+	 *	{ u64				time;     } && PERF_SAMPLE_TIME
 	 * };
 	 */
 	PERF_EVENT_FORK			= 7,
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d013f4e89e9c..d5899b62b276 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3083,6 +3083,7 @@ struct perf_task_event {
 		u32				ppid;
 		u32				tid;
 		u32				ptid;
+		u64				time;
 	} event;
 };
 
@@ -3090,9 +3091,12 @@ static void perf_counter_task_output(struct perf_counter *counter,
 				     struct perf_task_event *task_event)
 {
 	struct perf_output_handle handle;
-	int size = task_event->event.header.size;
+	int size;
 	struct task_struct *task = task_event->task;
-	int ret = perf_output_begin(&handle, counter, size, 0, 0);
+	int ret;
+
+	size  = task_event->event.header.size;
+	ret = perf_output_begin(&handle, counter, size, 0, 0);
 
 	if (ret)
 		return;
@@ -3103,7 +3107,10 @@ static void perf_counter_task_output(struct perf_counter *counter,
 	task_event->event.tid = perf_counter_tid(counter, task);
 	task_event->event.ptid = perf_counter_tid(counter, current);
 
+	task_event->event.time = perf_clock();
+
 	perf_output_put(&handle, task_event->event);
+
 	perf_output_end(&handle);
 }
 
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 2495529cae7d..28a579f8fa8e 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -39,6 +39,7 @@ struct fork_event {
 	struct perf_event_header header;
 	u32 pid, ppid;
 	u32 tid, ptid;
+	u64 time;
 };
 
 struct lost_event {
-- 
cgit v1.2.3-71-gd317


From 778dbcc1ebea6f9a560020110987449bf4607e5f Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Fri, 18 Sep 2009 12:51:44 -0700
Subject: mtd: onenand: make onenand/generic.c more generic

Remove the ARM dependency from the generic "onenand" platform device
driver.  This change makes the driver useful for other architectures as
well.  Needed for the SuperH kfr2r09 board.

Apart from the obvious Kconfig bits, the most important change is the move
away from ARM specific includes and platform data.  Together with this
change the only in-tree board code gets an update, and the driver name is
also changed gracefully break potential out of tree drivers.

The driver is also updated to allow NULL as platform data together with a
few changes to make use of resource_size() and dev_name().

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Tony Lindgren <tony@atomide.com>
Cc: Kyungmin Park <kmpark@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 arch/arm/mach-omap2/board-apollon.c |  4 ++--
 drivers/mtd/onenand/Kconfig         |  1 -
 drivers/mtd/onenand/generic.c       | 24 ++++++++++++++----------
 include/linux/mtd/onenand.h         |  8 ++++++++
 4 files changed, 24 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/board-apollon.c b/arch/arm/mach-omap2/board-apollon.c
index dcfc20d03894..b4d4ef6d90b9 100644
--- a/arch/arm/mach-omap2/board-apollon.c
+++ b/arch/arm/mach-omap2/board-apollon.c
@@ -87,7 +87,7 @@ static struct mtd_partition apollon_partitions[] = {
 	},
 };
 
-static struct flash_platform_data apollon_flash_data = {
+static struct onenand_platform_data apollon_flash_data = {
 	.parts		= apollon_partitions,
 	.nr_parts	= ARRAY_SIZE(apollon_partitions),
 };
@@ -99,7 +99,7 @@ static struct resource apollon_flash_resource[] = {
 };
 
 static struct platform_device apollon_onenand_device = {
-	.name		= "onenand",
+	.name		= "onenand-flash",
 	.id		= -1,
 	.dev		= {
 		.platform_data	= &apollon_flash_data,
diff --git a/drivers/mtd/onenand/Kconfig b/drivers/mtd/onenand/Kconfig
index 3a094d1bf8cc..a38f580c2bb3 100644
--- a/drivers/mtd/onenand/Kconfig
+++ b/drivers/mtd/onenand/Kconfig
@@ -24,7 +24,6 @@ config MTD_ONENAND_VERIFY_WRITE
 
 config MTD_ONENAND_GENERIC
 	tristate "OneNAND Flash device via platform device driver"
-	depends on ARM
 	help
 	  Support for OneNAND flash via platform device driver.
 
diff --git a/drivers/mtd/onenand/generic.c b/drivers/mtd/onenand/generic.c
index 3a496c33fb52..e78914938c5c 100644
--- a/drivers/mtd/onenand/generic.c
+++ b/drivers/mtd/onenand/generic.c
@@ -19,12 +19,16 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/onenand.h>
 #include <linux/mtd/partitions.h>
-
 #include <asm/io.h>
-#include <asm/mach/flash.h>
-
-#define DRIVER_NAME	"onenand"
 
+/*
+ * Note: Driver name and platform data format have been updated!
+ *
+ * This version of the driver is named "onenand-flash" and takes struct
+ * onenand_platform_data as platform data. The old ARM-specific version
+ * with the name "onenand" used to take struct flash_platform_data.
+ */
+#define DRIVER_NAME	"onenand-flash"
 
 #ifdef CONFIG_MTD_PARTITIONS
 static const char *part_probes[] = { "cmdlinepart", NULL,  };
@@ -39,16 +43,16 @@ struct onenand_info {
 static int __devinit generic_onenand_probe(struct platform_device *pdev)
 {
 	struct onenand_info *info;
-	struct flash_platform_data *pdata = pdev->dev.platform_data;
+	struct onenand_platform_data *pdata = pdev->dev.platform_data;
 	struct resource *res = pdev->resource;
-	unsigned long size = res->end - res->start + 1;
+	unsigned long size = resource_size(res);
 	int err;
 
 	info = kzalloc(sizeof(struct onenand_info), GFP_KERNEL);
 	if (!info)
 		return -ENOMEM;
 
-	if (!request_mem_region(res->start, size, pdev->dev.driver->name)) {
+	if (!request_mem_region(res->start, size, dev_name(&pdev->dev))) {
 		err = -EBUSY;
 		goto out_free_info;
 	}
@@ -59,7 +63,7 @@ static int __devinit generic_onenand_probe(struct platform_device *pdev)
 		goto out_release_mem_region;
 	}
 
-	info->onenand.mmcontrol = pdata->mmcontrol;
+	info->onenand.mmcontrol = pdata ? pdata->mmcontrol : 0;
 	info->onenand.irq = platform_get_irq(pdev, 0);
 
 	info->mtd.name = dev_name(&pdev->dev);
@@ -75,7 +79,7 @@ static int __devinit generic_onenand_probe(struct platform_device *pdev)
 	err = parse_mtd_partitions(&info->mtd, part_probes, &info->parts, 0);
 	if (err > 0)
 		add_mtd_partitions(&info->mtd, info->parts, err);
-	else if (err <= 0 && pdata->parts)
+	else if (err <= 0 && pdata && pdata->parts)
 		add_mtd_partitions(&info->mtd, pdata->parts, pdata->nr_parts);
 	else
 #endif
@@ -99,7 +103,7 @@ static int __devexit generic_onenand_remove(struct platform_device *pdev)
 {
 	struct onenand_info *info = platform_get_drvdata(pdev);
 	struct resource *res = pdev->resource;
-	unsigned long size = res->end - res->start + 1;
+	unsigned long size = resource_size(res);
 
 	platform_set_drvdata(pdev, NULL);
 
diff --git a/include/linux/mtd/onenand.h b/include/linux/mtd/onenand.h
index 8ed873374381..4e49f3350678 100644
--- a/include/linux/mtd/onenand.h
+++ b/include/linux/mtd/onenand.h
@@ -214,4 +214,12 @@ unsigned onenand_block(struct onenand_chip *this, loff_t addr);
 loff_t onenand_addr(struct onenand_chip *this, int block);
 int flexonenand_region(struct mtd_info *mtd, loff_t addr);
 
+struct mtd_partition;
+
+struct onenand_platform_data {
+	void		(*mmcontrol)(struct mtd_info *mtd, int sync_read);
+	struct mtd_partition *parts;
+	unsigned int	nr_parts;
+};
+
 #endif	/* __LINUX_MTD_ONENAND_H */
-- 
cgit v1.2.3-71-gd317


From 46a8cf2df2232c0051f29716ff8a166ebeb08daf Mon Sep 17 00:00:00 2001
From: Sneha Narnakaje <nsnehaprabha@ti.com>
Date: Fri, 18 Sep 2009 12:51:46 -0700
Subject: mtd: nand: add "page" parameter to all read_page/read_page_raw APIs

This patch adds a new "page" parameter to all NAND read_page/read_page_raw
APIs.  The read_page API for the new mode ECC_HW_OOB_FIRST requires the
page information to send the READOOB command and read the OOB area before
the data area.

Reviewed-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Sneha Narnakaje <nsnehaprabha@ti.com>
Signed-off-by: Sandeep Paulraj <s-paulraj@ti.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/nand/atmel_nand.c    |  2 +-
 drivers/mtd/nand/cafe_nand.c     |  2 +-
 drivers/mtd/nand/fsl_elbc_nand.c |  3 ++-
 drivers/mtd/nand/nand_base.c     | 18 ++++++++++--------
 drivers/mtd/nand/sh_flctl.c      |  2 +-
 include/linux/mtd/nand.h         |  4 ++--
 6 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/atmel_nand.c b/drivers/mtd/nand/atmel_nand.c
index 20c828ba9405..f8e9975c86e5 100644
--- a/drivers/mtd/nand/atmel_nand.c
+++ b/drivers/mtd/nand/atmel_nand.c
@@ -218,7 +218,7 @@ static int atmel_nand_calculate(struct mtd_info *mtd,
  * buf:        buffer to store read data
  */
 static int atmel_nand_read_page(struct mtd_info *mtd,
-		struct nand_chip *chip, uint8_t *buf)
+		struct nand_chip *chip, uint8_t *buf, int page)
 {
 	int eccsize = chip->ecc.size;
 	int eccbytes = chip->ecc.bytes;
diff --git a/drivers/mtd/nand/cafe_nand.c b/drivers/mtd/nand/cafe_nand.c
index 29acd06b1c39..a70f40e161d7 100644
--- a/drivers/mtd/nand/cafe_nand.c
+++ b/drivers/mtd/nand/cafe_nand.c
@@ -381,7 +381,7 @@ static int cafe_nand_read_oob(struct mtd_info *mtd, struct nand_chip *chip,
  * we need a special oob layout and handling.
  */
 static int cafe_nand_read_page(struct mtd_info *mtd, struct nand_chip *chip,
-			       uint8_t *buf)
+			       uint8_t *buf, int page)
 {
 	struct cafe_priv *cafe = mtd->priv;
 
diff --git a/drivers/mtd/nand/fsl_elbc_nand.c b/drivers/mtd/nand/fsl_elbc_nand.c
index 1f6eb2578717..ddd37d2554ed 100644
--- a/drivers/mtd/nand/fsl_elbc_nand.c
+++ b/drivers/mtd/nand/fsl_elbc_nand.c
@@ -739,7 +739,8 @@ static int fsl_elbc_chip_init_tail(struct mtd_info *mtd)
 
 static int fsl_elbc_read_page(struct mtd_info *mtd,
                               struct nand_chip *chip,
-                              uint8_t *buf)
+			      uint8_t *buf,
+			      int page)
 {
 	fsl_elbc_read_buf(mtd, buf, mtd->writesize);
 	fsl_elbc_read_buf(mtd, chip->oob_poi, mtd->oobsize);
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 4c5e8a74e1b2..17bbd5062027 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -765,7 +765,7 @@ static int nand_wait(struct mtd_info *mtd, struct nand_chip *chip)
  * Not for syndrome calculating ecc controllers, which use a special oob layout
  */
 static int nand_read_page_raw(struct mtd_info *mtd, struct nand_chip *chip,
-			      uint8_t *buf)
+			      uint8_t *buf, int page)
 {
 	chip->read_buf(mtd, buf, mtd->writesize);
 	chip->read_buf(mtd, chip->oob_poi, mtd->oobsize);
@@ -781,7 +781,7 @@ static int nand_read_page_raw(struct mtd_info *mtd, struct nand_chip *chip,
  * We need a special oob layout and handling even when OOB isn't used.
  */
 static int nand_read_page_raw_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
-			      uint8_t *buf)
+			      uint8_t *buf, int page)
 {
 	int eccsize = chip->ecc.size;
 	int eccbytes = chip->ecc.bytes;
@@ -820,7 +820,7 @@ static int nand_read_page_raw_syndrome(struct mtd_info *mtd, struct nand_chip *c
  * @buf:	buffer to store read data
  */
 static int nand_read_page_swecc(struct mtd_info *mtd, struct nand_chip *chip,
-				uint8_t *buf)
+				uint8_t *buf, int page)
 {
 	int i, eccsize = chip->ecc.size;
 	int eccbytes = chip->ecc.bytes;
@@ -830,7 +830,7 @@ static int nand_read_page_swecc(struct mtd_info *mtd, struct nand_chip *chip,
 	uint8_t *ecc_code = chip->buffers->ecccode;
 	uint32_t *eccpos = chip->ecc.layout->eccpos;
 
-	chip->ecc.read_page_raw(mtd, chip, buf);
+	chip->ecc.read_page_raw(mtd, chip, buf, page);
 
 	for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize)
 		chip->ecc.calculate(mtd, p, &ecc_calc[i]);
@@ -943,7 +943,7 @@ static int nand_read_subpage(struct mtd_info *mtd, struct nand_chip *chip, uint3
  * Not for syndrome calculating ecc controllers which need a special oob layout
  */
 static int nand_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
-				uint8_t *buf)
+				uint8_t *buf, int page)
 {
 	int i, eccsize = chip->ecc.size;
 	int eccbytes = chip->ecc.bytes;
@@ -988,7 +988,7 @@ static int nand_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
  * we need a special oob layout and handling.
  */
 static int nand_read_page_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
-				   uint8_t *buf)
+				   uint8_t *buf, int page)
 {
 	int i, eccsize = chip->ecc.size;
 	int eccbytes = chip->ecc.bytes;
@@ -1130,11 +1130,13 @@ static int nand_do_read_ops(struct mtd_info *mtd, loff_t from,
 
 			/* Now read the page into the buffer */
 			if (unlikely(ops->mode == MTD_OOB_RAW))
-				ret = chip->ecc.read_page_raw(mtd, chip, bufpoi);
+				ret = chip->ecc.read_page_raw(mtd, chip,
+							      bufpoi, page);
 			else if (!aligned && NAND_SUBPAGE_READ(chip) && !oob)
 				ret = chip->ecc.read_subpage(mtd, chip, col, bytes, bufpoi);
 			else
-				ret = chip->ecc.read_page(mtd, chip, bufpoi);
+				ret = chip->ecc.read_page(mtd, chip, bufpoi,
+							  page);
 			if (ret < 0)
 				break;
 
diff --git a/drivers/mtd/nand/sh_flctl.c b/drivers/mtd/nand/sh_flctl.c
index 2bc896623e2d..c5df28596a4a 100644
--- a/drivers/mtd/nand/sh_flctl.c
+++ b/drivers/mtd/nand/sh_flctl.c
@@ -329,7 +329,7 @@ static void set_cmd_regs(struct mtd_info *mtd, uint32_t cmd, uint32_t flcmcdr_va
 }
 
 static int flctl_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
-				uint8_t *buf)
+				uint8_t *buf, int page)
 {
 	int i, eccsize = chip->ecc.size;
 	int eccbytes = chip->ecc.bytes;
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 4030ebada49e..686f3701f2f4 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -271,13 +271,13 @@ struct nand_ecc_ctrl {
 					   uint8_t *calc_ecc);
 	int			(*read_page_raw)(struct mtd_info *mtd,
 						 struct nand_chip *chip,
-						 uint8_t *buf);
+						 uint8_t *buf, int page);
 	void			(*write_page_raw)(struct mtd_info *mtd,
 						  struct nand_chip *chip,
 						  const uint8_t *buf);
 	int			(*read_page)(struct mtd_info *mtd,
 					     struct nand_chip *chip,
-					     uint8_t *buf);
+					     uint8_t *buf, int page);
 	int			(*read_subpage)(struct mtd_info *mtd,
 					     struct nand_chip *chip,
 					     uint32_t offs, uint32_t len,
-- 
cgit v1.2.3-71-gd317


From 6e0cb135b3f3713b95ea41a11155e83a8c70f5f8 Mon Sep 17 00:00:00 2001
From: Sneha Narnakaje <nsnehaprabha@ti.com>
Date: Fri, 18 Sep 2009 12:51:47 -0700
Subject: mtd: nand: add new ECC mode - ECC_HW_OOB_FIRST

This patch adds the new mode NAND_ECC_HW_OOB_FIRST in the nand code to
support 4-bit ECC on TI DaVinci devices with large page (up to 2KiB) NAND
chips.  This ECC mode is similar to NAND_ECC_HW, with the exception of
read_page API that first reads the OOB area, reads the data in chunks,
feeds the ECC from OOB area to the ECC hw engine and perform any
correction on the data as per the ECC status reported by the engine.

"ECC_HW_OOB_FIRST" name suggested by Thomas Gleixner

Reviewed-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Sneha Narnakaje <nsnehaprabha@ti.com>
Signed-off-by: Sandeep Paulraj <s-paulraj@ti.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/nand/nand_base.c | 61 +++++++++++++++++++++++++++++++++++++++++++-
 include/linux/mtd/nand.h     |  1 +
 2 files changed, 61 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 17bbd5062027..22113865438b 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -978,6 +978,54 @@ static int nand_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
 	return 0;
 }
 
+/**
+ * nand_read_page_hwecc_oob_first - [REPLACABLE] hw ecc, read oob first
+ * @mtd:	mtd info structure
+ * @chip:	nand chip info structure
+ * @buf:	buffer to store read data
+ *
+ * Hardware ECC for large page chips, require OOB to be read first.
+ * For this ECC mode, the write_page method is re-used from ECC_HW.
+ * These methods read/write ECC from the OOB area, unlike the
+ * ECC_HW_SYNDROME support with multiple ECC steps, follows the
+ * "infix ECC" scheme and reads/writes ECC from the data area, by
+ * overwriting the NAND manufacturer bad block markings.
+ */
+static int nand_read_page_hwecc_oob_first(struct mtd_info *mtd,
+	struct nand_chip *chip, uint8_t *buf, int page)
+{
+	int i, eccsize = chip->ecc.size;
+	int eccbytes = chip->ecc.bytes;
+	int eccsteps = chip->ecc.steps;
+	uint8_t *p = buf;
+	uint8_t *ecc_code = chip->buffers->ecccode;
+	uint32_t *eccpos = chip->ecc.layout->eccpos;
+	uint8_t *ecc_calc = chip->buffers->ecccalc;
+
+	/* Read the OOB area first */
+	chip->cmdfunc(mtd, NAND_CMD_READOOB, 0, page);
+	chip->read_buf(mtd, chip->oob_poi, mtd->oobsize);
+	chip->cmdfunc(mtd, NAND_CMD_READ0, 0, page);
+
+	for (i = 0; i < chip->ecc.total; i++)
+		ecc_code[i] = chip->oob_poi[eccpos[i]];
+
+	for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize) {
+		int stat;
+
+		chip->ecc.hwctl(mtd, NAND_ECC_READ);
+		chip->read_buf(mtd, p, eccsize);
+		chip->ecc.calculate(mtd, p, &ecc_calc[i]);
+
+		stat = chip->ecc.correct(mtd, p, &ecc_code[i], NULL);
+		if (stat < 0)
+			mtd->ecc_stats.failed++;
+		else
+			mtd->ecc_stats.corrected += stat;
+	}
+	return 0;
+}
+
 /**
  * nand_read_page_syndrome - [REPLACABLE] hardware ecc syndrom based page read
  * @mtd:	mtd info structure
@@ -2673,6 +2721,17 @@ int nand_scan_tail(struct mtd_info *mtd)
 	 */
 
 	switch (chip->ecc.mode) {
+	case NAND_ECC_HW_OOB_FIRST:
+		/* Similar to NAND_ECC_HW, but a separate read_page handle */
+		if (!chip->ecc.calculate || !chip->ecc.correct ||
+		     !chip->ecc.hwctl) {
+			printk(KERN_WARNING "No ECC functions supplied; "
+			       "Hardware ECC not possible\n");
+			BUG();
+		}
+		if (!chip->ecc.read_page)
+			chip->ecc.read_page = nand_read_page_hwecc_oob_first;
+
 	case NAND_ECC_HW:
 		/* Use standard hwecc read page function ? */
 		if (!chip->ecc.read_page)
@@ -2695,7 +2754,7 @@ int nand_scan_tail(struct mtd_info *mtd)
 		     chip->ecc.read_page == nand_read_page_hwecc ||
 		     !chip->ecc.write_page ||
 		     chip->ecc.write_page == nand_write_page_hwecc)) {
-			printk(KERN_WARNING "No ECC functions supplied, "
+			printk(KERN_WARNING "No ECC functions supplied; "
 			       "Hardware ECC not possible\n");
 			BUG();
 		}
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 686f3701f2f4..7a232a9bdd62 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -121,6 +121,7 @@ typedef enum {
 	NAND_ECC_SOFT,
 	NAND_ECC_HW,
 	NAND_ECC_HW_SYNDROME,
+	NAND_ECC_HW_OOB_FIRST,
 } nand_ecc_modes_t;
 
 /*
-- 
cgit v1.2.3-71-gd317


From e454cea20bdcff10ee698d11b8882662a0153a47 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Fri, 18 Sep 2009 23:01:12 +0200
Subject: Driver-Core: extend devnode callbacks to provide permissions

This allows subsytems to provide devtmpfs with non-default permissions
for the device node. Instead of the default mode of 0600, null, zero,
random, urandom, full, tty, ptmx now have a mode of 0666, which allows
non-privileged processes to access standard device nodes in case no
other userspace process applies the expected permissions.

This also fixes a wrong assignment in pktcdvd and a checkpatch.pl complain.

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/x86/kernel/cpuid.c             |  4 ++--
 arch/x86/kernel/microcode_core.c    |  2 +-
 arch/x86/kernel/msr.c               |  4 ++--
 block/bsg.c                         |  4 ++--
 block/genhd.c                       |  8 ++++----
 drivers/base/core.c                 | 19 ++++++++++++-------
 drivers/base/devtmpfs.c             | 24 ++++++++++++++++--------
 drivers/block/aoe/aoechr.c          |  4 ++--
 drivers/block/pktcdvd.c             |  6 +++---
 drivers/char/hw_random/core.c       |  2 +-
 drivers/char/mem.c                  | 29 +++++++++++++++++++----------
 drivers/char/misc.c                 | 10 ++++++----
 drivers/char/raw.c                  |  4 ++--
 drivers/char/tty_io.c               | 11 +++++++++++
 drivers/gpu/drm/drm_sysfs.c         |  4 ++--
 drivers/hid/usbhid/hiddev.c         |  4 ++--
 drivers/input/input.c               |  4 ++--
 drivers/md/dm-ioctl.c               |  2 +-
 drivers/media/dvb/dvb-core/dvbdev.c |  4 ++--
 drivers/net/tun.c                   |  2 +-
 drivers/usb/class/usblp.c           |  4 ++--
 drivers/usb/core/file.c             |  8 ++++----
 drivers/usb/core/usb.c              |  4 ++--
 drivers/usb/misc/iowarrior.c        |  4 ++--
 drivers/usb/misc/legousbtower.c     |  4 ++--
 include/linux/device.h              |  7 ++++---
 include/linux/genhd.h               |  2 +-
 include/linux/miscdevice.h          |  3 ++-
 include/linux/usb.h                 |  4 ++--
 sound/sound_core.c                  |  4 ++--
 30 files changed, 116 insertions(+), 79 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index b07af8861244..6a52d4b36a30 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -182,7 +182,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =
 	.notifier_call = cpuid_class_cpu_callback,
 };
 
-static char *cpuid_nodename(struct device *dev)
+static char *cpuid_devnode(struct device *dev, mode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
 }
@@ -203,7 +203,7 @@ static int __init cpuid_init(void)
 		err = PTR_ERR(cpuid_class);
 		goto out_chrdev;
 	}
-	cpuid_class->nodename = cpuid_nodename;
+	cpuid_class->devnode = cpuid_devnode;
 	for_each_online_cpu(i) {
 		err = cpuid_device_create(i);
 		if (err != 0)
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 9371448290ac..0db7969b0dde 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -236,7 +236,7 @@ static const struct file_operations microcode_fops = {
 static struct miscdevice microcode_dev = {
 	.minor			= MICROCODE_MINOR,
 	.name			= "microcode",
-	.devnode		= "cpu/microcode",
+	.nodename		= "cpu/microcode",
 	.fops			= &microcode_fops,
 };
 
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 7dd950094178..6a3cefc7dda1 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -241,7 +241,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {
 	.notifier_call = msr_class_cpu_callback,
 };
 
-static char *msr_nodename(struct device *dev)
+static char *msr_devnode(struct device *dev, mode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
 }
@@ -262,7 +262,7 @@ static int __init msr_init(void)
 		err = PTR_ERR(msr_class);
 		goto out_chrdev;
 	}
-	msr_class->nodename = msr_nodename;
+	msr_class->devnode = msr_devnode;
 	for_each_online_cpu(i) {
 		err = msr_device_create(i);
 		if (err != 0)
diff --git a/block/bsg.c b/block/bsg.c
index 5f184bb3ff9e..0676301f16d0 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -1062,7 +1062,7 @@ EXPORT_SYMBOL_GPL(bsg_register_queue);
 
 static struct cdev bsg_cdev;
 
-static char *bsg_nodename(struct device *dev)
+static char *bsg_devnode(struct device *dev, mode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "bsg/%s", dev_name(dev));
 }
@@ -1087,7 +1087,7 @@ static int __init bsg_init(void)
 		ret = PTR_ERR(bsg_class);
 		goto destroy_kmemcache;
 	}
-	bsg_class->nodename = bsg_nodename;
+	bsg_class->devnode = bsg_devnode;
 
 	ret = alloc_chrdev_region(&devid, 0, BSG_MAX_DEVS, "bsg");
 	if (ret)
diff --git a/block/genhd.c b/block/genhd.c
index 2ad91ddad8e2..517e4332cb37 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -998,12 +998,12 @@ struct class block_class = {
 	.name		= "block",
 };
 
-static char *block_nodename(struct device *dev)
+static char *block_devnode(struct device *dev, mode_t *mode)
 {
 	struct gendisk *disk = dev_to_disk(dev);
 
-	if (disk->nodename)
-		return disk->nodename(disk);
+	if (disk->devnode)
+		return disk->devnode(disk, mode);
 	return NULL;
 }
 
@@ -1011,7 +1011,7 @@ static struct device_type disk_type = {
 	.name		= "disk",
 	.groups		= disk_attr_groups,
 	.release	= disk_release,
-	.nodename	= block_nodename,
+	.devnode	= block_devnode,
 };
 
 #ifdef CONFIG_PROC_FS
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 390e664ec1c7..6bee6af8d8e1 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -166,13 +166,16 @@ static int dev_uevent(struct kset *kset, struct kobject *kobj,
 	if (MAJOR(dev->devt)) {
 		const char *tmp;
 		const char *name;
+		mode_t mode = 0;
 
 		add_uevent_var(env, "MAJOR=%u", MAJOR(dev->devt));
 		add_uevent_var(env, "MINOR=%u", MINOR(dev->devt));
-		name = device_get_nodename(dev, &tmp);
+		name = device_get_devnode(dev, &mode, &tmp);
 		if (name) {
 			add_uevent_var(env, "DEVNAME=%s", name);
 			kfree(tmp);
+			if (mode)
+				add_uevent_var(env, "DEVMODE=%#o", mode & 0777);
 		}
 	}
 
@@ -1148,8 +1151,9 @@ static struct device *next_device(struct klist_iter *i)
 }
 
 /**
- * device_get_nodename - path of device node file
+ * device_get_devnode - path of device node file
  * @dev: device
+ * @mode: returned file access mode
  * @tmp: possibly allocated string
  *
  * Return the relative path of a possible device node.
@@ -1157,21 +1161,22 @@ static struct device *next_device(struct klist_iter *i)
  * a name. This memory is returned in tmp and needs to be
  * freed by the caller.
  */
-const char *device_get_nodename(struct device *dev, const char **tmp)
+const char *device_get_devnode(struct device *dev,
+			       mode_t *mode, const char **tmp)
 {
 	char *s;
 
 	*tmp = NULL;
 
 	/* the device type may provide a specific name */
-	if (dev->type && dev->type->nodename)
-		*tmp = dev->type->nodename(dev);
+	if (dev->type && dev->type->devnode)
+		*tmp = dev->type->devnode(dev, mode);
 	if (*tmp)
 		return *tmp;
 
 	/* the class may provide a specific name */
-	if (dev->class && dev->class->nodename)
-		*tmp = dev->class->nodename(dev);
+	if (dev->class && dev->class->devnode)
+		*tmp = dev->class->devnode(dev, mode);
 	if (*tmp)
 		return *tmp;
 
diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index fd488ad4263a..a1cb5afe6801 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -6,9 +6,10 @@
  * During bootup, before any driver core device is registered,
  * devtmpfs, a tmpfs-based filesystem is created. Every driver-core
  * device which requests a device node, will add a node in this
- * filesystem. The node is named after the the name of the device,
- * or the susbsytem can provide a custom name. All devices are
- * owned by root and have a mode of 0600.
+ * filesystem.
+ * By default, all devices are named after the the name of the
+ * device, owned by root and have a default mode of 0600. Subsystems
+ * can overwrite the default setting if needed.
  */
 
 #include <linux/kernel.h>
@@ -20,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/shmem_fs.h>
 #include <linux/cred.h>
+#include <linux/sched.h>
 #include <linux/init_task.h>
 
 static struct vfsmount *dev_mnt;
@@ -134,7 +136,7 @@ int devtmpfs_create_node(struct device *dev)
 	const char *tmp = NULL;
 	const char *nodename;
 	const struct cred *curr_cred;
-	mode_t mode;
+	mode_t mode = 0;
 	struct nameidata nd;
 	struct dentry *dentry;
 	int err;
@@ -142,14 +144,16 @@ int devtmpfs_create_node(struct device *dev)
 	if (!dev_mnt)
 		return 0;
 
-	nodename = device_get_nodename(dev, &tmp);
+	nodename = device_get_devnode(dev, &mode, &tmp);
 	if (!nodename)
 		return -ENOMEM;
 
+	if (mode == 0)
+		mode = 0600;
 	if (is_blockdev(dev))
-		mode = S_IFBLK|0600;
+		mode |= S_IFBLK;
 	else
-		mode = S_IFCHR|0600;
+		mode |= S_IFCHR;
 
 	curr_cred = override_creds(&init_cred);
 	err = vfs_path_lookup(dev_mnt->mnt_root, dev_mnt,
@@ -165,8 +169,12 @@ int devtmpfs_create_node(struct device *dev)
 
 	dentry = lookup_create(&nd, 0);
 	if (!IS_ERR(dentry)) {
+		int umask;
+
+		umask = sys_umask(0000);
 		err = vfs_mknod(nd.path.dentry->d_inode,
 				dentry, mode, dev->devt);
+		sys_umask(umask);
 		/* mark as kernel created inode */
 		if (!err)
 			dentry->d_inode->i_private = &dev_mnt;
@@ -271,7 +279,7 @@ int devtmpfs_delete_node(struct device *dev)
 	if (!dev_mnt)
 		return 0;
 
-	nodename = device_get_nodename(dev, &tmp);
+	nodename = device_get_devnode(dev, NULL, &tmp);
 	if (!nodename)
 		return -ENOMEM;
 
diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c
index 19888354188f..62141ec09a22 100644
--- a/drivers/block/aoe/aoechr.c
+++ b/drivers/block/aoe/aoechr.c
@@ -266,7 +266,7 @@ static const struct file_operations aoe_fops = {
 	.owner = THIS_MODULE,
 };
 
-static char *aoe_nodename(struct device *dev)
+static char *aoe_devnode(struct device *dev, mode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "etherd/%s", dev_name(dev));
 }
@@ -288,7 +288,7 @@ aoechr_init(void)
 		unregister_chrdev(AOE_MAJOR, "aoechr");
 		return PTR_ERR(aoe_class);
 	}
-	aoe_class->nodename = aoe_nodename;
+	aoe_class->devnode = aoe_devnode;
 
 	for (i = 0; i < ARRAY_SIZE(chardevs); ++i)
 		device_create(aoe_class, NULL,
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 95f11cdef203..fd5bb8ad59a9 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2857,7 +2857,7 @@ static struct block_device_operations pktcdvd_ops = {
 	.media_changed =	pkt_media_changed,
 };
 
-static char *pktcdvd_nodename(struct gendisk *gd)
+static char *pktcdvd_devnode(struct gendisk *gd, mode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "pktcdvd/%s", gd->disk_name);
 }
@@ -2914,7 +2914,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
 	disk->fops = &pktcdvd_ops;
 	disk->flags = GENHD_FL_REMOVABLE;
 	strcpy(disk->disk_name, pd->name);
-	disk->nodename = pktcdvd_nodename;
+	disk->devnode = pktcdvd_devnode;
 	disk->private_data = pd;
 	disk->queue = blk_alloc_queue(GFP_KERNEL);
 	if (!disk->queue)
@@ -3070,7 +3070,7 @@ static const struct file_operations pkt_ctl_fops = {
 static struct miscdevice pkt_misc = {
 	.minor 		= MISC_DYNAMIC_MINOR,
 	.name  		= DRIVER_NAME,
-	.name  		= "pktcdvd/control",
+	.nodename	= "pktcdvd/control",
 	.fops  		= &pkt_ctl_fops
 };
 
diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index fc93e2fc7c71..1573aebd54b5 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -153,7 +153,7 @@ static const struct file_operations rng_chrdev_ops = {
 static struct miscdevice rng_miscdev = {
 	.minor		= RNG_MISCDEV_MINOR,
 	.name		= RNG_MODULE_NAME,
-	.devnode	= "hwrng",
+	.nodename	= "hwrng",
 	.fops		= &rng_chrdev_ops,
 };
 
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 0491cdf63f2a..0aede1d6a9ea 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -866,24 +866,25 @@ static const struct file_operations kmsg_fops = {
 
 static const struct memdev {
 	const char *name;
+	mode_t mode;
 	const struct file_operations *fops;
 	struct backing_dev_info *dev_info;
 } devlist[] = {
-	[ 1] = { "mem", &mem_fops, &directly_mappable_cdev_bdi },
+	 [1] = { "mem", 0, &mem_fops, &directly_mappable_cdev_bdi },
 #ifdef CONFIG_DEVKMEM
-	[ 2] = { "kmem", &kmem_fops, &directly_mappable_cdev_bdi },
+	 [2] = { "kmem", 0, &kmem_fops, &directly_mappable_cdev_bdi },
 #endif
-	[ 3] = {"null", &null_fops, NULL },
+	 [3] = { "null", 0666, &null_fops, NULL },
 #ifdef CONFIG_DEVPORT
-	[ 4] = { "port", &port_fops, NULL },
+	 [4] = { "port", 0, &port_fops, NULL },
 #endif
-	[ 5] = { "zero", &zero_fops, &zero_bdi },
-	[ 7] = { "full", &full_fops, NULL },
-	[ 8] = { "random", &random_fops, NULL },
-	[ 9] = { "urandom", &urandom_fops, NULL },
-	[11] = { "kmsg", &kmsg_fops, NULL },
+	 [5] = { "zero", 0666, &zero_fops, &zero_bdi },
+	 [7] = { "full", 0666, &full_fops, NULL },
+	 [8] = { "random", 0666, &random_fops, NULL },
+	 [9] = { "urandom", 0666, &urandom_fops, NULL },
+	[11] = { "kmsg", 0, &kmsg_fops, NULL },
 #ifdef CONFIG_CRASH_DUMP
-	[12] = { "oldmem", &oldmem_fops, NULL },
+	[12] = { "oldmem", 0, &oldmem_fops, NULL },
 #endif
 };
 
@@ -920,6 +921,13 @@ static const struct file_operations memory_fops = {
 	.open		= memory_open,
 };
 
+static char *mem_devnode(struct device *dev, mode_t *mode)
+{
+	if (mode && devlist[MINOR(dev->devt)].mode)
+		*mode = devlist[MINOR(dev->devt)].mode;
+	return NULL;
+}
+
 static struct class *mem_class;
 
 static int __init chr_dev_init(void)
@@ -935,6 +943,7 @@ static int __init chr_dev_init(void)
 		printk("unable to get major %d for memory devs\n", MEM_MAJOR);
 
 	mem_class = class_create(THIS_MODULE, "mem");
+	mem_class->devnode = mem_devnode;
 	for (minor = 1; minor < ARRAY_SIZE(devlist); minor++) {
 		if (!devlist[minor].name)
 			continue;
diff --git a/drivers/char/misc.c b/drivers/char/misc.c
index 62c99fa59e2b..1ee27cc23426 100644
--- a/drivers/char/misc.c
+++ b/drivers/char/misc.c
@@ -263,12 +263,14 @@ int misc_deregister(struct miscdevice *misc)
 EXPORT_SYMBOL(misc_register);
 EXPORT_SYMBOL(misc_deregister);
 
-static char *misc_nodename(struct device *dev)
+static char *misc_devnode(struct device *dev, mode_t *mode)
 {
 	struct miscdevice *c = dev_get_drvdata(dev);
 
-	if (c->devnode)
-		return kstrdup(c->devnode, GFP_KERNEL);
+	if (mode && c->mode)
+		*mode = c->mode;
+	if (c->nodename)
+		return kstrdup(c->nodename, GFP_KERNEL);
 	return NULL;
 }
 
@@ -287,7 +289,7 @@ static int __init misc_init(void)
 	err = -EIO;
 	if (register_chrdev(MISC_MAJOR,"misc",&misc_fops))
 		goto fail_printk;
-	misc_class->nodename = misc_nodename;
+	misc_class->devnode = misc_devnode;
 	return 0;
 
 fail_printk:
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index 40268db02e22..64acd05f71c8 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -261,7 +261,7 @@ static const struct file_operations raw_ctl_fops = {
 
 static struct cdev raw_cdev;
 
-static char *raw_nodename(struct device *dev)
+static char *raw_devnode(struct device *dev, mode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "raw/%s", dev_name(dev));
 }
@@ -289,7 +289,7 @@ static int __init raw_init(void)
 		ret = PTR_ERR(raw_class);
 		goto error_region;
 	}
-	raw_class->nodename = raw_nodename;
+	raw_class->devnode = raw_devnode;
 	device_create(raw_class, NULL, MKDEV(RAW_MAJOR, 0), NULL, "rawctl");
 
 	return 0;
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index a3afa0c387cd..c70d9dabefae 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -3056,11 +3056,22 @@ void __init console_init(void)
 	}
 }
 
+static char *tty_devnode(struct device *dev, mode_t *mode)
+{
+	if (!mode)
+		return NULL;
+	if (dev->devt == MKDEV(TTYAUX_MAJOR, 0) ||
+	    dev->devt == MKDEV(TTYAUX_MAJOR, 2))
+		*mode = 0666;
+	return NULL;
+}
+
 static int __init tty_class_init(void)
 {
 	tty_class = class_create(THIS_MODULE, "tty");
 	if (IS_ERR(tty_class))
 		return PTR_ERR(tty_class);
+	tty_class->devnode = tty_devnode;
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
index f7a615b80c70..5301f226cb1c 100644
--- a/drivers/gpu/drm/drm_sysfs.c
+++ b/drivers/gpu/drm/drm_sysfs.c
@@ -76,7 +76,7 @@ static ssize_t version_show(struct class *dev, char *buf)
 		       CORE_MINOR, CORE_PATCHLEVEL, CORE_DATE);
 }
 
-static char *drm_nodename(struct device *dev)
+static char *drm_devnode(struct device *dev, mode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "dri/%s", dev_name(dev));
 }
@@ -112,7 +112,7 @@ struct class *drm_sysfs_create(struct module *owner, char *name)
 	if (err)
 		goto err_out_class;
 
-	class->nodename = drm_nodename;
+	class->devnode = drm_devnode;
 
 	return class;
 
diff --git a/drivers/hid/usbhid/hiddev.c b/drivers/hid/usbhid/hiddev.c
index 4d1dc0cf1401..8b6ee247bfe4 100644
--- a/drivers/hid/usbhid/hiddev.c
+++ b/drivers/hid/usbhid/hiddev.c
@@ -852,14 +852,14 @@ static const struct file_operations hiddev_fops = {
 #endif
 };
 
-static char *hiddev_nodename(struct device *dev)
+static char *hiddev_devnode(struct device *dev, mode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev));
 }
 
 static struct usb_class_driver hiddev_class = {
 	.name =		"hiddev%d",
-	.nodename =	hiddev_nodename,
+	.devnode =	hiddev_devnode,
 	.fops =		&hiddev_fops,
 	.minor_base =	HIDDEV_MINOR_BASE,
 };
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 851791d955f3..556539d617a4 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -1265,14 +1265,14 @@ static struct device_type input_dev_type = {
 	.uevent		= input_dev_uevent,
 };
 
-static char *input_nodename(struct device *dev)
+static char *input_devnode(struct device *dev, mode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "input/%s", dev_name(dev));
 }
 
 struct class input_class = {
 	.name		= "input",
-	.nodename	= input_nodename,
+	.devnode	= input_devnode,
 };
 EXPORT_SYMBOL_GPL(input_class);
 
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 7f77f18fcafa..a67942931582 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1532,7 +1532,7 @@ static const struct file_operations _ctl_fops = {
 static struct miscdevice _dm_misc = {
 	.minor 		= MISC_DYNAMIC_MINOR,
 	.name  		= DM_NAME,
-	.devnode	= "mapper/control",
+	.nodename	= "mapper/control",
 	.fops  		= &_ctl_fops
 };
 
diff --git a/drivers/media/dvb/dvb-core/dvbdev.c b/drivers/media/dvb/dvb-core/dvbdev.c
index 479dd05762a5..94159b90f733 100644
--- a/drivers/media/dvb/dvb-core/dvbdev.c
+++ b/drivers/media/dvb/dvb-core/dvbdev.c
@@ -447,7 +447,7 @@ static int dvb_uevent(struct device *dev, struct kobj_uevent_env *env)
 	return 0;
 }
 
-static char *dvb_nodename(struct device *dev)
+static char *dvb_devnode(struct device *dev, mode_t *mode)
 {
 	struct dvb_device *dvbdev = dev_get_drvdata(dev);
 
@@ -478,7 +478,7 @@ static int __init init_dvbdev(void)
 		goto error;
 	}
 	dvb_class->dev_uevent = dvb_uevent;
-	dvb_class->nodename = dvb_nodename;
+	dvb_class->devnode = dvb_devnode;
 	return 0;
 
 error:
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 3f5d28851aa2..d3ee1994b02f 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1370,7 +1370,7 @@ static const struct file_operations tun_fops = {
 static struct miscdevice tun_miscdev = {
 	.minor = TUN_MINOR,
 	.name = "tun",
-	.devnode = "net/tun",
+	.nodename = "net/tun",
 	.fops = &tun_fops,
 };
 
diff --git a/drivers/usb/class/usblp.c b/drivers/usb/class/usblp.c
index 26c09f0257db..9bc112ee7803 100644
--- a/drivers/usb/class/usblp.c
+++ b/drivers/usb/class/usblp.c
@@ -1057,14 +1057,14 @@ static const struct file_operations usblp_fops = {
 	.release =	usblp_release,
 };
 
-static char *usblp_nodename(struct device *dev)
+static char *usblp_devnode(struct device *dev, mode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev));
 }
 
 static struct usb_class_driver usblp_class = {
 	.name =		"lp%d",
-	.nodename =	usblp_nodename,
+	.devnode =	usblp_devnode,
 	.fops =		&usblp_fops,
 	.minor_base =	USBLP_MINOR_BASE,
 };
diff --git a/drivers/usb/core/file.c b/drivers/usb/core/file.c
index 5cef88929b3e..222ee07ea680 100644
--- a/drivers/usb/core/file.c
+++ b/drivers/usb/core/file.c
@@ -67,14 +67,14 @@ static struct usb_class {
 	struct class *class;
 } *usb_class;
 
-static char *usb_nodename(struct device *dev)
+static char *usb_devnode(struct device *dev, mode_t *mode)
 {
 	struct usb_class_driver *drv;
 
 	drv = dev_get_drvdata(dev);
-	if (!drv || !drv->nodename)
+	if (!drv || !drv->devnode)
 		return NULL;
-	return drv->nodename(dev);
+	return drv->devnode(dev, mode);
 }
 
 static int init_usb_class(void)
@@ -100,7 +100,7 @@ static int init_usb_class(void)
 		kfree(usb_class);
 		usb_class = NULL;
 	}
-	usb_class->class->nodename = usb_nodename;
+	usb_class->class->devnode = usb_devnode;
 
 exit:
 	return result;
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index a26f73880c32..43ee943d757a 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -311,7 +311,7 @@ static struct dev_pm_ops usb_device_pm_ops = {
 #endif	/* CONFIG_PM */
 
 
-static char *usb_nodename(struct device *dev)
+static char *usb_devnode(struct device *dev, mode_t *mode)
 {
 	struct usb_device *usb_dev;
 
@@ -324,7 +324,7 @@ struct device_type usb_device_type = {
 	.name =		"usb_device",
 	.release =	usb_release_dev,
 	.uevent =	usb_dev_uevent,
-	.nodename = 	usb_nodename,
+	.devnode = 	usb_devnode,
 	.pm =		&usb_device_pm_ops,
 };
 
diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c
index 90e1a8dedfa9..e75bb87ee92b 100644
--- a/drivers/usb/misc/iowarrior.c
+++ b/drivers/usb/misc/iowarrior.c
@@ -727,7 +727,7 @@ static const struct file_operations iowarrior_fops = {
 	.poll = iowarrior_poll,
 };
 
-static char *iowarrior_nodename(struct device *dev)
+static char *iowarrior_devnode(struct device *dev, mode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev));
 }
@@ -738,7 +738,7 @@ static char *iowarrior_nodename(struct device *dev)
  */
 static struct usb_class_driver iowarrior_class = {
 	.name = "iowarrior%d",
-	.nodename = iowarrior_nodename,
+	.devnode = iowarrior_devnode,
 	.fops = &iowarrior_fops,
 	.minor_base = IOWARRIOR_MINOR_BASE,
 };
diff --git a/drivers/usb/misc/legousbtower.c b/drivers/usb/misc/legousbtower.c
index c1e2433f640d..97efeaec4d52 100644
--- a/drivers/usb/misc/legousbtower.c
+++ b/drivers/usb/misc/legousbtower.c
@@ -266,7 +266,7 @@ static const struct file_operations tower_fops = {
 	.llseek =	tower_llseek,
 };
 
-static char *legousbtower_nodename(struct device *dev)
+static char *legousbtower_devnode(struct device *dev, mode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev));
 }
@@ -277,7 +277,7 @@ static char *legousbtower_nodename(struct device *dev)
  */
 static struct usb_class_driver tower_class = {
 	.name =		"legousbtower%d",
-	.nodename = 	legousbtower_nodename,
+	.devnode = 	legousbtower_devnode,
 	.fops =		&tower_fops,
 	.minor_base =	LEGO_USB_TOWER_MINOR_BASE,
 };
diff --git a/include/linux/device.h b/include/linux/device.h
index 847b763e40e9..aca31bf7d8ed 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -193,7 +193,7 @@ struct class {
 	struct kobject			*dev_kobj;
 
 	int (*dev_uevent)(struct device *dev, struct kobj_uevent_env *env);
-	char *(*nodename)(struct device *dev);
+	char *(*devnode)(struct device *dev, mode_t *mode);
 
 	void (*class_release)(struct class *class);
 	void (*dev_release)(struct device *dev);
@@ -298,7 +298,7 @@ struct device_type {
 	const char *name;
 	const struct attribute_group **groups;
 	int (*uevent)(struct device *dev, struct kobj_uevent_env *env);
-	char *(*nodename)(struct device *dev);
+	char *(*devnode)(struct device *dev, mode_t *mode);
 	void (*release)(struct device *dev);
 
 	const struct dev_pm_ops *pm;
@@ -487,7 +487,8 @@ extern struct device *device_find_child(struct device *dev, void *data,
 extern int device_rename(struct device *dev, char *new_name);
 extern int device_move(struct device *dev, struct device *new_parent,
 		       enum dpm_order dpm_order);
-extern const char *device_get_nodename(struct device *dev, const char **tmp);
+extern const char *device_get_devnode(struct device *dev,
+				      mode_t *mode, const char **tmp);
 extern void *dev_get_drvdata(const struct device *dev);
 extern void dev_set_drvdata(struct device *dev, void *data);
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 44263cb27121..109d179adb93 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -142,7 +142,7 @@ struct gendisk {
                                          * disks that can't be partitioned. */
 
 	char disk_name[DISK_NAME_LEN];	/* name of major driver */
-	char *(*nodename)(struct gendisk *gd);
+	char *(*devnode)(struct gendisk *gd, mode_t *mode);
 	/* Array of pointers to partitions indexed by partno.
 	 * Protected with matching bdev lock but stat and other
 	 * non-critical accesses use RCU.  Always access through
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index 052117744629..adaf3c15e449 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -41,7 +41,8 @@ struct miscdevice  {
 	struct list_head list;
 	struct device *parent;
 	struct device *this_device;
-	const char *devnode;
+	const char *nodename;
+	mode_t mode;
 };
 
 extern int misc_register(struct miscdevice * misc);
diff --git a/include/linux/usb.h b/include/linux/usb.h
index b1e3c2fbfe11..a8fe05f224e5 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -922,7 +922,7 @@ extern struct bus_type usb_bus_type;
 /**
  * struct usb_class_driver - identifies a USB driver that wants to use the USB major number
  * @name: the usb class device name for this driver.  Will show up in sysfs.
- * @nodename: Callback to provide a naming hint for a possible
+ * @devnode: Callback to provide a naming hint for a possible
  *	device node to create.
  * @fops: pointer to the struct file_operations of this driver.
  * @minor_base: the start of the minor range for this driver.
@@ -933,7 +933,7 @@ extern struct bus_type usb_bus_type;
  */
 struct usb_class_driver {
 	char *name;
-	char *(*nodename)(struct device *dev);
+	char *(*devnode)(struct device *dev, mode_t *mode);
 	const struct file_operations *fops;
 	int minor_base;
 };
diff --git a/sound/sound_core.c b/sound/sound_core.c
index bb4b88e606bb..49c998186592 100644
--- a/sound/sound_core.c
+++ b/sound/sound_core.c
@@ -29,7 +29,7 @@ MODULE_DESCRIPTION("Core sound module");
 MODULE_AUTHOR("Alan Cox");
 MODULE_LICENSE("GPL");
 
-static char *sound_nodename(struct device *dev)
+static char *sound_devnode(struct device *dev, mode_t *mode)
 {
 	if (MAJOR(dev->devt) == SOUND_MAJOR)
 		return NULL;
@@ -50,7 +50,7 @@ static int __init init_soundcore(void)
 		return PTR_ERR(sound_class);
 	}
 
-	sound_class->nodename = sound_nodename;
+	sound_class->devnode = sound_devnode;
 
 	return 0;
 }
-- 
cgit v1.2.3-71-gd317


From f0eefdc30e55e761facf645bd1be1339b21c30e6 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Sat, 19 Sep 2009 13:13:13 -0700
Subject: cyclades: avoid addresses recomputation

Don't fetch firmware address and recompute channel control on each
port access. Precompute the values on init and use them later all
the time.

The same for board control.

This simplify code and improves readability.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/char/cyclades.c  | 241 ++++++++++++++---------------------------------
 include/linux/cyclades.h |  10 ++
 2 files changed, 81 insertions(+), 170 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index 9e0cd7020aef..7a7092ae5d39 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -814,7 +814,7 @@ static char rflow_thr[] = {	/* rflow threshold */
 /*  The Cyclom-Ye has placed the sequential chips in non-sequential
  *  address order.  This look-up table overcomes that problem.
  */
-static int cy_chip_offset[] = { 0x0000,
+static const unsigned int cy_chip_offset[] = { 0x0000,
 	0x0400,
 	0x0800,
 	0x0C00,
@@ -1406,15 +1406,9 @@ static int
 cyz_fetch_msg(struct cyclades_card *cinfo,
 		__u32 *channel, __u8 *cmd, __u32 *param)
 {
-	struct FIRM_ID __iomem *firm_id;
-	struct ZFW_CTRL __iomem *zfw_ctrl;
-	struct BOARD_CTRL __iomem *board_ctrl;
+	struct BOARD_CTRL __iomem *board_ctrl = cinfo->board_ctrl;
 	unsigned long loc_doorbell;
 
-	firm_id = cinfo->base_addr + ID_ADDRESS;
-	zfw_ctrl = cinfo->base_addr + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
-	board_ctrl = &zfw_ctrl->board_ctrl;
-
 	loc_doorbell = readl(&cinfo->ctl_addr.p9060->loc_doorbell);
 	if (loc_doorbell) {
 		*cmd = (char)(0xff & loc_doorbell);
@@ -1430,19 +1424,13 @@ static int
 cyz_issue_cmd(struct cyclades_card *cinfo,
 		__u32 channel, __u8 cmd, __u32 param)
 {
-	struct FIRM_ID __iomem *firm_id;
-	struct ZFW_CTRL __iomem *zfw_ctrl;
-	struct BOARD_CTRL __iomem *board_ctrl;
+	struct BOARD_CTRL __iomem *board_ctrl = cinfo->board_ctrl;
 	__u32 __iomem *pci_doorbell;
 	unsigned int index;
 
-	firm_id = cinfo->base_addr + ID_ADDRESS;
 	if (!cyz_is_loaded(cinfo))
 		return -1;
 
-	zfw_ctrl = cinfo->base_addr + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
-	board_ctrl = &zfw_ctrl->board_ctrl;
-
 	index = 0;
 	pci_doorbell = &cinfo->ctl_addr.p9060->pci_doorbell;
 	while ((readl(pci_doorbell) & 0xff) != 0) {
@@ -1457,9 +1445,9 @@ cyz_issue_cmd(struct cyclades_card *cinfo,
 	return 0;
 }				/* cyz_issue_cmd */
 
-static void cyz_handle_rx(struct cyclades_port *info, struct tty_struct *tty,
-		struct BUF_CTRL __iomem *buf_ctrl)
+static void cyz_handle_rx(struct cyclades_port *info, struct tty_struct *tty)
 {
+	struct BUF_CTRL __iomem *buf_ctrl = info->u.cyz.buf_ctrl;
 	struct cyclades_card *cinfo = info->card;
 	unsigned int char_count;
 	int len;
@@ -1549,9 +1537,9 @@ static void cyz_handle_rx(struct cyclades_port *info, struct tty_struct *tty,
 	}
 }
 
-static void cyz_handle_tx(struct cyclades_port *info, struct tty_struct *tty,
-		struct BUF_CTRL __iomem *buf_ctrl)
+static void cyz_handle_tx(struct cyclades_port *info, struct tty_struct *tty)
 {
+	struct BUF_CTRL __iomem *buf_ctrl = info->u.cyz.buf_ctrl;
 	struct cyclades_card *cinfo = info->card;
 	u8 data;
 	unsigned int char_count;
@@ -1627,21 +1615,14 @@ ztxdone:
 
 static void cyz_handle_cmd(struct cyclades_card *cinfo)
 {
+	struct BOARD_CTRL __iomem *board_ctrl = cinfo->board_ctrl;
 	struct tty_struct *tty;
 	struct cyclades_port *info;
-	static struct FIRM_ID __iomem *firm_id;
-	static struct ZFW_CTRL __iomem *zfw_ctrl;
-	static struct BOARD_CTRL __iomem *board_ctrl;
-	static struct CH_CTRL __iomem *ch_ctrl;
-	static struct BUF_CTRL __iomem *buf_ctrl;
 	__u32 channel, param, fw_ver;
 	__u8 cmd;
 	int special_count;
 	int delta_count;
 
-	firm_id = cinfo->base_addr + ID_ADDRESS;
-	zfw_ctrl = cinfo->base_addr + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
-	board_ctrl = &zfw_ctrl->board_ctrl;
 	fw_ver = readl(&board_ctrl->fw_version);
 
 	while (cyz_fetch_msg(cinfo, &channel, &cmd, &param) == 1) {
@@ -1652,9 +1633,6 @@ static void cyz_handle_cmd(struct cyclades_card *cinfo)
 		if (tty == NULL)
 			continue;
 
-		ch_ctrl = &(zfw_ctrl->ch_ctrl[channel]);
-		buf_ctrl = &(zfw_ctrl->buf_ctrl[channel]);
-
 		switch (cmd) {
 		case C_CM_PR_ERROR:
 			tty_insert_flip_char(tty, 0, TTY_PARITY);
@@ -1675,9 +1653,9 @@ static void cyz_handle_cmd(struct cyclades_card *cinfo)
 			info->icount.dcd++;
 			delta_count++;
 			if (info->port.flags & ASYNC_CHECK_CD) {
-				if ((fw_ver > 241 ? ((u_long) param) :
-						readl(&ch_ctrl->rs_status)) &
-						C_RS_DCD) {
+				u32 dcd = fw_ver > 241 ? param :
+					readl(&info->u.cyz.ch_ctrl->rs_status);
+				if (dcd & C_RS_DCD) {
 					wake_up_interruptible(&info->port.open_wait);
 				} else {
 					tty_hangup(tty);
@@ -1712,7 +1690,7 @@ static void cyz_handle_cmd(struct cyclades_card *cinfo)
 			printk(KERN_DEBUG "cyz_interrupt: rcvd intr, card %d, "
 					"port %ld\n", info->card, channel);
 #endif
-			cyz_handle_rx(info, tty, buf_ctrl);
+			cyz_handle_rx(info, tty);
 			break;
 		case C_CM_TXBEMPTY:
 		case C_CM_TXLOWWM:
@@ -1722,7 +1700,7 @@ static void cyz_handle_cmd(struct cyclades_card *cinfo)
 			printk(KERN_DEBUG "cyz_interrupt: xmit intr, card %d, "
 					"port %ld\n", info->card, channel);
 #endif
-			cyz_handle_tx(info, tty, buf_ctrl);
+			cyz_handle_tx(info, tty);
 			break;
 #endif				/* CONFIG_CYZ_INTR */
 		case C_CM_FATAL:
@@ -1781,9 +1759,6 @@ static void cyz_poll(unsigned long arg)
 {
 	struct cyclades_card *cinfo;
 	struct cyclades_port *info;
-	struct FIRM_ID __iomem *firm_id;
-	struct ZFW_CTRL __iomem *zfw_ctrl;
-	struct BUF_CTRL __iomem *buf_ctrl;
 	unsigned long expires = jiffies + HZ;
 	unsigned int port, card;
 
@@ -1795,10 +1770,6 @@ static void cyz_poll(unsigned long arg)
 		if (!cyz_is_loaded(cinfo))
 			continue;
 
-		firm_id = cinfo->base_addr + ID_ADDRESS;
-		zfw_ctrl = cinfo->base_addr +
-				(readl(&firm_id->zfwctrl_addr) & 0xfffff);
-
 	/* Skip first polling cycle to avoid racing conditions with the FW */
 		if (!cinfo->intr_enabled) {
 			cinfo->intr_enabled = 1;
@@ -1811,15 +1782,13 @@ static void cyz_poll(unsigned long arg)
 			struct tty_struct *tty;
 
 			info = &cinfo->ports[port];
-			buf_ctrl = &(zfw_ctrl->buf_ctrl[port]);
-
 			tty = tty_port_tty_get(&info->port);
 			/* OK to pass NULL to the handle functions below.
 			   They need to drop the data in that case. */
 
 			if (!info->throttle)
-				cyz_handle_rx(info, tty, buf_ctrl);
-			cyz_handle_tx(info, tty, buf_ctrl);
+				cyz_handle_rx(info, tty);
+			cyz_handle_tx(info, tty);
 			tty_kref_put(tty);
 		}
 		/* poll every 'cyz_polling_cycle' period */
@@ -1922,45 +1891,34 @@ static int cy_startup(struct cyclades_port *info, struct tty_struct *tty)
 		spin_unlock_irqrestore(&card->card_lock, flags);
 
 	} else {
-		struct FIRM_ID __iomem *firm_id;
-		struct ZFW_CTRL __iomem *zfw_ctrl;
-		struct BOARD_CTRL __iomem *board_ctrl;
-		struct CH_CTRL __iomem *ch_ctrl;
-
-		base_addr = card->base_addr;
+		struct CH_CTRL __iomem *ch_ctrl = info->u.cyz.ch_ctrl;
 
-		firm_id = base_addr + ID_ADDRESS;
 		if (!cyz_is_loaded(card))
 			return -ENODEV;
 
-		zfw_ctrl = card->base_addr +
-				(readl(&firm_id->zfwctrl_addr) & 0xfffff);
-		board_ctrl = &zfw_ctrl->board_ctrl;
-		ch_ctrl = zfw_ctrl->ch_ctrl;
-
 #ifdef CY_DEBUG_OPEN
 		printk(KERN_DEBUG "cyc startup Z card %d, channel %d, "
-			"base_addr %p\n", card, channel, base_addr);
+			"base_addr %p\n", card, channel, card->base_addr);
 #endif
 		spin_lock_irqsave(&card->card_lock, flags);
 
-		cy_writel(&ch_ctrl[channel].op_mode, C_CH_ENABLE);
+		cy_writel(&ch_ctrl->op_mode, C_CH_ENABLE);
 #ifdef Z_WAKE
 #ifdef CONFIG_CYZ_INTR
-		cy_writel(&ch_ctrl[channel].intr_enable,
+		cy_writel(&ch_ctrl->intr_enable,
 			  C_IN_TXBEMPTY | C_IN_TXLOWWM | C_IN_RXHIWM |
 			  C_IN_RXNNDT | C_IN_IOCTLW | C_IN_MDCD);
 #else
-		cy_writel(&ch_ctrl[channel].intr_enable,
+		cy_writel(&ch_ctrl->intr_enable,
 			  C_IN_IOCTLW | C_IN_MDCD);
 #endif				/* CONFIG_CYZ_INTR */
 #else
 #ifdef CONFIG_CYZ_INTR
-		cy_writel(&ch_ctrl[channel].intr_enable,
+		cy_writel(&ch_ctrl->intr_enable,
 			  C_IN_TXBEMPTY | C_IN_TXLOWWM | C_IN_RXHIWM |
 			  C_IN_RXNNDT | C_IN_MDCD);
 #else
-		cy_writel(&ch_ctrl[channel].intr_enable, C_IN_MDCD);
+		cy_writel(&ch_ctrl->intr_enable, C_IN_MDCD);
 #endif				/* CONFIG_CYZ_INTR */
 #endif				/* Z_WAKE */
 
@@ -1979,9 +1937,8 @@ static int cy_startup(struct cyclades_port *info, struct tty_struct *tty)
 
 		/* set timeout !!! */
 		/* set RTS and DTR !!! */
-		cy_writel(&ch_ctrl[channel].rs_control,
-			readl(&ch_ctrl[channel].rs_control) | C_RS_RTS |
-			C_RS_DTR);
+		cy_writel(&ch_ctrl->rs_control, readl(&ch_ctrl->rs_control) |
+				C_RS_RTS | C_RS_DTR);
 		retval = cyz_issue_cmd(card, channel, C_CM_IOCTLM, 0L);
 		if (retval != 0) {
 			printk(KERN_ERR "cyc:startup(3) retval on ttyC%d was "
@@ -2110,27 +2067,17 @@ static void cy_shutdown(struct cyclades_port *info, struct tty_struct *tty)
 		info->port.flags &= ~ASYNC_INITIALIZED;
 		spin_unlock_irqrestore(&card->card_lock, flags);
 	} else {
-		struct FIRM_ID __iomem *firm_id;
-		struct ZFW_CTRL __iomem *zfw_ctrl;
-		struct BOARD_CTRL __iomem *board_ctrl;
-		struct CH_CTRL __iomem *ch_ctrl;
+		struct CH_CTRL __iomem *ch_ctrl = info->u.cyz.ch_ctrl;
 		int retval;
 
-		base_addr = card->base_addr;
 #ifdef CY_DEBUG_OPEN
 		printk(KERN_DEBUG "cyc shutdown Z card %d, channel %d, "
-			"base_addr %p\n", card, channel, base_addr);
+			"base_addr %p\n", card, channel, card->base_addr);
 #endif
 
-		firm_id = base_addr + ID_ADDRESS;
 		if (!cyz_is_loaded(card))
 			return;
 
-		zfw_ctrl = card->base_addr +
-				(readl(&firm_id->zfwctrl_addr) & 0xfffff);
-		board_ctrl = &zfw_ctrl->board_ctrl;
-		ch_ctrl = zfw_ctrl->ch_ctrl;
-
 		spin_lock_irqsave(&card->card_lock, flags);
 
 		if (info->port.xmit_buf) {
@@ -2141,9 +2088,9 @@ static void cy_shutdown(struct cyclades_port *info, struct tty_struct *tty)
 		}
 
 		if (tty->termios->c_cflag & HUPCL) {
-			cy_writel(&ch_ctrl[channel].rs_control,
-				(__u32)(readl(&ch_ctrl[channel].rs_control) &
-					~(C_RS_RTS | C_RS_DTR)));
+			cy_writel(&ch_ctrl->rs_control,
+				readl(&ch_ctrl->rs_control) &
+					~(C_RS_RTS | C_RS_DTR));
 			retval = cyz_issue_cmd(info->card, channel,
 					C_CM_IOCTLM, 0L);
 			if (retval != 0) {
@@ -2497,15 +2444,11 @@ static void cy_close(struct tty_struct *tty, struct file *filp)
 #ifdef Z_WAKE
 		/* Waiting for on-board buffers to be empty before closing
 		   the port */
-		void __iomem *base_addr = card->base_addr;
-		struct FIRM_ID __iomem *firm_id = base_addr + ID_ADDRESS;
-		struct ZFW_CTRL __iomem *zfw_ctrl =
-		    base_addr + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
-		struct CH_CTRL __iomem *ch_ctrl = zfw_ctrl->ch_ctrl;
+		struct CH_CTRL __iomem *ch_ctrl = info->u.cyz.ch_ctrl;
 		int channel = info->line - card->first_line;
 		int retval;
 
-		if (readl(&ch_ctrl[channel].flow_status) != C_FS_TXIDLE) {
+		if (readl(&ch_ctrl->flow_status) != C_FS_TXIDLE) {
 			retval = cyz_issue_cmd(card, channel, C_CM_IOCTLW, 0L);
 			if (retval != 0) {
 				printk(KERN_DEBUG "cyc:cy_close retval on "
@@ -2685,18 +2628,13 @@ static int cy_write_room(struct tty_struct *tty)
 
 static int cy_chars_in_buffer(struct tty_struct *tty)
 {
-	struct cyclades_card *card;
 	struct cyclades_port *info = tty->driver_data;
-	int channel;
 
 	if (serial_paranoia_check(info, tty->name, "cy_chars_in_buffer"))
 		return 0;
 
-	card = info->card;
-	channel = (info->line) - (card->first_line);
-
 #ifdef Z_EXT_CHARS_IN_BUFFER
-	if (!cy_is_Z(card)) {
+	if (!cy_is_Z(info->card)) {
 #endif				/* Z_EXT_CHARS_IN_BUFFER */
 #ifdef CY_DEBUG_IO
 		printk(KERN_DEBUG "cyc:cy_chars_in_buffer ttyC%d %d\n",
@@ -2705,20 +2643,11 @@ static int cy_chars_in_buffer(struct tty_struct *tty)
 		return info->xmit_cnt;
 #ifdef Z_EXT_CHARS_IN_BUFFER
 	} else {
-		static struct FIRM_ID *firm_id;
-		static struct ZFW_CTRL *zfw_ctrl;
-		static struct CH_CTRL *ch_ctrl;
-		static struct BUF_CTRL *buf_ctrl;
+		struct BUF_CTRL __iomem *buf_ctrl = info->u.cyz.buf_ctrl;
 		int char_count;
 		__u32 tx_put, tx_get, tx_bufsize;
 
 		lock_kernel();
-		firm_id = card->base_addr + ID_ADDRESS;
-		zfw_ctrl = card->base_addr +
-			(readl(&firm_id->zfwctrl_addr) & 0xfffff);
-		ch_ctrl = &(zfw_ctrl->ch_ctrl[channel]);
-		buf_ctrl = &(zfw_ctrl->buf_ctrl[channel]);
-
 		tx_get = readl(&buf_ctrl->tx_get);
 		tx_put = readl(&buf_ctrl->tx_put);
 		tx_bufsize = readl(&buf_ctrl->tx_bufsize);
@@ -3019,20 +2948,13 @@ static void cy_set_line_char(struct cyclades_port *info, struct tty_struct *tty)
 		spin_unlock_irqrestore(&card->card_lock, flags);
 
 	} else {
-		struct FIRM_ID __iomem *firm_id;
-		struct ZFW_CTRL __iomem *zfw_ctrl;
-		struct CH_CTRL __iomem *ch_ctrl;
+		struct CH_CTRL __iomem *ch_ctrl = info->u.cyz.ch_ctrl;
 		__u32 sw_flow;
 		int retval;
 
-		firm_id = card->base_addr + ID_ADDRESS;
 		if (!cyz_is_loaded(card))
 			return;
 
-		zfw_ctrl = card->base_addr +
-			(readl(&firm_id->zfwctrl_addr) & 0xfffff);
-		ch_ctrl = &(zfw_ctrl->ch_ctrl[channel]);
-
 		/* baud rate */
 		baud = tty_get_baud_rate(tty);
 		if (baud == 38400 && (info->port.flags & ASYNC_SPD_MASK) ==
@@ -3268,10 +3190,6 @@ static int cy_tiocmget(struct tty_struct *tty, struct file *file)
 	unsigned char status;
 	unsigned long lstatus;
 	unsigned int result;
-	struct FIRM_ID __iomem *firm_id;
-	struct ZFW_CTRL __iomem *zfw_ctrl;
-	struct BOARD_CTRL __iomem *board_ctrl;
-	struct CH_CTRL __iomem *ch_ctrl;
 
 	if (serial_paranoia_check(info, tty->name, __func__))
 		return -ENODEV;
@@ -3304,14 +3222,8 @@ static int cy_tiocmget(struct tty_struct *tty, struct file *file)
 			((status & CyDSR) ? TIOCM_DSR : 0) |
 			((status & CyCTS) ? TIOCM_CTS : 0);
 	} else {
-		base_addr = card->base_addr;
-		firm_id = card->base_addr + ID_ADDRESS;
 		if (cyz_is_loaded(card)) {
-			zfw_ctrl = card->base_addr +
-				(readl(&firm_id->zfwctrl_addr) & 0xfffff);
-			board_ctrl = &zfw_ctrl->board_ctrl;
-			ch_ctrl = zfw_ctrl->ch_ctrl;
-			lstatus = readl(&ch_ctrl[channel].rs_status);
+			lstatus = readl(&info->u.cyz.ch_ctrl->rs_status);
 			result = ((lstatus & C_RS_RTS) ? TIOCM_RTS : 0) |
 				((lstatus & C_RS_DTR) ? TIOCM_DTR : 0) |
 				((lstatus & C_RS_DCD) ? TIOCM_CAR : 0) |
@@ -3336,12 +3248,7 @@ cy_tiocmset(struct tty_struct *tty, struct file *file,
 	struct cyclades_port *info = tty->driver_data;
 	struct cyclades_card *card;
 	int chip, channel, index;
-	void __iomem *base_addr;
 	unsigned long flags;
-	struct FIRM_ID __iomem *firm_id;
-	struct ZFW_CTRL __iomem *zfw_ctrl;
-	struct BOARD_CTRL __iomem *board_ctrl;
-	struct CH_CTRL __iomem *ch_ctrl;
 	int retval;
 
 	if (serial_paranoia_check(info, tty->name, __func__))
@@ -3350,6 +3257,7 @@ cy_tiocmset(struct tty_struct *tty, struct file *file,
 	card = info->card;
 	channel = (info->line) - (card->first_line);
 	if (!cy_is_Z(card)) {
+		void __iomem *base_addr;
 		chip = channel >> 2;
 		channel &= 0x03;
 		index = card->bus_index;
@@ -3421,34 +3329,26 @@ cy_tiocmset(struct tty_struct *tty, struct file *file,
 			spin_unlock_irqrestore(&card->card_lock, flags);
 		}
 	} else {
-		base_addr = card->base_addr;
-
-		firm_id = card->base_addr + ID_ADDRESS;
 		if (cyz_is_loaded(card)) {
-			zfw_ctrl = card->base_addr +
-				(readl(&firm_id->zfwctrl_addr) & 0xfffff);
-			board_ctrl = &zfw_ctrl->board_ctrl;
-			ch_ctrl = zfw_ctrl->ch_ctrl;
+			struct CH_CTRL __iomem *ch_ctrl = info->u.cyz.ch_ctrl;
 
 			if (set & TIOCM_RTS) {
 				spin_lock_irqsave(&card->card_lock, flags);
-				cy_writel(&ch_ctrl[channel].rs_control,
-					readl(&ch_ctrl[channel].rs_control) |
-					C_RS_RTS);
+				cy_writel(&ch_ctrl->rs_control,
+					readl(&ch_ctrl->rs_control) | C_RS_RTS);
 				spin_unlock_irqrestore(&card->card_lock, flags);
 			}
 			if (clear & TIOCM_RTS) {
 				spin_lock_irqsave(&card->card_lock, flags);
-				cy_writel(&ch_ctrl[channel].rs_control,
-					readl(&ch_ctrl[channel].rs_control) &
+				cy_writel(&ch_ctrl->rs_control,
+					readl(&ch_ctrl->rs_control) &
 					~C_RS_RTS);
 				spin_unlock_irqrestore(&card->card_lock, flags);
 			}
 			if (set & TIOCM_DTR) {
 				spin_lock_irqsave(&card->card_lock, flags);
-				cy_writel(&ch_ctrl[channel].rs_control,
-					readl(&ch_ctrl[channel].rs_control) |
-					C_RS_DTR);
+				cy_writel(&ch_ctrl->rs_control,
+					readl(&ch_ctrl->rs_control) | C_RS_DTR);
 #ifdef CY_DEBUG_DTR
 				printk(KERN_DEBUG "cyc:set_modem_info raising "
 					"Z DTR\n");
@@ -3457,8 +3357,8 @@ cy_tiocmset(struct tty_struct *tty, struct file *file,
 			}
 			if (clear & TIOCM_DTR) {
 				spin_lock_irqsave(&card->card_lock, flags);
-				cy_writel(&ch_ctrl[channel].rs_control,
-					readl(&ch_ctrl[channel].rs_control) &
+				cy_writel(&ch_ctrl->rs_control,
+					readl(&ch_ctrl->rs_control) &
 					~C_RS_DTR);
 #ifdef CY_DEBUG_DTR
 				printk(KERN_DEBUG "cyc:set_modem_info clearing "
@@ -4171,17 +4071,8 @@ static int cyz_carrier_raised(struct tty_port *port)
 {
 	struct cyclades_port *info = container_of(port, struct cyclades_port,
 			port);
-	struct cyclades_card *cinfo = info->card;
-	void __iomem *base = cinfo->base_addr;
-	struct FIRM_ID __iomem *firm_id = base + ID_ADDRESS;
-	struct ZFW_CTRL __iomem *zfw_ctrl;
-	struct CH_CTRL __iomem *ch_ctrl;
-	int channel = info->line - cinfo->first_line;
 
-	zfw_ctrl = base + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
-	ch_ctrl = zfw_ctrl->ch_ctrl;
-
-	return readl(&ch_ctrl[channel].rs_status) & C_RS_DCD;
+	return readl(&info->u.cyz.ch_ctrl->rs_status) & C_RS_DCD;
 }
 
 static void cyz_dtr_rts(struct tty_port *port, int raise)
@@ -4189,22 +4080,16 @@ static void cyz_dtr_rts(struct tty_port *port, int raise)
 	struct cyclades_port *info = container_of(port, struct cyclades_port,
 			port);
 	struct cyclades_card *cinfo = info->card;
-	void __iomem *base = cinfo->base_addr;
-	struct FIRM_ID __iomem *firm_id = base + ID_ADDRESS;
-	struct ZFW_CTRL __iomem *zfw_ctrl;
-	struct CH_CTRL __iomem *ch_ctrl;
+	struct CH_CTRL __iomem *ch_ctrl = info->u.cyz.ch_ctrl;
 	int ret, channel = info->line - cinfo->first_line;
 	u32 rs;
 
-	zfw_ctrl = base + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
-	ch_ctrl = zfw_ctrl->ch_ctrl;
-
-	rs = readl(&ch_ctrl[channel].rs_control);
+	rs = readl(&ch_ctrl->rs_control);
 	if (raise)
 		rs |= C_RS_RTS | C_RS_DTR;
 	else
 		rs &= ~(C_RS_RTS | C_RS_DTR);
-	cy_writel(&ch_ctrl[channel].rs_control, rs);
+	cy_writel(&ch_ctrl->rs_control, rs);
 	ret = cyz_issue_cmd(cinfo, channel, C_CM_IOCTLM, 0L);
 	if (ret != 0)
 		printk(KERN_ERR "%s: retval on ttyC%d was %x\n",
@@ -4235,8 +4120,7 @@ static const struct tty_port_operations cyz_port_ops = {
 static int __devinit cy_init_card(struct cyclades_card *cinfo)
 {
 	struct cyclades_port *info;
-	unsigned int port;
-	unsigned short chip_number;
+	unsigned int channel, port;
 
 	spin_lock_init(&cinfo->card_lock);
 	cinfo->intr_enabled = 0;
@@ -4248,9 +4132,9 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
 		return -ENOMEM;
 	}
 
-	for (port = cinfo->first_line; port < cinfo->first_line + cinfo->nports;
-			port++) {
-		info = &cinfo->ports[port - cinfo->first_line];
+	for (channel = 0, port = cinfo->first_line; channel < cinfo->nports;
+			channel++, port++) {
+		info = &cinfo->ports[channel];
 		tty_port_init(&info->port);
 		info->magic = CYCLADES_MAGIC;
 		info->card = cinfo;
@@ -4263,8 +4147,17 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
 		init_waitqueue_head(&info->delta_msr_wait);
 
 		if (cy_is_Z(cinfo)) {
+			struct FIRM_ID *firm_id = cinfo->base_addr + ID_ADDRESS;
+			struct ZFW_CTRL *zfw_ctrl;
+
 			info->port.ops = &cyz_port_ops;
 			info->type = PORT_STARTECH;
+
+			zfw_ctrl = cinfo->base_addr +
+				(readl(&firm_id->zfwctrl_addr) & 0xfffff);
+			info->u.cyz.ch_ctrl = &zfw_ctrl->ch_ctrl[channel];
+			info->u.cyz.buf_ctrl = &zfw_ctrl->buf_ctrl[channel];
+
 			if (cinfo->hw_ver == ZO_V1)
 				info->xmit_fifo_size = CYZ_FIFO_SIZE;
 			else
@@ -4274,7 +4167,9 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
 				cyz_rx_restart, (unsigned long)info);
 #endif
 		} else {
+			unsigned short chip_number;
 			int index = cinfo->bus_index;
+
 			info->port.ops = &cyy_port_ops;
 			info->type = PORT_CIRRUS;
 			info->xmit_fifo_size = CyMAX_CHAR_FIFO;
@@ -4282,7 +4177,7 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
 			info->cor2 = CyETC;
 			info->cor3 = 0x08;	/* _very_ small rcv threshold */
 
-			chip_number = (port - cinfo->first_line) / 4;
+			chip_number = channel / CyPORTS_PER_CHIP;
 			info->chip_rev = readb(cinfo->base_addr +
 				      (cy_chip_offset[chip_number] << index) +
 				      (CyGFRCR << index));
@@ -4976,8 +4871,14 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 		}
 		cy_card[card_no].num_chips = nchan / CyPORTS_PER_CHIP;
 	} else {
+		struct FIRM_ID __iomem *firm_id = addr2 + ID_ADDRESS;
+		struct ZFW_CTRL __iomem *zfw_ctrl;
+
+		zfw_ctrl = addr2 + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
+
 		cy_card[card_no].hw_ver = mailbox;
 		cy_card[card_no].num_chips = (unsigned int)-1;
+		cy_card[card_no].board_ctrl = &zfw_ctrl->board_ctrl;
 #ifdef CONFIG_CYZ_INTR
 		/* allocate IRQ only if board has an IRQ */
 		if (irq != 0 && irq != 255) {
diff --git a/include/linux/cyclades.h b/include/linux/cyclades.h
index 1fbdea4f08eb..1eb87a6a2f6b 100644
--- a/include/linux/cyclades.h
+++ b/include/linux/cyclades.h
@@ -499,6 +499,7 @@ struct cyclades_card {
 		void __iomem *p9050;
 		struct RUNTIME_9060 __iomem *p9060;
 	} ctl_addr;
+	struct BOARD_CTRL __iomem *board_ctrl;	/* cyz specific */
 	int irq;
 	unsigned int num_chips;	/* 0 if card absent, -1 if Z/PCI, else Y */
 	unsigned int first_line;	/* minor number of first channel on card */
@@ -541,6 +542,15 @@ struct cyclades_port {
 	int                     magic;
 	struct tty_port		port;
 	struct cyclades_card	*card;
+	union {
+		struct {
+			int filler;
+		} cyy;
+		struct {
+			struct CH_CTRL __iomem	*ch_ctrl;
+			struct BUF_CTRL __iomem	*buf_ctrl;
+		} cyz;
+	} u;
 	int			line;
 	int			flags; 		/* defined in tty.h */
 	int                     type;		/* UART type */
-- 
cgit v1.2.3-71-gd317


From 3aeea5b92210083c7cffd4f08a0bb141d3f2d574 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Sat, 19 Sep 2009 13:13:16 -0700
Subject: cyclades: introduce cyy_readb/writeb

Add helpers for io operations, so that we can eliminate huge
amount of supporting code. It is now centralized in those
helpers and used values are precomputed in the init phase.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/char/cyclades.c  | 448 +++++++++++++++++------------------------------
 include/linux/cyclades.h |   2 +-
 2 files changed, 165 insertions(+), 285 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index ed66c3ab230d..cf2874a89c8c 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -90,7 +90,6 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 
-static void cy_throttle(struct tty_struct *tty);
 static void cy_send_xchar(struct tty_struct *tty, char ch);
 
 #ifndef SERIAL_XMIT_SIZE
@@ -298,6 +297,20 @@ static void cyz_rx_restart(unsigned long);
 static struct timer_list cyz_rx_full_timer[NR_PORTS];
 #endif				/* CONFIG_CYZ_INTR */
 
+static inline void cyy_writeb(struct cyclades_port *port, u32 reg, u8 val)
+{
+	struct cyclades_card *card = port->card;
+
+	cy_writeb(port->u.cyy.base_addr + (reg << card->bus_index), val);
+}
+
+static inline u8 cyy_readb(struct cyclades_port *port, u32 reg)
+{
+	struct cyclades_card *card = port->card;
+
+	return readb(port->u.cyy.base_addr + (reg << card->bus_index));
+}
+
 static inline bool cy_is_Z(struct cyclades_card *card)
 {
 	return card->num_chips == (unsigned int)-1;
@@ -350,13 +363,14 @@ static inline int serial_paranoia_check(struct cyclades_port *info,
 
    This function is only called from inside spinlock-protected code.
  */
-static int cyy_issue_cmd(void __iomem *base_addr, u_char cmd, int index)
+static int __cyy_issue_cmd(void __iomem *base_addr, u8 cmd, int index)
 {
+	void __iomem *ccr = base_addr + (CyCCR << index);
 	unsigned int i;
 
 	/* Check to see that the previous command has completed */
 	for (i = 0; i < 100; i++) {
-		if (readb(base_addr + (CyCCR << index)) == 0)
+		if (readb(ccr) == 0)
 			break;
 		udelay(10L);
 	}
@@ -366,10 +380,16 @@ static int cyy_issue_cmd(void __iomem *base_addr, u_char cmd, int index)
 		return -1;
 
 	/* Issue the new command */
-	cy_writeb(base_addr + (CyCCR << index), cmd);
+	cy_writeb(ccr, cmd);
 
 	return 0;
-}				/* cyy_issue_cmd */
+}
+
+static inline int cyy_issue_cmd(struct cyclades_port *port, u8 cmd)
+{
+	return __cyy_issue_cmd(port->u.cyy.base_addr, cmd,
+			port->card->bus_index);
+}
 
 #ifdef CONFIG_ISA
 /* ISA interrupt detection code */
@@ -394,7 +414,7 @@ static unsigned detect_isa_irq(void __iomem *address)
 	/* Enable the Tx interrupts on the CD1400 */
 	local_irq_save(flags);
 	cy_writeb(address + (CyCAR << index), 0);
-	cyy_issue_cmd(address, CyCHAN_CTL | CyENB_XMTR, index);
+	__cyy_issue_cmd(address, CyCHAN_CTL | CyENB_XMTR, index);
 
 	cy_writeb(address + (CyCAR << index), 0);
 	cy_writeb(address + (CySRER << index),
@@ -428,7 +448,7 @@ static void cyy_chip_rx(struct cyclades_card *cinfo, int chip,
 	struct cyclades_port *info;
 	struct tty_struct *tty;
 	int len, index = cinfo->bus_index;
-	u8 save_xir, channel, save_car, data, char_count;
+	u8 ivr, save_xir, channel, save_car, data, char_count;
 
 #ifdef CY_DEBUG_INTERRUPTS
 	printk(KERN_DEBUG "cyy_interrupt: rcvd intr, chip %d\n", chip);
@@ -437,26 +457,25 @@ static void cyy_chip_rx(struct cyclades_card *cinfo, int chip,
 	save_xir = readb(base_addr + (CyRIR << index));
 	channel = save_xir & CyIRChannel;
 	info = &cinfo->ports[channel + chip * 4];
-	save_car = readb(base_addr + (CyCAR << index));
-	cy_writeb(base_addr + (CyCAR << index), save_xir);
+	save_car = cyy_readb(info, CyCAR);
+	cyy_writeb(info, CyCAR, save_xir);
+	ivr = cyy_readb(info, CyRIVR) & CyIVRMask;
 
 	tty = tty_port_tty_get(&info->port);
 	/* if there is nowhere to put the data, discard it */
 	if (tty == NULL) {
-		if ((readb(base_addr + (CyRIVR << index)) & CyIVRMask) ==
-				CyIVRRxEx) {	/* exception */
-			data = readb(base_addr + (CyRDSR << index));
+		if (ivr == CyIVRRxEx) {	/* exception */
+			data = cyy_readb(info, CyRDSR);
 		} else {	/* normal character reception */
-			char_count = readb(base_addr + (CyRDCR << index));
+			char_count = cyy_readb(info, CyRDCR);
 			while (char_count--)
-				data = readb(base_addr + (CyRDSR << index));
+				data = cyy_readb(info, CyRDSR);
 		}
 		goto end;
 	}
 	/* there is an open port for this data */
-	if ((readb(base_addr + (CyRIVR << index)) & CyIVRMask) ==
-			CyIVRRxEx) {	/* exception */
-		data = readb(base_addr + (CyRDSR << index));
+	if (ivr == CyIVRRxEx) {	/* exception */
+		data = cyy_readb(info, CyRDSR);
 
 		/* For statistics only */
 		if (data & CyBREAK)
@@ -477,22 +496,22 @@ static void cyy_chip_rx(struct cyclades_card *cinfo, int chip,
 			if (data & info->read_status_mask) {
 				if (data & CyBREAK) {
 					tty_insert_flip_char(tty,
-						readb(base_addr + (CyRDSR <<
-							index)), TTY_BREAK);
+						cyy_readb(info, CyRDSR),
+						TTY_BREAK);
 					info->icount.rx++;
 					if (info->port.flags & ASYNC_SAK)
 						do_SAK(tty);
 				} else if (data & CyFRAME) {
 					tty_insert_flip_char(tty,
-						readb(base_addr + (CyRDSR <<
-							index)), TTY_FRAME);
+						cyy_readb(info, CyRDSR),
+						TTY_FRAME);
 					info->icount.rx++;
 					info->idle_stats.frame_errs++;
 				} else if (data & CyPARITY) {
 					/* Pieces of seven... */
 					tty_insert_flip_char(tty,
-						readb(base_addr + (CyRDSR <<
-							index)), TTY_PARITY);
+						cyy_readb(info, CyRDSR),
+						TTY_PARITY);
 					info->icount.rx++;
 					info->idle_stats.parity_errs++;
 				} else if (data & CyOVERRUN) {
@@ -504,8 +523,8 @@ static void cyy_chip_rx(struct cyclades_card *cinfo, int chip,
 					   the next incoming character.
 					 */
 					tty_insert_flip_char(tty,
-						readb(base_addr + (CyRDSR <<
-							index)), TTY_FRAME);
+						cyy_readb(info, CyRDSR),
+						TTY_FRAME);
 					info->icount.rx++;
 					info->idle_stats.overruns++;
 				/* These two conditions may imply */
@@ -529,7 +548,7 @@ static void cyy_chip_rx(struct cyclades_card *cinfo, int chip,
 		}
 	} else {	/* normal character reception */
 		/* load # chars available from the chip */
-		char_count = readb(base_addr + (CyRDCR << index));
+		char_count = cyy_readb(info, CyRDCR);
 
 #ifdef CY_ENABLE_MONITORING
 		++info->mon.int_count;
@@ -540,7 +559,7 @@ static void cyy_chip_rx(struct cyclades_card *cinfo, int chip,
 #endif
 		len = tty_buffer_request_room(tty, char_count);
 		while (len--) {
-			data = readb(base_addr + (CyRDSR << index));
+			data = cyy_readb(info, CyRDSR);
 			tty_insert_flip_char(tty, data, TTY_NORMAL);
 			info->idle_stats.recv_bytes++;
 			info->icount.rx++;
@@ -554,8 +573,8 @@ static void cyy_chip_rx(struct cyclades_card *cinfo, int chip,
 	tty_kref_put(tty);
 end:
 	/* end of service */
-	cy_writeb(base_addr + (CyRIR << index), save_xir & 0x3f);
-	cy_writeb(base_addr + (CyCAR << index), save_car);
+	cyy_writeb(info, CyRIR, save_xir & 0x3f);
+	cyy_writeb(info, CyCAR, save_car);
 }
 
 static void cyy_chip_tx(struct cyclades_card *cinfo, unsigned int chip,
@@ -588,8 +607,7 @@ static void cyy_chip_tx(struct cyclades_card *cinfo, unsigned int chip,
 	info = &cinfo->ports[channel + chip * 4];
 	tty = tty_port_tty_get(&info->port);
 	if (tty == NULL) {
-		cy_writeb(base_addr + (CySRER << index),
-			  readb(base_addr + (CySRER << index)) & ~CyTxRdy);
+		cyy_writeb(info, CySRER, cyy_readb(info, CySRER) & ~CyTxRdy);
 		goto end;
 	}
 
@@ -598,7 +616,7 @@ static void cyy_chip_tx(struct cyclades_card *cinfo, unsigned int chip,
 
 	if (info->x_char) {	/* send special char */
 		outch = info->x_char;
-		cy_writeb(base_addr + (CyTDR << index), outch);
+		cyy_writeb(info, CyTDR, outch);
 		char_count--;
 		info->icount.tx++;
 		info->x_char = 0;
@@ -606,14 +624,14 @@ static void cyy_chip_tx(struct cyclades_card *cinfo, unsigned int chip,
 
 	if (info->breakon || info->breakoff) {
 		if (info->breakon) {
-			cy_writeb(base_addr + (CyTDR << index), 0);
-			cy_writeb(base_addr + (CyTDR << index), 0x81);
+			cyy_writeb(info, CyTDR, 0);
+			cyy_writeb(info, CyTDR, 0x81);
 			info->breakon = 0;
 			char_count -= 2;
 		}
 		if (info->breakoff) {
-			cy_writeb(base_addr + (CyTDR << index), 0);
-			cy_writeb(base_addr + (CyTDR << index), 0x83);
+			cyy_writeb(info, CyTDR, 0);
+			cyy_writeb(info, CyTDR, 0x83);
 			info->breakoff = 0;
 			char_count -= 2;
 		}
@@ -621,27 +639,23 @@ static void cyy_chip_tx(struct cyclades_card *cinfo, unsigned int chip,
 
 	while (char_count-- > 0) {
 		if (!info->xmit_cnt) {
-			if (readb(base_addr + (CySRER << index)) & CyTxMpty) {
-				cy_writeb(base_addr + (CySRER << index),
-					readb(base_addr + (CySRER << index)) &
-						~CyTxMpty);
+			if (cyy_readb(info, CySRER) & CyTxMpty) {
+				cyy_writeb(info, CySRER,
+					cyy_readb(info, CySRER) & ~CyTxMpty);
 			} else {
-				cy_writeb(base_addr + (CySRER << index),
-					(readb(base_addr + (CySRER << index)) &
-						~CyTxRdy) | CyTxMpty);
+				cyy_writeb(info, CySRER, CyTxMpty |
+					(cyy_readb(info, CySRER) & ~CyTxRdy));
 			}
 			goto done;
 		}
 		if (info->port.xmit_buf == NULL) {
-			cy_writeb(base_addr + (CySRER << index),
-				readb(base_addr + (CySRER << index)) &
-					~CyTxRdy);
+			cyy_writeb(info, CySRER,
+				cyy_readb(info, CySRER) & ~CyTxRdy);
 			goto done;
 		}
 		if (tty->stopped || tty->hw_stopped) {
-			cy_writeb(base_addr + (CySRER << index),
-				readb(base_addr + (CySRER << index)) &
-					~CyTxRdy);
+			cyy_writeb(info, CySRER,
+				cyy_readb(info, CySRER) & ~CyTxRdy);
 			goto done;
 		}
 		/* Because the Embedded Transmit Commands have been enabled,
@@ -658,15 +672,15 @@ static void cyy_chip_tx(struct cyclades_card *cinfo, unsigned int chip,
 			info->xmit_cnt--;
 			info->xmit_tail = (info->xmit_tail + 1) &
 					(SERIAL_XMIT_SIZE - 1);
-			cy_writeb(base_addr + (CyTDR << index), outch);
+			cyy_writeb(info, CyTDR, outch);
 			info->icount.tx++;
 		} else {
 			if (char_count > 1) {
 				info->xmit_cnt--;
 				info->xmit_tail = (info->xmit_tail + 1) &
 					(SERIAL_XMIT_SIZE - 1);
-				cy_writeb(base_addr + (CyTDR << index), outch);
-				cy_writeb(base_addr + (CyTDR << index), 0);
+				cyy_writeb(info, CyTDR, outch);
+				cyy_writeb(info, CyTDR, 0);
 				info->icount.tx++;
 				char_count--;
 			}
@@ -678,8 +692,8 @@ done:
 	tty_kref_put(tty);
 end:
 	/* end of service */
-	cy_writeb(base_addr + (CyTIR << index), save_xir & 0x3f);
-	cy_writeb(base_addr + (CyCAR << index), save_car);
+	cyy_writeb(info, CyTIR, save_xir & 0x3f);
+	cyy_writeb(info, CyCAR, save_car);
 }
 
 static void cyy_chip_modem(struct cyclades_card *cinfo, int chip,
@@ -694,11 +708,11 @@ static void cyy_chip_modem(struct cyclades_card *cinfo, int chip,
 	save_xir = readb(base_addr + (CyMIR << index));
 	channel = save_xir & CyIRChannel;
 	info = &cinfo->ports[channel + chip * 4];
-	save_car = readb(base_addr + (CyCAR << index));
-	cy_writeb(base_addr + (CyCAR << index), save_xir);
+	save_car = cyy_readb(info, CyCAR);
+	cyy_writeb(info, CyCAR, save_xir);
 
-	mdm_change = readb(base_addr + (CyMISR << index));
-	mdm_status = readb(base_addr + (CyMSVR1 << index));
+	mdm_change = cyy_readb(info, CyMISR);
+	mdm_status = cyy_readb(info, CyMSVR1);
 
 	tty = tty_port_tty_get(&info->port);
 	if (!tty)
@@ -730,9 +744,8 @@ static void cyy_chip_modem(struct cyclades_card *cinfo, int chip,
 				/* cy_start isn't used
 				   because... !!! */
 				tty->hw_stopped = 0;
-				cy_writeb(base_addr + (CySRER << index),
-					readb(base_addr + (CySRER << index)) |
-						CyTxRdy);
+				cyy_writeb(info, CySRER,
+					cyy_readb(info, CySRER) | CyTxRdy);
 				tty_wakeup(tty);
 			}
 		} else {
@@ -740,9 +753,8 @@ static void cyy_chip_modem(struct cyclades_card *cinfo, int chip,
 				/* cy_stop isn't used
 				   because ... !!! */
 				tty->hw_stopped = 1;
-				cy_writeb(base_addr + (CySRER << index),
-					readb(base_addr + (CySRER << index)) &
-						~CyTxRdy);
+				cyy_writeb(info, CySRER,
+					cyy_readb(info, CySRER) & ~CyTxRdy);
 			}
 		}
 	}
@@ -753,8 +765,8 @@ static void cyy_chip_modem(struct cyclades_card *cinfo, int chip,
 	tty_kref_put(tty);
 end:
 	/* end of service */
-	cy_writeb(base_addr + (CyMIR << index), save_xir & 0x3f);
-	cy_writeb(base_addr + (CyCAR << index), save_car);
+	cyy_writeb(info, CyMIR, save_xir & 0x3f);
+	cyy_writeb(info, CyCAR, save_car);
 }
 
 /* The real interrupt service routine is called
@@ -829,15 +841,10 @@ static void cyy_change_rts_dtr(struct cyclades_port *info, unsigned int set,
 		unsigned int clear)
 {
 	struct cyclades_card *card = info->card;
-	void __iomem *base_addr;
-	int chip, channel, index;
+	int channel = info->line - card->first_line;
 	u32 rts, dtr, msvrr, msvrd;
 
-	channel = info->line - card->first_line;
-	chip = channel >> 2;
 	channel &= 0x03;
-	index = card->bus_index;
-	base_addr = card->base_addr + (cy_chip_offset[chip] << index);
 
 	if (info->rtsdtr_inv) {
 		msvrr = CyMSVR2;
@@ -851,31 +858,31 @@ static void cyy_change_rts_dtr(struct cyclades_port *info, unsigned int set,
 		dtr = CyDTR;
 	}
 	if (set & TIOCM_RTS) {
-		cy_writeb(base_addr + (CyCAR << index), (u8)channel);
-		cy_writeb(base_addr + (msvrr << index), rts);
+		cyy_writeb(info, CyCAR, channel);
+		cyy_writeb(info, msvrr, rts);
 	}
 	if (clear & TIOCM_RTS) {
-		cy_writeb(base_addr + (CyCAR << index), (u8)channel);
-		cy_writeb(base_addr + (msvrr << index), ~rts);
+		cyy_writeb(info, CyCAR, channel);
+		cyy_writeb(info, msvrr, ~rts);
 	}
 	if (set & TIOCM_DTR) {
-		cy_writeb(base_addr + (CyCAR << index), (u8)channel);
-		cy_writeb(base_addr + (msvrd << index), dtr);
+		cyy_writeb(info, CyCAR, channel);
+		cyy_writeb(info, msvrd, dtr);
 #ifdef CY_DEBUG_DTR
 		printk(KERN_DEBUG "cyc:set_modem_info raising DTR\n");
 		printk(KERN_DEBUG "     status: 0x%x, 0x%x\n",
-			readb(base_addr + (CyMSVR1 << index)),
-			readb(base_addr + (CyMSVR2 << index)));
+			cyy_readb(info, CyMSVR1),
+			cyy_readb(info, CyMSVR2));
 #endif
 	}
 	if (clear & TIOCM_DTR) {
-		cy_writeb(base_addr + (CyCAR << index), (u8)channel);
-		cy_writeb(base_addr + (msvrd << index), ~dtr);
+		cyy_writeb(info, CyCAR, channel);
+		cyy_writeb(info, msvrd, ~dtr);
 #ifdef CY_DEBUG_DTR
 		printk(KERN_DEBUG "cyc:set_modem_info dropping DTR\n");
 		printk(KERN_DEBUG "     status: 0x%x, 0x%x\n",
-			readb(base_addr + (CyMSVR1 << index)),
-			readb(base_addr + (CyMSVR2 << index)));
+			cyy_readb(info, CyMSVR1),
+			cyy_readb(info, CyMSVR2));
 #endif
 	}
 }
@@ -1290,7 +1297,6 @@ static int cy_startup(struct cyclades_port *info, struct tty_struct *tty)
 	struct cyclades_card *card;
 	unsigned long flags;
 	int retval = 0;
-	void __iomem *base_addr;
 	int channel;
 	unsigned long page;
 
@@ -1321,31 +1327,21 @@ static int cy_startup(struct cyclades_port *info, struct tty_struct *tty)
 	cy_set_line_char(info, tty);
 
 	if (!cy_is_Z(card)) {
-		int chip = channel >> 2;
-		int index = card->bus_index;
 		channel &= 0x03;
-		base_addr = card->base_addr + (cy_chip_offset[chip] << index);
 
-#ifdef CY_DEBUG_OPEN
-		printk(KERN_DEBUG "cyc startup card %d, chip %d, channel %d, "
-				"base_addr %p\n",
-				card, chip, channel, base_addr);
-#endif
 		spin_lock_irqsave(&card->card_lock, flags);
 
-		cy_writeb(base_addr + (CyCAR << index), (u_char) channel);
+		cyy_writeb(info, CyCAR, channel);
 
-		cy_writeb(base_addr + (CyRTPR << index),
+		cyy_writeb(info, CyRTPR,
 			(info->default_timeout ? info->default_timeout : 0x02));
 		/* 10ms rx timeout */
 
-		cyy_issue_cmd(base_addr, CyCHAN_CTL | CyENB_RCVR | CyENB_XMTR,
-				index);
+		cyy_issue_cmd(info, CyCHAN_CTL | CyENB_RCVR | CyENB_XMTR);
 
 		cyy_change_rts_dtr(info, TIOCM_RTS | TIOCM_DTR, 0);
 
-		cy_writeb(base_addr + (CySRER << index),
-			readb(base_addr + (CySRER << index)) | CyRxData);
+		cyy_writeb(info, CySRER, cyy_readb(info, CySRER) | CyRxData);
 	} else {
 		struct CH_CTRL __iomem *ch_ctrl = info->u.cyz.ch_ctrl;
 
@@ -1423,23 +1419,14 @@ errout:
 
 static void start_xmit(struct cyclades_port *info)
 {
-	struct cyclades_card *card;
+	struct cyclades_card *card = info->card;
 	unsigned long flags;
-	void __iomem *base_addr;
-	int chip, channel, index;
+	int channel = info->line - card->first_line;
 
-	card = info->card;
-	channel = info->line - card->first_line;
 	if (!cy_is_Z(card)) {
-		chip = channel >> 2;
-		channel &= 0x03;
-		index = card->bus_index;
-		base_addr = card->base_addr + (cy_chip_offset[chip] << index);
-
 		spin_lock_irqsave(&card->card_lock, flags);
-		cy_writeb(base_addr + (CyCAR << index), channel);
-		cy_writeb(base_addr + (CySRER << index),
-			readb(base_addr + (CySRER << index)) | CyTxRdy);
+		cyy_writeb(info, CyCAR, channel & 0x03);
+		cyy_writeb(info, CySRER, cyy_readb(info, CySRER) | CyTxRdy);
 		spin_unlock_irqrestore(&card->card_lock, flags);
 	} else {
 #ifdef CONFIG_CYZ_INTR
@@ -1466,8 +1453,7 @@ static void cy_shutdown(struct cyclades_port *info, struct tty_struct *tty)
 {
 	struct cyclades_card *card;
 	unsigned long flags;
-	void __iomem *base_addr;
-	int chip, channel, index;
+	int channel;
 
 	if (!(info->port.flags & ASYNC_INITIALIZED))
 		return;
@@ -1475,17 +1461,6 @@ static void cy_shutdown(struct cyclades_port *info, struct tty_struct *tty)
 	card = info->card;
 	channel = info->line - card->first_line;
 	if (!cy_is_Z(card)) {
-		chip = channel >> 2;
-		channel &= 0x03;
-		index = card->bus_index;
-		base_addr = card->base_addr + (cy_chip_offset[chip] << index);
-
-#ifdef CY_DEBUG_OPEN
-		printk(KERN_DEBUG "cyc shutdown Y card %d, chip %d, "
-				"channel %d, base_addr %p\n",
-				card, chip, channel, base_addr);
-#endif
-
 		spin_lock_irqsave(&card->card_lock, flags);
 
 		/* Clear delta_msr_wait queue to avoid mem leaks. */
@@ -1500,7 +1475,7 @@ static void cy_shutdown(struct cyclades_port *info, struct tty_struct *tty)
 		if (tty->termios->c_cflag & HUPCL)
 			cyy_change_rts_dtr(info, 0, TIOCM_RTS | TIOCM_DTR);
 
-		cyy_issue_cmd(base_addr, CyCHAN_CTL | CyDIS_RCVR, index);
+		cyy_issue_cmd(info, CyCHAN_CTL | CyDIS_RCVR);
 		/* it may be appropriate to clear _XMIT at
 		   some later date (after testing)!!! */
 
@@ -1677,8 +1652,6 @@ static void cy_wait_until_sent(struct tty_struct *tty, int timeout)
 {
 	struct cyclades_card *card;
 	struct cyclades_port *info = tty->driver_data;
-	void __iomem *base_addr;
-	int chip, channel, index;
 	unsigned long orig_jiffies;
 	int char_time;
 
@@ -1722,13 +1695,8 @@ static void cy_wait_until_sent(struct tty_struct *tty, int timeout)
 		timeout, char_time, jiffies);
 #endif
 	card = info->card;
-	channel = (info->line) - (card->first_line);
 	if (!cy_is_Z(card)) {
-		chip = channel >> 2;
-		channel &= 0x03;
-		index = card->bus_index;
-		base_addr = card->base_addr + (cy_chip_offset[chip] << index);
-		while (readb(base_addr + (CySRER << index)) & CyTxRdy) {
+		while (cyy_readb(info, CySRER) & CyTxRdy) {
 #ifdef CY_DEBUG_WAIT_UNTIL_SENT
 			printk(KERN_DEBUG "Not clean (jiff=%lu)...", jiffies);
 #endif
@@ -1790,6 +1758,7 @@ static void cy_close(struct tty_struct *tty, struct file *filp)
 	struct cyclades_port *info = tty->driver_data;
 	struct cyclades_card *card;
 	unsigned long flags;
+	int channel;
 
 	if (!info || serial_paranoia_check(info, tty->name, "cy_close"))
 		return;
@@ -1799,18 +1768,13 @@ static void cy_close(struct tty_struct *tty, struct file *filp)
 	if (!tty_port_close_start(&info->port, tty, filp))
 		return;
 
+	channel = info->line - card->first_line;
 	spin_lock_irqsave(&card->card_lock, flags);
 
 	if (!cy_is_Z(card)) {
-		int channel = info->line - card->first_line;
-		int index = card->bus_index;
-		void __iomem *base_addr = card->base_addr +
-			(cy_chip_offset[channel >> 2] << index);
 		/* Stop accepting input */
-		channel &= 0x03;
-		cy_writeb(base_addr + (CyCAR << index), (u_char) channel);
-		cy_writeb(base_addr + (CySRER << index),
-			  readb(base_addr + (CySRER << index)) & ~CyRxData);
+		cyy_writeb(info, CyCAR, channel & 0x03);
+		cyy_writeb(info, CySRER, cyy_readb(info, CySRER) & ~CyRxData);
 		if (info->port.flags & ASYNC_INITIALIZED) {
 			/* Waiting for on-board buffers to be empty before
 			   closing the port */
@@ -1823,7 +1787,6 @@ static void cy_close(struct tty_struct *tty, struct file *filp)
 		/* Waiting for on-board buffers to be empty before closing
 		   the port */
 		struct CH_CTRL __iomem *ch_ctrl = info->u.cyz.ch_ctrl;
-		int channel = info->line - card->first_line;
 		int retval;
 
 		if (readl(&ch_ctrl->flow_status) != C_FS_TXIDLE) {
@@ -2064,8 +2027,7 @@ static void cy_set_line_char(struct cyclades_port *info, struct tty_struct *tty)
 {
 	struct cyclades_card *card;
 	unsigned long flags;
-	void __iomem *base_addr;
-	int chip, channel, index;
+	int channel;
 	unsigned cflag, iflag;
 	int baud, baud_rate = 0;
 	int i;
@@ -2095,9 +2057,6 @@ static void cy_set_line_char(struct cyclades_port *info, struct tty_struct *tty)
 	channel = info->line - card->first_line;
 
 	if (!cy_is_Z(card)) {
-
-		index = card->bus_index;
-
 		/* baud rate */
 		baud = tty_get_baud_rate(tty);
 		if (baud == 38400 && (info->port.flags & ASYNC_SPD_MASK) ==
@@ -2208,70 +2167,67 @@ static void cy_set_line_char(struct cyclades_port *info, struct tty_struct *tty)
 	    cable.  Contact Marcio Saito for details.
 	 ***********************************************/
 
-		chip = channel >> 2;
 		channel &= 0x03;
-		base_addr = card->base_addr + (cy_chip_offset[chip] << index);
 
 		spin_lock_irqsave(&card->card_lock, flags);
-		cy_writeb(base_addr + (CyCAR << index), (u_char) channel);
+		cyy_writeb(info, CyCAR, channel);
 
 		/* tx and rx baud rate */
 
-		cy_writeb(base_addr + (CyTCOR << index), info->tco);
-		cy_writeb(base_addr + (CyTBPR << index), info->tbpr);
-		cy_writeb(base_addr + (CyRCOR << index), info->rco);
-		cy_writeb(base_addr + (CyRBPR << index), info->rbpr);
+		cyy_writeb(info, CyTCOR, info->tco);
+		cyy_writeb(info, CyTBPR, info->tbpr);
+		cyy_writeb(info, CyRCOR, info->rco);
+		cyy_writeb(info, CyRBPR, info->rbpr);
 
 		/* set line characteristics  according configuration */
 
-		cy_writeb(base_addr + (CySCHR1 << index), START_CHAR(tty));
-		cy_writeb(base_addr + (CySCHR2 << index), STOP_CHAR(tty));
-		cy_writeb(base_addr + (CyCOR1 << index), info->cor1);
-		cy_writeb(base_addr + (CyCOR2 << index), info->cor2);
-		cy_writeb(base_addr + (CyCOR3 << index), info->cor3);
-		cy_writeb(base_addr + (CyCOR4 << index), info->cor4);
-		cy_writeb(base_addr + (CyCOR5 << index), info->cor5);
+		cyy_writeb(info, CySCHR1, START_CHAR(tty));
+		cyy_writeb(info, CySCHR2, STOP_CHAR(tty));
+		cyy_writeb(info, CyCOR1, info->cor1);
+		cyy_writeb(info, CyCOR2, info->cor2);
+		cyy_writeb(info, CyCOR3, info->cor3);
+		cyy_writeb(info, CyCOR4, info->cor4);
+		cyy_writeb(info, CyCOR5, info->cor5);
 
-		cyy_issue_cmd(base_addr, CyCOR_CHANGE | CyCOR1ch | CyCOR2ch |
-				CyCOR3ch, index);
+		cyy_issue_cmd(info, CyCOR_CHANGE | CyCOR1ch | CyCOR2ch |
+				CyCOR3ch);
 
 		/* !!! Is this needed? */
-		cy_writeb(base_addr + (CyCAR << index), (u_char) channel);
-		cy_writeb(base_addr + (CyRTPR << index),
+		cyy_writeb(info, CyCAR, channel);
+		cyy_writeb(info, CyRTPR,
 			(info->default_timeout ? info->default_timeout : 0x02));
 		/* 10ms rx timeout */
 
 		if (C_CLOCAL(tty)) {
 			/* without modem intr */
-			cy_writeb(base_addr + (CySRER << index),
-				readb(base_addr + (CySRER << index)) | CyMdmCh);
+			cyy_writeb(info, CySRER,
+					cyy_readb(info, CySRER) | CyMdmCh);
 			/* act on 1->0 modem transitions */
 			if ((cflag & CRTSCTS) && info->rflow) {
-				cy_writeb(base_addr + (CyMCOR1 << index),
+				cyy_writeb(info, CyMCOR1,
 					  (CyCTS | rflow_thr[i]));
 			} else {
-				cy_writeb(base_addr + (CyMCOR1 << index),
+				cyy_writeb(info, CyMCOR1,
 					  CyCTS);
 			}
 			/* act on 0->1 modem transitions */
-			cy_writeb(base_addr + (CyMCOR2 << index), CyCTS);
+			cyy_writeb(info, CyMCOR2, CyCTS);
 		} else {
 			/* without modem intr */
-			cy_writeb(base_addr + (CySRER << index),
-				  readb(base_addr +
-					   (CySRER << index)) | CyMdmCh);
+			cyy_writeb(info, CySRER,
+					cyy_readb(info, CySRER) | CyMdmCh);
 			/* act on 1->0 modem transitions */
 			if ((cflag & CRTSCTS) && info->rflow) {
-				cy_writeb(base_addr + (CyMCOR1 << index),
+				cyy_writeb(info, CyMCOR1,
 					  (CyDSR | CyCTS | CyRI | CyDCD |
 					   rflow_thr[i]));
 			} else {
-				cy_writeb(base_addr + (CyMCOR1 << index),
-					  CyDSR | CyCTS | CyRI | CyDCD);
+				cyy_writeb(info, CyMCOR1,
+						CyDSR | CyCTS | CyRI | CyDCD);
 			}
 			/* act on 0->1 modem transitions */
-			cy_writeb(base_addr + (CyMCOR2 << index),
-				  CyDSR | CyCTS | CyRI | CyDCD);
+			cyy_writeb(info, CyMCOR2,
+					CyDSR | CyCTS | CyRI | CyDCD);
 		}
 
 		if (i == 0)	/* baud rate is zero, turn off line */
@@ -2482,24 +2438,14 @@ check_and_exit:
  */
 static int get_lsr_info(struct cyclades_port *info, unsigned int __user *value)
 {
-	struct cyclades_card *card;
-	int chip, channel, index;
-	unsigned char status;
+	struct cyclades_card *card = info->card;
 	unsigned int result;
 	unsigned long flags;
-	void __iomem *base_addr;
+	u8 status;
 
-	card = info->card;
-	channel = (info->line) - (card->first_line);
 	if (!cy_is_Z(card)) {
-		chip = channel >> 2;
-		channel &= 0x03;
-		index = card->bus_index;
-		base_addr = card->base_addr + (cy_chip_offset[chip] << index);
-
 		spin_lock_irqsave(&card->card_lock, flags);
-		status = readb(base_addr + (CySRER << index)) &
-				(CyTxRdy | CyTxMpty);
+		status = cyy_readb(info, CySRER) & (CyTxRdy | CyTxMpty);
 		spin_unlock_irqrestore(&card->card_lock, flags);
 		result = (status ? 0 : TIOCSER_TEMT);
 	} else {
@@ -2513,29 +2459,23 @@ static int cy_tiocmget(struct tty_struct *tty, struct file *file)
 {
 	struct cyclades_port *info = tty->driver_data;
 	struct cyclades_card *card;
-	void __iomem *base_addr;
-	int result, channel;
+	int result;
 
 	if (serial_paranoia_check(info, tty->name, __func__))
 		return -ENODEV;
 
 	card = info->card;
-	channel = info->line - card->first_line;
 
 	lock_kernel();
 	if (!cy_is_Z(card)) {
 		unsigned long flags;
-		unsigned char status;
-		int chip = channel >> 2;
-		int index = card->bus_index;
-
-		channel &= 0x03;
-		base_addr = card->base_addr + (cy_chip_offset[chip] << index);
+		int channel = info->line - card->first_line;
+		u8 status;
 
 		spin_lock_irqsave(&card->card_lock, flags);
-		cy_writeb(base_addr + (CyCAR << index), (u8)channel);
-		status = readb(base_addr + (CyMSVR1 << index));
-		status |= readb(base_addr + (CyMSVR2 << index));
+		cyy_writeb(info, CyCAR, channel & 0x03);
+		status = cyy_readb(info, CyMSVR1);
+		status |= cyy_readb(info, CyMSVR2);
 		spin_unlock_irqrestore(&card->card_lock, flags);
 
 		if (info->rtsdtr_inv) {
@@ -2689,26 +2629,16 @@ static int cy_break(struct tty_struct *tty, int break_state)
 
 static int set_threshold(struct cyclades_port *info, unsigned long value)
 {
-	struct cyclades_card *card;
-	void __iomem *base_addr;
-	int channel, chip, index;
+	struct cyclades_card *card = info->card;
 	unsigned long flags;
 
-	card = info->card;
-	channel = info->line - card->first_line;
 	if (!cy_is_Z(card)) {
-		chip = channel >> 2;
-		channel &= 0x03;
-		index = card->bus_index;
-		base_addr =
-		    card->base_addr + (cy_chip_offset[chip] << index);
-
 		info->cor3 &= ~CyREC_FIFO;
 		info->cor3 |= value & CyREC_FIFO;
 
 		spin_lock_irqsave(&card->card_lock, flags);
-		cy_writeb(base_addr + (CyCOR3 << index), info->cor3);
-		cyy_issue_cmd(base_addr, CyCOR_CHANGE | CyCOR3ch, index);
+		cyy_writeb(info, CyCOR3, info->cor3);
+		cyy_issue_cmd(info, CyCOR_CHANGE | CyCOR3ch);
 		spin_unlock_irqrestore(&card->card_lock, flags);
 	}
 	return 0;
@@ -2717,20 +2647,10 @@ static int set_threshold(struct cyclades_port *info, unsigned long value)
 static int get_threshold(struct cyclades_port *info,
 						unsigned long __user *value)
 {
-	struct cyclades_card *card;
-	void __iomem *base_addr;
-	int channel, chip, index;
-	unsigned long tmp;
+	struct cyclades_card *card = info->card;
 
-	card = info->card;
-	channel = info->line - card->first_line;
 	if (!cy_is_Z(card)) {
-		chip = channel >> 2;
-		channel &= 0x03;
-		index = card->bus_index;
-		base_addr = card->base_addr + (cy_chip_offset[chip] << index);
-
-		tmp = readb(base_addr + (CyCOR3 << index)) & CyREC_FIFO;
+		u8 tmp = cyy_readb(info, CyCOR3) & CyREC_FIFO;
 		return put_user(tmp, value);
 	}
 	return 0;
@@ -2738,21 +2658,12 @@ static int get_threshold(struct cyclades_port *info,
 
 static int set_timeout(struct cyclades_port *info, unsigned long value)
 {
-	struct cyclades_card *card;
-	void __iomem *base_addr;
-	int channel, chip, index;
+	struct cyclades_card *card = info->card;
 	unsigned long flags;
 
-	card = info->card;
-	channel = info->line - card->first_line;
 	if (!cy_is_Z(card)) {
-		chip = channel >> 2;
-		channel &= 0x03;
-		index = card->bus_index;
-		base_addr = card->base_addr + (cy_chip_offset[chip] << index);
-
 		spin_lock_irqsave(&card->card_lock, flags);
-		cy_writeb(base_addr + (CyRTPR << index), value & 0xff);
+		cyy_writeb(info, CyRTPR, value & 0xff);
 		spin_unlock_irqrestore(&card->card_lock, flags);
 	}
 	return 0;
@@ -2761,20 +2672,10 @@ static int set_timeout(struct cyclades_port *info, unsigned long value)
 static int get_timeout(struct cyclades_port *info,
 						unsigned long __user *value)
 {
-	struct cyclades_card *card;
-	void __iomem *base_addr;
-	int channel, chip, index;
-	unsigned long tmp;
+	struct cyclades_card *card = info->card;
 
-	card = info->card;
-	channel = info->line - card->first_line;
 	if (!cy_is_Z(card)) {
-		chip = channel >> 2;
-		channel &= 0x03;
-		index = card->bus_index;
-		base_addr = card->base_addr + (cy_chip_offset[chip] << index);
-
-		tmp = readb(base_addr + (CyRTPR << index));
+		u8 tmp = cyy_readb(info, CyRTPR);
 		return put_user(tmp, value);
 	}
 	return 0;
@@ -3101,8 +3002,7 @@ static void cy_stop(struct tty_struct *tty)
 {
 	struct cyclades_card *cinfo;
 	struct cyclades_port *info = tty->driver_data;
-	void __iomem *base_addr;
-	int chip, channel, index;
+	int channel;
 	unsigned long flags;
 
 #ifdef CY_DEBUG_OTHER
@@ -3115,16 +3015,9 @@ static void cy_stop(struct tty_struct *tty)
 	cinfo = info->card;
 	channel = info->line - cinfo->first_line;
 	if (!cy_is_Z(cinfo)) {
-		index = cinfo->bus_index;
-		chip = channel >> 2;
-		channel &= 0x03;
-		base_addr = cinfo->base_addr + (cy_chip_offset[chip] << index);
-
 		spin_lock_irqsave(&cinfo->card_lock, flags);
-		cy_writeb(base_addr + (CyCAR << index),
-			(u_char)(channel & 0x0003)); /* index channel */
-		cy_writeb(base_addr + (CySRER << index),
-			  readb(base_addr + (CySRER << index)) & ~CyTxRdy);
+		cyy_writeb(info, CyCAR, channel & 0x03);
+		cyy_writeb(info, CySRER, cyy_readb(info, CySRER) & ~CyTxRdy);
 		spin_unlock_irqrestore(&cinfo->card_lock, flags);
 	}
 }				/* cy_stop */
@@ -3133,8 +3026,7 @@ static void cy_start(struct tty_struct *tty)
 {
 	struct cyclades_card *cinfo;
 	struct cyclades_port *info = tty->driver_data;
-	void __iomem *base_addr;
-	int chip, channel, index;
+	int channel;
 	unsigned long flags;
 
 #ifdef CY_DEBUG_OTHER
@@ -3146,17 +3038,10 @@ static void cy_start(struct tty_struct *tty)
 
 	cinfo = info->card;
 	channel = info->line - cinfo->first_line;
-	index = cinfo->bus_index;
 	if (!cy_is_Z(cinfo)) {
-		chip = channel >> 2;
-		channel &= 0x03;
-		base_addr = cinfo->base_addr + (cy_chip_offset[chip] << index);
-
 		spin_lock_irqsave(&cinfo->card_lock, flags);
-		cy_writeb(base_addr + (CyCAR << index),
-			(u_char) (channel & 0x0003));	/* index channel */
-		cy_writeb(base_addr + (CySRER << index),
-			  readb(base_addr + (CySRER << index)) | CyTxRdy);
+		cyy_writeb(info, CyCAR, channel & 0x03);
+		cyy_writeb(info, CySRER, cyy_readb(info, CySRER) | CyTxRdy);
 		spin_unlock_irqrestore(&cinfo->card_lock, flags);
 	}
 }				/* cy_start */
@@ -3185,18 +3070,13 @@ static int cyy_carrier_raised(struct tty_port *port)
 	struct cyclades_port *info = container_of(port, struct cyclades_port,
 			port);
 	struct cyclades_card *cinfo = info->card;
-	void __iomem *base = cinfo->base_addr;
 	unsigned long flags;
 	int channel = info->line - cinfo->first_line;
-	int chip = channel >> 2, index = cinfo->bus_index;
 	u32 cd;
 
-	channel &= 0x03;
-	base += cy_chip_offset[chip] << index;
-
 	spin_lock_irqsave(&cinfo->card_lock, flags);
-	cy_writeb(base + (CyCAR << index), (u8)channel);
-	cd = readb(base + (CyMSVR1 << index)) & CyDCD;
+	cyy_writeb(info, CyCAR, channel & 0x03);
+	cd = cyy_readb(info, CyMSVR1) & CyDCD;
 	spin_unlock_irqrestore(&cinfo->card_lock, flags);
 
 	return cd;
@@ -3326,9 +3206,9 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
 			info->cor3 = 0x08;	/* _very_ small rcv threshold */
 
 			chip_number = channel / CyPORTS_PER_CHIP;
-			info->chip_rev = readb(cinfo->base_addr +
-				      (cy_chip_offset[chip_number] << index) +
-				      (CyGFRCR << index));
+			info->u.cyy.base_addr = cinfo->base_addr +
+				(cy_chip_offset[chip_number] << index);
+			info->chip_rev = cyy_readb(info, CyGFRCR);
 
 			if (info->chip_rev >= CD1400_REV_J) {
 				/* It is a CD1400 rev. J or later */
diff --git a/include/linux/cyclades.h b/include/linux/cyclades.h
index 1eb87a6a2f6b..bbebef7713b3 100644
--- a/include/linux/cyclades.h
+++ b/include/linux/cyclades.h
@@ -544,7 +544,7 @@ struct cyclades_port {
 	struct cyclades_card	*card;
 	union {
 		struct {
-			int filler;
+			void __iomem *base_addr;
 		} cyy;
 		struct {
 			struct CH_CTRL __iomem	*ch_ctrl;
-- 
cgit v1.2.3-71-gd317


From f8a7c1a976a6672204c7f4f0f694f33715dfa617 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Sat, 19 Sep 2009 13:13:17 -0700
Subject: kfifo: Use "const" definitions

Currently kfifo cannot be used by parts of the kernel that use "const"
properly as kfifo itself does not use const for passed data blocks which
are indeed const.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/kfifo.h | 4 ++--
 kernel/kfifo.c        | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h
index 29f62e1733ff..ad6bdf5a5970 100644
--- a/include/linux/kfifo.h
+++ b/include/linux/kfifo.h
@@ -38,7 +38,7 @@ extern struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask,
 				 spinlock_t *lock);
 extern void kfifo_free(struct kfifo *fifo);
 extern unsigned int __kfifo_put(struct kfifo *fifo,
-				unsigned char *buffer, unsigned int len);
+				const unsigned char *buffer, unsigned int len);
 extern unsigned int __kfifo_get(struct kfifo *fifo,
 				unsigned char *buffer, unsigned int len);
 
@@ -77,7 +77,7 @@ static inline void kfifo_reset(struct kfifo *fifo)
  * bytes copied.
  */
 static inline unsigned int kfifo_put(struct kfifo *fifo,
-				     unsigned char *buffer, unsigned int len)
+				const unsigned char *buffer, unsigned int len)
 {
 	unsigned long flags;
 	unsigned int ret;
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 26539e3228e5..3765ff3c1bbe 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -117,7 +117,7 @@ EXPORT_SYMBOL(kfifo_free);
  * writer, you don't need extra locking to use these functions.
  */
 unsigned int __kfifo_put(struct kfifo *fifo,
-			 unsigned char *buffer, unsigned int len)
+			const unsigned char *buffer, unsigned int len)
 {
 	unsigned int l;
 
-- 
cgit v1.2.3-71-gd317


From 1c2f04937b3e397a5695953c6b82aa4c77d21eb8 Mon Sep 17 00:00:00 2001
From: Vikram Pandita <vikram.pandita@ti.com>
Date: Sat, 19 Sep 2009 13:13:19 -0700
Subject: serial: 8250: add IRQ trigger support

There is currently no provision for passing IRQ trigger flags for
serial IRQs with triggering requirements (such as GPIO IRQs)

This patch adds irqflags to plat_serial8250_port that can be passed
from board file to reqest_irq() of 8250 driver

Changes are backward compatible with boards passing UPF_SHARE_IRQ flag

Tested on Zoom2 board that has IRQF_TRIGGER_RISING requirement for 8250 irq

[Moved new flag to end to fix bugs in the original with the old_serial array
	-- Alan]

Signed-off-by: Vikram Pandita <vikram.pandita@ti.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/serial/8250.c       | 14 +++++++++-----
 drivers/serial/8250.h       |  1 +
 include/linux/serial_8250.h |  1 +
 include/linux/serial_core.h |  1 +
 4 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index fb867a9f55e9..83168a6c3c05 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -1677,7 +1677,7 @@ static int serial_link_irq_chain(struct uart_8250_port *up)
 		INIT_LIST_HEAD(&up->list);
 		i->head = &up->list;
 		spin_unlock_irq(&i->lock);
-
+		irq_flags |= up->port.irqflags;
 		ret = request_irq(up->port.irq, serial8250_interrupt,
 				  irq_flags, "serial", i);
 		if (ret < 0)
@@ -2026,7 +2026,7 @@ static int serial8250_startup(struct uart_port *port)
 		 * allow register changes to become visible.
 		 */
 		spin_lock_irqsave(&up->port.lock, flags);
-		if (up->port.flags & UPF_SHARE_IRQ)
+		if (up->port.irqflags & IRQF_SHARED)
 			disable_irq_nosync(up->port.irq);
 
 		wait_for_xmitr(up, UART_LSR_THRE);
@@ -2039,7 +2039,7 @@ static int serial8250_startup(struct uart_port *port)
 		iir = serial_in(up, UART_IIR);
 		serial_out(up, UART_IER, 0);
 
-		if (up->port.flags & UPF_SHARE_IRQ)
+		if (up->port.irqflags & IRQF_SHARED)
 			enable_irq(up->port.irq);
 		spin_unlock_irqrestore(&up->port.lock, flags);
 
@@ -2671,6 +2671,7 @@ static void __init serial8250_isa_init_ports(void)
 	     i++, up++) {
 		up->port.iobase   = old_serial_port[i].port;
 		up->port.irq      = irq_canonicalize(old_serial_port[i].irq);
+		up->port.irqflags = old_serial_port[i].irqflags;
 		up->port.uartclk  = old_serial_port[i].baud_base * 16;
 		up->port.flags    = old_serial_port[i].flags;
 		up->port.hub6     = old_serial_port[i].hub6;
@@ -2679,7 +2680,7 @@ static void __init serial8250_isa_init_ports(void)
 		up->port.regshift = old_serial_port[i].iomem_reg_shift;
 		set_io_from_upio(&up->port);
 		if (share_irqs)
-			up->port.flags |= UPF_SHARE_IRQ;
+			up->port.irqflags |= IRQF_SHARED;
 	}
 }
 
@@ -2869,6 +2870,7 @@ int __init early_serial_setup(struct uart_port *port)
 	p->iobase       = port->iobase;
 	p->membase      = port->membase;
 	p->irq          = port->irq;
+	p->irqflags     = port->irqflags;
 	p->uartclk      = port->uartclk;
 	p->fifosize     = port->fifosize;
 	p->regshift     = port->regshift;
@@ -2942,6 +2944,7 @@ static int __devinit serial8250_probe(struct platform_device *dev)
 		port.iobase		= p->iobase;
 		port.membase		= p->membase;
 		port.irq		= p->irq;
+		port.irqflags		= p->irqflags;
 		port.uartclk		= p->uartclk;
 		port.regshift		= p->regshift;
 		port.iotype		= p->iotype;
@@ -2954,7 +2957,7 @@ static int __devinit serial8250_probe(struct platform_device *dev)
 		port.serial_out		= p->serial_out;
 		port.dev		= &dev->dev;
 		if (share_irqs)
-			port.flags |= UPF_SHARE_IRQ;
+			port.irqflags |= IRQF_SHARED;
 		ret = serial8250_register_port(&port);
 		if (ret < 0) {
 			dev_err(&dev->dev, "unable to register port at index %d "
@@ -3096,6 +3099,7 @@ int serial8250_register_port(struct uart_port *port)
 		uart->port.iobase       = port->iobase;
 		uart->port.membase      = port->membase;
 		uart->port.irq          = port->irq;
+		uart->port.irqflags     = port->irqflags;
 		uart->port.uartclk      = port->uartclk;
 		uart->port.fifosize     = port->fifosize;
 		uart->port.regshift     = port->regshift;
diff --git a/drivers/serial/8250.h b/drivers/serial/8250.h
index 520260326f3d..6e19ea3e48d5 100644
--- a/drivers/serial/8250.h
+++ b/drivers/serial/8250.h
@@ -25,6 +25,7 @@ struct old_serial_port {
 	unsigned char io_type;
 	unsigned char *iomem_base;
 	unsigned short iomem_reg_shift;
+	unsigned long irqflags;
 };
 
 /*
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index d4d2a78ad43e..fb46aba11fb5 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -22,6 +22,7 @@ struct plat_serial8250_port {
 	void __iomem	*membase;	/* ioremap cookie or NULL */
 	resource_size_t	mapbase;	/* resource base */
 	unsigned int	irq;		/* interrupt number */
+	unsigned long	irqflags;	/* request_irq flags */
 	unsigned int	uartclk;	/* UART clock rate */
 	void            *private_data;
 	unsigned char	regshift;	/* register shift */
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 23d2fb051f97..3cd255f0b211 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -265,6 +265,7 @@ struct uart_port {
 	unsigned int		(*serial_in)(struct uart_port *, int);
 	void			(*serial_out)(struct uart_port *, int, int);
 	unsigned int		irq;			/* irq number */
+	unsigned long		irqflags;		/* irq flags  */
 	unsigned int		uartclk;		/* base uart clock */
 	unsigned int		fifosize;		/* tx fifo size */
 	unsigned char		x_char;			/* xon/xoff char */
-- 
cgit v1.2.3-71-gd317


From 7ca0ff9ab3218ec443a7a9ad247e4650373ed41e Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Sat, 19 Sep 2009 13:13:20 -0700
Subject: tty: Add a full port_close function

Now we are extracting out methods for shutdown and the like we can add a
proper tty_port_close method that knows all the innards of the tty closing
process and hides the lot from the caller.

At some point in the future this will be paired with a similar open()
helper and the drivers can stick to hardware management.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Cc: stable <stable@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/char/tty_port.c | 29 +++++++++++++++++++++++++++--
 include/linux/tty.h     |  8 +++++++-
 2 files changed, 34 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/tty_port.c b/drivers/char/tty_port.c
index 9769b1149f76..549bd0fa8bb6 100644
--- a/drivers/char/tty_port.c
+++ b/drivers/char/tty_port.c
@@ -96,6 +96,14 @@ void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty)
 }
 EXPORT_SYMBOL(tty_port_tty_set);
 
+static void tty_port_shutdown(struct tty_port *port)
+{
+	if (port->ops->shutdown &&
+		test_and_clear_bit(ASYNC_INITIALIZED, &port->flags))
+			port->ops->shutdown(port);
+
+}
+
 /**
  *	tty_port_hangup		-	hangup helper
  *	@port: tty port
@@ -116,6 +124,7 @@ void tty_port_hangup(struct tty_port *port)
 	port->tty = NULL;
 	spin_unlock_irqrestore(&port->lock, flags);
 	wake_up_interruptible(&port->open_wait);
+	tty_port_shutdown(port);
 }
 EXPORT_SYMBOL(tty_port_hangup);
 
@@ -296,15 +305,17 @@ int tty_port_close_start(struct tty_port *port, struct tty_struct *tty, struct f
 
 	if (port->count) {
 		spin_unlock_irqrestore(&port->lock, flags);
+		if (port->ops->drop)
+			port->ops->drop(port);
 		return 0;
 	}
-	port->flags |= ASYNC_CLOSING;
+	set_bit(ASYNC_CLOSING, &port->flags);
 	tty->closing = 1;
 	spin_unlock_irqrestore(&port->lock, flags);
 	/* Don't block on a stalled port, just pull the chain */
 	if (tty->flow_stopped)
 		tty_driver_flush_buffer(tty);
-	if (port->flags & ASYNC_INITIALIZED &&
+	if (test_bit(ASYNCB_INITIALIZED, &port->flags) &&
 			port->closing_wait != ASYNC_CLOSING_WAIT_NONE)
 		tty_wait_until_sent(tty, port->closing_wait);
 	if (port->drain_delay) {
@@ -318,6 +329,9 @@ int tty_port_close_start(struct tty_port *port, struct tty_struct *tty, struct f
 			timeout = 2 * HZ;
 		schedule_timeout_interruptible(timeout);
 	}
+	/* Don't call port->drop for the last reference. Callers will want
+	   to drop the last active reference in ->shutdown() or the tty
+	   shutdown path */
 	return 1;
 }
 EXPORT_SYMBOL(tty_port_close_start);
@@ -348,3 +362,14 @@ void tty_port_close_end(struct tty_port *port, struct tty_struct *tty)
 	spin_unlock_irqrestore(&port->lock, flags);
 }
 EXPORT_SYMBOL(tty_port_close_end);
+
+void tty_port_close(struct tty_port *port, struct tty_struct *tty,
+							struct file *filp)
+{
+	if (tty_port_close_start(port, tty, filp) == 0)
+		return;
+	tty_port_shutdown(port);
+	tty_port_close_end(port, tty);
+	tty_port_tty_set(port, NULL);
+}
+EXPORT_SYMBOL(tty_port_close);
diff --git a/include/linux/tty.h b/include/linux/tty.h
index a916a318004e..ecb3d1ba3017 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -187,7 +187,12 @@ struct tty_port;
 struct tty_port_operations {
 	/* Return 1 if the carrier is raised */
 	int (*carrier_raised)(struct tty_port *port);
+	/* Control the DTR line */
 	void (*dtr_rts)(struct tty_port *port, int raise);
+	/* Called when the last close completes or a hangup finishes
+	   IFF the port was initialized. Do not use to free resources */
+	void (*shutdown)(struct tty_port *port);
+	void (*drop)(struct tty_port *port);
 };
 	
 struct tty_port {
@@ -459,7 +464,8 @@ extern int tty_port_block_til_ready(struct tty_port *port,
 extern int tty_port_close_start(struct tty_port *port,
 				struct tty_struct *tty, struct file *filp);
 extern void tty_port_close_end(struct tty_port *port, struct tty_struct *tty);
-
+extern void tty_port_close(struct tty_port *port,
+				struct tty_struct *tty, struct file *filp);
 extern int tty_register_ldisc(int disc, struct tty_ldisc_ops *new_ldisc);
 extern int tty_unregister_ldisc(int disc);
 extern int tty_set_ldisc(struct tty_struct *tty, int ldisc);
-- 
cgit v1.2.3-71-gd317


From 8b92e87d39bfd046e7581e1fe0f40eac40f88608 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Sat, 19 Sep 2009 13:13:24 -0700
Subject: vt: add an event interface

This is needed and requested in various forms for ConsoleKit, screenblank
handling and the like so do the job with a single interface. Also build the
interface so that unlike VT_WAITACTIVE and friends it won't miss events.

FIXME: Should this be a waitactive ioctl or a new device file you can poll
and read events from. We need the code anyway to fix up the existing broken
wait for console switch logic but the ConsoleKit people would prefer the
new device to the ioctl we have here

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/char/vt.c       |   4 +-
 drivers/char/vt_ioctl.c | 185 +++++++++++++++++++++++++++++++++++-------------
 include/linux/vt.h      |  14 ++++
 include/linux/vt_kern.h |   3 +-
 4 files changed, 154 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index e47a4c88976b..33214d92ca4c 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -252,7 +252,6 @@ static void notify_update(struct vc_data *vc)
 	struct vt_notifier_param param = { .vc = vc };
 	atomic_notifier_call_chain(&vt_notifier_list, VT_UPDATE, &param);
 }
-
 /*
  *	Low-Level Functions
  */
@@ -935,6 +934,7 @@ static int vc_do_resize(struct tty_struct *tty, struct vc_data *vc,
 
 	if (CON_IS_VISIBLE(vc))
 		update_screen(vc);
+	vt_event_post(VT_EVENT_RESIZE, vc->vc_num, vc->vc_num);
 	return err;
 }
 
@@ -3637,6 +3637,7 @@ void do_blank_screen(int entering_gfx)
 		blank_state = blank_vesa_wait;
 		mod_timer(&console_timer, jiffies + vesa_off_interval);
 	}
+	vt_event_post(VT_EVENT_BLANK, vc->vc_num, vc->vc_num);
 }
 EXPORT_SYMBOL(do_blank_screen);
 
@@ -3681,6 +3682,7 @@ void do_unblank_screen(int leaving_gfx)
 		console_blank_hook(0);
 	set_palette(vc);
 	set_cursor(vc);
+	vt_event_post(VT_EVENT_UNBLANK, vc->vc_num, vc->vc_num);
 }
 EXPORT_SYMBOL(do_unblank_screen);
 
diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c
index 95189f288f8c..35ad94e65d0d 100644
--- a/drivers/char/vt_ioctl.c
+++ b/drivers/char/vt_ioctl.c
@@ -61,6 +61,133 @@ extern struct tty_driver *console_driver;
 
 static void complete_change_console(struct vc_data *vc);
 
+/*
+ *	User space VT_EVENT handlers
+ */
+
+struct vt_event_wait {
+	struct list_head list;
+	struct vt_event event;
+	int done;
+};
+
+static LIST_HEAD(vt_events);
+static DEFINE_SPINLOCK(vt_event_lock);
+static DECLARE_WAIT_QUEUE_HEAD(vt_event_waitqueue);
+
+/**
+ *	vt_event_post
+ *	@event: the event that occurred
+ *	@old: old console
+ *	@new: new console
+ *
+ *	Post an VT event to interested VT handlers
+ */
+
+void vt_event_post(unsigned int event, unsigned int old, unsigned int new)
+{
+	struct list_head *pos, *head;
+	unsigned long flags;
+	int wake = 0;
+
+	spin_lock_irqsave(&vt_event_lock, flags);
+	head = &vt_events;
+
+	list_for_each(pos, head) {
+		struct vt_event_wait *ve = list_entry(pos,
+						struct vt_event_wait, list);
+		if (!(ve->event.event & event))
+			continue;
+		ve->event.event = event;
+		/* kernel view is consoles 0..n-1, user space view is
+		   console 1..n with 0 meaning current, so we must bias */
+		ve->event.old = old + 1;
+		ve->event.new = new + 1;
+		wake = 1;
+		ve->done = 1;
+	}
+	spin_unlock_irqrestore(&vt_event_lock, flags);
+	if (wake)
+		wake_up_interruptible(&vt_event_waitqueue);
+}
+
+/**
+ *	vt_event_wait		-	wait for an event
+ *	@vw: our event
+ *
+ *	Waits for an event to occur which completes our vt_event_wait
+ *	structure. On return the structure has wv->done set to 1 for success
+ *	or 0 if some event such as a signal ended the wait.
+ */
+
+static void vt_event_wait(struct vt_event_wait *vw)
+{
+	unsigned long flags;
+	/* Prepare the event */
+	INIT_LIST_HEAD(&vw->list);
+	vw->done = 0;
+	/* Queue our event */
+	spin_lock_irqsave(&vt_event_lock, flags);
+	list_add(&vw->list, &vt_events);
+	spin_unlock_irqrestore(&vt_event_lock, flags);
+	/* Wait for it to pass */
+	wait_event_interruptible(vt_event_waitqueue, vw->done);
+	/* Dequeue it */
+	spin_lock_irqsave(&vt_event_lock, flags);
+	list_del(&vw->list);
+	spin_unlock_irqrestore(&vt_event_lock, flags);
+}
+
+/**
+ *	vt_event_wait_ioctl	-	event ioctl handler
+ *	@arg: argument to ioctl
+ *
+ *	Implement the VT_WAITEVENT ioctl using the VT event interface
+ */
+
+static int vt_event_wait_ioctl(struct vt_event __user *event)
+{
+	struct vt_event_wait vw;
+
+	if (copy_from_user(&vw.event, event, sizeof(struct vt_event)))
+		return -EFAULT;
+	/* Highest supported event for now */
+	if (vw.event.event & ~VT_MAX_EVENT)
+		return -EINVAL;
+
+	vt_event_wait(&vw);
+	/* If it occurred report it */
+	if (vw.done) {
+		if (copy_to_user(event, &vw.event, sizeof(struct vt_event)))
+			return -EFAULT;
+		return 0;
+	}
+	return -EINTR;
+}
+
+/**
+ *	vt_waitactive	-	active console wait
+ *	@event: event code
+ *	@n: new console
+ *
+ *	Helper for event waits. Used to implement the legacy
+ *	event waiting ioctls in terms of events
+ */
+
+int vt_waitactive(int n)
+{
+	struct vt_event_wait vw;
+	do {
+		if (n == fg_console + 1)
+			break;
+		vw.event.event = VT_EVENT_SWITCH;
+		vt_event_wait(&vw);
+		if (vw.done == 0)
+			return -EINTR;
+	} while (vw.event.new != n);
+	return 0;
+}
+
 /*
  * these are the valid i/o ports we're allowed to change. they map all the
  * video ports
@@ -360,6 +487,8 @@ do_unimap_ioctl(int cmd, struct unimapdesc __user *user_ud, int perm, struct vc_
 	return 0;
 }
 
+
+
 /*
  * We handle the console-specific ioctl's here.  We allow the
  * capability to modify any console, not just the fg_console. 
@@ -851,7 +980,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 		if (arg == 0 || arg > MAX_NR_CONSOLES)
 			ret = -ENXIO;
 		else
-			ret = vt_waitactive(arg - 1);
+			ret = vt_waitactive(arg);
 		break;
 
 	/*
@@ -1159,6 +1288,9 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 		ret = put_user(vc->vc_hi_font_mask,
 					(unsigned short __user *)arg);
 		break;
+	case VT_WAITEVENT:
+		ret = vt_event_wait_ioctl((struct vt_event __user *)arg);
+		break;
 	default:
 		ret = -ENOIOCTLCMD;
 	}
@@ -1170,54 +1302,6 @@ eperm:
 	goto out;
 }
 
-/*
- * Sometimes we want to wait until a particular VT has been activated. We
- * do it in a very simple manner. Everybody waits on a single queue and
- * get woken up at once. Those that are satisfied go on with their business,
- * while those not ready go back to sleep. Seems overkill to add a wait
- * to each vt just for this - usually this does nothing!
- */
-static DECLARE_WAIT_QUEUE_HEAD(vt_activate_queue);
-
-/*
- * Sleeps until a vt is activated, or the task is interrupted. Returns
- * 0 if activation, -EINTR if interrupted by a signal handler.
- */
-int vt_waitactive(int vt)
-{
-	int retval;
-	DECLARE_WAITQUEUE(wait, current);
-
-	add_wait_queue(&vt_activate_queue, &wait);
-	for (;;) {
-		retval = 0;
-
-		/*
-		 * Synchronize with redraw_screen(). By acquiring the console
-		 * semaphore we make sure that the console switch is completed
-		 * before we return. If we didn't wait for the semaphore, we
-		 * could return at a point where fg_console has already been
-		 * updated, but the console switch hasn't been completed.
-		 */
-		acquire_console_sem();
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (vt == fg_console) {
-			release_console_sem();
-			break;
-		}
-		release_console_sem();
-		retval = -ERESTARTNOHAND;
-		if (signal_pending(current))
-			break;
-		schedule();
-	}
-	remove_wait_queue(&vt_activate_queue, &wait);
-	__set_current_state(TASK_RUNNING);
-	return retval;
-}
-
-#define vt_wake_waitactive() wake_up(&vt_activate_queue)
-
 void reset_vc(struct vc_data *vc)
 {
 	vc->vc_mode = KD_TEXT;
@@ -1262,6 +1346,7 @@ void vc_SAK(struct work_struct *work)
 static void complete_change_console(struct vc_data *vc)
 {
 	unsigned char old_vc_mode;
+	int old = fg_console;
 
 	last_console = fg_console;
 
@@ -1325,7 +1410,7 @@ static void complete_change_console(struct vc_data *vc)
 	/*
 	 * Wake anyone waiting for their VT to activate
 	 */
-	vt_wake_waitactive();
+	vt_event_post(VT_EVENT_SWITCH, old, vc->vc_num);
 	return;
 }
 
diff --git a/include/linux/vt.h b/include/linux/vt.h
index 02c1c0288770..89c03a11e193 100644
--- a/include/linux/vt.h
+++ b/include/linux/vt.h
@@ -74,4 +74,18 @@ struct vt_consize {
 #define VT_UNLOCKSWITCH 0x560C  /* allow vt switching */
 #define VT_GETHIFONTMASK 0x560D  /* return hi font mask */
 
+struct vt_event {
+	unsigned int event;
+#define VT_EVENT_SWITCH		0x0001	/* Console switch */
+#define VT_EVENT_BLANK		0x0002	/* Screen blank */
+#define VT_EVENT_UNBLANK	0x0004	/* Screen unblank */
+#define VT_EVENT_RESIZE		0x0008	/* Resize display */
+#define VT_MAX_EVENT		0x000F
+	unsigned int old;		/* Old console */
+	unsigned int new;		/* New console (if changing) */
+	unsigned int pad[4];		/* Padding for expansion */
+};
+
+#define VT_WAITEVENT	0x560E	/* Wait for an event */
+
 #endif /* _LINUX_VT_H */
diff --git a/include/linux/vt_kern.h b/include/linux/vt_kern.h
index 2f1113467f70..f8c797d57c8b 100644
--- a/include/linux/vt_kern.h
+++ b/include/linux/vt_kern.h
@@ -91,7 +91,8 @@ int con_copy_unimap(struct vc_data *dst_vc, struct vc_data *src_vc);
 #endif
 
 /* vt.c */
-int vt_waitactive(int vt);
+void vt_event_post(unsigned int event, unsigned int old, unsigned int new);
+int vt_waitactive(int n);
 void change_console(struct vc_data *new_vc);
 void reset_vc(struct vc_data *vc);
 extern int unbind_con_driver(const struct consw *csw, int first, int last,
-- 
cgit v1.2.3-71-gd317


From 8d233558cd99a888571bb5a88a74970879e0aba4 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Sat, 19 Sep 2009 13:13:25 -0700
Subject: vt: remove power stuff from kernel/power

In the past someone gratuitiously borrowed chunks of kernel internal vt
code and dumped them in kernel/power. They have all sorts of deep relations
with the vt code so put them in the vt tree instead

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/char/vt_ioctl.c | 56 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/vt_kern.h |  1 +
 kernel/power/console.c  | 63 ++++---------------------------------------------
 3 files changed, 62 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c
index 35ad94e65d0d..d29fbd44bdca 100644
--- a/drivers/char/vt_ioctl.c
+++ b/drivers/char/vt_ioctl.c
@@ -16,6 +16,7 @@
 #include <linux/tty.h>
 #include <linux/timer.h>
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/kd.h>
 #include <linux/vt.h>
 #include <linux/string.h>
@@ -1483,3 +1484,58 @@ void change_console(struct vc_data *new_vc)
 
 	complete_change_console(new_vc);
 }
+
+/* Perform a kernel triggered VT switch for suspend/resume */
+
+static int disable_vt_switch;
+
+int vt_move_to_console(unsigned int vt, int alloc)
+{
+	int prev;
+
+	acquire_console_sem();
+	/* Graphics mode - up to X */
+	if (disable_vt_switch) {
+		release_console_sem();
+		return 0;
+	}
+	prev = fg_console;
+
+	if (alloc && vc_allocate(vt)) {
+		/* we can't have a free VC for now. Too bad,
+		 * we don't want to mess the screen for now. */
+		release_console_sem();
+		return -ENOSPC;
+	}
+
+	if (set_console(vt)) {
+		/*
+		 * We're unable to switch to the SUSPEND_CONSOLE.
+		 * Let the calling function know so it can decide
+		 * what to do.
+		 */
+		release_console_sem();
+		return -EIO;
+	}
+	release_console_sem();
+	if (vt_waitactive(vt)) {
+		pr_debug("Suspend: Can't switch VCs.");
+		return -EINTR;
+	}
+	return prev;
+}
+
+/*
+ * Normally during a suspend, we allocate a new console and switch to it.
+ * When we resume, we switch back to the original console.  This switch
+ * can be slow, so on systems where the framebuffer can handle restoration
+ * of video registers anyways, there's little point in doing the console
+ * switch.  This function allows you to disable it by passing it '0'.
+ */
+void pm_set_vt_switch(int do_switch)
+{
+	acquire_console_sem();
+	disable_vt_switch = !do_switch;
+	release_console_sem();
+}
+EXPORT_SYMBOL(pm_set_vt_switch);
diff --git a/include/linux/vt_kern.h b/include/linux/vt_kern.h
index f8c797d57c8b..5b53efe41b5c 100644
--- a/include/linux/vt_kern.h
+++ b/include/linux/vt_kern.h
@@ -117,4 +117,5 @@ struct vt_spawn_console {
 };
 extern struct vt_spawn_console vt_spawn_con;
 
+extern int vt_move_to_console(unsigned int vt, int alloc);
 #endif /* _VT_KERN_H */
diff --git a/kernel/power/console.c b/kernel/power/console.c
index a3961b205de7..5187136fe1de 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -14,56 +14,13 @@
 #define SUSPEND_CONSOLE	(MAX_NR_CONSOLES-1)
 
 static int orig_fgconsole, orig_kmsg;
-static int disable_vt_switch;
-
-/*
- * Normally during a suspend, we allocate a new console and switch to it.
- * When we resume, we switch back to the original console.  This switch
- * can be slow, so on systems where the framebuffer can handle restoration
- * of video registers anyways, there's little point in doing the console
- * switch.  This function allows you to disable it by passing it '0'.
- */
-void pm_set_vt_switch(int do_switch)
-{
-	acquire_console_sem();
-	disable_vt_switch = !do_switch;
-	release_console_sem();
-}
-EXPORT_SYMBOL(pm_set_vt_switch);
 
 int pm_prepare_console(void)
 {
-	acquire_console_sem();
-
-	if (disable_vt_switch) {
-		release_console_sem();
-		return 0;
-	}
-
-	orig_fgconsole = fg_console;
-
-	if (vc_allocate(SUSPEND_CONSOLE)) {
-	  /* we can't have a free VC for now. Too bad,
-	   * we don't want to mess the screen for now. */
-		release_console_sem();
+	orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
+	if (orig_fgconsole < 0)
 		return 1;
-	}
 
-	if (set_console(SUSPEND_CONSOLE)) {
-		/*
-		 * We're unable to switch to the SUSPEND_CONSOLE.
-		 * Let the calling function know so it can decide
-		 * what to do.
-		 */
-		release_console_sem();
-		return 1;
-	}
-	release_console_sem();
-
-	if (vt_waitactive(SUSPEND_CONSOLE)) {
-		pr_debug("Suspend: Can't switch VCs.");
-		return 1;
-	}
 	orig_kmsg = kmsg_redirect;
 	kmsg_redirect = SUSPEND_CONSOLE;
 	return 0;
@@ -71,19 +28,9 @@ int pm_prepare_console(void)
 
 void pm_restore_console(void)
 {
-	acquire_console_sem();
-	if (disable_vt_switch) {
-		release_console_sem();
-		return;
-	}
-	set_console(orig_fgconsole);
-	release_console_sem();
-
-	if (vt_waitactive(orig_fgconsole)) {
-		pr_debug("Resume: Can't switch VCs.");
-		return;
+	if (orig_fgconsole >= 0) {
+		vt_move_to_console(orig_fgconsole, 0);
+		kmsg_redirect = orig_kmsg;
 	}
-
-	kmsg_redirect = orig_kmsg;
 }
 #endif
-- 
cgit v1.2.3-71-gd317


From a5eb56242d1e2d82938a066219ac1cdf0d68adc8 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Sat, 19 Sep 2009 13:13:25 -0700
Subject: vt: move kernel stuff out of vt.h

We have vt_kern.h for this


Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/vt.h      | 11 -----------
 include/linux/vt_kern.h | 12 ++++++++++++
 2 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vt.h b/include/linux/vt.h
index 89c03a11e193..831daf64b90c 100644
--- a/include/linux/vt.h
+++ b/include/linux/vt.h
@@ -1,17 +1,6 @@
 #ifndef _LINUX_VT_H
 #define _LINUX_VT_H
 
-#ifdef __KERNEL__
-struct notifier_block;
-
-struct vt_notifier_param {
-	struct vc_data *vc;	/* VC on which the update happened */
-	unsigned int c;		/* Printed char */
-};
-
-extern int register_vt_notifier(struct notifier_block *nb);
-extern int unregister_vt_notifier(struct notifier_block *nb);
-#endif
 
 /*
  * These constants are also useful for user-level apps (e.g., VC
diff --git a/include/linux/vt_kern.h b/include/linux/vt_kern.h
index 5b53efe41b5c..c0c4e1103a73 100644
--- a/include/linux/vt_kern.h
+++ b/include/linux/vt_kern.h
@@ -13,6 +13,7 @@
 #include <linux/console_struct.h>
 #include <linux/mm.h>
 #include <linux/consolemap.h>
+#include <linux/notifier.h>
 
 /*
  * Presently, a lot of graphics programs do not restore the contents of
@@ -118,4 +119,15 @@ struct vt_spawn_console {
 extern struct vt_spawn_console vt_spawn_con;
 
 extern int vt_move_to_console(unsigned int vt, int alloc);
+
+/* Interfaces for VC notification of character events (for accessibility etc) */
+
+struct vt_notifier_param {
+	struct vc_data *vc;	/* VC on which the update happened */
+	unsigned int c;		/* Printed char */
+};
+
+extern int register_vt_notifier(struct notifier_block *nb);
+extern int unregister_vt_notifier(struct notifier_block *nb);
+
 #endif /* _VT_KERN_H */
-- 
cgit v1.2.3-71-gd317


From d3b5cffcf84a8bdc7073dce4745d67c72629af85 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Sat, 19 Sep 2009 13:13:26 -0700
Subject: vt: add an activate and lock

X and other graphical interfaces need to be able to flip to a console
and lock it into graphics mode without races.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/char/vt_ioctl.c | 38 +++++++++++++++++++++++++++++++++++++-
 include/linux/vt.h      |  7 +++++++
 2 files changed, 44 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c
index d29fbd44bdca..0fceb8f4d6f2 100644
--- a/drivers/char/vt_ioctl.c
+++ b/drivers/char/vt_ioctl.c
@@ -972,6 +972,41 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 		}
 		break;
 
+	case VT_SETACTIVATE:
+	{
+		struct vt_setactivate vsa;
+
+		if (!perm)
+			goto eperm;
+
+		if (copy_from_user(&vsa, (struct vt_setactivate __user *)arg,
+						sizeof(struct vt_setactivate)))
+			return -EFAULT;
+		if (vsa.console == 0 || vsa.console > MAX_NR_CONSOLES)
+			ret = -ENXIO;
+		else {
+			vsa.console--;
+			acquire_console_sem();
+			ret = vc_allocate(vsa.console);
+			if (ret == 0) {
+				struct vc_data *nvc;
+				/* This is safe providing we don't drop the
+				   console sem between vc_allocate and
+				   finishing referencing nvc */
+				nvc = vc_cons[vsa.console].d;
+				nvc->vt_mode = vsa.mode;
+				nvc->vt_mode.frsig = 0;
+				put_pid(nvc->vt_pid);
+				nvc->vt_pid = get_pid(task_pid(current));
+			}
+			release_console_sem();
+			if (ret)
+				break;
+			/* Commence switch and lock */
+			set_console(arg);
+		}
+	}
+
 	/*
 	 * wait until the specified VT has been activated
 	 */
@@ -1342,7 +1377,8 @@ void vc_SAK(struct work_struct *work)
 }
 
 /*
- * Performs the back end of a vt switch
+ * Performs the back end of a vt switch. Called under the console
+ * semaphore.
  */
 static void complete_change_console(struct vc_data *vc)
 {
diff --git a/include/linux/vt.h b/include/linux/vt.h
index 831daf64b90c..7afca0d72139 100644
--- a/include/linux/vt.h
+++ b/include/linux/vt.h
@@ -77,4 +77,11 @@ struct vt_event {
 
 #define VT_WAITEVENT	0x560E	/* Wait for an event */
 
+struct vt_setactivate {
+	unsigned int console;
+	struct vt_mode mode;
+};
+
+#define VT_SETACTIVATE	0x560F	/* Activate and set the mode of a console */
+
 #endif /* _LINUX_VT_H */
-- 
cgit v1.2.3-71-gd317


From a509a7e478e4766114d69f12d19d644ac63e9765 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Sat, 19 Sep 2009 13:13:26 -0700
Subject: tty: USB does not need the filp argument in the drivers

And indeed none of them use it. Clean this up as it will make moving to a
standard open method rather easier.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/serial/ark3116.c          |  5 ++---
 drivers/usb/serial/belkin_sa.c        |  4 ++--
 drivers/usb/serial/ch341.c            |  5 ++---
 drivers/usb/serial/console.c          |  4 ++--
 drivers/usb/serial/cp210x.c           |  6 ++----
 drivers/usb/serial/cyberjack.c        |  4 ++--
 drivers/usb/serial/cypress_m8.c       |  6 ++----
 drivers/usb/serial/digi_acceleport.c  |  6 ++----
 drivers/usb/serial/empeg.c            |  8 +++-----
 drivers/usb/serial/ftdi_sio.c         |  6 ++----
 drivers/usb/serial/garmin_gps.c       |  3 +--
 drivers/usb/serial/generic.c          |  3 +--
 drivers/usb/serial/io_edgeport.c      |  6 ++----
 drivers/usb/serial/io_ti.c            |  3 +--
 drivers/usb/serial/ipaq.c             |  9 ++-------
 drivers/usb/serial/ipw.c              |  3 +--
 drivers/usb/serial/ir-usb.c           |  6 ++----
 drivers/usb/serial/iuu_phoenix.c      |  5 ++---
 drivers/usb/serial/keyspan.c          |  3 +--
 drivers/usb/serial/keyspan.h          |  3 +--
 drivers/usb/serial/keyspan_pda.c      |  2 +-
 drivers/usb/serial/kl5kusb105.c       | 10 ++--------
 drivers/usb/serial/kobil_sct.c        |  6 ++----
 drivers/usb/serial/mct_u232.c         |  6 ++----
 drivers/usb/serial/mos7720.c          |  3 +--
 drivers/usb/serial/mos7840.c          |  3 +--
 drivers/usb/serial/navman.c           |  3 +--
 drivers/usb/serial/omninet.c          |  6 ++----
 drivers/usb/serial/opticon.c          |  3 +--
 drivers/usb/serial/option.c           |  6 ++----
 drivers/usb/serial/oti6858.c          |  6 ++----
 drivers/usb/serial/pl2303.c           |  5 +----
 drivers/usb/serial/sierra.c           |  3 +--
 drivers/usb/serial/spcp8x5.c          |  5 +----
 drivers/usb/serial/symbolserial.c     |  3 +--
 drivers/usb/serial/ti_usb_3410_5052.c |  6 ++----
 drivers/usb/serial/usb-serial.c       |  2 +-
 drivers/usb/serial/usb_debug.c        |  5 ++---
 drivers/usb/serial/visor.c            |  6 ++----
 drivers/usb/serial/whiteheat.c        |  5 ++---
 include/linux/usb/serial.h            |  7 +++----
 41 files changed, 68 insertions(+), 131 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/serial/ark3116.c b/drivers/usb/serial/ark3116.c
index aec61880f36c..7ddde4ddfb4b 100644
--- a/drivers/usb/serial/ark3116.c
+++ b/drivers/usb/serial/ark3116.c
@@ -318,8 +318,7 @@ static void ark3116_set_termios(struct tty_struct *tty,
 	return;
 }
 
-static int ark3116_open(struct tty_struct *tty, struct usb_serial_port *port,
-					struct file *filp)
+static int ark3116_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	struct ktermios tmp_termios;
 	struct usb_serial *serial = port->serial;
@@ -334,7 +333,7 @@ static int ark3116_open(struct tty_struct *tty, struct usb_serial_port *port,
 		return -ENOMEM;
 	}
 
-	result = usb_serial_generic_open(tty, port, filp);
+	result = usb_serial_generic_open(tty, port);
 	if (result)
 		goto err_out;
 
diff --git a/drivers/usb/serial/belkin_sa.c b/drivers/usb/serial/belkin_sa.c
index 7033b031b443..a0467bc61627 100644
--- a/drivers/usb/serial/belkin_sa.c
+++ b/drivers/usb/serial/belkin_sa.c
@@ -92,7 +92,7 @@ static int debug;
 static int  belkin_sa_startup(struct usb_serial *serial);
 static void belkin_sa_release(struct usb_serial *serial);
 static int  belkin_sa_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+			struct usb_serial_port *port);
 static void belkin_sa_close(struct usb_serial_port *port);
 static void belkin_sa_read_int_callback(struct urb *urb);
 static void belkin_sa_set_termios(struct tty_struct *tty,
@@ -213,7 +213,7 @@ static void belkin_sa_release(struct usb_serial *serial)
 
 
 static int  belkin_sa_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+					struct usb_serial_port *port)
 {
 	int retval = 0;
 
diff --git a/drivers/usb/serial/ch341.c b/drivers/usb/serial/ch341.c
index 2830766f5b39..8c894a7d5dcf 100644
--- a/drivers/usb/serial/ch341.c
+++ b/drivers/usb/serial/ch341.c
@@ -300,8 +300,7 @@ static void ch341_close(struct usb_serial_port *port)
 
 
 /* open this device, set default parameters */
-static int ch341_open(struct tty_struct *tty, struct usb_serial_port *port,
-				struct file *filp)
+static int ch341_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	struct usb_serial *serial = port->serial;
 	struct ch341_private *priv = usb_get_serial_port_data(serial->port[0]);
@@ -333,7 +332,7 @@ static int ch341_open(struct tty_struct *tty, struct usb_serial_port *port,
 		return -EPROTO;
 	}
 
-	r = usb_serial_generic_open(tty, port, filp);
+	r = usb_serial_generic_open(tty, port);
 
 out:	return r;
 }
diff --git a/drivers/usb/serial/console.c b/drivers/usb/serial/console.c
index 0e4f2e41ace5..be086e4c76b6 100644
--- a/drivers/usb/serial/console.c
+++ b/drivers/usb/serial/console.c
@@ -150,9 +150,9 @@ static int usb_console_setup(struct console *co, char *options)
 		/* only call the device specific open if this
 		 * is the first time the port is opened */
 		if (serial->type->open)
-			retval = serial->type->open(NULL, port, NULL);
+			retval = serial->type->open(NULL, port);
 		else
-			retval = usb_serial_generic_open(NULL, port, NULL);
+			retval = usb_serial_generic_open(NULL, port);
 
 		if (retval) {
 			err("could not open USB console port");
diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index b5275c4a8eed..4a208fe85bc9 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -33,8 +33,7 @@
 /*
  * Function Prototypes
  */
-static int cp210x_open(struct tty_struct *, struct usb_serial_port *,
-							struct file *);
+static int cp210x_open(struct tty_struct *tty, struct usb_serial_port *);
 static void cp210x_cleanup(struct usb_serial_port *);
 static void cp210x_close(struct usb_serial_port *);
 static void cp210x_get_termios(struct tty_struct *,
@@ -368,8 +367,7 @@ static unsigned int cp210x_quantise_baudrate(unsigned int baud) {
 	return baud;
 }
 
-static int cp210x_open(struct tty_struct *tty, struct usb_serial_port *port,
-				struct file *filp)
+static int cp210x_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	struct usb_serial *serial = port->serial;
 	int result;
diff --git a/drivers/usb/serial/cyberjack.c b/drivers/usb/serial/cyberjack.c
index 336523fd7366..b0f6402a91ca 100644
--- a/drivers/usb/serial/cyberjack.c
+++ b/drivers/usb/serial/cyberjack.c
@@ -61,7 +61,7 @@ static int cyberjack_startup(struct usb_serial *serial);
 static void cyberjack_disconnect(struct usb_serial *serial);
 static void cyberjack_release(struct usb_serial *serial);
 static int  cyberjack_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+	struct usb_serial_port *port);
 static void cyberjack_close(struct usb_serial_port *port);
 static int cyberjack_write(struct tty_struct *tty,
 	struct usb_serial_port *port, const unsigned char *buf, int count);
@@ -173,7 +173,7 @@ static void cyberjack_release(struct usb_serial *serial)
 }
 
 static int  cyberjack_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+					struct usb_serial_port *port)
 {
 	struct cyberjack_private *priv;
 	unsigned long flags;
diff --git a/drivers/usb/serial/cypress_m8.c b/drivers/usb/serial/cypress_m8.c
index 59adfe123110..df1adb9a4367 100644
--- a/drivers/usb/serial/cypress_m8.c
+++ b/drivers/usb/serial/cypress_m8.c
@@ -172,8 +172,7 @@ static int  cypress_earthmate_startup(struct usb_serial *serial);
 static int  cypress_hidcom_startup(struct usb_serial *serial);
 static int  cypress_ca42v2_startup(struct usb_serial *serial);
 static void cypress_release(struct usb_serial *serial);
-static int  cypress_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static int  cypress_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void cypress_close(struct usb_serial_port *port);
 static void cypress_dtr_rts(struct usb_serial_port *port, int on);
 static int  cypress_write(struct tty_struct *tty, struct usb_serial_port *port,
@@ -633,8 +632,7 @@ static void cypress_release(struct usb_serial *serial)
 }
 
 
-static int cypress_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static int cypress_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	struct cypress_private *priv = usb_get_serial_port_data(port);
 	struct usb_serial *serial = port->serial;
diff --git a/drivers/usb/serial/digi_acceleport.c b/drivers/usb/serial/digi_acceleport.c
index f4808091c47c..ab3dd991586b 100644
--- a/drivers/usb/serial/digi_acceleport.c
+++ b/drivers/usb/serial/digi_acceleport.c
@@ -453,8 +453,7 @@ static int digi_write(struct tty_struct *tty, struct usb_serial_port *port,
 static void digi_write_bulk_callback(struct urb *urb);
 static int digi_write_room(struct tty_struct *tty);
 static int digi_chars_in_buffer(struct tty_struct *tty);
-static int digi_open(struct tty_struct *tty, struct usb_serial_port *port,
-	struct file *filp);
+static int digi_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void digi_close(struct usb_serial_port *port);
 static int digi_carrier_raised(struct usb_serial_port *port);
 static void digi_dtr_rts(struct usb_serial_port *port, int on);
@@ -1347,8 +1346,7 @@ static int digi_carrier_raised(struct usb_serial_port *port)
 	return 0;
 }
 
-static int digi_open(struct tty_struct *tty, struct usb_serial_port *port,
-				struct file *filp)
+static int digi_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	int ret;
 	unsigned char buf[32];
diff --git a/drivers/usb/serial/empeg.c b/drivers/usb/serial/empeg.c
index 80cb3471adbe..da559a773b51 100644
--- a/drivers/usb/serial/empeg.c
+++ b/drivers/usb/serial/empeg.c
@@ -79,8 +79,7 @@ static int debug;
 #define EMPEG_PRODUCT_ID		0x0001
 
 /* function prototypes for an empeg-car player */
-static int  empeg_open(struct tty_struct *tty, struct usb_serial_port *port,
-						struct file *filp);
+static int  empeg_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void empeg_close(struct usb_serial_port *port);
 static int  empeg_write(struct tty_struct *tty, struct usb_serial_port *port,
 						const unsigned char *buf,
@@ -142,8 +141,7 @@ static int		bytes_out;
 /******************************************************************************
  * Empeg specific driver functions
  ******************************************************************************/
-static int empeg_open(struct tty_struct *tty, struct usb_serial_port *port,
-				struct file *filp)
+static int empeg_open(struct tty_struct *tty,struct usb_serial_port *port)
 {
 	struct usb_serial *serial = port->serial;
 	int result = 0;
@@ -151,7 +149,7 @@ static int empeg_open(struct tty_struct *tty, struct usb_serial_port *port,
 	dbg("%s - port %d", __func__, port->number);
 
 	/* Force default termio settings */
-	empeg_set_termios(tty, port, NULL) ;
+	empeg_set_termios(tty, port, NULL);
 
 	bytes_in = 0;
 	bytes_out = 0;
diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index 8fec5d4455c9..76a17f915eef 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -747,8 +747,7 @@ static int  ftdi_sio_probe(struct usb_serial *serial,
 					const struct usb_device_id *id);
 static int  ftdi_sio_port_probe(struct usb_serial_port *port);
 static int  ftdi_sio_port_remove(struct usb_serial_port *port);
-static int  ftdi_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static int  ftdi_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void ftdi_close(struct usb_serial_port *port);
 static void ftdi_dtr_rts(struct usb_serial_port *port, int on);
 static int  ftdi_write(struct tty_struct *tty, struct usb_serial_port *port,
@@ -1680,8 +1679,7 @@ static int ftdi_sio_port_remove(struct usb_serial_port *port)
 	return 0;
 }
 
-static int ftdi_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static int ftdi_open(struct tty_struct *tty, struct usb_serial_port *port)
 { /* ftdi_open */
 	struct usb_device *dev = port->serial->dev;
 	struct ftdi_private *priv = usb_get_serial_port_data(port);
diff --git a/drivers/usb/serial/garmin_gps.c b/drivers/usb/serial/garmin_gps.c
index 8839f1c70b7f..20432d345529 100644
--- a/drivers/usb/serial/garmin_gps.c
+++ b/drivers/usb/serial/garmin_gps.c
@@ -933,8 +933,7 @@ static int garmin_init_session(struct usb_serial_port *port)
 
 
-static int garmin_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static int garmin_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	unsigned long flags;
 	int status = 0;
diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index ce57f6a32bdf..d9398e9f30ce 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -114,8 +114,7 @@ void usb_serial_generic_deregister(void)
 #endif
 }
 
-int usb_serial_generic_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+int usb_serial_generic_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	struct usb_serial *serial = port->serial;
 	int result = 0;
diff --git a/drivers/usb/serial/io_edgeport.c b/drivers/usb/serial/io_edgeport.c
index 0191693625d6..dc0f832657e6 100644
--- a/drivers/usb/serial/io_edgeport.c
+++ b/drivers/usb/serial/io_edgeport.c
@@ -205,8 +205,7 @@ static void edge_bulk_out_data_callback(struct urb *urb);
 static void edge_bulk_out_cmd_callback(struct urb *urb);
 
 /* function prototypes for the usbserial callbacks */
-static int edge_open(struct tty_struct *tty, struct usb_serial_port *port,
-					struct file *filp);
+static int edge_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void edge_close(struct usb_serial_port *port);
 static int edge_write(struct tty_struct *tty, struct usb_serial_port *port,
 					const unsigned char *buf, int count);
@@ -852,8 +851,7 @@ static void edge_bulk_out_cmd_callback(struct urb *urb)
  *	If successful, we return 0
  *	Otherwise we return a negative error number.
  *****************************************************************************/
-static int edge_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static int edge_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	struct edgeport_port *edge_port = usb_get_serial_port_data(port);
 	struct usb_serial *serial;
diff --git a/drivers/usb/serial/io_ti.c b/drivers/usb/serial/io_ti.c
index e8bc42f92e79..d4cc0f7af400 100644
--- a/drivers/usb/serial/io_ti.c
+++ b/drivers/usb/serial/io_ti.c
@@ -1831,8 +1831,7 @@ static void edge_bulk_out_callback(struct urb *urb)
 	tty_kref_put(tty);
 }
 
-static int edge_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static int edge_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	struct edgeport_port *edge_port = usb_get_serial_port_data(port);
 	struct edgeport_serial *edge_serial;
diff --git a/drivers/usb/serial/ipaq.c b/drivers/usb/serial/ipaq.c
index 2545d45ce16f..24fcc64b837d 100644
--- a/drivers/usb/serial/ipaq.c
+++ b/drivers/usb/serial/ipaq.c
@@ -75,7 +75,7 @@ static int initial_wait;
 
 /* Function prototypes for an ipaq */
 static int  ipaq_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+			struct usb_serial_port *port);
 static void ipaq_close(struct usb_serial_port *port);
 static int  ipaq_calc_num_ports(struct usb_serial *serial);
 static int  ipaq_startup(struct usb_serial *serial);
@@ -587,7 +587,7 @@ static int		bytes_in;
 static int		bytes_out;
 
 static int ipaq_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+			struct usb_serial_port *port)
 {
 	struct usb_serial	*serial = port->serial;
 	struct ipaq_private	*priv;
@@ -628,11 +628,6 @@ static int ipaq_open(struct tty_struct *tty,
 		priv->free_len += PACKET_SIZE;
 	}
 
-	if (tty) {
-		/* FIXME: These two are bogus */
-		tty->raw = 1;
-		tty->real_raw = 1;
-	}
 	/*
 	 * Lose the small buffers usbserial provides. Make larger ones.
 	 */
diff --git a/drivers/usb/serial/ipw.c b/drivers/usb/serial/ipw.c
index 29ad038b9c8d..727d323f092a 100644
--- a/drivers/usb/serial/ipw.c
+++ b/drivers/usb/serial/ipw.c
@@ -193,8 +193,7 @@ static void ipw_read_bulk_callback(struct urb *urb)
 	return;
 }
 
-static int ipw_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static int ipw_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	struct usb_device *dev = port->serial->dev;
 	u8 buf_flow_static[16] = IPW_BYTES_FLOWINIT;
diff --git a/drivers/usb/serial/ir-usb.c b/drivers/usb/serial/ir-usb.c
index 66009b6b763a..95d8d26b9a44 100644
--- a/drivers/usb/serial/ir-usb.c
+++ b/drivers/usb/serial/ir-usb.c
@@ -86,8 +86,7 @@ static int buffer_size;
 static int xbof = -1;
 
 static int  ir_startup (struct usb_serial *serial);
-static int  ir_open(struct tty_struct *tty, struct usb_serial_port *port,
-					struct file *filep);
+static int  ir_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void ir_close(struct usb_serial_port *port);
 static int  ir_write(struct tty_struct *tty, struct usb_serial_port *port,
 					const unsigned char *buf, int count);
@@ -296,8 +295,7 @@ static int ir_startup(struct usb_serial *serial)
 	return 0;
 }
 
-static int ir_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static int ir_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	char *buffer;
 	int result = 0;
diff --git a/drivers/usb/serial/iuu_phoenix.c b/drivers/usb/serial/iuu_phoenix.c
index 96873a7a32b0..f3f9be0469c5 100644
--- a/drivers/usb/serial/iuu_phoenix.c
+++ b/drivers/usb/serial/iuu_phoenix.c
@@ -1018,8 +1018,7 @@ static void iuu_close(struct usb_serial_port *port)
 	}
 }
 
-static int iuu_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static int iuu_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	struct usb_serial *serial = port->serial;
 	u8 *buf;
@@ -1077,7 +1076,7 @@ static int iuu_open(struct tty_struct *tty,
 		tty->termios->c_iflag = 0;
 		priv->termios_initialized = 1;
 		priv->poll = 0;
-	 }
+	}
 	spin_unlock_irqrestore(&priv->lock, flags);
 
 	/* initialize writebuf */
diff --git a/drivers/usb/serial/keyspan.c b/drivers/usb/serial/keyspan.c
index 2594b8743d3f..f8c4b07033ff 100644
--- a/drivers/usb/serial/keyspan.c
+++ b/drivers/usb/serial/keyspan.c
@@ -1209,8 +1209,7 @@ static int keyspan_write_room(struct tty_struct *tty)
 }
 
 
-static int keyspan_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static int keyspan_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	struct keyspan_port_private 	*p_priv;
 	struct keyspan_serial_private 	*s_priv;
diff --git a/drivers/usb/serial/keyspan.h b/drivers/usb/serial/keyspan.h
index 3107ed15af64..30771e5b3973 100644
--- a/drivers/usb/serial/keyspan.h
+++ b/drivers/usb/serial/keyspan.h
@@ -36,8 +36,7 @@
 
 /* Function prototypes for Keyspan serial converter */
 static int  keyspan_open		(struct tty_struct *tty,
-					 struct usb_serial_port *port,
-					 struct file *filp);
+					 struct usb_serial_port *port);
 static void keyspan_close		(struct usb_serial_port *port);
 static void keyspan_dtr_rts		(struct usb_serial_port *port, int on);
 static int  keyspan_startup		(struct usb_serial *serial);
diff --git a/drivers/usb/serial/keyspan_pda.c b/drivers/usb/serial/keyspan_pda.c
index d0b12e40c2b1..257c16cc6b2a 100644
--- a/drivers/usb/serial/keyspan_pda.c
+++ b/drivers/usb/serial/keyspan_pda.c
@@ -681,7 +681,7 @@ static int keyspan_pda_carrier_raised(struct usb_serial_port *port)
 
 
 static int keyspan_pda_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+					struct usb_serial_port *port)
 {
 	struct usb_serial *serial = port->serial;
 	unsigned char room;
diff --git a/drivers/usb/serial/kl5kusb105.c b/drivers/usb/serial/kl5kusb105.c
index 0f44bb8e8d4f..a61673133d7d 100644
--- a/drivers/usb/serial/kl5kusb105.c
+++ b/drivers/usb/serial/kl5kusb105.c
@@ -75,8 +75,7 @@ static int debug;
 static int  klsi_105_startup(struct usb_serial *serial);
 static void klsi_105_disconnect(struct usb_serial *serial);
 static void klsi_105_release(struct usb_serial *serial);
-static int  klsi_105_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static int  klsi_105_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void klsi_105_close(struct usb_serial_port *port);
 static int  klsi_105_write(struct tty_struct *tty,
 	struct usb_serial_port *port, const unsigned char *buf, int count);
@@ -358,8 +357,7 @@ static void klsi_105_release(struct usb_serial *serial)
 	}
 } /* klsi_105_release */
 
-static int  klsi_105_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static int  klsi_105_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	struct klsi_105_private *priv = usb_get_serial_port_data(port);
 	int retval = 0;
@@ -371,10 +369,6 @@ static int  klsi_105_open(struct tty_struct *tty,
 
 	dbg("%s port %d", __func__, port->number);
 
-	/* force low_latency on so that our tty_push actually forces
-	 * the data through
-	 * tty->low_latency = 1; */
-
 	/* Do a defined restart:
 	 * Set up sane default baud rate and send the 'READ_ON'
 	 * vendor command.
diff --git a/drivers/usb/serial/kobil_sct.c b/drivers/usb/serial/kobil_sct.c
index 6db0e561f680..97901578343a 100644
--- a/drivers/usb/serial/kobil_sct.c
+++ b/drivers/usb/serial/kobil_sct.c
@@ -70,8 +70,7 @@ static int debug;
 /* Function prototypes */
 static int  kobil_startup(struct usb_serial *serial);
 static void kobil_release(struct usb_serial *serial);
-static int  kobil_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static int  kobil_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void kobil_close(struct usb_serial_port *port);
 static int  kobil_write(struct tty_struct *tty, struct usb_serial_port *port,
 			 const unsigned char *buf, int count);
@@ -211,8 +210,7 @@ static void kobil_release(struct usb_serial *serial)
 }
 
 
-static int kobil_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static int kobil_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	int result = 0;
 	struct kobil_private *priv;
diff --git a/drivers/usb/serial/mct_u232.c b/drivers/usb/serial/mct_u232.c
index d8825e159aa5..d501aefa2628 100644
--- a/drivers/usb/serial/mct_u232.c
+++ b/drivers/usb/serial/mct_u232.c
@@ -93,8 +93,7 @@ static int debug;
  */
 static int  mct_u232_startup(struct usb_serial *serial);
 static void mct_u232_release(struct usb_serial *serial);
-static int  mct_u232_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static int  mct_u232_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void mct_u232_close(struct usb_serial_port *port);
 static void mct_u232_dtr_rts(struct usb_serial_port *port, int on);
 static void mct_u232_read_int_callback(struct urb *urb);
@@ -421,8 +420,7 @@ static void mct_u232_release(struct usb_serial *serial)
 	}
 } /* mct_u232_release */
 
-static int  mct_u232_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static int  mct_u232_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	struct usb_serial *serial = port->serial;
 	struct mct_u232_private *priv = usb_get_serial_port_data(port);
diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c
index 4342a8a0eac9..763e32a44be0 100644
--- a/drivers/usb/serial/mos7720.c
+++ b/drivers/usb/serial/mos7720.c
@@ -319,8 +319,7 @@ static int send_mos_cmd(struct usb_serial *serial, __u8 request, __u16 value,
 	return status;
 }
 
-static int mos7720_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static int mos7720_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	struct usb_serial *serial;
 	struct usb_serial_port *port0;
diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c
index b93f0f992d9f..f11abf52be7d 100644
--- a/drivers/usb/serial/mos7840.c
+++ b/drivers/usb/serial/mos7840.c
@@ -824,8 +824,7 @@ static int mos7840_serial_probe(struct usb_serial *serial,
  *	Otherwise we return a negative error number.
  *****************************************************************************/
 
-static int mos7840_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static int mos7840_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	int response;
 	int j;
diff --git a/drivers/usb/serial/navman.c b/drivers/usb/serial/navman.c
index f5f3751a888c..5ceaa4c6be09 100644
--- a/drivers/usb/serial/navman.c
+++ b/drivers/usb/serial/navman.c
@@ -80,8 +80,7 @@ exit:
 			__func__, result);
 }
 
-static int navman_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static int navman_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	int result = 0;
 
diff --git a/drivers/usb/serial/omninet.c b/drivers/usb/serial/omninet.c
index 56857ddbd70b..062265038bf0 100644
--- a/drivers/usb/serial/omninet.c
+++ b/drivers/usb/serial/omninet.c
@@ -64,8 +64,7 @@ static int debug;
 #define BT_IGNITIONPRO_ID	0x2000
 
 /* function prototypes */
-static int  omninet_open(struct tty_struct *tty, struct usb_serial_port *port,
-							struct file *filp);
+static int  omninet_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void omninet_close(struct usb_serial_port *port);
 static void omninet_read_bulk_callback(struct urb *urb);
 static void omninet_write_bulk_callback(struct urb *urb);
@@ -163,8 +162,7 @@ static int omninet_attach(struct usb_serial *serial)
 	return 0;
 }
 
-static int omninet_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static int omninet_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	struct usb_serial	*serial = port->serial;
 	struct usb_serial_port	*wport;
diff --git a/drivers/usb/serial/opticon.c b/drivers/usb/serial/opticon.c
index 336bba79ad32..1085a577c5c1 100644
--- a/drivers/usb/serial/opticon.c
+++ b/drivers/usb/serial/opticon.c
@@ -144,8 +144,7 @@ exit:
 	spin_unlock(&priv->lock);
 }
 
-static int opticon_open(struct tty_struct *tty, struct usb_serial_port *port,
-			struct file *filp)
+static int opticon_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	struct opticon_private *priv = usb_get_serial_data(port->serial);
 	unsigned long flags;
diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index c784ddbe7b61..fe47051dbef2 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -45,8 +45,7 @@
 /* Function prototypes */
 static int  option_probe(struct usb_serial *serial,
 			const struct usb_device_id *id);
-static int  option_open(struct tty_struct *tty, struct usb_serial_port *port,
-							struct file *filp);
+static int  option_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void option_close(struct usb_serial_port *port);
 static void option_dtr_rts(struct usb_serial_port *port, int on);
 
@@ -961,8 +960,7 @@ static int option_chars_in_buffer(struct tty_struct *tty)
 	return data_len;
 }
 
-static int option_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static int option_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	struct option_port_private *portdata;
 	int i, err;
diff --git a/drivers/usb/serial/oti6858.c b/drivers/usb/serial/oti6858.c
index 3cece27325e7..e90f88a3b040 100644
--- a/drivers/usb/serial/oti6858.c
+++ b/drivers/usb/serial/oti6858.c
@@ -141,8 +141,7 @@ struct oti6858_control_pkt {
 	  && ((a)->frame_fmt == (priv)->pending_setup.frame_fmt))
 
 /* function prototypes */
-static int oti6858_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static int oti6858_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void oti6858_close(struct usb_serial_port *port);
 static void oti6858_set_termios(struct tty_struct *tty,
 			struct usb_serial_port *port, struct ktermios *old);
@@ -566,8 +565,7 @@ static void oti6858_set_termios(struct tty_struct *tty,
 	spin_unlock_irqrestore(&priv->lock, flags);
 }
 
-static int oti6858_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static int oti6858_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	struct oti6858_private *priv = usb_get_serial_port_data(port);
 	struct ktermios tmp_termios;
diff --git a/drivers/usb/serial/pl2303.c b/drivers/usb/serial/pl2303.c
index 3e86815b2705..a63ea99936f7 100644
--- a/drivers/usb/serial/pl2303.c
+++ b/drivers/usb/serial/pl2303.c
@@ -691,8 +691,7 @@ static void pl2303_close(struct usb_serial_port *port)
 
 }
 
-static int pl2303_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static int pl2303_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	struct ktermios tmp_termios;
 	struct usb_serial *serial = port->serial;
@@ -714,8 +713,6 @@ static int pl2303_open(struct tty_struct *tty,
 	if (tty)
 		pl2303_set_termios(tty, port, &tmp_termios);
 
-	/* FIXME: need to assert RTS and DTR if CRTSCTS off */
-
 	dbg("%s - submitting read urb", __func__);
 	port->read_urb->dev = serial->dev;
 	result = usb_submit_urb(port->read_urb, GFP_KERNEL);
diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c
index f48d05e0acc1..55391bbe1230 100644
--- a/drivers/usb/serial/sierra.c
+++ b/drivers/usb/serial/sierra.c
@@ -734,8 +734,7 @@ static void sierra_close(struct usb_serial_port *port)
 	}
 }
 
-static int sierra_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static int sierra_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	struct sierra_port_private *portdata;
 	struct usb_serial *serial = port->serial;
diff --git a/drivers/usb/serial/spcp8x5.c b/drivers/usb/serial/spcp8x5.c
index 3c249d8e8b8e..8b312a05a353 100644
--- a/drivers/usb/serial/spcp8x5.c
+++ b/drivers/usb/serial/spcp8x5.c
@@ -623,8 +623,7 @@ static void spcp8x5_set_termios(struct tty_struct *tty,
 
 /* open the serial port. do some usb system call. set termios and get the line
  * status of the device. then submit the read urb */
-static int spcp8x5_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static int spcp8x5_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	struct ktermios tmp_termios;
 	struct usb_serial *serial = port->serial;
@@ -658,8 +657,6 @@ static int spcp8x5_open(struct tty_struct *tty,
 	priv->line_status = status & 0xf0 ;
 	spin_unlock_irqrestore(&priv->lock, flags);
 
-	/* FIXME: need to assert RTS and DTR if CRTSCTS off */
-
 	dbg("%s - submitting read urb", __func__);
 	port->read_urb->dev = serial->dev;
 	ret = usb_submit_urb(port->read_urb, GFP_KERNEL);
diff --git a/drivers/usb/serial/symbolserial.c b/drivers/usb/serial/symbolserial.c
index 6157fac9366b..cb7e95f9fcbf 100644
--- a/drivers/usb/serial/symbolserial.c
+++ b/drivers/usb/serial/symbolserial.c
@@ -124,8 +124,7 @@ exit:
 	spin_unlock(&priv->lock);
 }
 
-static int symbol_open(struct tty_struct *tty, struct usb_serial_port *port,
-			struct file *filp)
+static int symbol_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	struct symbol_private *priv = usb_get_serial_data(port->serial);
 	unsigned long flags;
diff --git a/drivers/usb/serial/ti_usb_3410_5052.c b/drivers/usb/serial/ti_usb_3410_5052.c
index 3bc609fe2242..1e9dc8821698 100644
--- a/drivers/usb/serial/ti_usb_3410_5052.c
+++ b/drivers/usb/serial/ti_usb_3410_5052.c
@@ -98,8 +98,7 @@ struct ti_device {
 
 static int ti_startup(struct usb_serial *serial);
 static void ti_release(struct usb_serial *serial);
-static int ti_open(struct tty_struct *tty, struct usb_serial_port *port,
-		struct file *file);
+static int ti_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void ti_close(struct usb_serial_port *port);
 static int ti_write(struct tty_struct *tty, struct usb_serial_port *port,
 		const unsigned char *data, int count);
@@ -492,8 +491,7 @@ static void ti_release(struct usb_serial *serial)
 }
 
 
-static int ti_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *file)
+static int ti_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	struct ti_port *tport = usb_get_serial_port_data(port);
 	struct ti_device *tdev;
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index 31c7a0939b98..3dda6841e724 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -244,7 +244,7 @@ static int serial_open (struct tty_struct *tty, struct file *filp)
 
 		/* only call the device specific open if this
 		 * is the first time the port is opened */
-		retval = serial->type->open(tty, port, filp);
+		retval = serial->type->open(tty, port);
 		if (retval)
 			goto bailout_interface_put;
 		mutex_unlock(&serial->disc_mutex);
diff --git a/drivers/usb/serial/usb_debug.c b/drivers/usb/serial/usb_debug.c
index 614800972dc3..7b5bfc4edd3d 100644
--- a/drivers/usb/serial/usb_debug.c
+++ b/drivers/usb/serial/usb_debug.c
@@ -43,11 +43,10 @@ static struct usb_driver debug_driver = {
 	.no_dynamic_id = 	1,
 };
 
-static int usb_debug_open(struct tty_struct *tty, struct usb_serial_port *port,
-							struct file *filp)
+static int usb_debug_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	port->bulk_out_size = USB_DEBUG_MAX_PACKET_SIZE;
-	return usb_serial_generic_open(tty, port, filp);
+	return usb_serial_generic_open(tty, port);
 }
 
 /* This HW really does not support a serial break, so one will be
diff --git a/drivers/usb/serial/visor.c b/drivers/usb/serial/visor.c
index f5d0f64dcc52..1aa5d20a5d99 100644
--- a/drivers/usb/serial/visor.c
+++ b/drivers/usb/serial/visor.c
@@ -36,8 +36,7 @@
 #define DRIVER_DESC "USB HandSpring Visor / Palm OS driver"
 
 /* function prototypes for a handspring visor */
-static int  visor_open(struct tty_struct *tty, struct usb_serial_port *port,
-					struct file *filp);
+static int  visor_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void visor_close(struct usb_serial_port *port);
 static int  visor_write(struct tty_struct *tty, struct usb_serial_port *port,
 					const unsigned char *buf, int count);
@@ -273,8 +272,7 @@ static int stats;
 /******************************************************************************
  * Handspring Visor specific driver functions
  ******************************************************************************/
-static int visor_open(struct tty_struct *tty, struct usb_serial_port *port,
-							struct file *filp)
+static int visor_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	struct usb_serial *serial = port->serial;
 	struct visor_private *priv = usb_get_serial_port_data(port);
diff --git a/drivers/usb/serial/whiteheat.c b/drivers/usb/serial/whiteheat.c
index 8d126dd7a02e..81f2ae505966 100644
--- a/drivers/usb/serial/whiteheat.c
+++ b/drivers/usb/serial/whiteheat.c
@@ -146,7 +146,7 @@ static int  whiteheat_firmware_attach(struct usb_serial *serial);
 static int  whiteheat_attach(struct usb_serial *serial);
 static void whiteheat_release(struct usb_serial *serial);
 static int  whiteheat_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+			struct usb_serial_port *port);
 static void whiteheat_close(struct usb_serial_port *port);
 static int  whiteheat_write(struct tty_struct *tty,
 			struct usb_serial_port *port,
@@ -659,8 +659,7 @@ static void whiteheat_release(struct usb_serial *serial)
 	return;
 }
 
-static int whiteheat_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static int whiteheat_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	int		retval = 0;
 
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 0ec50ba62139..1cbaf4a6b449 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -238,9 +238,8 @@ struct usb_serial_driver {
 	int (*resume)(struct usb_serial *serial);
 
 	/* serial function calls */
-	/* Called by console with tty = NULL and by tty */
-	int  (*open)(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+	/* Called by console and by the tty layer */
+	int  (*open)(struct tty_struct *tty, struct usb_serial_port *port);
 	void (*close)(struct usb_serial_port *port);
 	int  (*write)(struct tty_struct *tty, struct usb_serial_port *port,
 			const unsigned char *buf, int count);
@@ -300,7 +299,7 @@ static inline void usb_serial_console_disconnect(struct usb_serial *serial) {}
 extern struct usb_serial *usb_serial_get_by_index(unsigned int minor);
 extern void usb_serial_put(struct usb_serial *serial);
 extern int usb_serial_generic_open(struct tty_struct *tty,
-		struct usb_serial_port *port, struct file *filp);
+	struct usb_serial_port *port);
 extern int usb_serial_generic_write(struct tty_struct *tty,
 	struct usb_serial_port *port, const unsigned char *buf, int count);
 extern void usb_serial_generic_close(struct usb_serial_port *port);
-- 
cgit v1.2.3-71-gd317


From ebd2c8f6d2ec4012c267ecb95e72a57b8355a705 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Sat, 19 Sep 2009 13:13:28 -0700
Subject: serial: kill off uart_info

We moved this into uart_state, now move the fields out of the separate
structure and kill it off.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/mn10300/kernel/asm-offsets.c       |   6 +-
 drivers/serial/21285.c                  |   8 +-
 drivers/serial/8250.c                   |  10 +-
 drivers/serial/amba-pl010.c             |   6 +-
 drivers/serial/amba-pl011.c             |   6 +-
 drivers/serial/atmel_serial.c           |  14 +-
 drivers/serial/bfin_5xx.c               |  20 +--
 drivers/serial/bfin_sport_uart.c        |   6 +-
 drivers/serial/clps711x.c               |   4 +-
 drivers/serial/cpm_uart/cpm_uart_core.c |   2 +-
 drivers/serial/dz.c                     |   6 +-
 drivers/serial/icom.c                   |  16 +-
 drivers/serial/imx.c                    |  16 +-
 drivers/serial/ioc3_serial.c            |  52 +++---
 drivers/serial/ioc4_serial.c            |  66 ++++----
 drivers/serial/ip22zilog.c              |  14 +-
 drivers/serial/jsm/jsm_neo.c            |   2 +-
 drivers/serial/jsm/jsm_tty.c            |  20 +--
 drivers/serial/m32r_sio.c               |   6 +-
 drivers/serial/max3100.c                |  16 +-
 drivers/serial/mcf.c                    |   4 +-
 drivers/serial/mpc52xx_uart.c           |   4 +-
 drivers/serial/mpsc.c                   |   4 +-
 drivers/serial/msm_serial.c             |   6 +-
 drivers/serial/mux.c                    |   4 +-
 drivers/serial/netx-serial.c            |   6 +-
 drivers/serial/nwpserial.c              |   4 +-
 drivers/serial/pmac_zilog.c             |  12 +-
 drivers/serial/pnx8xxx_uart.c           |   8 +-
 drivers/serial/pxa.c                    |   6 +-
 drivers/serial/sa1100.c                 |   8 +-
 drivers/serial/samsung.c                |   8 +-
 drivers/serial/sb1250-duart.c           |   6 +-
 drivers/serial/sc26xx.c                 |  10 +-
 drivers/serial/serial_core.c            | 286 ++++++++++++++++----------------
 drivers/serial/serial_ks8695.c          |   6 +-
 drivers/serial/serial_lh7a40x.c         |   6 +-
 drivers/serial/serial_txx9.c            |   4 +-
 drivers/serial/sh-sci.c                 |  10 +-
 drivers/serial/sn_console.c             |  22 +--
 drivers/serial/sunhv.c                  |   8 +-
 drivers/serial/sunsab.c                 |  10 +-
 drivers/serial/sunsu.c                  |   6 +-
 drivers/serial/sunzilog.c               |  14 +-
 drivers/serial/timbuart.c               |  10 +-
 drivers/serial/uartlite.c               |   6 +-
 drivers/serial/ucc_uart.c               |   4 +-
 drivers/serial/vr41xx_siu.c             |   6 +-
 drivers/serial/zs.c                     |   6 +-
 include/linux/serial_core.h             |  64 ++++---
 50 files changed, 422 insertions(+), 432 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mn10300/kernel/asm-offsets.c b/arch/mn10300/kernel/asm-offsets.c
index 2646fcbd7d89..82b40079ad76 100644
--- a/arch/mn10300/kernel/asm-offsets.c
+++ b/arch/mn10300/kernel/asm-offsets.c
@@ -95,7 +95,7 @@ void foo(void)
 	OFFSET(__iobase,		mn10300_serial_port, _iobase);
 
 	DEFINE(__UART_XMIT_SIZE,	UART_XMIT_SIZE);
-	OFFSET(__xmit_buffer,		uart_info, xmit.buf);
-	OFFSET(__xmit_head,		uart_info, xmit.head);
-	OFFSET(__xmit_tail,		uart_info, xmit.tail);
+	OFFSET(__xmit_buffer,		uart_state, xmit.buf);
+	OFFSET(__xmit_head,		uart_state, xmit.head);
+	OFFSET(__xmit_tail,		uart_state, xmit.tail);
 }
diff --git a/drivers/serial/21285.c b/drivers/serial/21285.c
index cb6d85d7ff43..1e3d19397a59 100644
--- a/drivers/serial/21285.c
+++ b/drivers/serial/21285.c
@@ -86,7 +86,7 @@ static void serial21285_enable_ms(struct uart_port *port)
 static irqreturn_t serial21285_rx_chars(int irq, void *dev_id)
 {
 	struct uart_port *port = dev_id;
-	struct tty_struct *tty = port->info->port.tty;
+	struct tty_struct *tty = port->state->port.tty;
 	unsigned int status, ch, flag, rxs, max_count = 256;
 
 	status = *CSR_UARTFLG;
@@ -124,7 +124,7 @@ static irqreturn_t serial21285_rx_chars(int irq, void *dev_id)
 static irqreturn_t serial21285_tx_chars(int irq, void *dev_id)
 {
 	struct uart_port *port = dev_id;
-	struct circ_buf *xmit = &port->info->xmit;
+	struct circ_buf *xmit = &port->state->xmit;
 	int count = 256;
 
 	if (port->x_char) {
@@ -235,8 +235,8 @@ serial21285_set_termios(struct uart_port *port, struct ktermios *termios,
 	baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk/16); 
 	quot = uart_get_divisor(port, baud);
 
-	if (port->info && port->info->port.tty) {
-		struct tty_struct *tty = port->info->port.tty;
+	if (port->state && port->state->port.tty) {
+		struct tty_struct *tty = port->state->port.tty;
 		unsigned int b = port->uartclk / (16 * quot);
 		tty_encode_baud_rate(tty, b, b);
 	}
diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index 1fd4894d9b51..e415c5eca599 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -1382,7 +1382,7 @@ static void serial8250_enable_ms(struct uart_port *port)
 static void
 receive_chars(struct uart_8250_port *up, unsigned int *status)
 {
-	struct tty_struct *tty = up->port.info->port.tty;
+	struct tty_struct *tty = up->port.state->port.tty;
 	unsigned char ch, lsr = *status;
 	int max_count = 256;
 	char flag;
@@ -1457,7 +1457,7 @@ ignore_char:
 
 static void transmit_chars(struct uart_8250_port *up)
 {
-	struct circ_buf *xmit = &up->port.info->xmit;
+	struct circ_buf *xmit = &up->port.state->xmit;
 	int count;
 
 	if (up->port.x_char) {
@@ -1500,7 +1500,7 @@ static unsigned int check_modem_status(struct uart_8250_port *up)
 	status |= up->msr_saved_flags;
 	up->msr_saved_flags = 0;
 	if (status & UART_MSR_ANY_DELTA && up->ier & UART_IER_MSI &&
-	    up->port.info != NULL) {
+	    up->port.state != NULL) {
 		if (status & UART_MSR_TERI)
 			up->port.icount.rng++;
 		if (status & UART_MSR_DDSR)
@@ -1510,7 +1510,7 @@ static unsigned int check_modem_status(struct uart_8250_port *up)
 		if (status & UART_MSR_DCTS)
 			uart_handle_cts_change(&up->port, status & UART_MSR_CTS);
 
-		wake_up_interruptible(&up->port.info->delta_msr_wait);
+		wake_up_interruptible(&up->port.state->delta_msr_wait);
 	}
 
 	return status;
@@ -1764,7 +1764,7 @@ static void serial8250_backup_timeout(unsigned long data)
 	up->lsr_saved_flags |= lsr & LSR_SAVE_FLAGS;
 	spin_unlock_irqrestore(&up->port.lock, flags);
 	if ((iir & UART_IIR_NO_INT) && (up->ier & UART_IER_THRI) &&
-	    (!uart_circ_empty(&up->port.info->xmit) || up->port.x_char) &&
+	    (!uart_circ_empty(&up->port.state->xmit) || up->port.x_char) &&
 	    (lsr & UART_LSR_THRE)) {
 		iir &= ~(UART_IIR_ID | UART_IIR_NO_INT);
 		iir |= UART_IIR_THRI;
diff --git a/drivers/serial/amba-pl010.c b/drivers/serial/amba-pl010.c
index 58a4879c7e48..39032413d4a1 100644
--- a/drivers/serial/amba-pl010.c
+++ b/drivers/serial/amba-pl010.c
@@ -117,7 +117,7 @@ static void pl010_enable_ms(struct uart_port *port)
 
 static void pl010_rx_chars(struct uart_amba_port *uap)
 {
-	struct tty_struct *tty = uap->port.info->port.tty;
+	struct tty_struct *tty = uap->port.state->port.tty;
 	unsigned int status, ch, flag, rsr, max_count = 256;
 
 	status = readb(uap->port.membase + UART01x_FR);
@@ -172,7 +172,7 @@ static void pl010_rx_chars(struct uart_amba_port *uap)
 
 static void pl010_tx_chars(struct uart_amba_port *uap)
 {
-	struct circ_buf *xmit = &uap->port.info->xmit;
+	struct circ_buf *xmit = &uap->port.state->xmit;
 	int count;
 
 	if (uap->port.x_char) {
@@ -225,7 +225,7 @@ static void pl010_modem_status(struct uart_amba_port *uap)
 	if (delta & UART01x_FR_CTS)
 		uart_handle_cts_change(&uap->port, status & UART01x_FR_CTS);
 
-	wake_up_interruptible(&uap->port.info->delta_msr_wait);
+	wake_up_interruptible(&uap->port.state->delta_msr_wait);
 }
 
 static irqreturn_t pl010_int(int irq, void *dev_id)
diff --git a/drivers/serial/amba-pl011.c b/drivers/serial/amba-pl011.c
index 72ba0c6d3551..ef82a34baf0f 100644
--- a/drivers/serial/amba-pl011.c
+++ b/drivers/serial/amba-pl011.c
@@ -124,7 +124,7 @@ static void pl011_enable_ms(struct uart_port *port)
 
 static void pl011_rx_chars(struct uart_amba_port *uap)
 {
-	struct tty_struct *tty = uap->port.info->port.tty;
+	struct tty_struct *tty = uap->port.state->port.tty;
 	unsigned int status, ch, flag, max_count = 256;
 
 	status = readw(uap->port.membase + UART01x_FR);
@@ -175,7 +175,7 @@ static void pl011_rx_chars(struct uart_amba_port *uap)
 
 static void pl011_tx_chars(struct uart_amba_port *uap)
 {
-	struct circ_buf *xmit = &uap->port.info->xmit;
+	struct circ_buf *xmit = &uap->port.state->xmit;
 	int count;
 
 	if (uap->port.x_char) {
@@ -226,7 +226,7 @@ static void pl011_modem_status(struct uart_amba_port *uap)
 	if (delta & UART01x_FR_CTS)
 		uart_handle_cts_change(&uap->port, status & UART01x_FR_CTS);
 
-	wake_up_interruptible(&uap->port.info->delta_msr_wait);
+	wake_up_interruptible(&uap->port.state->delta_msr_wait);
 }
 
 static irqreturn_t pl011_int(int irq, void *dev_id)
diff --git a/drivers/serial/atmel_serial.c b/drivers/serial/atmel_serial.c
index 607d43a31048..963e3c12af41 100644
--- a/drivers/serial/atmel_serial.c
+++ b/drivers/serial/atmel_serial.c
@@ -427,7 +427,7 @@ static void atmel_rx_chars(struct uart_port *port)
  */
 static void atmel_tx_chars(struct uart_port *port)
 {
-	struct circ_buf *xmit = &port->info->xmit;
+	struct circ_buf *xmit = &port->state->xmit;
 
 	if (port->x_char && UART_GET_CSR(port) & ATMEL_US_TXRDY) {
 		UART_PUT_CHAR(port, port->x_char);
@@ -560,7 +560,7 @@ static irqreturn_t atmel_interrupt(int irq, void *dev_id)
 static void atmel_tx_dma(struct uart_port *port)
 {
 	struct atmel_uart_port *atmel_port = to_atmel_uart_port(port);
-	struct circ_buf *xmit = &port->info->xmit;
+	struct circ_buf *xmit = &port->state->xmit;
 	struct atmel_dma_buffer *pdc = &atmel_port->pdc_tx;
 	int count;
 
@@ -663,14 +663,14 @@ static void atmel_rx_from_ring(struct uart_port *port)
 	 * uart_start(), which takes the lock.
 	 */
 	spin_unlock(&port->lock);
-	tty_flip_buffer_push(port->info->port.tty);
+	tty_flip_buffer_push(port->state->port.tty);
 	spin_lock(&port->lock);
 }
 
 static void atmel_rx_from_dma(struct uart_port *port)
 {
 	struct atmel_uart_port *atmel_port = to_atmel_uart_port(port);
-	struct tty_struct *tty = port->info->port.tty;
+	struct tty_struct *tty = port->state->port.tty;
 	struct atmel_dma_buffer *pdc;
 	int rx_idx = atmel_port->pdc_rx_idx;
 	unsigned int head;
@@ -776,7 +776,7 @@ static void atmel_tasklet_func(unsigned long data)
 		if (status_change & ATMEL_US_CTS)
 			uart_handle_cts_change(port, !(status & ATMEL_US_CTS));
 
-		wake_up_interruptible(&port->info->delta_msr_wait);
+		wake_up_interruptible(&port->state->delta_msr_wait);
 
 		atmel_port->irq_status_prev = status;
 	}
@@ -795,7 +795,7 @@ static void atmel_tasklet_func(unsigned long data)
 static int atmel_startup(struct uart_port *port)
 {
 	struct atmel_uart_port *atmel_port = to_atmel_uart_port(port);
-	struct tty_struct *tty = port->info->port.tty;
+	struct tty_struct *tty = port->state->port.tty;
 	int retval;
 
 	/*
@@ -854,7 +854,7 @@ static int atmel_startup(struct uart_port *port)
 	}
 	if (atmel_use_dma_tx(port)) {
 		struct atmel_dma_buffer *pdc = &atmel_port->pdc_tx;
-		struct circ_buf *xmit = &port->info->xmit;
+		struct circ_buf *xmit = &port->state->xmit;
 
 		pdc->buf = xmit->buf;
 		pdc->dma_addr = dma_map_single(port->dev,
diff --git a/drivers/serial/bfin_5xx.c b/drivers/serial/bfin_5xx.c
index 4fff4e524034..50abb7e557f4 100644
--- a/drivers/serial/bfin_5xx.c
+++ b/drivers/serial/bfin_5xx.c
@@ -144,7 +144,7 @@ static void bfin_serial_stop_tx(struct uart_port *port)
 {
 	struct bfin_serial_port *uart = (struct bfin_serial_port *)port;
 #ifdef CONFIG_SERIAL_BFIN_DMA
-	struct circ_buf *xmit = &uart->port.info->xmit;
+	struct circ_buf *xmit = &uart->port.state->xmit;
 #endif
 
 	while (!(UART_GET_LSR(uart) & TEMT))
@@ -171,7 +171,7 @@ static void bfin_serial_stop_tx(struct uart_port *port)
 static void bfin_serial_start_tx(struct uart_port *port)
 {
 	struct bfin_serial_port *uart = (struct bfin_serial_port *)port;
-	struct tty_struct *tty = uart->port.info->port.tty;
+	struct tty_struct *tty = uart->port.state->port.tty;
 
 #ifdef CONFIG_SERIAL_BFIN_HARD_CTSRTS
 	if (uart->scts && !(bfin_serial_get_mctrl(&uart->port) & TIOCM_CTS)) {
@@ -243,10 +243,10 @@ static void bfin_serial_rx_chars(struct bfin_serial_port *uart)
 			return;
 		}
 
-	if (!uart->port.info || !uart->port.info->port.tty)
+	if (!uart->port.state || !uart->port.state->port.tty)
 		return;
 #endif
-	tty = uart->port.info->port.tty;
+	tty = uart->port.state->port.tty;
 
 	if (ANOMALY_05000363) {
 		/* The BF533 (and BF561) family of processors have a nice anomaly
@@ -331,7 +331,7 @@ static void bfin_serial_rx_chars(struct bfin_serial_port *uart)
 
 static void bfin_serial_tx_chars(struct bfin_serial_port *uart)
 {
-	struct circ_buf *xmit = &uart->port.info->xmit;
+	struct circ_buf *xmit = &uart->port.state->xmit;
 
 	if (uart_circ_empty(xmit) || uart_tx_stopped(&uart->port)) {
 #ifdef CONFIG_BF54x
@@ -398,7 +398,7 @@ static irqreturn_t bfin_serial_tx_int(int irq, void *dev_id)
 #ifdef CONFIG_SERIAL_BFIN_DMA
 static void bfin_serial_dma_tx_chars(struct bfin_serial_port *uart)
 {
-	struct circ_buf *xmit = &uart->port.info->xmit;
+	struct circ_buf *xmit = &uart->port.state->xmit;
 
 	uart->tx_done = 0;
 
@@ -436,7 +436,7 @@ static void bfin_serial_dma_tx_chars(struct bfin_serial_port *uart)
 
 static void bfin_serial_dma_rx_chars(struct bfin_serial_port *uart)
 {
-	struct tty_struct *tty = uart->port.info->port.tty;
+	struct tty_struct *tty = uart->port.state->port.tty;
 	int i, flg, status;
 
 	status = UART_GET_LSR(uart);
@@ -529,7 +529,7 @@ void bfin_serial_rx_dma_timeout(struct bfin_serial_port *uart)
 static irqreturn_t bfin_serial_dma_tx_int(int irq, void *dev_id)
 {
 	struct bfin_serial_port *uart = dev_id;
-	struct circ_buf *xmit = &uart->port.info->xmit;
+	struct circ_buf *xmit = &uart->port.state->xmit;
 
 #ifdef CONFIG_SERIAL_BFIN_HARD_CTSRTS
 	if (uart->scts && !(bfin_serial_get_mctrl(&uart->port)&TIOCM_CTS)) {
@@ -965,10 +965,10 @@ static void bfin_serial_set_ldisc(struct uart_port *port)
 	int line = port->line;
 	unsigned short val;
 
-	if (line >= port->info->port.tty->driver->num)
+	if (line >= port->state->port.tty->driver->num)
 		return;
 
-	switch (port->info->port.tty->termios->c_line) {
+	switch (port->state->port.tty->termios->c_line) {
 	case N_IRDA:
 		val = UART_GET_GCTL(&bfin_serial_ports[line]);
 		val |= (IREN | RPOLC);
diff --git a/drivers/serial/bfin_sport_uart.c b/drivers/serial/bfin_sport_uart.c
index c108b1a0ce98..088bb35475f1 100644
--- a/drivers/serial/bfin_sport_uart.c
+++ b/drivers/serial/bfin_sport_uart.c
@@ -178,7 +178,7 @@ static int sport_uart_setup(struct sport_uart_port *up, int sclk, int baud_rate)
 static irqreturn_t sport_uart_rx_irq(int irq, void *dev_id)
 {
 	struct sport_uart_port *up = dev_id;
-	struct tty_struct *tty = up->port.info->port.tty;
+	struct tty_struct *tty = up->port.state->port.tty;
 	unsigned int ch;
 
 	do {
@@ -205,7 +205,7 @@ static irqreturn_t sport_uart_tx_irq(int irq, void *dev_id)
 static irqreturn_t sport_uart_err_irq(int irq, void *dev_id)
 {
 	struct sport_uart_port *up = dev_id;
-	struct tty_struct *tty = up->port.info->port.tty;
+	struct tty_struct *tty = up->port.state->port.tty;
 	unsigned int stat = SPORT_GET_STAT(up);
 
 	/* Overflow in RX FIFO */
@@ -290,7 +290,7 @@ fail1:
 
 static void sport_uart_tx_chars(struct sport_uart_port *up)
 {
-	struct circ_buf *xmit = &up->port.info->xmit;
+	struct circ_buf *xmit = &up->port.state->xmit;
 
 	if (SPORT_GET_STAT(up) & TXF)
 		return;
diff --git a/drivers/serial/clps711x.c b/drivers/serial/clps711x.c
index 80e76426131d..b6acd19b458e 100644
--- a/drivers/serial/clps711x.c
+++ b/drivers/serial/clps711x.c
@@ -93,7 +93,7 @@ static void clps711xuart_enable_ms(struct uart_port *port)
 static irqreturn_t clps711xuart_int_rx(int irq, void *dev_id)
 {
 	struct uart_port *port = dev_id;
-	struct tty_struct *tty = port->info->port.tty;
+	struct tty_struct *tty = port->state->port.tty;
 	unsigned int status, ch, flg;
 
 	status = clps_readl(SYSFLG(port));
@@ -147,7 +147,7 @@ static irqreturn_t clps711xuart_int_rx(int irq, void *dev_id)
 static irqreturn_t clps711xuart_int_tx(int irq, void *dev_id)
 {
 	struct uart_port *port = dev_id;
-	struct circ_buf *xmit = &port->info->xmit;
+	struct circ_buf *xmit = &port->state->xmit;
 	int count;
 
 	if (port->x_char) {
diff --git a/drivers/serial/cpm_uart/cpm_uart_core.c b/drivers/serial/cpm_uart/cpm_uart_core.c
index f8df0681e160..8d349b23249a 100644
--- a/drivers/serial/cpm_uart/cpm_uart_core.c
+++ b/drivers/serial/cpm_uart/cpm_uart_core.c
@@ -244,7 +244,7 @@ static void cpm_uart_int_rx(struct uart_port *port)
 	int i;
 	unsigned char ch;
 	u8 *cp;
-	struct tty_struct *tty = port->info->port.tty;
+	struct tty_struct *tty = port->state->port.tty;
 	struct uart_cpm_port *pinfo = (struct uart_cpm_port *)port;
 	cbd_t __iomem *bdp;
 	u16 status;
diff --git a/drivers/serial/dz.c b/drivers/serial/dz.c
index 6042b87797a1..57421d776329 100644
--- a/drivers/serial/dz.c
+++ b/drivers/serial/dz.c
@@ -197,7 +197,7 @@ static inline void dz_receive_chars(struct dz_mux *mux)
 	while ((status = dz_in(dport, DZ_RBUF)) & DZ_DVAL) {
 		dport = &mux->dport[LINE(status)];
 		uport = &dport->port;
-		tty = uport->info->port.tty;	/* point to the proper dev */
+		tty = uport->state->port.tty;	/* point to the proper dev */
 
 		ch = UCHAR(status);		/* grab the char */
 		flag = TTY_NORMAL;
@@ -249,7 +249,7 @@ static inline void dz_receive_chars(struct dz_mux *mux)
 	}
 	for (i = 0; i < DZ_NB_PORT; i++)
 		if (lines_rx[i])
-			tty_flip_buffer_push(mux->dport[i].port.info->port.tty);
+			tty_flip_buffer_push(mux->dport[i].port.state->port.tty);
 }
 
 /*
@@ -268,7 +268,7 @@ static inline void dz_transmit_chars(struct dz_mux *mux)
 
 	status = dz_in(dport, DZ_CSR);
 	dport = &mux->dport[LINE(status)];
-	xmit = &dport->port.info->xmit;
+	xmit = &dport->port.state->xmit;
 
 	if (dport->port.x_char) {		/* XON/XOFF chars */
 		dz_out(dport, DZ_TDR, dport->port.x_char);
diff --git a/drivers/serial/icom.c b/drivers/serial/icom.c
index 060f4e3d54c5..f86c47e08a06 100644
--- a/drivers/serial/icom.c
+++ b/drivers/serial/icom.c
@@ -627,7 +627,7 @@ static int icom_write(struct uart_port *port)
 	unsigned long data_count;
 	unsigned char cmdReg;
 	unsigned long offset;
-	int temp_tail = port->info->xmit.tail;
+	int temp_tail = port->state->xmit.tail;
 
 	trace(ICOM_PORT, "WRITE", 0);
 
@@ -638,11 +638,11 @@ static int icom_write(struct uart_port *port)
 	}
 
 	data_count = 0;
-	while ((port->info->xmit.head != temp_tail) &&
+	while ((port->state->xmit.head != temp_tail) &&
 	       (data_count <= XMIT_BUFF_SZ)) {
 
 		ICOM_PORT->xmit_buf[data_count++] =
-		    port->info->xmit.buf[temp_tail];
+		    port->state->xmit.buf[temp_tail];
 
 		temp_tail++;
 		temp_tail &= (UART_XMIT_SIZE - 1);
@@ -694,7 +694,7 @@ static inline void check_modem_status(struct icom_port *icom_port)
 			uart_handle_cts_change(&icom_port->uart_port,
 					       delta_status & ICOM_CTS);
 
-		wake_up_interruptible(&icom_port->uart_port.info->
+		wake_up_interruptible(&icom_port->uart_port.state->
 				      delta_msr_wait);
 		old_status = status;
 	}
@@ -718,10 +718,10 @@ static void xmit_interrupt(u16 port_int_reg, struct icom_port *icom_port)
 		icom_port->uart_port.icount.tx += count;
 
 		for (i=0; i<count &&
-			!uart_circ_empty(&icom_port->uart_port.info->xmit); i++) {
+			!uart_circ_empty(&icom_port->uart_port.state->xmit); i++) {
 
-			icom_port->uart_port.info->xmit.tail++;
-			icom_port->uart_port.info->xmit.tail &=
+			icom_port->uart_port.state->xmit.tail++;
+			icom_port->uart_port.state->xmit.tail &=
 				(UART_XMIT_SIZE - 1);
 		}
 
@@ -735,7 +735,7 @@ static void xmit_interrupt(u16 port_int_reg, struct icom_port *icom_port)
 static void recv_interrupt(u16 port_int_reg, struct icom_port *icom_port)
 {
 	short int count, rcv_buff;
-	struct tty_struct *tty = icom_port->uart_port.info->port.tty;
+	struct tty_struct *tty = icom_port->uart_port.state->port.tty;
 	unsigned short int status;
 	struct uart_icount *icount;
 	unsigned long offset;
diff --git a/drivers/serial/imx.c b/drivers/serial/imx.c
index 7485afd0df4c..1febeafcb97a 100644
--- a/drivers/serial/imx.c
+++ b/drivers/serial/imx.c
@@ -224,7 +224,7 @@ static void imx_mctrl_check(struct imx_port *sport)
 	if (changed & TIOCM_CTS)
 		uart_handle_cts_change(&sport->port, status & TIOCM_CTS);
 
-	wake_up_interruptible(&sport->port.info->delta_msr_wait);
+	wake_up_interruptible(&sport->port.state->delta_msr_wait);
 }
 
 /*
@@ -236,7 +236,7 @@ static void imx_timeout(unsigned long data)
 	struct imx_port *sport = (struct imx_port *)data;
 	unsigned long flags;
 
-	if (sport->port.info) {
+	if (sport->port.state) {
 		spin_lock_irqsave(&sport->port.lock, flags);
 		imx_mctrl_check(sport);
 		spin_unlock_irqrestore(&sport->port.lock, flags);
@@ -323,7 +323,7 @@ static void imx_enable_ms(struct uart_port *port)
 
 static inline void imx_transmit_buffer(struct imx_port *sport)
 {
-	struct circ_buf *xmit = &sport->port.info->xmit;
+	struct circ_buf *xmit = &sport->port.state->xmit;
 
 	while (!(readl(sport->port.membase + UTS) & UTS_TXFULL)) {
 		/* send xmit->buf[xmit->tail]
@@ -388,7 +388,7 @@ static irqreturn_t imx_rtsint(int irq, void *dev_id)
 
 	writel(USR1_RTSD, sport->port.membase + USR1);
 	uart_handle_cts_change(&sport->port, !!val);
-	wake_up_interruptible(&sport->port.info->delta_msr_wait);
+	wake_up_interruptible(&sport->port.state->delta_msr_wait);
 
 	spin_unlock_irqrestore(&sport->port.lock, flags);
 	return IRQ_HANDLED;
@@ -397,7 +397,7 @@ static irqreturn_t imx_rtsint(int irq, void *dev_id)
 static irqreturn_t imx_txint(int irq, void *dev_id)
 {
 	struct imx_port *sport = dev_id;
-	struct circ_buf *xmit = &sport->port.info->xmit;
+	struct circ_buf *xmit = &sport->port.state->xmit;
 	unsigned long flags;
 
 	spin_lock_irqsave(&sport->port.lock,flags);
@@ -427,7 +427,7 @@ static irqreturn_t imx_rxint(int irq, void *dev_id)
 {
 	struct imx_port *sport = dev_id;
 	unsigned int rx,flg,ignored = 0;
-	struct tty_struct *tty = sport->port.info->port.tty;
+	struct tty_struct *tty = sport->port.state->port.tty;
 	unsigned long flags, temp;
 
 	spin_lock_irqsave(&sport->port.lock,flags);
@@ -900,11 +900,11 @@ imx_set_termios(struct uart_port *port, struct ktermios *termios,
 	rational_best_approximation(16 * div * baud, sport->port.uartclk,
 		1 << 16, 1 << 16, &num, &denom);
 
-	if (port->info && port->info->port.tty) {
+	if (port->state && port->state->port.tty) {
 		tdiv64 = sport->port.uartclk;
 		tdiv64 *= num;
 		do_div(tdiv64, denom * 16 * div);
-		tty_encode_baud_rate(sport->port.info->port.tty,
+		tty_encode_baud_rate(sport->port.state->port.tty,
 				(speed_t)tdiv64, (speed_t)tdiv64);
 	}
 
diff --git a/drivers/serial/ioc3_serial.c b/drivers/serial/ioc3_serial.c
index ae3699d77dd0..de4ab1bfee8d 100644
--- a/drivers/serial/ioc3_serial.c
+++ b/drivers/serial/ioc3_serial.c
@@ -897,25 +897,25 @@ static void transmit_chars(struct uart_port *the_port)
 	char *start;
 	struct tty_struct *tty;
 	struct ioc3_port *port = get_ioc3_port(the_port);
-	struct uart_info *info;
+	struct uart_state *state;
 
 	if (!the_port)
 		return;
 	if (!port)
 		return;
 
-	info = the_port->info;
-	tty = info->port.tty;
+	state = the_port->state;
+	tty = state->port.tty;
 
-	if (uart_circ_empty(&info->xmit) || uart_tx_stopped(the_port)) {
+	if (uart_circ_empty(&state->xmit) || uart_tx_stopped(the_port)) {
 		/* Nothing to do or hw stopped */
 		set_notification(port, N_ALL_OUTPUT, 0);
 		return;
 	}
 
-	head = info->xmit.head;
-	tail = info->xmit.tail;
-	start = (char *)&info->xmit.buf[tail];
+	head = state->xmit.head;
+	tail = state->xmit.tail;
+	start = (char *)&state->xmit.buf[tail];
 
 	/* write out all the data or until the end of the buffer */
 	xmit_count = (head < tail) ? (UART_XMIT_SIZE - tail) : (head - tail);
@@ -928,14 +928,14 @@ static void transmit_chars(struct uart_port *the_port)
 			/* advance the pointers */
 			tail += result;
 			tail &= UART_XMIT_SIZE - 1;
-			info->xmit.tail = tail;
-			start = (char *)&info->xmit.buf[tail];
+			state->xmit.tail = tail;
+			start = (char *)&state->xmit.buf[tail];
 		}
 	}
-	if (uart_circ_chars_pending(&info->xmit) < WAKEUP_CHARS)
+	if (uart_circ_chars_pending(&state->xmit) < WAKEUP_CHARS)
 		uart_write_wakeup(the_port);
 
-	if (uart_circ_empty(&info->xmit)) {
+	if (uart_circ_empty(&state->xmit)) {
 		set_notification(port, N_OUTPUT_LOWAT, 0);
 	} else {
 		set_notification(port, N_OUTPUT_LOWAT, 1);
@@ -956,7 +956,7 @@ ioc3_change_speed(struct uart_port *the_port,
 	unsigned int cflag;
 	int baud;
 	int new_parity = 0, new_parity_enable = 0, new_stop = 0, new_data = 8;
-	struct uart_info *info = the_port->info;
+	struct uart_state *state = the_port->state;
 
 	cflag = new_termios->c_cflag;
 
@@ -997,14 +997,14 @@ ioc3_change_speed(struct uart_port *the_port,
 
 	the_port->ignore_status_mask = N_ALL_INPUT;
 
-	info->port.tty->low_latency = 1;
+	state->port.tty->low_latency = 1;
 
-	if (I_IGNPAR(info->port.tty))
+	if (I_IGNPAR(state->port.tty))
 		the_port->ignore_status_mask &= ~(N_PARITY_ERROR
 						  | N_FRAMING_ERROR);
-	if (I_IGNBRK(info->port.tty)) {
+	if (I_IGNBRK(state->port.tty)) {
 		the_port->ignore_status_mask &= ~N_BREAK;
-		if (I_IGNPAR(info->port.tty))
+		if (I_IGNPAR(state->port.tty))
 			the_port->ignore_status_mask &= ~N_OVERRUN_ERROR;
 	}
 	if (!(cflag & CREAD)) {
@@ -1286,7 +1286,7 @@ static inline int do_read(struct uart_port *the_port, char *buf, int len)
 						uart_handle_dcd_change
 							(port->ip_port, 0);
 						wake_up_interruptible
-						    (&the_port->info->
+						    (&the_port->state->
 						     delta_msr_wait);
 					}
 
@@ -1392,21 +1392,21 @@ static int receive_chars(struct uart_port *the_port)
 	struct tty_struct *tty;
 	unsigned char ch[MAX_CHARS];
 	int read_count = 0, read_room, flip = 0;
-	struct uart_info *info = the_port->info;
+	struct uart_state *state = the_port->state;
 	struct ioc3_port *port = get_ioc3_port(the_port);
 	unsigned long pflags;
 
 	/* Make sure all the pointers are "good" ones */
-	if (!info)
+	if (!state)
 		return 0;
-	if (!info->port.tty)
+	if (!state->port.tty)
 		return 0;
 
 	if (!(port->ip_flags & INPUT_ENABLE))
 		return 0;
 
 	spin_lock_irqsave(&the_port->lock, pflags);
-	tty = info->port.tty;
+	tty = state->port.tty;
 
 	read_count = do_read(the_port, ch, MAX_CHARS);
 	if (read_count > 0) {
@@ -1491,7 +1491,7 @@ ioc3uart_intr_one(struct ioc3_submodule *is,
 				uart_handle_dcd_change(the_port,
 						shadow & SHADOW_DCD);
 				wake_up_interruptible
-				    (&the_port->info->delta_msr_wait);
+				    (&the_port->state->delta_msr_wait);
 			} else if ((port->ip_notify & N_DDCD)
 				   && !(shadow & SHADOW_DCD)) {
 				/* Flag delta DCD/no DCD */
@@ -1511,7 +1511,7 @@ ioc3uart_intr_one(struct ioc3_submodule *is,
 				uart_handle_cts_change(the_port, shadow
 						& SHADOW_CTS);
 				wake_up_interruptible
-				    (&the_port->info->delta_msr_wait);
+				    (&the_port->state->delta_msr_wait);
 			}
 		}
 
@@ -1721,14 +1721,14 @@ static void ic3_shutdown(struct uart_port *the_port)
 {
 	unsigned long port_flags;
 	struct ioc3_port *port;
-	struct uart_info *info;
+	struct uart_state *state;
 
 	port = get_ioc3_port(the_port);
 	if (!port)
 		return;
 
-	info = the_port->info;
-	wake_up_interruptible(&info->delta_msr_wait);
+	state = the_port->state;
+	wake_up_interruptible(&state->delta_msr_wait);
 
 	spin_lock_irqsave(&the_port->lock, port_flags);
 	set_notification(port, N_ALL, 0);
diff --git a/drivers/serial/ioc4_serial.c b/drivers/serial/ioc4_serial.c
index e5c58fe7e745..2055d323f15f 100644
--- a/drivers/serial/ioc4_serial.c
+++ b/drivers/serial/ioc4_serial.c
@@ -1627,25 +1627,25 @@ static void transmit_chars(struct uart_port *the_port)
 	char *start;
 	struct tty_struct *tty;
 	struct ioc4_port *port = get_ioc4_port(the_port, 0);
-	struct uart_info *info;
+	struct uart_state *state;
 
 	if (!the_port)
 		return;
 	if (!port)
 		return;
 
-	info = the_port->info;
-	tty = info->port.tty;
+	state = the_port->state;
+	tty = state->port.tty;
 
-	if (uart_circ_empty(&info->xmit) || uart_tx_stopped(the_port)) {
+	if (uart_circ_empty(&state->xmit) || uart_tx_stopped(the_port)) {
 		/* Nothing to do or hw stopped */
 		set_notification(port, N_ALL_OUTPUT, 0);
 		return;
 	}
 
-	head = info->xmit.head;
-	tail = info->xmit.tail;
-	start = (char *)&info->xmit.buf[tail];
+	head = state->xmit.head;
+	tail = state->xmit.tail;
+	start = (char *)&state->xmit.buf[tail];
 
 	/* write out all the data or until the end of the buffer */
 	xmit_count = (head < tail) ? (UART_XMIT_SIZE - tail) : (head - tail);
@@ -1658,14 +1658,14 @@ static void transmit_chars(struct uart_port *the_port)
 			/* advance the pointers */
 			tail += result;
 			tail &= UART_XMIT_SIZE - 1;
-			info->xmit.tail = tail;
-			start = (char *)&info->xmit.buf[tail];
+			state->xmit.tail = tail;
+			start = (char *)&state->xmit.buf[tail];
 		}
 	}
-	if (uart_circ_chars_pending(&info->xmit) < WAKEUP_CHARS)
+	if (uart_circ_chars_pending(&state->xmit) < WAKEUP_CHARS)
 		uart_write_wakeup(the_port);
 
-	if (uart_circ_empty(&info->xmit)) {
+	if (uart_circ_empty(&state->xmit)) {
 		set_notification(port, N_OUTPUT_LOWAT, 0);
 	} else {
 		set_notification(port, N_OUTPUT_LOWAT, 1);
@@ -1686,7 +1686,7 @@ ioc4_change_speed(struct uart_port *the_port,
 	int baud, bits;
 	unsigned cflag;
 	int new_parity = 0, new_parity_enable = 0, new_stop = 0, new_data = 8;
-	struct uart_info *info = the_port->info;
+	struct uart_state *state = the_port->state;
 
 	cflag = new_termios->c_cflag;
 
@@ -1738,14 +1738,14 @@ ioc4_change_speed(struct uart_port *the_port,
 
 	the_port->ignore_status_mask = N_ALL_INPUT;
 
-	info->port.tty->low_latency = 1;
+	state->port.tty->low_latency = 1;
 
-	if (I_IGNPAR(info->port.tty))
+	if (I_IGNPAR(state->port.tty))
 		the_port->ignore_status_mask &= ~(N_PARITY_ERROR
 						| N_FRAMING_ERROR);
-	if (I_IGNBRK(info->port.tty)) {
+	if (I_IGNBRK(state->port.tty)) {
 		the_port->ignore_status_mask &= ~N_BREAK;
-		if (I_IGNPAR(info->port.tty))
+		if (I_IGNPAR(state->port.tty))
 			the_port->ignore_status_mask &= ~N_OVERRUN_ERROR;
 	}
 	if (!(cflag & CREAD)) {
@@ -1784,7 +1784,7 @@ ioc4_change_speed(struct uart_port *the_port,
 static inline int ic4_startup_local(struct uart_port *the_port)
 {
 	struct ioc4_port *port;
-	struct uart_info *info;
+	struct uart_state *state;
 
 	if (!the_port)
 		return -1;
@@ -1793,7 +1793,7 @@ static inline int ic4_startup_local(struct uart_port *the_port)
 	if (!port)
 		return -1;
 
-	info = the_port->info;
+	state = the_port->state;
 
 	local_open(port);
 
@@ -1801,7 +1801,7 @@ static inline int ic4_startup_local(struct uart_port *the_port)
 	ioc4_set_proto(port, the_port->mapbase);
 
 	/* set the speed of the serial port */
-	ioc4_change_speed(the_port, info->port.tty->termios,
+	ioc4_change_speed(the_port, state->port.tty->termios,
 			  (struct ktermios *)0);
 
 	return 0;
@@ -1882,7 +1882,7 @@ static void handle_intr(void *arg, uint32_t sio_ir)
 				the_port = port->ip_port;
 				the_port->icount.dcd = 1;
 				wake_up_interruptible
-					    (&the_port-> info->delta_msr_wait);
+					    (&the_port->state->delta_msr_wait);
 			} else if ((port->ip_notify & N_DDCD)
 					&& !(shadow & IOC4_SHADOW_DCD)) {
 				/* Flag delta DCD/no DCD */
@@ -1904,7 +1904,7 @@ static void handle_intr(void *arg, uint32_t sio_ir)
 				the_port->icount.cts =
 					(shadow & IOC4_SHADOW_CTS) ? 1 : 0;
 				wake_up_interruptible
-					(&the_port->info->delta_msr_wait);
+					(&the_port->state->delta_msr_wait);
 			}
 		}
 
@@ -2236,7 +2236,7 @@ static inline int do_read(struct uart_port *the_port, unsigned char *buf,
 						   && port->ip_port) {
 						the_port->icount.dcd = 0;
 						wake_up_interruptible
-						    (&the_port->info->
+						    (&the_port->state->
 							delta_msr_wait);
 					}
 
@@ -2341,17 +2341,17 @@ static void receive_chars(struct uart_port *the_port)
 	unsigned char ch[IOC4_MAX_CHARS];
 	int read_count, request_count = IOC4_MAX_CHARS;
 	struct uart_icount *icount;
-	struct uart_info *info = the_port->info;
+	struct uart_state *state = the_port->state;
 	unsigned long pflags;
 
 	/* Make sure all the pointers are "good" ones */
-	if (!info)
+	if (!state)
 		return;
-	if (!info->port.tty)
+	if (!state->port.tty)
 		return;
 
 	spin_lock_irqsave(&the_port->lock, pflags);
-	tty = info->port.tty;
+	tty = state->port.tty;
 
 	request_count = tty_buffer_request_room(tty, IOC4_MAX_CHARS);
 
@@ -2430,19 +2430,19 @@ static void ic4_shutdown(struct uart_port *the_port)
 {
 	unsigned long port_flags;
 	struct ioc4_port *port;
-	struct uart_info *info;
+	struct uart_state *state;
 
 	port = get_ioc4_port(the_port, 0);
 	if (!port)
 		return;
 
-	info = the_port->info;
+	state = the_port->state;
 	port->ip_port = NULL;
 
-	wake_up_interruptible(&info->delta_msr_wait);
+	wake_up_interruptible(&state->delta_msr_wait);
 
-	if (info->port.tty)
-		set_bit(TTY_IO_ERROR, &info->port.tty->flags);
+	if (state->port.tty)
+		set_bit(TTY_IO_ERROR, &state->port.tty->flags);
 
 	spin_lock_irqsave(&the_port->lock, port_flags);
 	set_notification(port, N_ALL, 0);
@@ -2538,7 +2538,7 @@ static int ic4_startup(struct uart_port *the_port)
 	int retval;
 	struct ioc4_port *port;
 	struct ioc4_control *control;
-	struct uart_info *info;
+	struct uart_state *state;
 	unsigned long port_flags;
 
 	if (!the_port)
@@ -2546,7 +2546,7 @@ static int ic4_startup(struct uart_port *the_port)
 	port = get_ioc4_port(the_port, 1);
 	if (!port)
 		return -ENODEV;
-	info = the_port->info;
+	state = the_port->state;
 
 	control = port->ip_control;
 	if (!control) {
diff --git a/drivers/serial/ip22zilog.c b/drivers/serial/ip22zilog.c
index 0d9acbd0bb70..2e847deb41dc 100644
--- a/drivers/serial/ip22zilog.c
+++ b/drivers/serial/ip22zilog.c
@@ -256,9 +256,9 @@ static struct tty_struct *ip22zilog_receive_chars(struct uart_ip22zilog_port *up
 	unsigned int r1;
 
 	tty = NULL;
-	if (up->port.info != NULL &&
-	    up->port.info->port.tty != NULL)
-		tty = up->port.info->port.tty;
+	if (up->port.state != NULL &&
+	    up->port.state->port.tty != NULL)
+		tty = up->port.state->port.tty;
 
 	for (;;) {
 		ch = readb(&channel->control);
@@ -354,7 +354,7 @@ static void ip22zilog_status_handle(struct uart_ip22zilog_port *up,
 			uart_handle_cts_change(&up->port,
 					       (status & CTS));
 
-		wake_up_interruptible(&up->port.info->delta_msr_wait);
+		wake_up_interruptible(&up->port.state->delta_msr_wait);
 	}
 
 	up->prev_status = status;
@@ -404,9 +404,9 @@ static void ip22zilog_transmit_chars(struct uart_ip22zilog_port *up,
 		return;
 	}
 
-	if (up->port.info == NULL)
+	if (up->port.state == NULL)
 		goto ack_tx_int;
-	xmit = &up->port.info->xmit;
+	xmit = &up->port.state->xmit;
 	if (uart_circ_empty(xmit))
 		goto ack_tx_int;
 	if (uart_tx_stopped(&up->port))
@@ -607,7 +607,7 @@ static void ip22zilog_start_tx(struct uart_port *port)
 		port->icount.tx++;
 		port->x_char = 0;
 	} else {
-		struct circ_buf *xmit = &port->info->xmit;
+		struct circ_buf *xmit = &port->state->xmit;
 
 		writeb(xmit->buf[xmit->tail], &channel->data);
 		ZSDELAY();
diff --git a/drivers/serial/jsm/jsm_neo.c b/drivers/serial/jsm/jsm_neo.c
index 9dadaa11d266..b4b124e4828f 100644
--- a/drivers/serial/jsm/jsm_neo.c
+++ b/drivers/serial/jsm/jsm_neo.c
@@ -989,7 +989,7 @@ static void neo_param(struct jsm_channel *ch)
 			{     50, B50     },
 		};
 
-		cflag = C_BAUD(ch->uart_port.info->port.tty);
+		cflag = C_BAUD(ch->uart_port.state->port.tty);
 		baud = 9600;
 		for (i = 0; i < ARRAY_SIZE(baud_rates); i++) {
 			if (baud_rates[i].cflag == cflag) {
diff --git a/drivers/serial/jsm/jsm_tty.c b/drivers/serial/jsm/jsm_tty.c
index 00f4577d2f7f..7439c0373620 100644
--- a/drivers/serial/jsm/jsm_tty.c
+++ b/drivers/serial/jsm/jsm_tty.c
@@ -147,7 +147,7 @@ static void jsm_tty_send_xchar(struct uart_port *port, char ch)
 	struct ktermios *termios;
 
 	spin_lock_irqsave(&port->lock, lock_flags);
-	termios = port->info->port.tty->termios;
+	termios = port->state->port.tty->termios;
 	if (ch == termios->c_cc[VSTART])
 		channel->ch_bd->bd_ops->send_start_character(channel);
 
@@ -245,7 +245,7 @@ static int jsm_tty_open(struct uart_port *port)
 	channel->ch_cached_lsr = 0;
 	channel->ch_stops_sent = 0;
 
-	termios = port->info->port.tty->termios;
+	termios = port->state->port.tty->termios;
 	channel->ch_c_cflag	= termios->c_cflag;
 	channel->ch_c_iflag	= termios->c_iflag;
 	channel->ch_c_oflag	= termios->c_oflag;
@@ -278,7 +278,7 @@ static void jsm_tty_close(struct uart_port *port)
 	jsm_printk(CLOSE, INFO, &channel->ch_bd->pci_dev, "start\n");
 
 	bd = channel->ch_bd;
-	ts = port->info->port.tty->termios;
+	ts = port->state->port.tty->termios;
 
 	channel->ch_flags &= ~(CH_STOPI);
 
@@ -530,7 +530,7 @@ void jsm_input(struct jsm_channel *ch)
 	if (!ch)
 		return;
 
-	tp = ch->uart_port.info->port.tty;
+	tp = ch->uart_port.state->port.tty;
 
 	bd = ch->ch_bd;
 	if(!bd)
@@ -849,7 +849,7 @@ int jsm_tty_write(struct uart_port *port)
 	u16 tail;
 	u16 tmask;
 	u32 remain;
-	int temp_tail = port->info->xmit.tail;
+	int temp_tail = port->state->xmit.tail;
 	struct jsm_channel *channel = (struct jsm_channel *)port;
 
 	tmask = WQUEUEMASK;
@@ -865,10 +865,10 @@ int jsm_tty_write(struct uart_port *port)
 	data_count = 0;
 	if (bufcount >= remain) {
 		bufcount -= remain;
-		while ((port->info->xmit.head != temp_tail) &&
+		while ((port->state->xmit.head != temp_tail) &&
 		(data_count < remain)) {
 			channel->ch_wqueue[head++] =
-			port->info->xmit.buf[temp_tail];
+			port->state->xmit.buf[temp_tail];
 
 			temp_tail++;
 			temp_tail &= (UART_XMIT_SIZE - 1);
@@ -880,10 +880,10 @@ int jsm_tty_write(struct uart_port *port)
 	data_count1 = 0;
 	if (bufcount > 0) {
 		remain = bufcount;
-		while ((port->info->xmit.head != temp_tail) &&
+		while ((port->state->xmit.head != temp_tail) &&
 			(data_count1 < remain)) {
 			channel->ch_wqueue[head++] =
-				port->info->xmit.buf[temp_tail];
+				port->state->xmit.buf[temp_tail];
 
 			temp_tail++;
 			temp_tail &= (UART_XMIT_SIZE - 1);
@@ -892,7 +892,7 @@ int jsm_tty_write(struct uart_port *port)
 		}
 	}
 
-	port->info->xmit.tail = temp_tail;
+	port->state->xmit.tail = temp_tail;
 
 	data_count += data_count1;
 	if (data_count) {
diff --git a/drivers/serial/m32r_sio.c b/drivers/serial/m32r_sio.c
index 611c97a15654..bea5c215460c 100644
--- a/drivers/serial/m32r_sio.c
+++ b/drivers/serial/m32r_sio.c
@@ -286,7 +286,7 @@ static void m32r_sio_start_tx(struct uart_port *port)
 {
 #ifdef CONFIG_SERIAL_M32R_PLDSIO
 	struct uart_sio_port *up = (struct uart_sio_port *)port;
-	struct circ_buf *xmit = &up->port.info->xmit;
+	struct circ_buf *xmit = &up->port.state->xmit;
 
 	if (!(up->ier & UART_IER_THRI)) {
 		up->ier |= UART_IER_THRI;
@@ -325,7 +325,7 @@ static void m32r_sio_enable_ms(struct uart_port *port)
 
 static void receive_chars(struct uart_sio_port *up, int *status)
 {
-	struct tty_struct *tty = up->port.info->port.tty;
+	struct tty_struct *tty = up->port.state->port.tty;
 	unsigned char ch;
 	unsigned char flag;
 	int max_count = 256;
@@ -398,7 +398,7 @@ static void receive_chars(struct uart_sio_port *up, int *status)
 
 static void transmit_chars(struct uart_sio_port *up)
 {
-	struct circ_buf *xmit = &up->port.info->xmit;
+	struct circ_buf *xmit = &up->port.state->xmit;
 	int count;
 
 	if (up->port.x_char) {
diff --git a/drivers/serial/max3100.c b/drivers/serial/max3100.c
index 9fd33e5622bd..75ab00631c41 100644
--- a/drivers/serial/max3100.c
+++ b/drivers/serial/max3100.c
@@ -184,7 +184,7 @@ static void max3100_timeout(unsigned long data)
 {
 	struct max3100_port *s = (struct max3100_port *)data;
 
-	if (s->port.info) {
+	if (s->port.state) {
 		max3100_dowork(s);
 		mod_timer(&s->timer, jiffies + s->poll_time);
 	}
@@ -261,7 +261,7 @@ static void max3100_work(struct work_struct *w)
 	int rxchars;
 	u16 tx, rx;
 	int conf, cconf, rts, crts;
-	struct circ_buf *xmit = &s->port.info->xmit;
+	struct circ_buf *xmit = &s->port.state->xmit;
 
 	dev_dbg(&s->spi->dev, "%s\n", __func__);
 
@@ -307,8 +307,8 @@ static void max3100_work(struct work_struct *w)
 			}
 		}
 
-		if (rxchars > 16 && s->port.info->port.tty != NULL) {
-			tty_flip_buffer_push(s->port.info->port.tty);
+		if (rxchars > 16 && s->port.state->port.tty != NULL) {
+			tty_flip_buffer_push(s->port.state->port.tty);
 			rxchars = 0;
 		}
 		if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
@@ -320,8 +320,8 @@ static void max3100_work(struct work_struct *w)
 		  (!uart_circ_empty(xmit) &&
 		   !uart_tx_stopped(&s->port))));
 
-	if (rxchars > 0 && s->port.info->port.tty != NULL)
-		tty_flip_buffer_push(s->port.info->port.tty);
+	if (rxchars > 0 && s->port.state->port.tty != NULL)
+		tty_flip_buffer_push(s->port.state->port.tty);
 }
 
 static irqreturn_t max3100_irq(int irqno, void *dev_id)
@@ -429,7 +429,7 @@ max3100_set_termios(struct uart_port *port, struct ktermios *termios,
 	int baud = 0;
 	unsigned cflag;
 	u32 param_new, param_mask, parity = 0;
-	struct tty_struct *tty = s->port.info->port.tty;
+	struct tty_struct *tty = s->port.state->port.tty;
 
 	dev_dbg(&s->spi->dev, "%s\n", __func__);
 	if (!tty)
@@ -529,7 +529,7 @@ max3100_set_termios(struct uart_port *port, struct ktermios *termios,
 			MAX3100_STATUS_OE;
 
 	/* we are sending char from a workqueue so enable */
-	s->port.info->port.tty->low_latency = 1;
+	s->port.state->port.tty->low_latency = 1;
 
 	if (s->poll_time > 0)
 		del_timer_sync(&s->timer);
diff --git a/drivers/serial/mcf.c b/drivers/serial/mcf.c
index 0eefb07bebaf..b44382442bf1 100644
--- a/drivers/serial/mcf.c
+++ b/drivers/serial/mcf.c
@@ -323,7 +323,7 @@ static void mcf_rx_chars(struct mcf_uart *pp)
 		uart_insert_char(port, status, MCFUART_USR_RXOVERRUN, ch, flag);
 	}
 
-	tty_flip_buffer_push(port->info->port.tty);
+	tty_flip_buffer_push(port->state->port.tty);
 }
 
 /****************************************************************************/
@@ -331,7 +331,7 @@ static void mcf_rx_chars(struct mcf_uart *pp)
 static void mcf_tx_chars(struct mcf_uart *pp)
 {
 	struct uart_port *port = &pp->port;
-	struct circ_buf *xmit = &port->info->xmit;
+	struct circ_buf *xmit = &port->state->xmit;
 
 	if (port->x_char) {
 		/* Send special char - probably flow control */
diff --git a/drivers/serial/mpc52xx_uart.c b/drivers/serial/mpc52xx_uart.c
index abbd146c50d9..d7bcd074d383 100644
--- a/drivers/serial/mpc52xx_uart.c
+++ b/drivers/serial/mpc52xx_uart.c
@@ -745,7 +745,7 @@ static struct uart_ops mpc52xx_uart_ops = {
 static inline int
 mpc52xx_uart_int_rx_chars(struct uart_port *port)
 {
-	struct tty_struct *tty = port->info->port.tty;
+	struct tty_struct *tty = port->state->port.tty;
 	unsigned char ch, flag;
 	unsigned short status;
 
@@ -812,7 +812,7 @@ mpc52xx_uart_int_rx_chars(struct uart_port *port)
 static inline int
 mpc52xx_uart_int_tx_chars(struct uart_port *port)
 {
-	struct circ_buf *xmit = &port->info->xmit;
+	struct circ_buf *xmit = &port->state->xmit;
 
 	/* Process out of band chars */
 	if (port->x_char) {
diff --git a/drivers/serial/mpsc.c b/drivers/serial/mpsc.c
index 61d3ade5286c..b5496c28e60b 100644
--- a/drivers/serial/mpsc.c
+++ b/drivers/serial/mpsc.c
@@ -936,7 +936,7 @@ static int serial_polled;
 static int mpsc_rx_intr(struct mpsc_port_info *pi)
 {
 	struct mpsc_rx_desc *rxre;
-	struct tty_struct *tty = pi->port.info->port.tty;
+	struct tty_struct *tty = pi->port.state->port.tty;
 	u32	cmdstat, bytes_in, i;
 	int	rc = 0;
 	u8	*bp;
@@ -1109,7 +1109,7 @@ static void mpsc_setup_tx_desc(struct mpsc_port_info *pi, u32 count, u32 intr)
 
 static void mpsc_copy_tx_data(struct mpsc_port_info *pi)
 {
-	struct circ_buf *xmit = &pi->port.info->xmit;
+	struct circ_buf *xmit = &pi->port.state->xmit;
 	u8 *bp;
 	u32 i;
 
diff --git a/drivers/serial/msm_serial.c b/drivers/serial/msm_serial.c
index f7c24baa1416..ff18d50c99c1 100644
--- a/drivers/serial/msm_serial.c
+++ b/drivers/serial/msm_serial.c
@@ -88,7 +88,7 @@ static void msm_enable_ms(struct uart_port *port)
 
 static void handle_rx(struct uart_port *port)
 {
-	struct tty_struct *tty = port->info->port.tty;
+	struct tty_struct *tty = port->state->port.tty;
 	unsigned int sr;
 
 	/*
@@ -136,7 +136,7 @@ static void handle_rx(struct uart_port *port)
 
 static void handle_tx(struct uart_port *port)
 {
-	struct circ_buf *xmit = &port->info->xmit;
+	struct circ_buf *xmit = &port->state->xmit;
 	struct msm_port *msm_port = UART_TO_MSM(port);
 	int sent_tx;
 
@@ -169,7 +169,7 @@ static void handle_delta_cts(struct uart_port *port)
 {
 	msm_write(port, UART_CR_CMD_RESET_CTS, UART_CR);
 	port->icount.cts++;
-	wake_up_interruptible(&port->info->delta_msr_wait);
+	wake_up_interruptible(&port->state->delta_msr_wait);
 }
 
 static irqreturn_t msm_irq(int irq, void *dev_id)
diff --git a/drivers/serial/mux.c b/drivers/serial/mux.c
index 953a5ffa9b44..7571aaa138b0 100644
--- a/drivers/serial/mux.c
+++ b/drivers/serial/mux.c
@@ -199,7 +199,7 @@ static void mux_break_ctl(struct uart_port *port, int break_state)
 static void mux_write(struct uart_port *port)
 {
 	int count;
-	struct circ_buf *xmit = &port->info->xmit;
+	struct circ_buf *xmit = &port->state->xmit;
 
 	if(port->x_char) {
 		UART_PUT_CHAR(port, port->x_char);
@@ -243,7 +243,7 @@ static void mux_write(struct uart_port *port)
 static void mux_read(struct uart_port *port)
 {
 	int data;
-	struct tty_struct *tty = port->info->port.tty;
+	struct tty_struct *tty = port->state->port.tty;
 	__u32 start_count = port->icount.rx;
 
 	while(1) {
diff --git a/drivers/serial/netx-serial.c b/drivers/serial/netx-serial.c
index 3e5dda8518b7..7735c9f35fa0 100644
--- a/drivers/serial/netx-serial.c
+++ b/drivers/serial/netx-serial.c
@@ -140,7 +140,7 @@ static void netx_enable_ms(struct uart_port *port)
 
 static inline void netx_transmit_buffer(struct uart_port *port)
 {
-	struct circ_buf *xmit = &port->info->xmit;
+	struct circ_buf *xmit = &port->state->xmit;
 
 	if (port->x_char) {
 		writel(port->x_char, port->membase + UART_DR);
@@ -185,7 +185,7 @@ static unsigned int netx_tx_empty(struct uart_port *port)
 
 static void netx_txint(struct uart_port *port)
 {
-	struct circ_buf *xmit = &port->info->xmit;
+	struct circ_buf *xmit = &port->state->xmit;
 
 	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
 		netx_stop_tx(port);
@@ -201,7 +201,7 @@ static void netx_txint(struct uart_port *port)
 static void netx_rxint(struct uart_port *port)
 {
 	unsigned char rx, flg, status;
-	struct tty_struct *tty = port->info->port.tty;
+	struct tty_struct *tty = port->state->port.tty;
 
 	while (!(readl(port->membase + UART_FR) & FR_RXFE)) {
 		rx = readl(port->membase + UART_DR);
diff --git a/drivers/serial/nwpserial.c b/drivers/serial/nwpserial.c
index 9e150b19d726..e1ab8ec0a4a6 100644
--- a/drivers/serial/nwpserial.c
+++ b/drivers/serial/nwpserial.c
@@ -126,7 +126,7 @@ static void nwpserial_config_port(struct uart_port *port, int flags)
 static irqreturn_t nwpserial_interrupt(int irq, void *dev_id)
 {
 	struct nwpserial_port *up = dev_id;
-	struct tty_struct *tty = up->port.info->port.tty;
+	struct tty_struct *tty = up->port.state->port.tty;
 	irqreturn_t ret;
 	unsigned int iir;
 	unsigned char ch;
@@ -261,7 +261,7 @@ static void nwpserial_start_tx(struct uart_port *port)
 	struct nwpserial_port *up;
 	struct circ_buf *xmit;
 	up = container_of(port, struct nwpserial_port, port);
-	xmit  = &up->port.info->xmit;
+	xmit  = &up->port.state->xmit;
 
 	if (port->x_char) {
 		nwpserial_putchar(up, up->port.x_char);
diff --git a/drivers/serial/pmac_zilog.c b/drivers/serial/pmac_zilog.c
index 9c1243fbd512..ab4c85ba3549 100644
--- a/drivers/serial/pmac_zilog.c
+++ b/drivers/serial/pmac_zilog.c
@@ -242,12 +242,12 @@ static struct tty_struct *pmz_receive_chars(struct uart_pmac_port *uap)
 	}
 
 	/* Sanity check, make sure the old bug is no longer happening */
-	if (uap->port.info == NULL || uap->port.info->port.tty == NULL) {
+	if (uap->port.state == NULL || uap->port.state->port.tty == NULL) {
 		WARN_ON(1);
 		(void)read_zsdata(uap);
 		return NULL;
 	}
-	tty = uap->port.info->port.tty;
+	tty = uap->port.state->port.tty;
 
 	while (1) {
 		error = 0;
@@ -369,7 +369,7 @@ static void pmz_status_handle(struct uart_pmac_port *uap)
 			uart_handle_cts_change(&uap->port,
 					       !(status & CTS));
 
-		wake_up_interruptible(&uap->port.info->delta_msr_wait);
+		wake_up_interruptible(&uap->port.state->delta_msr_wait);
 	}
 
 	if (status & BRK_ABRT)
@@ -420,9 +420,9 @@ static void pmz_transmit_chars(struct uart_pmac_port *uap)
 		return;
 	}
 
-	if (uap->port.info == NULL)
+	if (uap->port.state == NULL)
 		goto ack_tx_int;
-	xmit = &uap->port.info->xmit;
+	xmit = &uap->port.state->xmit;
 	if (uart_circ_empty(xmit)) {
 		uart_write_wakeup(&uap->port);
 		goto ack_tx_int;
@@ -655,7 +655,7 @@ static void pmz_start_tx(struct uart_port *port)
 		port->icount.tx++;
 		port->x_char = 0;
 	} else {
-		struct circ_buf *xmit = &port->info->xmit;
+		struct circ_buf *xmit = &port->state->xmit;
 
 		write_zsdata(uap, xmit->buf[xmit->tail]);
 		zssync(uap);
diff --git a/drivers/serial/pnx8xxx_uart.c b/drivers/serial/pnx8xxx_uart.c
index 1bb8f1b45767..2da747635275 100644
--- a/drivers/serial/pnx8xxx_uart.c
+++ b/drivers/serial/pnx8xxx_uart.c
@@ -100,7 +100,7 @@ static void pnx8xxx_mctrl_check(struct pnx8xxx_port *sport)
 	if (changed & TIOCM_CTS)
 		uart_handle_cts_change(&sport->port, status & TIOCM_CTS);
 
-	wake_up_interruptible(&sport->port.info->delta_msr_wait);
+	wake_up_interruptible(&sport->port.state->delta_msr_wait);
 }
 
 /*
@@ -112,7 +112,7 @@ static void pnx8xxx_timeout(unsigned long data)
 	struct pnx8xxx_port *sport = (struct pnx8xxx_port *)data;
 	unsigned long flags;
 
-	if (sport->port.info) {
+	if (sport->port.state) {
 		spin_lock_irqsave(&sport->port.lock, flags);
 		pnx8xxx_mctrl_check(sport);
 		spin_unlock_irqrestore(&sport->port.lock, flags);
@@ -181,7 +181,7 @@ static void pnx8xxx_enable_ms(struct uart_port *port)
 
 static void pnx8xxx_rx_chars(struct pnx8xxx_port *sport)
 {
-	struct tty_struct *tty = sport->port.info->port.tty;
+	struct tty_struct *tty = sport->port.state->port.tty;
 	unsigned int status, ch, flg;
 
 	status = FIFO_TO_SM(serial_in(sport, PNX8XXX_FIFO)) |
@@ -243,7 +243,7 @@ static void pnx8xxx_rx_chars(struct pnx8xxx_port *sport)
 
 static void pnx8xxx_tx_chars(struct pnx8xxx_port *sport)
 {
-	struct circ_buf *xmit = &sport->port.info->xmit;
+	struct circ_buf *xmit = &sport->port.state->xmit;
 
 	if (sport->port.x_char) {
 		serial_out(sport, PNX8XXX_FIFO, sport->port.x_char);
diff --git a/drivers/serial/pxa.c b/drivers/serial/pxa.c
index a48a8a13d87b..ad48919c0415 100644
--- a/drivers/serial/pxa.c
+++ b/drivers/serial/pxa.c
@@ -96,7 +96,7 @@ static void serial_pxa_stop_rx(struct uart_port *port)
 
 static inline void receive_chars(struct uart_pxa_port *up, int *status)
 {
-	struct tty_struct *tty = up->port.info->port.tty;
+	struct tty_struct *tty = up->port.state->port.tty;
 	unsigned int ch, flag;
 	int max_count = 256;
 
@@ -161,7 +161,7 @@ static inline void receive_chars(struct uart_pxa_port *up, int *status)
 
 static void transmit_chars(struct uart_pxa_port *up)
 {
-	struct circ_buf *xmit = &up->port.info->xmit;
+	struct circ_buf *xmit = &up->port.state->xmit;
 	int count;
 
 	if (up->port.x_char) {
@@ -220,7 +220,7 @@ static inline void check_modem_status(struct uart_pxa_port *up)
 	if (status & UART_MSR_DCTS)
 		uart_handle_cts_change(&up->port, status & UART_MSR_CTS);
 
-	wake_up_interruptible(&up->port.info->delta_msr_wait);
+	wake_up_interruptible(&up->port.state->delta_msr_wait);
 }
 
 /*
diff --git a/drivers/serial/sa1100.c b/drivers/serial/sa1100.c
index 94530f01521e..61ef3ae24927 100644
--- a/drivers/serial/sa1100.c
+++ b/drivers/serial/sa1100.c
@@ -117,7 +117,7 @@ static void sa1100_mctrl_check(struct sa1100_port *sport)
 	if (changed & TIOCM_CTS)
 		uart_handle_cts_change(&sport->port, status & TIOCM_CTS);
 
-	wake_up_interruptible(&sport->port.info->delta_msr_wait);
+	wake_up_interruptible(&sport->port.state->delta_msr_wait);
 }
 
 /*
@@ -129,7 +129,7 @@ static void sa1100_timeout(unsigned long data)
 	struct sa1100_port *sport = (struct sa1100_port *)data;
 	unsigned long flags;
 
-	if (sport->port.info) {
+	if (sport->port.state) {
 		spin_lock_irqsave(&sport->port.lock, flags);
 		sa1100_mctrl_check(sport);
 		spin_unlock_irqrestore(&sport->port.lock, flags);
@@ -189,7 +189,7 @@ static void sa1100_enable_ms(struct uart_port *port)
 static void
 sa1100_rx_chars(struct sa1100_port *sport)
 {
-	struct tty_struct *tty = sport->port.info->port.tty;
+	struct tty_struct *tty = sport->port.state->port.tty;
 	unsigned int status, ch, flg;
 
 	status = UTSR1_TO_SM(UART_GET_UTSR1(sport)) |
@@ -239,7 +239,7 @@ sa1100_rx_chars(struct sa1100_port *sport)
 
 static void sa1100_tx_chars(struct sa1100_port *sport)
 {
-	struct circ_buf *xmit = &sport->port.info->xmit;
+	struct circ_buf *xmit = &sport->port.state->xmit;
 
 	if (sport->port.x_char) {
 		UART_PUT_CHAR(sport, sport->port.x_char);
diff --git a/drivers/serial/samsung.c b/drivers/serial/samsung.c
index c8851a0db63a..1523e8d9ae77 100644
--- a/drivers/serial/samsung.c
+++ b/drivers/serial/samsung.c
@@ -196,7 +196,7 @@ s3c24xx_serial_rx_chars(int irq, void *dev_id)
 {
 	struct s3c24xx_uart_port *ourport = dev_id;
 	struct uart_port *port = &ourport->port;
-	struct tty_struct *tty = port->info->port.tty;
+	struct tty_struct *tty = port->state->port.tty;
 	unsigned int ufcon, ch, flag, ufstat, uerstat;
 	int max_count = 64;
 
@@ -281,7 +281,7 @@ static irqreturn_t s3c24xx_serial_tx_chars(int irq, void *id)
 {
 	struct s3c24xx_uart_port *ourport = id;
 	struct uart_port *port = &ourport->port;
-	struct circ_buf *xmit = &port->info->xmit;
+	struct circ_buf *xmit = &port->state->xmit;
 	int count = 256;
 
 	if (port->x_char) {
@@ -992,10 +992,10 @@ static int s3c24xx_serial_cpufreq_transition(struct notifier_block *nb,
 		struct ktermios *termios;
 		struct tty_struct *tty;
 
-		if (uport->info == NULL)
+		if (uport->state == NULL)
 			goto exit;
 
-		tty = uport->info->port.tty;
+		tty = uport->state->port.tty;
 
 		if (tty == NULL)
 			goto exit;
diff --git a/drivers/serial/sb1250-duart.c b/drivers/serial/sb1250-duart.c
index 319e8b83f6be..fa5f303b36d3 100644
--- a/drivers/serial/sb1250-duart.c
+++ b/drivers/serial/sb1250-duart.c
@@ -384,13 +384,13 @@ static void sbd_receive_chars(struct sbd_port *sport)
 		uart_insert_char(uport, status, M_DUART_OVRUN_ERR, ch, flag);
 	}
 
-	tty_flip_buffer_push(uport->info->port.tty);
+	tty_flip_buffer_push(uport->state->port.tty);
 }
 
 static void sbd_transmit_chars(struct sbd_port *sport)
 {
 	struct uart_port *uport = &sport->port;
-	struct circ_buf *xmit = &sport->port.info->xmit;
+	struct circ_buf *xmit = &sport->port.state->xmit;
 	unsigned int mask;
 	int stop_tx;
 
@@ -440,7 +440,7 @@ static void sbd_status_handle(struct sbd_port *sport)
 
 	if (delta & ((M_DUART_IN_PIN2_VAL | M_DUART_IN_PIN0_VAL) <<
 		     S_DUART_IN_PIN_CHNG))
-		wake_up_interruptible(&uport->info->delta_msr_wait);
+		wake_up_interruptible(&uport->state->delta_msr_wait);
 }
 
 static irqreturn_t sbd_interrupt(int irq, void *dev_id)
diff --git a/drivers/serial/sc26xx.c b/drivers/serial/sc26xx.c
index e0be11ceaa25..75038ad2b242 100644
--- a/drivers/serial/sc26xx.c
+++ b/drivers/serial/sc26xx.c
@@ -140,8 +140,8 @@ static struct tty_struct *receive_chars(struct uart_port *port)
 	char flag;
 	u8 status;
 
-	if (port->info != NULL)		/* Unopened serial console */
-		tty = port->info->port.tty;
+	if (port->state != NULL)		/* Unopened serial console */
+		tty = port->state->port.tty;
 
 	while (limit-- > 0) {
 		status = READ_SC_PORT(port, SR);
@@ -190,10 +190,10 @@ static void transmit_chars(struct uart_port *port)
 {
 	struct circ_buf *xmit;
 
-	if (!port->info)
+	if (!port->state)
 		return;
 
-	xmit = &port->info->xmit;
+	xmit = &port->state->xmit;
 	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
 		sc26xx_disable_irq(port, IMR_TXRDY);
 		return;
@@ -316,7 +316,7 @@ static void sc26xx_stop_tx(struct uart_port *port)
 /* port->lock held by caller.  */
 static void sc26xx_start_tx(struct uart_port *port)
 {
-	struct circ_buf *xmit = &port->info->xmit;
+	struct circ_buf *xmit = &port->state->xmit;
 
 	while (!uart_circ_empty(xmit)) {
 		if (!(READ_SC_PORT(port, SR) & SR_TXRDY)) {
diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index b0bb29d804ae..ea53b6f224b0 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -52,7 +52,7 @@ static struct lock_class_key port_lock_key;
 
 #define HIGH_BITS_OFFSET	((sizeof(long)-sizeof(int))*8)
 
-#define uart_users(state)	((state)->count + (state)->info.port.blocked_open)
+#define uart_users(state)	((state)->count + (state)->port.blocked_open)
 
 #ifdef CONFIG_SERIAL_CORE_CONSOLE
 #define uart_console(port)	((port)->cons && (port)->cons->index == (port)->line)
@@ -71,19 +71,19 @@ static void uart_change_pm(struct uart_state *state, int pm_state);
  */
 void uart_write_wakeup(struct uart_port *port)
 {
-	struct uart_info *info = port->info;
+	struct uart_state *state = port->state;
 	/*
 	 * This means you called this function _after_ the port was
 	 * closed.  No cookie for you.
 	 */
-	BUG_ON(!info);
-	tasklet_schedule(&info->tlet);
+	BUG_ON(!state);
+	tasklet_schedule(&state->tlet);
 }
 
 static void uart_stop(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_port *port = state->port;
+	struct uart_port *port = state->uart_port;
 	unsigned long flags;
 
 	spin_lock_irqsave(&port->lock, flags);
@@ -94,9 +94,9 @@ static void uart_stop(struct tty_struct *tty)
 static void __uart_start(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_port *port = state->port;
+	struct uart_port *port = state->uart_port;
 
-	if (!uart_circ_empty(&state->info.xmit) && state->info.xmit.buf &&
+	if (!uart_circ_empty(&state->xmit) && state->xmit.buf &&
 	    !tty->stopped && !tty->hw_stopped)
 		port->ops->start_tx(port);
 }
@@ -104,7 +104,7 @@ static void __uart_start(struct tty_struct *tty)
 static void uart_start(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_port *port = state->port;
+	struct uart_port *port = state->uart_port;
 	unsigned long flags;
 
 	spin_lock_irqsave(&port->lock, flags);
@@ -115,7 +115,7 @@ static void uart_start(struct tty_struct *tty)
 static void uart_tasklet_action(unsigned long data)
 {
 	struct uart_state *state = (struct uart_state *)data;
-	tty_wakeup(state->info.port.tty);
+	tty_wakeup(state->port.tty);
 }
 
 static inline void
@@ -141,12 +141,11 @@ uart_update_mctrl(struct uart_port *port, unsigned int set, unsigned int clear)
  */
 static int uart_startup(struct uart_state *state, int init_hw)
 {
-	struct uart_info *info = &state->info;
-	struct uart_port *port = state->port;
+	struct uart_port *port = state->uart_port;
 	unsigned long page;
 	int retval = 0;
 
-	if (info->flags & UIF_INITIALIZED)
+	if (state->flags & UIF_INITIALIZED)
 		return 0;
 
 	/*
@@ -154,7 +153,7 @@ static int uart_startup(struct uart_state *state, int init_hw)
 	 * once we have successfully opened the port.  Also set
 	 * up the tty->alt_speed kludge
 	 */
-	set_bit(TTY_IO_ERROR, &info->port.tty->flags);
+	set_bit(TTY_IO_ERROR, &state->port.tty->flags);
 
 	if (port->type == PORT_UNKNOWN)
 		return 0;
@@ -163,14 +162,14 @@ static int uart_startup(struct uart_state *state, int init_hw)
 	 * Initialise and allocate the transmit and temporary
 	 * buffer.
 	 */
-	if (!info->xmit.buf) {
+	if (!state->xmit.buf) {
 		/* This is protected by the per port mutex */
 		page = get_zeroed_page(GFP_KERNEL);
 		if (!page)
 			return -ENOMEM;
 
-		info->xmit.buf = (unsigned char *) page;
-		uart_circ_clear(&info->xmit);
+		state->xmit.buf = (unsigned char *) page;
+		uart_circ_clear(&state->xmit);
 	}
 
 	retval = port->ops->startup(port);
@@ -185,20 +184,20 @@ static int uart_startup(struct uart_state *state, int init_hw)
 			 * Setup the RTS and DTR signals once the
 			 * port is open and ready to respond.
 			 */
-			if (info->port.tty->termios->c_cflag & CBAUD)
+			if (state->port.tty->termios->c_cflag & CBAUD)
 				uart_set_mctrl(port, TIOCM_RTS | TIOCM_DTR);
 		}
 
-		if (info->flags & UIF_CTS_FLOW) {
+		if (state->flags & UIF_CTS_FLOW) {
 			spin_lock_irq(&port->lock);
 			if (!(port->ops->get_mctrl(port) & TIOCM_CTS))
-				info->port.tty->hw_stopped = 1;
+				state->port.tty->hw_stopped = 1;
 			spin_unlock_irq(&port->lock);
 		}
 
-		info->flags |= UIF_INITIALIZED;
+		state->flags |= UIF_INITIALIZED;
 
-		clear_bit(TTY_IO_ERROR, &info->port.tty->flags);
+		clear_bit(TTY_IO_ERROR, &state->port.tty->flags);
 	}
 
 	if (retval && capable(CAP_SYS_ADMIN))
@@ -214,9 +213,8 @@ static int uart_startup(struct uart_state *state, int init_hw)
  */
 static void uart_shutdown(struct uart_state *state)
 {
-	struct uart_info *info = &state->info;
-	struct uart_port *port = state->port;
-	struct tty_struct *tty = info->port.tty;
+	struct uart_port *port = state->uart_port;
+	struct tty_struct *tty = state->port.tty;
 
 	/*
 	 * Set the TTY IO error marker
@@ -224,8 +222,8 @@ static void uart_shutdown(struct uart_state *state)
 	if (tty)
 		set_bit(TTY_IO_ERROR, &tty->flags);
 
-	if (info->flags & UIF_INITIALIZED) {
-		info->flags &= ~UIF_INITIALIZED;
+	if (state->flags & UIF_INITIALIZED) {
+		state->flags &= ~UIF_INITIALIZED;
 
 		/*
 		 * Turn off DTR and RTS early.
@@ -240,7 +238,7 @@ static void uart_shutdown(struct uart_state *state)
 		 * any outstanding file descriptors should be pointing at
 		 * hung_up_tty_fops now.
 		 */
-		wake_up_interruptible(&info->delta_msr_wait);
+		wake_up_interruptible(&state->delta_msr_wait);
 
 		/*
 		 * Free the IRQ and disable the port.
@@ -256,14 +254,14 @@ static void uart_shutdown(struct uart_state *state)
 	/*
 	 * kill off our tasklet
 	 */
-	tasklet_kill(&info->tlet);
+	tasklet_kill(&state->tlet);
 
 	/*
 	 * Free the transmit buffer page.
 	 */
-	if (info->xmit.buf) {
-		free_page((unsigned long)info->xmit.buf);
-		info->xmit.buf = NULL;
+	if (state->xmit.buf) {
+		free_page((unsigned long)state->xmit.buf);
+		state->xmit.buf = NULL;
 	}
 }
 
@@ -430,8 +428,8 @@ EXPORT_SYMBOL(uart_get_divisor);
 static void
 uart_change_speed(struct uart_state *state, struct ktermios *old_termios)
 {
-	struct tty_struct *tty = state->info.port.tty;
-	struct uart_port *port = state->port;
+	struct tty_struct *tty = state->port.tty;
+	struct uart_port *port = state->uart_port;
 	struct ktermios *termios;
 
 	/*
@@ -447,14 +445,14 @@ uart_change_speed(struct uart_state *state, struct ktermios *old_termios)
 	 * Set flags based on termios cflag
 	 */
 	if (termios->c_cflag & CRTSCTS)
-		state->info.flags |= UIF_CTS_FLOW;
+		state->flags |= UIF_CTS_FLOW;
 	else
-		state->info.flags &= ~UIF_CTS_FLOW;
+		state->flags &= ~UIF_CTS_FLOW;
 
 	if (termios->c_cflag & CLOCAL)
-		state->info.flags &= ~UIF_CHECK_CD;
+		state->flags &= ~UIF_CHECK_CD;
 	else
-		state->info.flags |= UIF_CHECK_CD;
+		state->flags |= UIF_CHECK_CD;
 
 	port->ops->set_termios(port, termios, old_termios);
 }
@@ -482,7 +480,7 @@ static int uart_put_char(struct tty_struct *tty, unsigned char ch)
 {
 	struct uart_state *state = tty->driver_data;
 
-	return __uart_put_char(state->port, &state->info.xmit, ch);
+	return __uart_put_char(state->uart_port, &state->xmit, ch);
 }
 
 static void uart_flush_chars(struct tty_struct *tty)
@@ -508,8 +506,8 @@ uart_write(struct tty_struct *tty, const unsigned char *buf, int count)
 		return -EL3HLT;
 	}
 
-	port = state->port;
-	circ = &state->info.xmit;
+	port = state->uart_port;
+	circ = &state->xmit;
 
 	if (!circ->buf)
 		return 0;
@@ -539,9 +537,9 @@ static int uart_write_room(struct tty_struct *tty)
 	unsigned long flags;
 	int ret;
 
-	spin_lock_irqsave(&state->port->lock, flags);
-	ret = uart_circ_chars_free(&state->info.xmit);
-	spin_unlock_irqrestore(&state->port->lock, flags);
+	spin_lock_irqsave(&state->uart_port->lock, flags);
+	ret = uart_circ_chars_free(&state->xmit);
+	spin_unlock_irqrestore(&state->uart_port->lock, flags);
 	return ret;
 }
 
@@ -551,9 +549,9 @@ static int uart_chars_in_buffer(struct tty_struct *tty)
 	unsigned long flags;
 	int ret;
 
-	spin_lock_irqsave(&state->port->lock, flags);
-	ret = uart_circ_chars_pending(&state->info.xmit);
-	spin_unlock_irqrestore(&state->port->lock, flags);
+	spin_lock_irqsave(&state->uart_port->lock, flags);
+	ret = uart_circ_chars_pending(&state->xmit);
+	spin_unlock_irqrestore(&state->uart_port->lock, flags);
 	return ret;
 }
 
@@ -572,11 +570,11 @@ static void uart_flush_buffer(struct tty_struct *tty)
 		return;
 	}
 
-	port = state->port;
+	port = state->uart_port;
 	pr_debug("uart_flush_buffer(%d) called\n", tty->index);
 
 	spin_lock_irqsave(&port->lock, flags);
-	uart_circ_clear(&state->info.xmit);
+	uart_circ_clear(&state->xmit);
 	if (port->ops->flush_buffer)
 		port->ops->flush_buffer(port);
 	spin_unlock_irqrestore(&port->lock, flags);
@@ -590,7 +588,7 @@ static void uart_flush_buffer(struct tty_struct *tty)
 static void uart_send_xchar(struct tty_struct *tty, char ch)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_port *port = state->port;
+	struct uart_port *port = state->uart_port;
 	unsigned long flags;
 
 	if (port->ops->send_xchar)
@@ -613,13 +611,13 @@ static void uart_throttle(struct tty_struct *tty)
 		uart_send_xchar(tty, STOP_CHAR(tty));
 
 	if (tty->termios->c_cflag & CRTSCTS)
-		uart_clear_mctrl(state->port, TIOCM_RTS);
+		uart_clear_mctrl(state->uart_port, TIOCM_RTS);
 }
 
 static void uart_unthrottle(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_port *port = state->port;
+	struct uart_port *port = state->uart_port;
 
 	if (I_IXOFF(tty)) {
 		if (port->x_char)
@@ -635,7 +633,7 @@ static void uart_unthrottle(struct tty_struct *tty)
 static int uart_get_info(struct uart_state *state,
 			 struct serial_struct __user *retinfo)
 {
-	struct uart_port *port = state->port;
+	struct uart_port *port = state->uart_port;
 	struct serial_struct tmp;
 
 	memset(&tmp, 0, sizeof(tmp));
@@ -674,7 +672,7 @@ static int uart_set_info(struct uart_state *state,
 			 struct serial_struct __user *newinfo)
 {
 	struct serial_struct new_serial;
-	struct uart_port *port = state->port;
+	struct uart_port *port = state->uart_port;
 	unsigned long new_port;
 	unsigned int change_irq, change_port, closing_wait;
 	unsigned int old_custom_divisor, close_delay;
@@ -840,15 +838,15 @@ static int uart_set_info(struct uart_state *state,
 	state->closing_wait    = closing_wait;
 	if (new_serial.xmit_fifo_size)
 		port->fifosize = new_serial.xmit_fifo_size;
-	if (state->info.port.tty)
-		state->info.port.tty->low_latency =
+	if (state->port.tty)
+		state->port.tty->low_latency =
 			(port->flags & UPF_LOW_LATENCY) ? 1 : 0;
 
  check_and_exit:
 	retval = 0;
 	if (port->type == PORT_UNKNOWN)
 		goto exit;
-	if (state->info.flags & UIF_INITIALIZED) {
+	if (state->flags & UIF_INITIALIZED) {
 		if (((old_flags ^ port->flags) & UPF_SPD_MASK) ||
 		    old_custom_divisor != port->custom_divisor) {
 			/*
@@ -861,7 +859,7 @@ static int uart_set_info(struct uart_state *state,
 				printk(KERN_NOTICE
 				       "%s sets custom speed on %s. This "
 				       "is deprecated.\n", current->comm,
-				       tty_name(state->info.port.tty, buf));
+				       tty_name(state->port.tty, buf));
 			}
 			uart_change_speed(state, NULL);
 		}
@@ -880,7 +878,7 @@ static int uart_set_info(struct uart_state *state,
 static int uart_get_lsr_info(struct uart_state *state,
 			     unsigned int __user *value)
 {
-	struct uart_port *port = state->port;
+	struct uart_port *port = state->uart_port;
 	unsigned int result;
 
 	result = port->ops->tx_empty(port);
@@ -892,8 +890,8 @@ static int uart_get_lsr_info(struct uart_state *state,
 	 * interrupt happens).
 	 */
 	if (port->x_char ||
-	    ((uart_circ_chars_pending(&state->info.xmit) > 0) &&
-	     !state->info.port.tty->stopped && !state->info.port.tty->hw_stopped))
+	    ((uart_circ_chars_pending(&state->xmit) > 0) &&
+	     !state->port.tty->stopped && !state->port.tty->hw_stopped))
 		result &= ~TIOCSER_TEMT;
 
 	return put_user(result, value);
@@ -902,7 +900,7 @@ static int uart_get_lsr_info(struct uart_state *state,
 static int uart_tiocmget(struct tty_struct *tty, struct file *file)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_port *port = state->port;
+	struct uart_port *port = state->uart_port;
 	int result = -EIO;
 
 	mutex_lock(&state->mutex);
@@ -924,7 +922,7 @@ uart_tiocmset(struct tty_struct *tty, struct file *file,
 	      unsigned int set, unsigned int clear)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_port *port = state->port;
+	struct uart_port *port = state->uart_port;
 	int ret = -EIO;
 
 	mutex_lock(&state->mutex);
@@ -940,7 +938,7 @@ uart_tiocmset(struct tty_struct *tty, struct file *file,
 static int uart_break_ctl(struct tty_struct *tty, int break_state)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_port *port = state->port;
+	struct uart_port *port = state->uart_port;
 
 	mutex_lock(&state->mutex);
 
@@ -953,7 +951,7 @@ static int uart_break_ctl(struct tty_struct *tty, int break_state)
 
 static int uart_do_autoconfig(struct uart_state *state)
 {
-	struct uart_port *port = state->port;
+	struct uart_port *port = state->uart_port;
 	int flags, ret;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -1003,7 +1001,7 @@ static int uart_do_autoconfig(struct uart_state *state)
 static int
 uart_wait_modem_status(struct uart_state *state, unsigned long arg)
 {
-	struct uart_port *port = state->port;
+	struct uart_port *port = state->uart_port;
 	DECLARE_WAITQUEUE(wait, current);
 	struct uart_icount cprev, cnow;
 	int ret;
@@ -1020,7 +1018,7 @@ uart_wait_modem_status(struct uart_state *state, unsigned long arg)
 	port->ops->enable_ms(port);
 	spin_unlock_irq(&port->lock);
 
-	add_wait_queue(&state->info.delta_msr_wait, &wait);
+	add_wait_queue(&state->delta_msr_wait, &wait);
 	for (;;) {
 		spin_lock_irq(&port->lock);
 		memcpy(&cnow, &port->icount, sizeof(struct uart_icount));
@@ -1048,7 +1046,7 @@ uart_wait_modem_status(struct uart_state *state, unsigned long arg)
 	}
 
 	current->state = TASK_RUNNING;
-	remove_wait_queue(&state->info.delta_msr_wait, &wait);
+	remove_wait_queue(&state->delta_msr_wait, &wait);
 
 	return ret;
 }
@@ -1064,7 +1062,7 @@ static int uart_get_count(struct uart_state *state,
 {
 	struct serial_icounter_struct icount;
 	struct uart_icount cnow;
-	struct uart_port *port = state->port;
+	struct uart_port *port = state->uart_port;
 
 	spin_lock_irq(&port->lock);
 	memcpy(&cnow, &port->icount, sizeof(struct uart_icount));
@@ -1160,7 +1158,7 @@ uart_ioctl(struct tty_struct *tty, struct file *filp, unsigned int cmd,
 		break;
 
 	default: {
-		struct uart_port *port = state->port;
+		struct uart_port *port = state->uart_port;
 		if (port->ops->ioctl)
 			ret = port->ops->ioctl(port, cmd, arg);
 		break;
@@ -1175,7 +1173,7 @@ out:
 static void uart_set_ldisc(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_port *port = state->port;
+	struct uart_port *port = state->uart_port;
 
 	if (port->ops->set_ldisc)
 		port->ops->set_ldisc(port);
@@ -1207,7 +1205,7 @@ static void uart_set_termios(struct tty_struct *tty,
 
 	/* Handle transition to B0 status */
 	if ((old_termios->c_cflag & CBAUD) && !(cflag & CBAUD))
-		uart_clear_mctrl(state->port, TIOCM_RTS | TIOCM_DTR);
+		uart_clear_mctrl(state->uart_port, TIOCM_RTS | TIOCM_DTR);
 
 	/* Handle transition away from B0 status */
 	if (!(old_termios->c_cflag & CBAUD) && (cflag & CBAUD)) {
@@ -1215,25 +1213,25 @@ static void uart_set_termios(struct tty_struct *tty,
 		if (!(cflag & CRTSCTS) ||
 		    !test_bit(TTY_THROTTLED, &tty->flags))
 			mask |= TIOCM_RTS;
-		uart_set_mctrl(state->port, mask);
+		uart_set_mctrl(state->uart_port, mask);
 	}
 
 	/* Handle turning off CRTSCTS */
 	if ((old_termios->c_cflag & CRTSCTS) && !(cflag & CRTSCTS)) {
-		spin_lock_irqsave(&state->port->lock, flags);
+		spin_lock_irqsave(&state->uart_port->lock, flags);
 		tty->hw_stopped = 0;
 		__uart_start(tty);
-		spin_unlock_irqrestore(&state->port->lock, flags);
+		spin_unlock_irqrestore(&state->uart_port->lock, flags);
 	}
 
 	/* Handle turning on CRTSCTS */
 	if (!(old_termios->c_cflag & CRTSCTS) && (cflag & CRTSCTS)) {
-		spin_lock_irqsave(&state->port->lock, flags);
-		if (!(state->port->ops->get_mctrl(state->port) & TIOCM_CTS)) {
+		spin_lock_irqsave(&state->uart_port->lock, flags);
+		if (!(state->uart_port->ops->get_mctrl(state->uart_port) & TIOCM_CTS)) {
 			tty->hw_stopped = 1;
-			state->port->ops->stop_tx(state->port);
+			state->uart_port->ops->stop_tx(state->uart_port);
 		}
-		spin_unlock_irqrestore(&state->port->lock, flags);
+		spin_unlock_irqrestore(&state->uart_port->lock, flags);
 	}
 #if 0
 	/*
@@ -1244,7 +1242,7 @@ static void uart_set_termios(struct tty_struct *tty,
 	 */
 	if (!(old_termios->c_cflag & CLOCAL) &&
 	    (tty->termios->c_cflag & CLOCAL))
-		wake_up_interruptible(&info->port.open_wait);
+		wake_up_interruptible(&state->uart_port.open_wait);
 #endif
 }
 
@@ -1260,10 +1258,10 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 
 	BUG_ON(!kernel_locked());
 
-	if (!state || !state->port)
+	if (!state || !state->uart_port)
 		return;
 
-	port = state->port;
+	port = state->uart_port;
 
 	pr_debug("uart_close(%d) called\n", port->line);
 
@@ -1306,7 +1304,7 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 	 * At this point, we stop accepting input.  To do this, we
 	 * disable the receive line status interrupts.
 	 */
-	if (state->info.flags & UIF_INITIALIZED) {
+	if (state->flags & UIF_INITIALIZED) {
 		unsigned long flags;
 		spin_lock_irqsave(&port->lock, flags);
 		port->ops->stop_rx(port);
@@ -1325,9 +1323,9 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 	tty_ldisc_flush(tty);
 
 	tty->closing = 0;
-	state->info.port.tty = NULL;
+	state->port.tty = NULL;
 
-	if (state->info.port.blocked_open) {
+	if (state->port.blocked_open) {
 		if (state->close_delay)
 			msleep_interruptible(state->close_delay);
 	} else if (!uart_console(port)) {
@@ -1337,8 +1335,8 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 	/*
 	 * Wake up anyone trying to open this port.
 	 */
-	state->info.flags &= ~UIF_NORMAL_ACTIVE;
-	wake_up_interruptible(&state->info.port.open_wait);
+	state->flags &= ~UIF_NORMAL_ACTIVE;
+	wake_up_interruptible(&state->port.open_wait);
 
  done:
 	mutex_unlock(&state->mutex);
@@ -1347,7 +1345,7 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 static void uart_wait_until_sent(struct tty_struct *tty, int timeout)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_port *port = state->port;
+	struct uart_port *port = state->uart_port;
 	unsigned long char_time, expire;
 
 	if (port->type == PORT_UNKNOWN || port->fifosize == 0)
@@ -1412,20 +1410,19 @@ static void uart_wait_until_sent(struct tty_struct *tty, int timeout)
 static void uart_hangup(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_info *info = &state->info;
 
 	BUG_ON(!kernel_locked());
-	pr_debug("uart_hangup(%d)\n", state->port->line);
+	pr_debug("uart_hangup(%d)\n", state->uart_port->line);
 
 	mutex_lock(&state->mutex);
-	if (info->flags & UIF_NORMAL_ACTIVE) {
+	if (state->flags & UIF_NORMAL_ACTIVE) {
 		uart_flush_buffer(tty);
 		uart_shutdown(state);
 		state->count = 0;
-		info->flags &= ~UIF_NORMAL_ACTIVE;
-		info->port.tty = NULL;
-		wake_up_interruptible(&info->port.open_wait);
-		wake_up_interruptible(&info->delta_msr_wait);
+		state->flags &= ~UIF_NORMAL_ACTIVE;
+		state->port.tty = NULL;
+		wake_up_interruptible(&state->port.open_wait);
+		wake_up_interruptible(&state->delta_msr_wait);
 	}
 	mutex_unlock(&state->mutex);
 }
@@ -1438,8 +1435,8 @@ static void uart_hangup(struct tty_struct *tty)
  */
 static void uart_update_termios(struct uart_state *state)
 {
-	struct tty_struct *tty = state->info.port.tty;
-	struct uart_port *port = state->port;
+	struct tty_struct *tty = state->port.tty;
+	struct uart_port *port = state->uart_port;
 
 	if (uart_console(port) && port->cons->cflag) {
 		tty->termios->c_cflag = port->cons->cflag;
@@ -1473,27 +1470,26 @@ static int
 uart_block_til_ready(struct file *filp, struct uart_state *state)
 {
 	DECLARE_WAITQUEUE(wait, current);
-	struct uart_info *info = &state->info;
-	struct uart_port *port = state->port;
+	struct uart_port *port = state->uart_port;
 	unsigned int mctrl;
 
-	info->port.blocked_open++;
+	state->port.blocked_open++;
 	state->count--;
 
-	add_wait_queue(&info->port.open_wait, &wait);
+	add_wait_queue(&state->port.open_wait, &wait);
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
 
 		/*
 		 * If we have been hung up, tell userspace/restart open.
 		 */
-		if (tty_hung_up_p(filp) || info->port.tty == NULL)
+		if (tty_hung_up_p(filp) || state->port.tty == NULL)
 			break;
 
 		/*
 		 * If the port has been closed, tell userspace/restart open.
 		 */
-		if (!(info->flags & UIF_INITIALIZED))
+		if (!(state->flags & UIF_INITIALIZED))
 			break;
 
 		/*
@@ -1506,8 +1502,8 @@ uart_block_til_ready(struct file *filp, struct uart_state *state)
 		 * have set TTY_IO_ERROR for a non-existant port.
 		 */
 		if ((filp->f_flags & O_NONBLOCK) ||
-		    (info->port.tty->termios->c_cflag & CLOCAL) ||
-		    (info->port.tty->flags & (1 << TTY_IO_ERROR)))
+		    (state->port.tty->termios->c_cflag & CLOCAL) ||
+		    (state->port.tty->flags & (1 << TTY_IO_ERROR)))
 			break;
 
 		/*
@@ -1515,7 +1511,7 @@ uart_block_til_ready(struct file *filp, struct uart_state *state)
 		 * not set RTS here - we want to make sure we catch
 		 * the data from the modem.
 		 */
-		if (info->port.tty->termios->c_cflag & CBAUD)
+		if (state->port.tty->termios->c_cflag & CBAUD)
 			uart_set_mctrl(port, TIOCM_DTR);
 
 		/*
@@ -1537,15 +1533,15 @@ uart_block_til_ready(struct file *filp, struct uart_state *state)
 			break;
 	}
 	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&info->port.open_wait, &wait);
+	remove_wait_queue(&state->port.open_wait, &wait);
 
 	state->count++;
-	info->port.blocked_open--;
+	state->port.blocked_open--;
 
 	if (signal_pending(current))
 		return -ERESTARTSYS;
 
-	if (!info->port.tty || tty_hung_up_p(filp))
+	if (!state->port.tty || tty_hung_up_p(filp))
 		return -EAGAIN;
 
 	return 0;
@@ -1563,7 +1559,7 @@ static struct uart_state *uart_get(struct uart_driver *drv, int line)
 	}
 
 	state->count++;
-	if (!state->port || state->port->flags & UPF_DEAD) {
+	if (!state->uart_port || state->uart_port->flags & UPF_DEAD) {
 		ret = -ENXIO;
 		goto err_unlock;
 	}
@@ -1606,10 +1602,11 @@ static int uart_open(struct tty_struct *tty, struct file *filp)
 
 	/*
 	 * We take the semaphore inside uart_get to guarantee that we won't
-	 * be re-entered while allocating the info structure, or while we
+	 * be re-entered while allocating the state structure, or while we
 	 * request any IRQs that the driver may need.  This also has the nice
 	 * side-effect that it delays the action of uart_hangup, so we can
-	 * guarantee that info->port.tty will always contain something reasonable.
+	 * guarantee that state->port.tty will always contain something
+	 * reasonable.
 	 */
 	state = uart_get(drv, line);
 	if (IS_ERR(state)) {
@@ -1623,10 +1620,10 @@ static int uart_open(struct tty_struct *tty, struct file *filp)
 	 * Any failures from here onwards should not touch the count.
 	 */
 	tty->driver_data = state;
-	state->port->info = &state->info;
-	tty->low_latency = (state->port->flags & UPF_LOW_LATENCY) ? 1 : 0;
+	state->uart_port->state = state;
+	tty->low_latency = (state->uart_port->flags & UPF_LOW_LATENCY) ? 1 : 0;
 	tty->alt_speed = 0;
-	state->info.port.tty = tty;
+	state->port.tty = tty;
 
 	/*
 	 * If the port is in the middle of closing, bail out now.
@@ -1659,8 +1656,8 @@ static int uart_open(struct tty_struct *tty, struct file *filp)
 	/*
 	 * If this is the first open to succeed, adjust things to suit.
 	 */
-	if (retval == 0 && !(state->info.flags & UIF_NORMAL_ACTIVE)) {
-		state->info.flags |= UIF_NORMAL_ACTIVE;
+	if (retval == 0 && !(state->flags & UIF_NORMAL_ACTIVE)) {
+		state->flags |= UIF_NORMAL_ACTIVE;
 
 		uart_update_termios(state);
 	}
@@ -1688,7 +1685,7 @@ static void uart_line_info(struct seq_file *m, struct uart_driver *drv, int i)
 {
 	struct uart_state *state = drv->state + i;
 	int pm_state;
-	struct uart_port *port = state->port;
+	struct uart_port *port = state->uart_port;
 	char stat_buf[32];
 	unsigned int status;
 	int mmio;
@@ -1958,7 +1955,7 @@ EXPORT_SYMBOL_GPL(uart_set_options);
 
 static void uart_change_pm(struct uart_state *state, int pm_state)
 {
-	struct uart_port *port = state->port;
+	struct uart_port *port = state->uart_port;
 
 	if (state->pm_state != pm_state) {
 		if (port->ops->pm)
@@ -2005,11 +2002,11 @@ int uart_suspend_port(struct uart_driver *drv, struct uart_port *port)
 	}
 	port->suspended = 1;
 
-	if (state->info.flags & UIF_INITIALIZED) {
+	if (state->flags & UIF_INITIALIZED) {
 		const struct uart_ops *ops = port->ops;
 		int tries;
 
-		state->info.flags = (state->info.flags & ~UIF_INITIALIZED)
+		state->flags = (state->flags & ~UIF_INITIALIZED)
 				     | UIF_SUSPENDED;
 
 		spin_lock_irq(&port->lock);
@@ -2084,15 +2081,15 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *port)
 		/*
 		 * If that's unset, use the tty termios setting.
 		 */
-		if (state->info.port.tty && termios.c_cflag == 0)
-			termios = *state->info.port.tty->termios;
+		if (state->port.tty && termios.c_cflag == 0)
+			termios = *state->port.tty->termios;
 
 		uart_change_pm(state, 0);
 		port->ops->set_termios(port, &termios, NULL);
 		console_start(port->cons);
 	}
 
-	if (state->info.flags & UIF_SUSPENDED) {
+	if (state->flags & UIF_SUSPENDED) {
 		const struct uart_ops *ops = port->ops;
 		int ret;
 
@@ -2107,7 +2104,7 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *port)
 			ops->set_mctrl(port, port->mctrl);
 			ops->start_tx(port);
 			spin_unlock_irq(&port->lock);
-			state->info.flags |= UIF_INITIALIZED;
+			state->flags |= UIF_INITIALIZED;
 		} else {
 			/*
 			 * Failed to resume - maybe hardware went away?
@@ -2117,7 +2114,7 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *port)
 			uart_shutdown(state);
 		}
 
-		state->info.flags &= ~UIF_SUSPENDED;
+		state->flags &= ~UIF_SUSPENDED;
 	}
 
 	mutex_unlock(&state->mutex);
@@ -2232,10 +2229,10 @@ static int uart_poll_init(struct tty_driver *driver, int line, char *options)
 	int parity = 'n';
 	int flow = 'n';
 
-	if (!state || !state->port)
+	if (!state || !state->uart_port)
 		return -1;
 
-	port = state->port;
+	port = state->uart_port;
 	if (!(port->ops->poll_get_char && port->ops->poll_put_char))
 		return -1;
 
@@ -2253,10 +2250,10 @@ static int uart_poll_get_char(struct tty_driver *driver, int line)
 	struct uart_state *state = drv->state + line;
 	struct uart_port *port;
 
-	if (!state || !state->port)
+	if (!state || !state->uart_port)
 		return -1;
 
-	port = state->port;
+	port = state->uart_port;
 	return port->ops->poll_get_char(port);
 }
 
@@ -2266,10 +2263,10 @@ static void uart_poll_put_char(struct tty_driver *driver, int line, char ch)
 	struct uart_state *state = drv->state + line;
 	struct uart_port *port;
 
-	if (!state || !state->port)
+	if (!state || !state->uart_port)
 		return;
 
-	port = state->port;
+	port = state->uart_port;
 	port->ops->poll_put_char(port, ch);
 }
 #endif
@@ -2365,9 +2362,9 @@ int uart_register_driver(struct uart_driver *drv)
 		state->closing_wait    = 30000;	/* 30 seconds */
 		mutex_init(&state->mutex);
 
-		tty_port_init(&state->info.port);
-		init_waitqueue_head(&state->info.delta_msr_wait);
-		tasklet_init(&state->info.tlet, uart_tasklet_action,
+		tty_port_init(&state->port);
+		init_waitqueue_head(&state->delta_msr_wait);
+		tasklet_init(&state->tlet, uart_tasklet_action,
 			     (unsigned long)state);
 	}
 
@@ -2430,16 +2427,16 @@ int uart_add_one_port(struct uart_driver *drv, struct uart_port *port)
 
 	mutex_lock(&port_mutex);
 	mutex_lock(&state->mutex);
-	if (state->port) {
+	if (state->uart_port) {
 		ret = -EINVAL;
 		goto out;
 	}
 
-	state->port = port;
+	state->uart_port = port;
 	state->pm_state = -1;
 
 	port->cons = drv->cons;
-	port->info = &state->info;
+	port->state = state;
 
 	/*
 	 * If this port is a console, then the spinlock is already
@@ -2488,13 +2485,12 @@ int uart_add_one_port(struct uart_driver *drv, struct uart_port *port)
 int uart_remove_one_port(struct uart_driver *drv, struct uart_port *port)
 {
 	struct uart_state *state = drv->state + port->line;
-	struct uart_info *info;
 
 	BUG_ON(in_interrupt());
 
-	if (state->port != port)
+	if (state->uart_port != port)
 		printk(KERN_ALERT "Removing wrong port: %p != %p\n",
-			state->port, port);
+			state->uart_port, port);
 
 	mutex_lock(&port_mutex);
 
@@ -2511,9 +2507,8 @@ int uart_remove_one_port(struct uart_driver *drv, struct uart_port *port)
 	 */
 	tty_unregister_device(drv->tty_driver, port->line);
 
-	info = &state->info;
-	if (info && info->port.tty)
-		tty_vhangup(info->port.tty);
+	if (state->port.tty)
+		tty_vhangup(state->port.tty);
 
 	/*
 	 * Free the port IO and memory resources, if any.
@@ -2529,10 +2524,9 @@ int uart_remove_one_port(struct uart_driver *drv, struct uart_port *port)
 	/*
 	 * Kill the tasklet, and free resources.
 	 */
-	if (info)
-		tasklet_kill(&info->tlet);
+	tasklet_kill(&state->tlet);
 
-	state->port = NULL;
+	state->uart_port = NULL;
 	mutex_unlock(&port_mutex);
 
 	return 0;
diff --git a/drivers/serial/serial_ks8695.c b/drivers/serial/serial_ks8695.c
index 52db5cc3f900..4560b2e70685 100644
--- a/drivers/serial/serial_ks8695.c
+++ b/drivers/serial/serial_ks8695.c
@@ -154,7 +154,7 @@ static void ks8695uart_disable_ms(struct uart_port *port)
 static irqreturn_t ks8695uart_rx_chars(int irq, void *dev_id)
 {
 	struct uart_port *port = dev_id;
-	struct tty_struct *tty = port->info->port.tty;
+	struct tty_struct *tty = port->state->port.tty;
 	unsigned int status, ch, lsr, flg, max_count = 256;
 
 	status = UART_GET_LSR(port);		/* clears pending LSR interrupts */
@@ -210,7 +210,7 @@ ignore_char:
 static irqreturn_t ks8695uart_tx_chars(int irq, void *dev_id)
 {
 	struct uart_port *port = dev_id;
-	struct circ_buf *xmit = &port->info->xmit;
+	struct circ_buf *xmit = &port->state->xmit;
 	unsigned int count;
 
 	if (port->x_char) {
@@ -266,7 +266,7 @@ static irqreturn_t ks8695uart_modem_status(int irq, void *dev_id)
 	if (status & URMS_URTERI)
 		port->icount.rng++;
 
-	wake_up_interruptible(&port->info->delta_msr_wait);
+	wake_up_interruptible(&port->state->delta_msr_wait);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/serial/serial_lh7a40x.c b/drivers/serial/serial_lh7a40x.c
index a7bf024a8286..057fc5e8cc8d 100644
--- a/drivers/serial/serial_lh7a40x.c
+++ b/drivers/serial/serial_lh7a40x.c
@@ -138,7 +138,7 @@ static void lh7a40xuart_enable_ms (struct uart_port* port)
 
 static void lh7a40xuart_rx_chars (struct uart_port* port)
 {
-	struct tty_struct* tty = port->info->port.tty;
+	struct tty_struct* tty = port->state->port.tty;
 	int cbRxMax = 256;	/* (Gross) limit on receive */
 	unsigned int data;	/* Received data and status */
 	unsigned int flag;
@@ -184,7 +184,7 @@ static void lh7a40xuart_rx_chars (struct uart_port* port)
 
 static void lh7a40xuart_tx_chars (struct uart_port* port)
 {
-	struct circ_buf* xmit = &port->info->xmit;
+	struct circ_buf* xmit = &port->state->xmit;
 	int cbTxMax = port->fifosize;
 
 	if (port->x_char) {
@@ -241,7 +241,7 @@ static void lh7a40xuart_modem_status (struct uart_port* port)
 	if (delta & CTS)
 		uart_handle_cts_change (port, status & CTS);
 
-	wake_up_interruptible (&port->info->delta_msr_wait);
+	wake_up_interruptible (&port->state->delta_msr_wait);
 }
 
 static irqreturn_t lh7a40xuart_int (int irq, void* dev_id)
diff --git a/drivers/serial/serial_txx9.c b/drivers/serial/serial_txx9.c
index 54dd16d66a4b..0f7cf4c453e6 100644
--- a/drivers/serial/serial_txx9.c
+++ b/drivers/serial/serial_txx9.c
@@ -272,7 +272,7 @@ static void serial_txx9_initialize(struct uart_port *port)
 static inline void
 receive_chars(struct uart_txx9_port *up, unsigned int *status)
 {
-	struct tty_struct *tty = up->port.info->port.tty;
+	struct tty_struct *tty = up->port.state->port.tty;
 	unsigned char ch;
 	unsigned int disr = *status;
 	int max_count = 256;
@@ -348,7 +348,7 @@ receive_chars(struct uart_txx9_port *up, unsigned int *status)
 
 static inline void transmit_chars(struct uart_txx9_port *up)
 {
-	struct circ_buf *xmit = &up->port.info->xmit;
+	struct circ_buf *xmit = &up->port.state->xmit;
 	int count;
 
 	if (up->port.x_char) {
diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index 32dc2fc50e6b..85119fb7cb50 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -361,7 +361,7 @@ static inline int sci_rxroom(struct uart_port *port)
 
 static void sci_transmit_chars(struct uart_port *port)
 {
-	struct circ_buf *xmit = &port->info->xmit;
+	struct circ_buf *xmit = &port->state->xmit;
 	unsigned int stopped = uart_tx_stopped(port);
 	unsigned short status;
 	unsigned short ctrl;
@@ -426,7 +426,7 @@ static void sci_transmit_chars(struct uart_port *port)
 static inline void sci_receive_chars(struct uart_port *port)
 {
 	struct sci_port *sci_port = to_sci_port(port);
-	struct tty_struct *tty = port->info->port.tty;
+	struct tty_struct *tty = port->state->port.tty;
 	int i, count, copied = 0;
 	unsigned short status;
 	unsigned char flag;
@@ -546,7 +546,7 @@ static inline int sci_handle_errors(struct uart_port *port)
 {
 	int copied = 0;
 	unsigned short status = sci_in(port, SCxSR);
-	struct tty_struct *tty = port->info->port.tty;
+	struct tty_struct *tty = port->state->port.tty;
 
 	if (status & SCxSR_ORER(port)) {
 		/* overrun error */
@@ -600,7 +600,7 @@ static inline int sci_handle_errors(struct uart_port *port)
 
 static inline int sci_handle_fifo_overrun(struct uart_port *port)
 {
-	struct tty_struct *tty = port->info->port.tty;
+	struct tty_struct *tty = port->state->port.tty;
 	int copied = 0;
 
 	if (port->type != PORT_SCIF)
@@ -623,7 +623,7 @@ static inline int sci_handle_breaks(struct uart_port *port)
 {
 	int copied = 0;
 	unsigned short status = sci_in(port, SCxSR);
-	struct tty_struct *tty = port->info->port.tty;
+	struct tty_struct *tty = port->state->port.tty;
 	struct sci_port *s = to_sci_port(port);
 
 	if (uart_handle_break(port))
diff --git a/drivers/serial/sn_console.c b/drivers/serial/sn_console.c
index d5276c012f78..9794e0cd3dcc 100644
--- a/drivers/serial/sn_console.c
+++ b/drivers/serial/sn_console.c
@@ -469,9 +469,9 @@ sn_receive_chars(struct sn_cons_port *port, unsigned long flags)
 		return;
 	}
 
-	if (port->sc_port.info) {
+	if (port->sc_port.state) {
 		/* The serial_core stuffs are initilized, use them */
-		tty = port->sc_port.info->port.tty;
+		tty = port->sc_port.state->port.tty;
 	}
 	else {
 		/* Not registered yet - can't pass to tty layer.  */
@@ -550,9 +550,9 @@ static void sn_transmit_chars(struct sn_cons_port *port, int raw)
 
 	BUG_ON(!port->sc_is_asynch);
 
-	if (port->sc_port.info) {
+	if (port->sc_port.state) {
 		/* We're initilized, using serial core infrastructure */
-		xmit = &port->sc_port.info->xmit;
+		xmit = &port->sc_port.state->xmit;
 	} else {
 		/* Probably sn_sal_switch_to_asynch has been run but serial core isn't
 		 * initilized yet.  Just return.  Writes are going through
@@ -927,7 +927,7 @@ sn_sal_console_write(struct console *co, const char *s, unsigned count)
 	/* We can't look at the xmit buffer if we're not registered with serial core
 	 *  yet.  So only do the fancy recovery after registering
 	 */
-	if (!port->sc_port.info) {
+	if (!port->sc_port.state) {
 		/* Not yet registered with serial core - simple case */
 		puts_raw_fixed(port->sc_ops->sal_puts_raw, s, count);
 		return;
@@ -936,8 +936,8 @@ sn_sal_console_write(struct console *co, const char *s, unsigned count)
 	/* somebody really wants this output, might be an
 	 * oops, kdb, panic, etc.  make sure they get it. */
 	if (spin_is_locked(&port->sc_port.lock)) {
-		int lhead = port->sc_port.info->xmit.head;
-		int ltail = port->sc_port.info->xmit.tail;
+		int lhead = port->sc_port.state->xmit.head;
+		int ltail = port->sc_port.state->xmit.tail;
 		int counter, got_lock = 0;
 
 		/*
@@ -962,13 +962,13 @@ sn_sal_console_write(struct console *co, const char *s, unsigned count)
 				break;
 			} else {
 				/* still locked */
-				if ((lhead != port->sc_port.info->xmit.head)
+				if ((lhead != port->sc_port.state->xmit.head)
 				    || (ltail !=
-					port->sc_port.info->xmit.tail)) {
+					port->sc_port.state->xmit.tail)) {
 					lhead =
-						port->sc_port.info->xmit.head;
+						port->sc_port.state->xmit.head;
 					ltail =
-						port->sc_port.info->xmit.tail;
+						port->sc_port.state->xmit.tail;
 					counter = 0;
 				}
 			}
diff --git a/drivers/serial/sunhv.c b/drivers/serial/sunhv.c
index 1df5325faab2..d548652dee50 100644
--- a/drivers/serial/sunhv.c
+++ b/drivers/serial/sunhv.c
@@ -184,8 +184,8 @@ static struct tty_struct *receive_chars(struct uart_port *port)
 {
 	struct tty_struct *tty = NULL;
 
-	if (port->info != NULL)		/* Unopened serial console */
-		tty = port->info->port.tty;
+	if (port->state != NULL)		/* Unopened serial console */
+		tty = port->state->port.tty;
 
 	if (sunhv_ops->receive_chars(port, tty))
 		sun_do_break();
@@ -197,10 +197,10 @@ static void transmit_chars(struct uart_port *port)
 {
 	struct circ_buf *xmit;
 
-	if (!port->info)
+	if (!port->state)
 		return;
 
-	xmit = &port->info->xmit;
+	xmit = &port->state->xmit;
 	if (uart_circ_empty(xmit) || uart_tx_stopped(port))
 		return;
 
diff --git a/drivers/serial/sunsab.c b/drivers/serial/sunsab.c
index 0355efe115d9..7c4f2fe8e246 100644
--- a/drivers/serial/sunsab.c
+++ b/drivers/serial/sunsab.c
@@ -117,8 +117,8 @@ receive_chars(struct uart_sunsab_port *up,
 	int count = 0;
 	int i;
 
-	if (up->port.info != NULL)		/* Unopened serial console */
-		tty = up->port.info->port.tty;
+	if (up->port.state != NULL)		/* Unopened serial console */
+		tty = up->port.state->port.tty;
 
 	/* Read number of BYTES (Character + Status) available. */
 	if (stat->sreg.isr0 & SAB82532_ISR0_RPF) {
@@ -229,7 +229,7 @@ static void sunsab_tx_idle(struct uart_sunsab_port *);
 static void transmit_chars(struct uart_sunsab_port *up,
 			   union sab82532_irq_status *stat)
 {
-	struct circ_buf *xmit = &up->port.info->xmit;
+	struct circ_buf *xmit = &up->port.state->xmit;
 	int i;
 
 	if (stat->sreg.isr1 & SAB82532_ISR1_ALLS) {
@@ -297,7 +297,7 @@ static void check_status(struct uart_sunsab_port *up,
 		up->port.icount.dsr++;
 	}
 
-	wake_up_interruptible(&up->port.info->delta_msr_wait);
+	wake_up_interruptible(&up->port.state->delta_msr_wait);
 }
 
 static irqreturn_t sunsab_interrupt(int irq, void *dev_id)
@@ -429,7 +429,7 @@ static void sunsab_tx_idle(struct uart_sunsab_port *up)
 static void sunsab_start_tx(struct uart_port *port)
 {
 	struct uart_sunsab_port *up = (struct uart_sunsab_port *) port;
-	struct circ_buf *xmit = &up->port.info->xmit;
+	struct circ_buf *xmit = &up->port.state->xmit;
 	int i;
 
 	up->interrupt_mask1 &= ~(SAB82532_IMR1_ALLS|SAB82532_IMR1_XPR);
diff --git a/drivers/serial/sunsu.c b/drivers/serial/sunsu.c
index 47c6837850b1..5a32365b58ad 100644
--- a/drivers/serial/sunsu.c
+++ b/drivers/serial/sunsu.c
@@ -311,7 +311,7 @@ static void sunsu_enable_ms(struct uart_port *port)
 static struct tty_struct *
 receive_chars(struct uart_sunsu_port *up, unsigned char *status)
 {
-	struct tty_struct *tty = up->port.info->port.tty;
+	struct tty_struct *tty = up->port.state->port.tty;
 	unsigned char ch, flag;
 	int max_count = 256;
 	int saw_console_brk = 0;
@@ -389,7 +389,7 @@ receive_chars(struct uart_sunsu_port *up, unsigned char *status)
 
 static void transmit_chars(struct uart_sunsu_port *up)
 {
-	struct circ_buf *xmit = &up->port.info->xmit;
+	struct circ_buf *xmit = &up->port.state->xmit;
 	int count;
 
 	if (up->port.x_char) {
@@ -441,7 +441,7 @@ static void check_modem_status(struct uart_sunsu_port *up)
 	if (status & UART_MSR_DCTS)
 		uart_handle_cts_change(&up->port, status & UART_MSR_CTS);
 
-	wake_up_interruptible(&up->port.info->delta_msr_wait);
+	wake_up_interruptible(&up->port.state->delta_msr_wait);
 }
 
 static irqreturn_t sunsu_serial_interrupt(int irq, void *dev_id)
diff --git a/drivers/serial/sunzilog.c b/drivers/serial/sunzilog.c
index e09d3cebb4fb..055034d12b1c 100644
--- a/drivers/serial/sunzilog.c
+++ b/drivers/serial/sunzilog.c
@@ -328,9 +328,9 @@ sunzilog_receive_chars(struct uart_sunzilog_port *up,
 	unsigned char ch, r1, flag;
 
 	tty = NULL;
-	if (up->port.info != NULL &&		/* Unopened serial console */
-	    up->port.info->port.tty != NULL)	/* Keyboard || mouse */
-		tty = up->port.info->port.tty;
+	if (up->port.state != NULL &&		/* Unopened serial console */
+	    up->port.state->port.tty != NULL)	/* Keyboard || mouse */
+		tty = up->port.state->port.tty;
 
 	for (;;) {
 
@@ -451,7 +451,7 @@ static void sunzilog_status_handle(struct uart_sunzilog_port *up,
 			uart_handle_cts_change(&up->port,
 					       (status & CTS));
 
-		wake_up_interruptible(&up->port.info->delta_msr_wait);
+		wake_up_interruptible(&up->port.state->delta_msr_wait);
 	}
 
 	up->prev_status = status;
@@ -501,9 +501,9 @@ static void sunzilog_transmit_chars(struct uart_sunzilog_port *up,
 		return;
 	}
 
-	if (up->port.info == NULL)
+	if (up->port.state == NULL)
 		goto ack_tx_int;
-	xmit = &up->port.info->xmit;
+	xmit = &up->port.state->xmit;
 	if (uart_circ_empty(xmit))
 		goto ack_tx_int;
 
@@ -705,7 +705,7 @@ static void sunzilog_start_tx(struct uart_port *port)
 		port->icount.tx++;
 		port->x_char = 0;
 	} else {
-		struct circ_buf *xmit = &port->info->xmit;
+		struct circ_buf *xmit = &port->state->xmit;
 
 		writeb(xmit->buf[xmit->tail], &channel->data);
 		ZSDELAY();
diff --git a/drivers/serial/timbuart.c b/drivers/serial/timbuart.c
index 063a313b755c..3d40be6f389f 100644
--- a/drivers/serial/timbuart.c
+++ b/drivers/serial/timbuart.c
@@ -77,7 +77,7 @@ static void timbuart_flush_buffer(struct uart_port *port)
 
 static void timbuart_rx_chars(struct uart_port *port)
 {
-	struct tty_struct *tty = port->info->port.tty;
+	struct tty_struct *tty = port->state->port.tty;
 
 	while (ioread32(port->membase + TIMBUART_ISR) & RXDP) {
 		u8 ch = ioread8(port->membase + TIMBUART_RXFIFO);
@@ -86,7 +86,7 @@ static void timbuart_rx_chars(struct uart_port *port)
 	}
 
 	spin_unlock(&port->lock);
-	tty_flip_buffer_push(port->info->port.tty);
+	tty_flip_buffer_push(port->state->port.tty);
 	spin_lock(&port->lock);
 
 	dev_dbg(port->dev, "%s - total read %d bytes\n",
@@ -95,7 +95,7 @@ static void timbuart_rx_chars(struct uart_port *port)
 
 static void timbuart_tx_chars(struct uart_port *port)
 {
-	struct circ_buf *xmit = &port->info->xmit;
+	struct circ_buf *xmit = &port->state->xmit;
 
 	while (!(ioread32(port->membase + TIMBUART_ISR) & TXBF) &&
 		!uart_circ_empty(xmit)) {
@@ -118,7 +118,7 @@ static void timbuart_handle_tx_port(struct uart_port *port, u32 isr, u32 *ier)
 {
 	struct timbuart_port *uart =
 		container_of(port, struct timbuart_port, port);
-	struct circ_buf *xmit = &port->info->xmit;
+	struct circ_buf *xmit = &port->state->xmit;
 
 	if (uart_circ_empty(xmit) || uart_tx_stopped(port))
 		return;
@@ -231,7 +231,7 @@ static void timbuart_mctrl_check(struct uart_port *port, u32 isr, u32 *ier)
 		iowrite32(CTS_DELTA, port->membase + TIMBUART_ISR);
 		cts = timbuart_get_mctrl(port);
 		uart_handle_cts_change(port, cts & TIOCM_CTS);
-		wake_up_interruptible(&port->info->delta_msr_wait);
+		wake_up_interruptible(&port->state->delta_msr_wait);
 	}
 
 	*ier |= CTS_DELTA;
diff --git a/drivers/serial/uartlite.c b/drivers/serial/uartlite.c
index 3317148a4b93..5d3a573d6994 100644
--- a/drivers/serial/uartlite.c
+++ b/drivers/serial/uartlite.c
@@ -75,7 +75,7 @@ static struct uart_port ulite_ports[ULITE_NR_UARTS];
 
 static int ulite_receive(struct uart_port *port, int stat)
 {
-	struct tty_struct *tty = port->info->port.tty;
+	struct tty_struct *tty = port->state->port.tty;
 	unsigned char ch = 0;
 	char flag = TTY_NORMAL;
 
@@ -125,7 +125,7 @@ static int ulite_receive(struct uart_port *port, int stat)
 
 static int ulite_transmit(struct uart_port *port, int stat)
 {
-	struct circ_buf *xmit  = &port->info->xmit;
+	struct circ_buf *xmit  = &port->state->xmit;
 
 	if (stat & ULITE_STATUS_TXFULL)
 		return 0;
@@ -162,7 +162,7 @@ static irqreturn_t ulite_isr(int irq, void *dev_id)
 		busy |= ulite_transmit(port, stat);
 	} while (busy);
 
-	tty_flip_buffer_push(port->info->port.tty);
+	tty_flip_buffer_push(port->state->port.tty);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/serial/ucc_uart.c b/drivers/serial/ucc_uart.c
index e945e780b5c9..0c08f286a2ef 100644
--- a/drivers/serial/ucc_uart.c
+++ b/drivers/serial/ucc_uart.c
@@ -327,7 +327,7 @@ static int qe_uart_tx_pump(struct uart_qe_port *qe_port)
 	unsigned char *p;
 	unsigned int count;
 	struct uart_port *port = &qe_port->port;
-	struct circ_buf *xmit = &port->info->xmit;
+	struct circ_buf *xmit = &port->state->xmit;
 
 	bdp = qe_port->rx_cur;
 
@@ -466,7 +466,7 @@ static void qe_uart_int_rx(struct uart_qe_port *qe_port)
 	int i;
 	unsigned char ch, *cp;
 	struct uart_port *port = &qe_port->port;
-	struct tty_struct *tty = port->info->port.tty;
+	struct tty_struct *tty = port->state->port.tty;
 	struct qe_bd *bdp;
 	u16 status;
 	unsigned int flg;
diff --git a/drivers/serial/vr41xx_siu.c b/drivers/serial/vr41xx_siu.c
index dac550e57c29..cf4410e6d53b 100644
--- a/drivers/serial/vr41xx_siu.c
+++ b/drivers/serial/vr41xx_siu.c
@@ -318,7 +318,7 @@ static inline void receive_chars(struct uart_port *port, uint8_t *status)
 	char flag;
 	int max_count = RX_MAX_COUNT;
 
-	tty = port->info->port.tty;
+	tty = port->state->port.tty;
 	lsr = *status;
 
 	do {
@@ -386,7 +386,7 @@ static inline void check_modem_status(struct uart_port *port)
 	if (msr & UART_MSR_DCTS)
 		uart_handle_cts_change(port, msr & UART_MSR_CTS);
 
-	wake_up_interruptible(&port->info->delta_msr_wait);
+	wake_up_interruptible(&port->state->delta_msr_wait);
 }
 
 static inline void transmit_chars(struct uart_port *port)
@@ -394,7 +394,7 @@ static inline void transmit_chars(struct uart_port *port)
 	struct circ_buf *xmit;
 	int max_count = TX_MAX_COUNT;
 
-	xmit = &port->info->xmit;
+	xmit = &port->state->xmit;
 
 	if (port->x_char) {
 		siu_write(port, UART_TX, port->x_char);
diff --git a/drivers/serial/zs.c b/drivers/serial/zs.c
index d8c2809b1ab6..b9c9fb9198d6 100644
--- a/drivers/serial/zs.c
+++ b/drivers/serial/zs.c
@@ -602,12 +602,12 @@ static void zs_receive_chars(struct zs_port *zport)
 		uart_insert_char(uport, status, Rx_OVR, ch, flag);
 	}
 
-	tty_flip_buffer_push(uport->info->port.tty);
+	tty_flip_buffer_push(uport->state->port.tty);
 }
 
 static void zs_raw_transmit_chars(struct zs_port *zport)
 {
-	struct circ_buf *xmit = &zport->port.info->xmit;
+	struct circ_buf *xmit = &zport->port.state->xmit;
 
 	/* XON/XOFF chars.  */
 	if (zport->port.x_char) {
@@ -686,7 +686,7 @@ static void zs_status_handle(struct zs_port *zport, struct zs_port *zport_a)
 			uport->icount.rng++;
 
 		if (delta)
-			wake_up_interruptible(&uport->info->delta_msr_wait);
+			wake_up_interruptible(&uport->state->delta_msr_wait);
 
 		spin_lock(&scc->zlock);
 	}
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 3cd255f0b211..c1542703fbab 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -186,7 +186,6 @@
 #include <linux/sysrq.h>
 
 struct uart_port;
-struct uart_info;
 struct serial_struct;
 struct device;
 
@@ -284,7 +283,7 @@ struct uart_port {
 
 	unsigned int		read_status_mask;	/* driver specific */
 	unsigned int		ignore_status_mask;	/* driver specific */
-	struct uart_info	*info;			/* pointer to parent info */
+	struct uart_state	*state;			/* pointer to parent state */
 	struct uart_icount	icount;			/* statistics */
 
 	struct console		*cons;			/* struct console, if any */
@@ -343,8 +342,22 @@ struct uart_port {
  */
 typedef unsigned int __bitwise__ uif_t;
 
-struct uart_info {
+
+/*
+ * This is the state information which is persistent across opens.
+ * The low level driver must not to touch any elements contained
+ * within.
+ */
+struct uart_state {
 	struct tty_port		port;
+	unsigned int		close_delay;		/* msec */
+	unsigned int		closing_wait;		/* msec */
+
+#define USF_CLOSING_WAIT_INF	(0)
+#define USF_CLOSING_WAIT_NONE	(~0U)
+
+	int			count;
+	int			pm_state;
 	struct circ_buf		xmit;
 	uif_t			flags;
 
@@ -362,24 +375,7 @@ struct uart_info {
 
 	struct tasklet_struct	tlet;
 	wait_queue_head_t	delta_msr_wait;
-};
-
-/*
- * This is the state information which is persistent across opens.
- * The low level driver must not to touch any elements contained
- * within.
- */
-struct uart_state {
-	unsigned int		close_delay;		/* msec */
-	unsigned int		closing_wait;		/* msec */
-
-#define USF_CLOSING_WAIT_INF	(0)
-#define USF_CLOSING_WAIT_NONE	(~0U)
-
-	int			count;
-	int			pm_state;
-	struct uart_info	info;
-	struct uart_port	*port;
+	struct uart_port	*uart_port;
 
 	struct mutex		mutex;
 };
@@ -462,7 +458,7 @@ int uart_resume_port(struct uart_driver *reg, struct uart_port *port);
 
 static inline int uart_tx_stopped(struct uart_port *port)
 {
-	struct tty_struct *tty = port->info->port.tty;
+	struct tty_struct *tty = port->state->port.tty;
 	if(tty->stopped || tty->hw_stopped)
 		return 1;
 	return 0;
@@ -477,7 +473,7 @@ uart_handle_sysrq_char(struct uart_port *port, unsigned int ch)
 #ifdef SUPPORT_SYSRQ
 	if (port->sysrq) {
 		if (ch && time_before(jiffies, port->sysrq)) {
-			handle_sysrq(ch, port->info->port.tty);
+			handle_sysrq(ch, port->state->port.tty);
 			port->sysrq = 0;
 			return 1;
 		}
@@ -495,7 +491,7 @@ uart_handle_sysrq_char(struct uart_port *port, unsigned int ch)
  */
 static inline int uart_handle_break(struct uart_port *port)
 {
-	struct uart_info *info = port->info;
+	struct uart_state *state = port->state;
 #ifdef SUPPORT_SYSRQ
 	if (port->cons && port->cons->index == port->line) {
 		if (!port->sysrq) {
@@ -506,7 +502,7 @@ static inline int uart_handle_break(struct uart_port *port)
 	}
 #endif
 	if (port->flags & UPF_SAK)
-		do_SAK(info->port.tty);
+		do_SAK(state->port.tty);
 	return 0;
 }
 
@@ -518,7 +514,7 @@ static inline int uart_handle_break(struct uart_port *port)
 static inline void
 uart_handle_dcd_change(struct uart_port *port, unsigned int status)
 {
-	struct uart_info *info = port->info;
+	struct uart_state *state = port->state;
 
 	port->icount.dcd++;
 
@@ -527,11 +523,11 @@ uart_handle_dcd_change(struct uart_port *port, unsigned int status)
 		hardpps();
 #endif
 
-	if (info->flags & UIF_CHECK_CD) {
+	if (state->flags & UIF_CHECK_CD) {
 		if (status)
-			wake_up_interruptible(&info->port.open_wait);
-		else if (info->port.tty)
-			tty_hangup(info->port.tty);
+			wake_up_interruptible(&state->port.open_wait);
+		else if (state->port.tty)
+			tty_hangup(state->port.tty);
 	}
 }
 
@@ -543,12 +539,12 @@ uart_handle_dcd_change(struct uart_port *port, unsigned int status)
 static inline void
 uart_handle_cts_change(struct uart_port *port, unsigned int status)
 {
-	struct uart_info *info = port->info;
-	struct tty_struct *tty = info->port.tty;
+	struct uart_state *state = port->state;
+	struct tty_struct *tty = state->port.tty;
 
 	port->icount.cts++;
 
-	if (info->flags & UIF_CTS_FLOW) {
+	if (state->flags & UIF_CTS_FLOW) {
 		if (tty->hw_stopped) {
 			if (status) {
 				tty->hw_stopped = 0;
@@ -570,7 +566,7 @@ static inline void
 uart_insert_char(struct uart_port *port, unsigned int status,
 		 unsigned int overrun, unsigned int ch, unsigned int flag)
 {
-	struct tty_struct *tty = port->info->port.tty;
+	struct tty_struct *tty = port->state->port.tty;
 
 	if ((status & port->ignore_status_mask & ~overrun) == 0)
 		tty_insert_flip_char(tty, ch, flag);
-- 
cgit v1.2.3-71-gd317


From 5e99df561fc830730d63672d795a0b02ef8cdd6f Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Sat, 19 Sep 2009 13:13:28 -0700
Subject: serial: Fold closing_* fields into the tty_port ones

Remove some more serial specific use

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/serial/serial_core.c | 26 +++++++++++++-------------
 include/linux/serial_core.h  |  2 --
 include/linux/tty.h          |  4 ++--
 3 files changed, 15 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index ea53b6f224b0..2d63b13e2f7e 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -651,10 +651,10 @@ static int uart_get_info(struct uart_state *state,
 	tmp.flags	    = port->flags;
 	tmp.xmit_fifo_size  = port->fifosize;
 	tmp.baud_base	    = port->uartclk / 16;
-	tmp.close_delay	    = state->close_delay / 10;
-	tmp.closing_wait    = state->closing_wait == USF_CLOSING_WAIT_NONE ?
+	tmp.close_delay	    = state->port.close_delay / 10;
+	tmp.closing_wait    = state->port.closing_wait == USF_CLOSING_WAIT_NONE ?
 				ASYNC_CLOSING_WAIT_NONE :
-				state->closing_wait / 10;
+				state->port.closing_wait / 10;
 	tmp.custom_divisor  = port->custom_divisor;
 	tmp.hub6	    = port->hub6;
 	tmp.io_type         = port->iotype;
@@ -724,8 +724,8 @@ static int uart_set_info(struct uart_state *state,
 		retval = -EPERM;
 		if (change_irq || change_port ||
 		    (new_serial.baud_base != port->uartclk / 16) ||
-		    (close_delay != state->close_delay) ||
-		    (closing_wait != state->closing_wait) ||
+		    (close_delay != state->port.close_delay) ||
+		    (closing_wait != state->port.closing_wait) ||
 		    (new_serial.xmit_fifo_size &&
 		     new_serial.xmit_fifo_size != port->fifosize) ||
 		    (((new_flags ^ old_flags) & ~UPF_USR_MASK) != 0))
@@ -834,8 +834,8 @@ static int uart_set_info(struct uart_state *state,
 	port->flags            = (port->flags & ~UPF_CHANGE_MASK) |
 				 (new_flags & UPF_CHANGE_MASK);
 	port->custom_divisor   = new_serial.custom_divisor;
-	state->close_delay     = close_delay;
-	state->closing_wait    = closing_wait;
+	state->port.close_delay     = close_delay;
+	state->port.closing_wait    = closing_wait;
 	if (new_serial.xmit_fifo_size)
 		port->fifosize = new_serial.xmit_fifo_size;
 	if (state->port.tty)
@@ -1297,8 +1297,8 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 	 */
 	tty->closing = 1;
 
-	if (state->closing_wait != USF_CLOSING_WAIT_NONE)
-		tty_wait_until_sent(tty, msecs_to_jiffies(state->closing_wait));
+	if (state->port.closing_wait != USF_CLOSING_WAIT_NONE)
+		tty_wait_until_sent(tty, msecs_to_jiffies(state->port.closing_wait));
 
 	/*
 	 * At this point, we stop accepting input.  To do this, we
@@ -1326,8 +1326,8 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 	state->port.tty = NULL;
 
 	if (state->port.blocked_open) {
-		if (state->close_delay)
-			msleep_interruptible(state->close_delay);
+		if (state->port.close_delay)
+			msleep_interruptible(state->port.close_delay);
 	} else if (!uart_console(port)) {
 		uart_change_pm(state, 3);
 	}
@@ -2358,11 +2358,11 @@ int uart_register_driver(struct uart_driver *drv)
 	for (i = 0; i < drv->nr; i++) {
 		struct uart_state *state = drv->state + i;
 
-		state->close_delay     = 500;	/* .5 seconds */
-		state->closing_wait    = 30000;	/* 30 seconds */
 		mutex_init(&state->mutex);
 
 		tty_port_init(&state->port);
+		state->port.close_delay     = 500;	/* .5 seconds */
+		state->port.closing_wait    = 30000;	/* 30 seconds */
 		init_waitqueue_head(&state->delta_msr_wait);
 		tasklet_init(&state->tlet, uart_tasklet_action,
 			     (unsigned long)state);
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index c1542703fbab..fd11d4d5a571 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -350,8 +350,6 @@ typedef unsigned int __bitwise__ uif_t;
  */
 struct uart_state {
 	struct tty_port		port;
-	unsigned int		close_delay;		/* msec */
-	unsigned int		closing_wait;		/* msec */
 
 #define USF_CLOSING_WAIT_INF	(0)
 #define USF_CLOSING_WAIT_NONE	(~0U)
diff --git a/include/linux/tty.h b/include/linux/tty.h
index ecb3d1ba3017..9fdc3d84baad 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -206,8 +206,8 @@ struct tty_port {
 	unsigned long		flags;		/* TTY flags ASY_*/
 	struct mutex		mutex;		/* Locking */
 	unsigned char		*xmit_buf;	/* Optional buffer */
-	int			close_delay;	/* Close port delay */
-	int			closing_wait;	/* Delay for output */
+	unsigned int		close_delay;	/* Close port delay */
+	unsigned int		closing_wait;	/* Delay for output */
 	int			drain_delay;	/* Set to zero if no pure time
 						   based drain is needed else
 						   set to size of fifo */
-- 
cgit v1.2.3-71-gd317


From 91312cdb4fcd832341e425f74f49938e0503c929 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Sat, 19 Sep 2009 13:13:29 -0700
Subject: serial: move count into the tty_port version

Remove more stuff from the serial special case code

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/serial/serial_core.c | 38 ++++++++++++++++++++------------------
 include/linux/serial_core.h  |  1 -
 2 files changed, 20 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index 0ffefb3f1d8f..4af3364dd811 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -52,7 +52,7 @@ static struct lock_class_key port_lock_key;
 
 #define HIGH_BITS_OFFSET	((sizeof(long)-sizeof(int))*8)
 
-#define uart_users(state)	((state)->count + (state)->port.blocked_open)
+#define uart_users(state)	((state)->port.count + (state)->port.blocked_open)
 
 #ifdef CONFIG_SERIAL_CORE_CONSOLE
 #define uart_console(port)	((port)->cons && (port)->cons->index == (port)->line)
@@ -694,7 +694,7 @@ static int uart_set_info(struct uart_state *state,
 			USF_CLOSING_WAIT_NONE : new_serial.closing_wait * 10;
 
 	/*
-	 * This semaphore protects state->count.  It is also
+	 * This semaphore protects port->count.  It is also
 	 * very useful to prevent opens.  Also, take the
 	 * port configuration semaphore to make sure that a
 	 * module insertion/removal doesn't change anything
@@ -1272,24 +1272,24 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 	if (tty_hung_up_p(filp))
 		goto done;
 
-	if ((tty->count == 1) && (state->count != 1)) {
+	if ((tty->count == 1) && (port->count != 1)) {
 		/*
 		 * Uh, oh.  tty->count is 1, which means that the tty
-		 * structure will be freed.  state->count should always
+		 * structure will be freed.  port->count should always
 		 * be one in these conditions.  If it's greater than
 		 * one, we've got real problems, since it means the
 		 * serial port won't be shutdown.
 		 */
 		printk(KERN_ERR "uart_close: bad serial port count; tty->count is 1, "
-		       "state->count is %d\n", state->count);
-		state->count = 1;
+		       "port->count is %d\n", port->count);
+		port->count = 1;
 	}
-	if (--state->count < 0) {
+	if (--port->count < 0) {
 		printk(KERN_ERR "uart_close: bad serial port count for %s: %d\n",
-		       tty->name, state->count);
-		state->count = 0;
+		       tty->name, port->count);
+		port->count = 0;
 	}
-	if (state->count)
+	if (port->count)
 		goto done;
 
 	/*
@@ -1421,7 +1421,7 @@ static void uart_hangup(struct tty_struct *tty)
 	if (state->flags & UIF_NORMAL_ACTIVE) {
 		uart_flush_buffer(tty);
 		uart_shutdown(state);
-		state->count = 0;
+		port->count = 0;
 		state->flags &= ~UIF_NORMAL_ACTIVE;
 		port->tty = NULL;
 		wake_up_interruptible(&port->open_wait);
@@ -1478,7 +1478,7 @@ uart_block_til_ready(struct file *filp, struct uart_state *state)
 	unsigned int mctrl;
 
 	port->blocked_open++;
-	state->count--;
+	port->count--;
 
 	add_wait_queue(&port->open_wait, &wait);
 	while (1) {
@@ -1539,7 +1539,7 @@ uart_block_til_ready(struct file *filp, struct uart_state *state)
 	set_current_state(TASK_RUNNING);
 	remove_wait_queue(&port->open_wait, &wait);
 
-	state->count++;
+	port->count++;
 	port->blocked_open--;
 
 	if (signal_pending(current))
@@ -1562,7 +1562,7 @@ static struct uart_state *uart_get(struct uart_driver *drv, int line)
 		goto err;
 	}
 
-	state->count++;
+	state->port.count++;
 	if (!state->uart_port || state->uart_port->flags & UPF_DEAD) {
 		ret = -ENXIO;
 		goto err_unlock;
@@ -1570,7 +1570,7 @@ static struct uart_state *uart_get(struct uart_driver *drv, int line)
 	return state;
 
  err_unlock:
-	state->count--;
+	state->port.count--;
 	mutex_unlock(&state->mutex);
  err:
 	return ERR_PTR(ret);
@@ -1590,6 +1590,7 @@ static int uart_open(struct tty_struct *tty, struct file *filp)
 {
 	struct uart_driver *drv = (struct uart_driver *)tty->driver->driver_state;
 	struct uart_state *state;
+	struct tty_port *port;
 	int retval, line = tty->index;
 
 	BUG_ON(!kernel_locked());
@@ -1617,6 +1618,7 @@ static int uart_open(struct tty_struct *tty, struct file *filp)
 		retval = PTR_ERR(state);
 		goto fail;
 	}
+	port = &state->port;
 
 	/*
 	 * Once we set tty->driver_data here, we are guaranteed that
@@ -1627,14 +1629,14 @@ static int uart_open(struct tty_struct *tty, struct file *filp)
 	state->uart_port->state = state;
 	tty->low_latency = (state->uart_port->flags & UPF_LOW_LATENCY) ? 1 : 0;
 	tty->alt_speed = 0;
-	state->port.tty = tty;
+	port->tty = tty;
 
 	/*
 	 * If the port is in the middle of closing, bail out now.
 	 */
 	if (tty_hung_up_p(filp)) {
 		retval = -EAGAIN;
-		state->count--;
+		port->count--;
 		mutex_unlock(&state->mutex);
 		goto fail;
 	}
@@ -1642,7 +1644,7 @@ static int uart_open(struct tty_struct *tty, struct file *filp)
 	/*
 	 * Make sure the device is in D0 state.
 	 */
-	if (state->count == 1)
+	if (port->count == 1)
 		uart_change_pm(state, 0);
 
 	/*
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index fd11d4d5a571..63ad90966db2 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -354,7 +354,6 @@ struct uart_state {
 #define USF_CLOSING_WAIT_INF	(0)
 #define USF_CLOSING_WAIT_NONE	(~0U)
 
-	int			count;
 	int			pm_state;
 	struct circ_buf		xmit;
 	uif_t			flags;
-- 
cgit v1.2.3-71-gd317


From ccce6debb62d94964e3878f978a56b0f3e32d94f Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Sat, 19 Sep 2009 13:13:30 -0700
Subject: serial: move the flags into the tty_port field

Fortunately the serial layer was designed to use the same flag values but
with different names. It has its own SUSPENDED flag which is a free slot in
the ASYNC flags so we allocate it in the ASYNC flags instead.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/serial/serial_core.c | 153 ++++++++++++++++++++++---------------------
 include/linux/serial.h       |   2 +
 include/linux/serial_core.h  |  48 ++++++--------
 3 files changed, 98 insertions(+), 105 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index 4af3364dd811..744dec9301fb 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -29,10 +29,10 @@
 #include <linux/console.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <linux/serial_core.h>
 #include <linux/smp_lock.h>
 #include <linux/device.h>
 #include <linux/serial.h> /* for serial_state and serial_icounter_struct */
+#include <linux/serial_core.h>
 #include <linux/delay.h>
 #include <linux/mutex.h>
 
@@ -146,7 +146,7 @@ static int uart_startup(struct uart_state *state, int init_hw)
 	unsigned long page;
 	int retval = 0;
 
-	if (state->flags & UIF_INITIALIZED)
+	if (port->flags & ASYNC_INITIALIZED)
 		return 0;
 
 	/*
@@ -189,14 +189,14 @@ static int uart_startup(struct uart_state *state, int init_hw)
 				uart_set_mctrl(uport, TIOCM_RTS | TIOCM_DTR);
 		}
 
-		if (state->flags & UIF_CTS_FLOW) {
+		if (port->flags & ASYNC_CTS_FLOW) {
 			spin_lock_irq(&uport->lock);
 			if (!(uport->ops->get_mctrl(uport) & TIOCM_CTS))
 				port->tty->hw_stopped = 1;
 			spin_unlock_irq(&uport->lock);
 		}
 
-		state->flags |= UIF_INITIALIZED;
+		set_bit(ASYNCB_INITIALIZED, &port->flags);
 
 		clear_bit(TTY_IO_ERROR, &port->tty->flags);
 	}
@@ -214,7 +214,7 @@ static int uart_startup(struct uart_state *state, int init_hw)
  */
 static void uart_shutdown(struct uart_state *state)
 {
-	struct uart_port *port = state->uart_port;
+	struct uart_port *uport = state->uart_port;
 	struct tty_struct *tty = state->port.tty;
 
 	/*
@@ -223,14 +223,12 @@ static void uart_shutdown(struct uart_state *state)
 	if (tty)
 		set_bit(TTY_IO_ERROR, &tty->flags);
 
-	if (state->flags & UIF_INITIALIZED) {
-		state->flags &= ~UIF_INITIALIZED;
-
+	if (test_and_clear_bit(ASYNCB_INITIALIZED, &state->port.flags)) {
 		/*
 		 * Turn off DTR and RTS early.
 		 */
 		if (!tty || (tty->termios->c_cflag & HUPCL))
-			uart_clear_mctrl(port, TIOCM_DTR | TIOCM_RTS);
+			uart_clear_mctrl(uport, TIOCM_DTR | TIOCM_RTS);
 
 		/*
 		 * clear delta_msr_wait queue to avoid mem leaks: we may free
@@ -244,12 +242,12 @@ static void uart_shutdown(struct uart_state *state)
 		/*
 		 * Free the IRQ and disable the port.
 		 */
-		port->ops->shutdown(port);
+		uport->ops->shutdown(uport);
 
 		/*
 		 * Ensure that the IRQ handler isn't running on another CPU.
 		 */
-		synchronize_irq(port->irq);
+		synchronize_irq(uport->irq);
 	}
 
 	/*
@@ -429,15 +427,16 @@ EXPORT_SYMBOL(uart_get_divisor);
 static void
 uart_change_speed(struct uart_state *state, struct ktermios *old_termios)
 {
-	struct tty_struct *tty = state->port.tty;
-	struct uart_port *port = state->uart_port;
+	struct tty_port *port = &state->port;
+	struct tty_struct *tty = port->tty;
+	struct uart_port *uport = state->uart_port;
 	struct ktermios *termios;
 
 	/*
 	 * If we have no tty, termios, or the port does not exist,
 	 * then we can't set the parameters for this port.
 	 */
-	if (!tty || !tty->termios || port->type == PORT_UNKNOWN)
+	if (!tty || !tty->termios || uport->type == PORT_UNKNOWN)
 		return;
 
 	termios = tty->termios;
@@ -446,16 +445,16 @@ uart_change_speed(struct uart_state *state, struct ktermios *old_termios)
 	 * Set flags based on termios cflag
 	 */
 	if (termios->c_cflag & CRTSCTS)
-		state->flags |= UIF_CTS_FLOW;
+		set_bit(ASYNCB_CTS_FLOW, &port->flags);
 	else
-		state->flags &= ~UIF_CTS_FLOW;
+		clear_bit(ASYNCB_CTS_FLOW, &port->flags);
 
 	if (termios->c_cflag & CLOCAL)
-		state->flags &= ~UIF_CHECK_CD;
+		clear_bit(ASYNCB_CHECK_CD, &port->flags);
 	else
-		state->flags |= UIF_CHECK_CD;
+		set_bit(ASYNCB_CHECK_CD, &port->flags);
 
-	port->ops->set_termios(port, termios, old_termios);
+	uport->ops->set_termios(uport, termios, old_termios);
 }
 
 static inline int
@@ -848,7 +847,7 @@ static int uart_set_info(struct uart_state *state,
 	retval = 0;
 	if (uport->type == PORT_UNKNOWN)
 		goto exit;
-	if (state->flags & UIF_INITIALIZED) {
+	if (port->flags & ASYNC_INITIALIZED) {
 		if (((old_flags ^ uport->flags) & UPF_SPD_MASK) ||
 		    old_custom_divisor != uport->custom_divisor) {
 			/*
@@ -1306,7 +1305,7 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 	 * At this point, we stop accepting input.  To do this, we
 	 * disable the receive line status interrupts.
 	 */
-	if (state->flags & UIF_INITIALIZED) {
+	if (port->flags & ASYNC_INITIALIZED) {
 		unsigned long flags;
 		spin_lock_irqsave(&port->lock, flags);
 		uport->ops->stop_rx(uport);
@@ -1337,7 +1336,7 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 	/*
 	 * Wake up anyone trying to open this port.
 	 */
-	state->flags &= ~UIF_NORMAL_ACTIVE;
+	clear_bit(ASYNCB_NORMAL_ACTIVE, &port->flags);
 	wake_up_interruptible(&port->open_wait);
 
 done:
@@ -1418,11 +1417,11 @@ static void uart_hangup(struct tty_struct *tty)
 	pr_debug("uart_hangup(%d)\n", state->uart_port->line);
 
 	mutex_lock(&state->mutex);
-	if (state->flags & UIF_NORMAL_ACTIVE) {
+	if (port->flags & ASYNC_NORMAL_ACTIVE) {
 		uart_flush_buffer(tty);
 		uart_shutdown(state);
 		port->count = 0;
-		state->flags &= ~UIF_NORMAL_ACTIVE;
+		clear_bit(ASYNCB_NORMAL_ACTIVE, &port->flags);
 		port->tty = NULL;
 		wake_up_interruptible(&port->open_wait);
 		wake_up_interruptible(&state->delta_msr_wait);
@@ -1493,7 +1492,7 @@ uart_block_til_ready(struct file *filp, struct uart_state *state)
 		/*
 		 * If the port has been closed, tell userspace/restart open.
 		 */
-		if (!(state->flags & UIF_INITIALIZED))
+		if (!(port->flags & ASYNC_INITIALIZED))
 			break;
 
 		/*
@@ -1662,8 +1661,8 @@ static int uart_open(struct tty_struct *tty, struct file *filp)
 	/*
 	 * If this is the first open to succeed, adjust things to suit.
 	 */
-	if (retval == 0 && !(state->flags & UIF_NORMAL_ACTIVE)) {
-		state->flags |= UIF_NORMAL_ACTIVE;
+	if (retval == 0 && !(port->flags & ASYNC_NORMAL_ACTIVE)) {
+		set_bit(ASYNCB_NORMAL_ACTIVE, &port->flags);
 
 		uart_update_termios(state);
 	}
@@ -1985,63 +1984,64 @@ static int serial_match_port(struct device *dev, void *data)
 	return dev->devt == devt; /* Actually, only one tty per port */
 }
 
-int uart_suspend_port(struct uart_driver *drv, struct uart_port *port)
+int uart_suspend_port(struct uart_driver *drv, struct uart_port *uport)
 {
-	struct uart_state *state = drv->state + port->line;
+	struct uart_state *state = drv->state + uport->line;
+	struct tty_port *port = &state->port;
 	struct device *tty_dev;
-	struct uart_match match = {port, drv};
+	struct uart_match match = {uport, drv};
 
 	mutex_lock(&state->mutex);
 
-	if (!console_suspend_enabled && uart_console(port)) {
+	if (!console_suspend_enabled && uart_console(uport)) {
 		/* we're going to avoid suspending serial console */
 		mutex_unlock(&state->mutex);
 		return 0;
 	}
 
-	tty_dev = device_find_child(port->dev, &match, serial_match_port);
+	tty_dev = device_find_child(uport->dev, &match, serial_match_port);
 	if (device_may_wakeup(tty_dev)) {
-		enable_irq_wake(port->irq);
+		enable_irq_wake(uport->irq);
 		put_device(tty_dev);
 		mutex_unlock(&state->mutex);
 		return 0;
 	}
-	port->suspended = 1;
+	uport->suspended = 1;
 
-	if (state->flags & UIF_INITIALIZED) {
-		const struct uart_ops *ops = port->ops;
+	if (port->flags & ASYNC_INITIALIZED) {
+		const struct uart_ops *ops = uport->ops;
 		int tries;
 
-		state->flags = (state->flags & ~UIF_INITIALIZED)
-				     | UIF_SUSPENDED;
+		set_bit(ASYNCB_SUSPENDED, &port->flags);
+		clear_bit(ASYNCB_INITIALIZED, &port->flags);
 
-		spin_lock_irq(&port->lock);
-		ops->stop_tx(port);
-		ops->set_mctrl(port, 0);
-		ops->stop_rx(port);
-		spin_unlock_irq(&port->lock);
+		spin_lock_irq(&uport->lock);
+		ops->stop_tx(uport);
+		ops->set_mctrl(uport, 0);
+		ops->stop_rx(uport);
+		spin_unlock_irq(&uport->lock);
 
 		/*
 		 * Wait for the transmitter to empty.
 		 */
-		for (tries = 3; !ops->tx_empty(port) && tries; tries--)
+		for (tries = 3; !ops->tx_empty(uport) && tries; tries--)
 			msleep(10);
 		if (!tries)
 			printk(KERN_ERR "%s%s%s%d: Unable to drain "
 					"transmitter\n",
-			       port->dev ? dev_name(port->dev) : "",
-			       port->dev ? ": " : "",
+			       uport->dev ? dev_name(uport->dev) : "",
+			       uport->dev ? ": " : "",
 			       drv->dev_name,
-			       drv->tty_driver->name_base + port->line);
+			       drv->tty_driver->name_base + uport->line);
 
-		ops->shutdown(port);
+		ops->shutdown(uport);
 	}
 
 	/*
 	 * Disable the console device before suspending.
 	 */
-	if (uart_console(port))
-		console_stop(port->cons);
+	if (uart_console(uport))
+		console_stop(uport->cons);
 
 	uart_change_pm(state, 3);
 
@@ -2050,67 +2050,68 @@ int uart_suspend_port(struct uart_driver *drv, struct uart_port *port)
 	return 0;
 }
 
-int uart_resume_port(struct uart_driver *drv, struct uart_port *port)
+int uart_resume_port(struct uart_driver *drv, struct uart_port *uport)
 {
-	struct uart_state *state = drv->state + port->line;
+	struct uart_state *state = drv->state + uport->line;
+	struct tty_port *port = &state->port;
 	struct device *tty_dev;
-	struct uart_match match = {port, drv};
+	struct uart_match match = {uport, drv};
 
 	mutex_lock(&state->mutex);
 
-	if (!console_suspend_enabled && uart_console(port)) {
+	if (!console_suspend_enabled && uart_console(uport)) {
 		/* no need to resume serial console, it wasn't suspended */
 		mutex_unlock(&state->mutex);
 		return 0;
 	}
 
-	tty_dev = device_find_child(port->dev, &match, serial_match_port);
-	if (!port->suspended && device_may_wakeup(tty_dev)) {
-		disable_irq_wake(port->irq);
+	tty_dev = device_find_child(uport->dev, &match, serial_match_port);
+	if (!uport->suspended && device_may_wakeup(tty_dev)) {
+		disable_irq_wake(uport->irq);
 		mutex_unlock(&state->mutex);
 		return 0;
 	}
-	port->suspended = 0;
+	uport->suspended = 0;
 
 	/*
 	 * Re-enable the console device after suspending.
 	 */
-	if (uart_console(port)) {
+	if (uart_console(uport)) {
 		struct ktermios termios;
 
 		/*
 		 * First try to use the console cflag setting.
 		 */
 		memset(&termios, 0, sizeof(struct ktermios));
-		termios.c_cflag = port->cons->cflag;
+		termios.c_cflag = uport->cons->cflag;
 
 		/*
 		 * If that's unset, use the tty termios setting.
 		 */
-		if (state->port.tty && termios.c_cflag == 0)
-			termios = *state->port.tty->termios;
+		if (port->tty && termios.c_cflag == 0)
+			termios = *port->tty->termios;
 
 		uart_change_pm(state, 0);
-		port->ops->set_termios(port, &termios, NULL);
-		console_start(port->cons);
+		uport->ops->set_termios(uport, &termios, NULL);
+		console_start(uport->cons);
 	}
 
-	if (state->flags & UIF_SUSPENDED) {
-		const struct uart_ops *ops = port->ops;
+	if (port->flags & ASYNC_SUSPENDED) {
+		const struct uart_ops *ops = uport->ops;
 		int ret;
 
 		uart_change_pm(state, 0);
-		spin_lock_irq(&port->lock);
-		ops->set_mctrl(port, 0);
-		spin_unlock_irq(&port->lock);
-		ret = ops->startup(port);
+		spin_lock_irq(&uport->lock);
+		ops->set_mctrl(uport, 0);
+		spin_unlock_irq(&uport->lock);
+		ret = ops->startup(uport);
 		if (ret == 0) {
 			uart_change_speed(state, NULL);
-			spin_lock_irq(&port->lock);
-			ops->set_mctrl(port, port->mctrl);
-			ops->start_tx(port);
-			spin_unlock_irq(&port->lock);
-			state->flags |= UIF_INITIALIZED;
+			spin_lock_irq(&uport->lock);
+			ops->set_mctrl(uport, uport->mctrl);
+			ops->start_tx(uport);
+			spin_unlock_irq(&uport->lock);
+			set_bit(ASYNCB_INITIALIZED, &port->flags);
 		} else {
 			/*
 			 * Failed to resume - maybe hardware went away?
@@ -2120,7 +2121,7 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *port)
 			uart_shutdown(state);
 		}
 
-		state->flags &= ~UIF_SUSPENDED;
+		clear_bit(ASYNCB_SUSPENDED, &port->flags);
 	}
 
 	mutex_unlock(&state->mutex);
diff --git a/include/linux/serial.h b/include/linux/serial.h
index e5bb75a63802..c8613c3ff9d3 100644
--- a/include/linux/serial.h
+++ b/include/linux/serial.h
@@ -122,6 +122,7 @@ struct serial_uart_config {
 
 /* Internal flags used only by kernel */
 #define ASYNCB_INITIALIZED	31 /* Serial port was initialized */
+#define ASYNCB_SUSPENDED	30 /* Serial port is suspended */
 #define ASYNCB_NORMAL_ACTIVE	29 /* Normal device is active */
 #define ASYNCB_BOOT_AUTOCONF	28 /* Autoconfigure port on bootup */
 #define ASYNCB_CLOSING		27 /* Serial port is closing */
@@ -133,6 +134,7 @@ struct serial_uart_config {
 #define ASYNCB_FIRST_KERNEL	22
 
 #define ASYNC_HUP_NOTIFY	(1U << ASYNCB_HUP_NOTIFY)
+#define ASYNC_SUSPENDED		(1U << ASYNCB_SUSPENDED)
 #define ASYNC_FOURPORT		(1U << ASYNCB_FOURPORT)
 #define ASYNC_SAK		(1U << ASYNCB_SAK)
 #define ASYNC_SPLIT_TERMIOS	(1U << ASYNCB_SPLIT_TERMIOS)
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 63ad90966db2..284ef7f54890 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -20,6 +20,8 @@
 #ifndef LINUX_SERIAL_CORE_H
 #define LINUX_SERIAL_CORE_H
 
+#include <linux/serial.h>
+
 /*
  * The type definitions.  These are from Ted Ts'o's serial.h
  */
@@ -356,19 +358,6 @@ struct uart_state {
 
 	int			pm_state;
 	struct circ_buf		xmit;
-	uif_t			flags;
-
-/*
- * Definitions for info->flags.  These are _private_ to serial_core, and
- * are specific to this structure.  They may be queried by low level drivers.
- *
- * FIXME: use the ASY_ definitions
- */
-#define UIF_CHECK_CD		((__force uif_t) (1 << 25))
-#define UIF_CTS_FLOW		((__force uif_t) (1 << 26))
-#define UIF_NORMAL_ACTIVE	((__force uif_t) (1 << 29))
-#define UIF_INITIALIZED		((__force uif_t) (1 << 31))
-#define UIF_SUSPENDED		((__force uif_t) (1 << 30))
 
 	struct tasklet_struct	tlet;
 	wait_queue_head_t	delta_msr_wait;
@@ -509,22 +498,23 @@ static inline int uart_handle_break(struct uart_port *port)
  *	@status: new carrier detect status, nonzero if active
  */
 static inline void
-uart_handle_dcd_change(struct uart_port *port, unsigned int status)
+uart_handle_dcd_change(struct uart_port *uport, unsigned int status)
 {
-	struct uart_state *state = port->state;
+	struct uart_state *state = uport->state;
+	struct tty_port *port = &state->port;
 
-	port->icount.dcd++;
+	uport->icount.dcd++;
 
 #ifdef CONFIG_HARD_PPS
-	if ((port->flags & UPF_HARDPPS_CD) && status)
+	if ((uport->flags & UPF_HARDPPS_CD) && status)
 		hardpps();
 #endif
 
-	if (state->flags & UIF_CHECK_CD) {
+	if (port->flags & ASYNC_CHECK_CD) {
 		if (status)
-			wake_up_interruptible(&state->port.open_wait);
-		else if (state->port.tty)
-			tty_hangup(state->port.tty);
+			wake_up_interruptible(&port->open_wait);
+		else if (port->tty)
+			tty_hangup(port->tty);
 	}
 }
 
@@ -534,24 +524,24 @@ uart_handle_dcd_change(struct uart_port *port, unsigned int status)
  *	@status: new clear to send status, nonzero if active
  */
 static inline void
-uart_handle_cts_change(struct uart_port *port, unsigned int status)
+uart_handle_cts_change(struct uart_port *uport, unsigned int status)
 {
-	struct uart_state *state = port->state;
-	struct tty_struct *tty = state->port.tty;
+	struct tty_port *port = &uport->state->port;
+	struct tty_struct *tty = port->tty;
 
-	port->icount.cts++;
+	uport->icount.cts++;
 
-	if (state->flags & UIF_CTS_FLOW) {
+	if (port->flags & ASYNC_CTS_FLOW) {
 		if (tty->hw_stopped) {
 			if (status) {
 				tty->hw_stopped = 0;
-				port->ops->start_tx(port);
-				uart_write_wakeup(port);
+				uport->ops->start_tx(uport);
+				uart_write_wakeup(uport);
 			}
 		} else {
 			if (!status) {
 				tty->hw_stopped = 1;
-				port->ops->stop_tx(port);
+				uport->ops->stop_tx(uport);
 			}
 		}
 	}
-- 
cgit v1.2.3-71-gd317


From a03006860d272eac5a8ebf23f04f54c7e1e783a5 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Sat, 19 Sep 2009 13:13:30 -0700
Subject: serial: kill off uif_t

This typedef is now extinct

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/serial_core.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 284ef7f54890..f98dc78abb27 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -336,19 +336,8 @@ struct uart_port {
 	void			*private_data;		/* generic platform data pointer */
 };
 
-/*
- * This is the state information which is only valid when the port
- * is open; it may be cleared the core driver once the device has
- * been closed.  Either the low level driver or the core can modify
- * stuff here.
- */
-typedef unsigned int __bitwise__ uif_t;
-
-
 /*
  * This is the state information which is persistent across opens.
- * The low level driver must not to touch any elements contained
- * within.
  */
 struct uart_state {
 	struct tty_port		port;
-- 
cgit v1.2.3-71-gd317


From a2bceae065ed8c4f552b35c4dde4cc2db05ce9e3 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Sat, 19 Sep 2009 13:13:31 -0700
Subject: serial: replace the state mutex with the tty port mutex

They cover essentially the same stuff and we can therefore fold it into the
tty_port one.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/serial/pmac_zilog.c  |   8 +-
 drivers/serial/serial_core.c | 227 +++++++++++++++++++++++--------------------
 include/linux/serial_core.h  |   2 -
 3 files changed, 123 insertions(+), 114 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/pmac_zilog.c b/drivers/serial/pmac_zilog.c
index ab4c85ba3549..0dc786835dca 100644
--- a/drivers/serial/pmac_zilog.c
+++ b/drivers/serial/pmac_zilog.c
@@ -1645,7 +1645,7 @@ static int pmz_suspend(struct macio_dev *mdev, pm_message_t pm_state)
 	state = pmz_uart_reg.state + uap->port.line;
 
 	mutex_lock(&pmz_irq_mutex);
-	mutex_lock(&state->mutex);
+	mutex_lock(&state->port.mutex);
 
 	spin_lock_irqsave(&uap->port.lock, flags);
 
@@ -1676,7 +1676,7 @@ static int pmz_suspend(struct macio_dev *mdev, pm_message_t pm_state)
 	/* Shut the chip down */
 	pmz_set_scc_power(uap, 0);
 
-	mutex_unlock(&state->mutex);
+	mutex_unlock(&state->port.mutex);
 	mutex_unlock(&pmz_irq_mutex);
 
 	pmz_debug("suspend, switching complete\n");
@@ -1705,7 +1705,7 @@ static int pmz_resume(struct macio_dev *mdev)
 	state = pmz_uart_reg.state + uap->port.line;
 
 	mutex_lock(&pmz_irq_mutex);
-	mutex_lock(&state->mutex);
+	mutex_lock(&state->port.mutex);
 
 	spin_lock_irqsave(&uap->port.lock, flags);
 	if (!ZS_IS_OPEN(uap) && !ZS_IS_CONS(uap)) {
@@ -1737,7 +1737,7 @@ static int pmz_resume(struct macio_dev *mdev)
 	}
 
  bail:
-	mutex_unlock(&state->mutex);
+	mutex_unlock(&state->port.mutex);
 	mutex_unlock(&pmz_irq_mutex);
 
 	/* Right now, we deal with delay by blocking here, I'll be
diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index 744dec9301fb..9d42e57e1971 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -633,35 +633,36 @@ static void uart_unthrottle(struct tty_struct *tty)
 static int uart_get_info(struct uart_state *state,
 			 struct serial_struct __user *retinfo)
 {
-	struct uart_port *port = state->uart_port;
+	struct uart_port *uport = state->uart_port;
+	struct tty_port *port = &state->port;
 	struct serial_struct tmp;
 
 	memset(&tmp, 0, sizeof(tmp));
 
 	/* Ensure the state we copy is consistent and no hardware changes
 	   occur as we go */
-	mutex_lock(&state->mutex);
+	mutex_lock(&port->mutex);
 
-	tmp.type	    = port->type;
-	tmp.line	    = port->line;
-	tmp.port	    = port->iobase;
+	tmp.type	    = uport->type;
+	tmp.line	    = uport->line;
+	tmp.port	    = uport->iobase;
 	if (HIGH_BITS_OFFSET)
-		tmp.port_high = (long) port->iobase >> HIGH_BITS_OFFSET;
-	tmp.irq		    = port->irq;
-	tmp.flags	    = port->flags;
-	tmp.xmit_fifo_size  = port->fifosize;
-	tmp.baud_base	    = port->uartclk / 16;
-	tmp.close_delay	    = state->port.close_delay / 10;
-	tmp.closing_wait    = state->port.closing_wait == USF_CLOSING_WAIT_NONE ?
+		tmp.port_high = (long) uport->iobase >> HIGH_BITS_OFFSET;
+	tmp.irq		    = uport->irq;
+	tmp.flags	    = uport->flags;
+	tmp.xmit_fifo_size  = uport->fifosize;
+	tmp.baud_base	    = uport->uartclk / 16;
+	tmp.close_delay	    = port->close_delay / 10;
+	tmp.closing_wait    = port->closing_wait == USF_CLOSING_WAIT_NONE ?
 				ASYNC_CLOSING_WAIT_NONE :
-				state->port.closing_wait / 10;
-	tmp.custom_divisor  = port->custom_divisor;
-	tmp.hub6	    = port->hub6;
-	tmp.io_type         = port->iotype;
-	tmp.iomem_reg_shift = port->regshift;
-	tmp.iomem_base      = (void *)(unsigned long)port->mapbase;
+				port->closing_wait / 10;
+	tmp.custom_divisor  = uport->custom_divisor;
+	tmp.hub6	    = uport->hub6;
+	tmp.io_type         = uport->iotype;
+	tmp.iomem_reg_shift = uport->regshift;
+	tmp.iomem_base      = (void *)(unsigned long)uport->mapbase;
 
-	mutex_unlock(&state->mutex);
+	mutex_unlock(&port->mutex);
 
 	if (copy_to_user(retinfo, &tmp, sizeof(*retinfo)))
 		return -EFAULT;
@@ -699,7 +700,7 @@ static int uart_set_info(struct uart_state *state,
 	 * module insertion/removal doesn't change anything
 	 * under us.
 	 */
-	mutex_lock(&state->mutex);
+	mutex_lock(&port->mutex);
 
 	change_irq  = !(uport->flags & UPF_FIXED_PORT)
 		&& new_serial.irq != uport->irq;
@@ -867,7 +868,7 @@ static int uart_set_info(struct uart_state *state,
 	} else
 		retval = uart_startup(state, 1);
  exit:
-	mutex_unlock(&state->mutex);
+	mutex_unlock(&port->mutex);
 	return retval;
 }
 
@@ -902,10 +903,11 @@ static int uart_get_lsr_info(struct uart_state *state,
 static int uart_tiocmget(struct tty_struct *tty, struct file *file)
 {
 	struct uart_state *state = tty->driver_data;
+	struct tty_port *port = &state->port;
 	struct uart_port *uport = state->uart_port;
 	int result = -EIO;
 
-	mutex_lock(&state->mutex);
+	mutex_lock(&port->mutex);
 	if ((!file || !tty_hung_up_p(file)) &&
 	    !(tty->flags & (1 << TTY_IO_ERROR))) {
 		result = uport->mctrl;
@@ -914,7 +916,7 @@ static int uart_tiocmget(struct tty_struct *tty, struct file *file)
 		result |= uport->ops->get_mctrl(uport);
 		spin_unlock_irq(&uport->lock);
 	}
-	mutex_unlock(&state->mutex);
+	mutex_unlock(&port->mutex);
 
 	return result;
 }
@@ -925,35 +927,38 @@ uart_tiocmset(struct tty_struct *tty, struct file *file,
 {
 	struct uart_state *state = tty->driver_data;
 	struct uart_port *uport = state->uart_port;
+	struct tty_port *port = &state->port;
 	int ret = -EIO;
 
-	mutex_lock(&state->mutex);
+	mutex_lock(&port->mutex);
 	if ((!file || !tty_hung_up_p(file)) &&
 	    !(tty->flags & (1 << TTY_IO_ERROR))) {
 		uart_update_mctrl(uport, set, clear);
 		ret = 0;
 	}
-	mutex_unlock(&state->mutex);
+	mutex_unlock(&port->mutex);
 	return ret;
 }
 
 static int uart_break_ctl(struct tty_struct *tty, int break_state)
 {
 	struct uart_state *state = tty->driver_data;
+	struct tty_port *port = &state->port;
 	struct uart_port *uport = state->uart_port;
 
-	mutex_lock(&state->mutex);
+	mutex_lock(&port->mutex);
 
 	if (uport->type != PORT_UNKNOWN)
 		uport->ops->break_ctl(uport, break_state);
 
-	mutex_unlock(&state->mutex);
+	mutex_unlock(&port->mutex);
 	return 0;
 }
 
 static int uart_do_autoconfig(struct uart_state *state)
 {
 	struct uart_port *uport = state->uart_port;
+	struct tty_port *port = &state->port;
 	int flags, ret;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -964,7 +969,7 @@ static int uart_do_autoconfig(struct uart_state *state)
 	 * changing, and hence any extra opens of the port while
 	 * we're auto-configuring.
 	 */
-	if (mutex_lock_interruptible(&state->mutex))
+	if (mutex_lock_interruptible(&port->mutex))
 		return -ERESTARTSYS;
 
 	ret = -EBUSY;
@@ -990,7 +995,7 @@ static int uart_do_autoconfig(struct uart_state *state)
 
 		ret = uart_startup(state, 1);
 	}
-	mutex_unlock(&state->mutex);
+	mutex_unlock(&port->mutex);
 	return ret;
 }
 
@@ -1093,6 +1098,7 @@ uart_ioctl(struct tty_struct *tty, struct file *filp, unsigned int cmd,
 	   unsigned long arg)
 {
 	struct uart_state *state = tty->driver_data;
+	struct tty_port *port = &state->port;
 	void __user *uarg = (void __user *)arg;
 	int ret = -ENOIOCTLCMD;
 
@@ -1143,7 +1149,7 @@ uart_ioctl(struct tty_struct *tty, struct file *filp, unsigned int cmd,
 	if (ret != -ENOIOCTLCMD)
 		goto out;
 
-	mutex_lock(&state->mutex);
+	mutex_lock(&port->mutex);
 
 	if (tty_hung_up_p(filp)) {
 		ret = -EIO;
@@ -1167,7 +1173,7 @@ uart_ioctl(struct tty_struct *tty, struct file *filp, unsigned int cmd,
 	}
 	}
 out_up:
-	mutex_unlock(&state->mutex);
+	mutex_unlock(&port->mutex);
 out:
 	return ret;
 }
@@ -1266,7 +1272,7 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 
 	pr_debug("uart_close(%d) called\n", uport->line);
 
-	mutex_lock(&state->mutex);
+	mutex_lock(&port->mutex);
 
 	if (tty_hung_up_p(filp))
 		goto done;
@@ -1340,7 +1346,7 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 	wake_up_interruptible(&port->open_wait);
 
 done:
-	mutex_unlock(&state->mutex);
+	mutex_unlock(&port->mutex);
 }
 
 static void uart_wait_until_sent(struct tty_struct *tty, int timeout)
@@ -1416,7 +1422,7 @@ static void uart_hangup(struct tty_struct *tty)
 	BUG_ON(!kernel_locked());
 	pr_debug("uart_hangup(%d)\n", state->uart_port->line);
 
-	mutex_lock(&state->mutex);
+	mutex_lock(&port->mutex);
 	if (port->flags & ASYNC_NORMAL_ACTIVE) {
 		uart_flush_buffer(tty);
 		uart_shutdown(state);
@@ -1426,7 +1432,7 @@ static void uart_hangup(struct tty_struct *tty)
 		wake_up_interruptible(&port->open_wait);
 		wake_up_interruptible(&state->delta_msr_wait);
 	}
-	mutex_unlock(&state->mutex);
+	mutex_unlock(&port->mutex);
 }
 
 /*
@@ -1528,9 +1534,9 @@ uart_block_til_ready(struct file *filp, struct uart_state *state)
 		if (mctrl & TIOCM_CAR)
 			break;
 
-		mutex_unlock(&state->mutex);
+		mutex_unlock(&port->mutex);
 		schedule();
-		mutex_lock(&state->mutex);
+		mutex_lock(&port->mutex);
 
 		if (signal_pending(current))
 			break;
@@ -1553,15 +1559,17 @@ uart_block_til_ready(struct file *filp, struct uart_state *state)
 static struct uart_state *uart_get(struct uart_driver *drv, int line)
 {
 	struct uart_state *state;
+	struct tty_port *port;
 	int ret = 0;
 
 	state = drv->state + line;
-	if (mutex_lock_interruptible(&state->mutex)) {
+	port = &state->port;
+	if (mutex_lock_interruptible(&port->mutex)) {
 		ret = -ERESTARTSYS;
 		goto err;
 	}
 
-	state->port.count++;
+	port->count++;
 	if (!state->uart_port || state->uart_port->flags & UPF_DEAD) {
 		ret = -ENXIO;
 		goto err_unlock;
@@ -1569,8 +1577,8 @@ static struct uart_state *uart_get(struct uart_driver *drv, int line)
 	return state;
 
  err_unlock:
-	state->port.count--;
-	mutex_unlock(&state->mutex);
+	port->count--;
+	mutex_unlock(&port->mutex);
  err:
 	return ERR_PTR(ret);
 }
@@ -1636,7 +1644,7 @@ static int uart_open(struct tty_struct *tty, struct file *filp)
 	if (tty_hung_up_p(filp)) {
 		retval = -EAGAIN;
 		port->count--;
-		mutex_unlock(&state->mutex);
+		mutex_unlock(&port->mutex);
 		goto fail;
 	}
 
@@ -1656,7 +1664,7 @@ static int uart_open(struct tty_struct *tty, struct file *filp)
 	 */
 	if (retval == 0)
 		retval = uart_block_til_ready(filp, state);
-	mutex_unlock(&state->mutex);
+	mutex_unlock(&port->mutex);
 
 	/*
 	 * If this is the first open to succeed, adjust things to suit.
@@ -1667,7 +1675,7 @@ static int uart_open(struct tty_struct *tty, struct file *filp)
 		uart_update_termios(state);
 	}
 
- fail:
+fail:
 	return retval;
 }
 
@@ -1689,57 +1697,58 @@ static const char *uart_type(struct uart_port *port)
 static void uart_line_info(struct seq_file *m, struct uart_driver *drv, int i)
 {
 	struct uart_state *state = drv->state + i;
+	struct tty_port *port = &state->port;
 	int pm_state;
-	struct uart_port *port = state->uart_port;
+	struct uart_port *uport = state->uart_port;
 	char stat_buf[32];
 	unsigned int status;
 	int mmio;
 
-	if (!port)
+	if (!uport)
 		return;
 
-	mmio = port->iotype >= UPIO_MEM;
+	mmio = uport->iotype >= UPIO_MEM;
 	seq_printf(m, "%d: uart:%s %s%08llX irq:%d",
-			port->line, uart_type(port),
+			uport->line, uart_type(uport),
 			mmio ? "mmio:0x" : "port:",
-			mmio ? (unsigned long long)port->mapbase
-			     : (unsigned long long) port->iobase,
-			port->irq);
+			mmio ? (unsigned long long)uport->mapbase
+			     : (unsigned long long)uport->iobase,
+			uport->irq);
 
-	if (port->type == PORT_UNKNOWN) {
+	if (uport->type == PORT_UNKNOWN) {
 		seq_putc(m, '\n');
 		return;
 	}
 
 	if (capable(CAP_SYS_ADMIN)) {
-		mutex_lock(&state->mutex);
+		mutex_lock(&port->mutex);
 		pm_state = state->pm_state;
 		if (pm_state)
 			uart_change_pm(state, 0);
-		spin_lock_irq(&port->lock);
-		status = port->ops->get_mctrl(port);
-		spin_unlock_irq(&port->lock);
+		spin_lock_irq(&uport->lock);
+		status = uport->ops->get_mctrl(uport);
+		spin_unlock_irq(&uport->lock);
 		if (pm_state)
 			uart_change_pm(state, pm_state);
-		mutex_unlock(&state->mutex);
+		mutex_unlock(&port->mutex);
 
 		seq_printf(m, " tx:%d rx:%d",
-				port->icount.tx, port->icount.rx);
-		if (port->icount.frame)
+				uport->icount.tx, uport->icount.rx);
+		if (uport->icount.frame)
 			seq_printf(m, " fe:%d",
-				port->icount.frame);
-		if (port->icount.parity)
+				uport->icount.frame);
+		if (uport->icount.parity)
 			seq_printf(m, " pe:%d",
-				port->icount.parity);
-		if (port->icount.brk)
+				uport->icount.parity);
+		if (uport->icount.brk)
 			seq_printf(m, " brk:%d",
-				port->icount.brk);
-		if (port->icount.overrun)
+				uport->icount.brk);
+		if (uport->icount.overrun)
 			seq_printf(m, " oe:%d",
-				port->icount.overrun);
+				uport->icount.overrun);
 
 #define INFOBIT(bit, str) \
-	if (port->mctrl & (bit)) \
+	if (uport->mctrl & (bit)) \
 		strncat(stat_buf, (str), sizeof(stat_buf) - \
 			strlen(stat_buf) - 2)
 #define STATBIT(bit, str) \
@@ -1991,11 +2000,11 @@ int uart_suspend_port(struct uart_driver *drv, struct uart_port *uport)
 	struct device *tty_dev;
 	struct uart_match match = {uport, drv};
 
-	mutex_lock(&state->mutex);
+	mutex_lock(&port->mutex);
 
 	if (!console_suspend_enabled && uart_console(uport)) {
 		/* we're going to avoid suspending serial console */
-		mutex_unlock(&state->mutex);
+		mutex_unlock(&port->mutex);
 		return 0;
 	}
 
@@ -2003,7 +2012,7 @@ int uart_suspend_port(struct uart_driver *drv, struct uart_port *uport)
 	if (device_may_wakeup(tty_dev)) {
 		enable_irq_wake(uport->irq);
 		put_device(tty_dev);
-		mutex_unlock(&state->mutex);
+		mutex_unlock(&port->mutex);
 		return 0;
 	}
 	uport->suspended = 1;
@@ -2045,7 +2054,7 @@ int uart_suspend_port(struct uart_driver *drv, struct uart_port *uport)
 
 	uart_change_pm(state, 3);
 
-	mutex_unlock(&state->mutex);
+	mutex_unlock(&port->mutex);
 
 	return 0;
 }
@@ -2057,18 +2066,18 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *uport)
 	struct device *tty_dev;
 	struct uart_match match = {uport, drv};
 
-	mutex_lock(&state->mutex);
+	mutex_lock(&port->mutex);
 
 	if (!console_suspend_enabled && uart_console(uport)) {
 		/* no need to resume serial console, it wasn't suspended */
-		mutex_unlock(&state->mutex);
+		mutex_unlock(&port->mutex);
 		return 0;
 	}
 
 	tty_dev = device_find_child(uport->dev, &match, serial_match_port);
 	if (!uport->suspended && device_may_wakeup(tty_dev)) {
 		disable_irq_wake(uport->irq);
-		mutex_unlock(&state->mutex);
+		mutex_unlock(&port->mutex);
 		return 0;
 	}
 	uport->suspended = 0;
@@ -2124,7 +2133,7 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *uport)
 		clear_bit(ASYNCB_SUSPENDED, &port->flags);
 	}
 
-	mutex_unlock(&state->mutex);
+	mutex_unlock(&port->mutex);
 
 	return 0;
 }
@@ -2364,12 +2373,11 @@ int uart_register_driver(struct uart_driver *drv)
 	 */
 	for (i = 0; i < drv->nr; i++) {
 		struct uart_state *state = drv->state + i;
+		struct tty_port *port = &state->port;
 
-		mutex_init(&state->mutex);
-
-		tty_port_init(&state->port);
-		state->port.close_delay     = 500;	/* .5 seconds */
-		state->port.closing_wait    = 30000;	/* 30 seconds */
+		tty_port_init(port);
+		port->close_delay     = 500;	/* .5 seconds */
+		port->closing_wait    = 30000;	/* 30 seconds */
 		init_waitqueue_head(&state->delta_msr_wait);
 		tasklet_init(&state->tlet, uart_tasklet_action,
 			     (unsigned long)state);
@@ -2419,62 +2427,64 @@ struct tty_driver *uart_console_device(struct console *co, int *index)
  *	level uart drivers to expand uart_port, rather than having yet
  *	more levels of structures.
  */
-int uart_add_one_port(struct uart_driver *drv, struct uart_port *port)
+int uart_add_one_port(struct uart_driver *drv, struct uart_port *uport)
 {
 	struct uart_state *state;
+	struct tty_port *port;
 	int ret = 0;
 	struct device *tty_dev;
 
 	BUG_ON(in_interrupt());
 
-	if (port->line >= drv->nr)
+	if (uport->line >= drv->nr)
 		return -EINVAL;
 
-	state = drv->state + port->line;
+	state = drv->state + uport->line;
+	port = &state->port;
 
 	mutex_lock(&port_mutex);
-	mutex_lock(&state->mutex);
+	mutex_lock(&port->mutex);
 	if (state->uart_port) {
 		ret = -EINVAL;
 		goto out;
 	}
 
-	state->uart_port = port;
+	state->uart_port = uport;
 	state->pm_state = -1;
 
-	port->cons = drv->cons;
-	port->state = state;
+	uport->cons = drv->cons;
+	uport->state = state;
 
 	/*
 	 * If this port is a console, then the spinlock is already
 	 * initialised.
 	 */
-	if (!(uart_console(port) && (port->cons->flags & CON_ENABLED))) {
-		spin_lock_init(&port->lock);
-		lockdep_set_class(&port->lock, &port_lock_key);
+	if (!(uart_console(uport) && (uport->cons->flags & CON_ENABLED))) {
+		spin_lock_init(&uport->lock);
+		lockdep_set_class(&uport->lock, &port_lock_key);
 	}
 
-	uart_configure_port(drv, state, port);
+	uart_configure_port(drv, state, uport);
 
 	/*
 	 * Register the port whether it's detected or not.  This allows
 	 * setserial to be used to alter this ports parameters.
 	 */
-	tty_dev = tty_register_device(drv->tty_driver, port->line, port->dev);
+	tty_dev = tty_register_device(drv->tty_driver, uport->line, uport->dev);
 	if (likely(!IS_ERR(tty_dev))) {
 		device_init_wakeup(tty_dev, 1);
 		device_set_wakeup_enable(tty_dev, 0);
 	} else
 		printk(KERN_ERR "Cannot register tty device on line %d\n",
-		       port->line);
+		       uport->line);
 
 	/*
 	 * Ensure UPF_DEAD is not set.
 	 */
-	port->flags &= ~UPF_DEAD;
+	uport->flags &= ~UPF_DEAD;
 
  out:
-	mutex_unlock(&state->mutex);
+	mutex_unlock(&port->mutex);
 	mutex_unlock(&port_mutex);
 
 	return ret;
@@ -2489,15 +2499,16 @@ int uart_add_one_port(struct uart_driver *drv, struct uart_port *port)
  *	core driver.  No further calls will be made to the low-level code
  *	for this port.
  */
-int uart_remove_one_port(struct uart_driver *drv, struct uart_port *port)
+int uart_remove_one_port(struct uart_driver *drv, struct uart_port *uport)
 {
-	struct uart_state *state = drv->state + port->line;
+	struct uart_state *state = drv->state + uport->line;
+	struct tty_port *port = &state->port;
 
 	BUG_ON(in_interrupt());
 
-	if (state->uart_port != port)
+	if (state->uart_port != uport)
 		printk(KERN_ALERT "Removing wrong port: %p != %p\n",
-			state->uart_port, port);
+			state->uart_port, uport);
 
 	mutex_lock(&port_mutex);
 
@@ -2505,28 +2516,28 @@ int uart_remove_one_port(struct uart_driver *drv, struct uart_port *port)
 	 * Mark the port "dead" - this prevents any opens from
 	 * succeeding while we shut down the port.
 	 */
-	mutex_lock(&state->mutex);
-	port->flags |= UPF_DEAD;
-	mutex_unlock(&state->mutex);
+	mutex_lock(&port->mutex);
+	uport->flags |= UPF_DEAD;
+	mutex_unlock(&port->mutex);
 
 	/*
 	 * Remove the devices from the tty layer
 	 */
-	tty_unregister_device(drv->tty_driver, port->line);
+	tty_unregister_device(drv->tty_driver, uport->line);
 
-	if (state->port.tty)
-		tty_vhangup(state->port.tty);
+	if (port->tty)
+		tty_vhangup(port->tty);
 
 	/*
 	 * Free the port IO and memory resources, if any.
 	 */
-	if (port->type != PORT_UNKNOWN)
-		port->ops->release_port(port);
+	if (uport->type != PORT_UNKNOWN)
+		uport->ops->release_port(uport);
 
 	/*
 	 * Indicate that there isn't a port here anymore.
 	 */
-	port->type = PORT_UNKNOWN;
+	uport->type = PORT_UNKNOWN;
 
 	/*
 	 * Kill the tasklet, and free resources.
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index f98dc78abb27..27767ea5fa29 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -351,8 +351,6 @@ struct uart_state {
 	struct tasklet_struct	tlet;
 	wait_queue_head_t	delta_msr_wait;
 	struct uart_port	*uart_port;
-
-	struct mutex		mutex;
 };
 
 #define UART_XMIT_SIZE	PAGE_SIZE
-- 
cgit v1.2.3-71-gd317


From bdc04e3174e18f475289fa8f4144f66686326b7e Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Sat, 19 Sep 2009 13:13:31 -0700
Subject: serial: move delta_msr_wait into the tty_port

This is used by various drivers not just serial and can be extracted
as commonality

Signed-off-by: Alan Cox <alan@linux.intel.com>
---
 drivers/char/cyclades.c         |  9 ++++-----
 drivers/char/esp.c              |  7 +++----
 drivers/char/mxser.c            |  8 +++-----
 drivers/char/tty_port.c         |  2 ++
 drivers/serial/8250.c           |  2 +-
 drivers/serial/amba-pl010.c     |  2 +-
 drivers/serial/amba-pl011.c     |  2 +-
 drivers/serial/atmel_serial.c   |  2 +-
 drivers/serial/icom.c           |  2 +-
 drivers/serial/imx.c            |  4 ++--
 drivers/serial/ioc3_serial.c    |  8 ++++----
 drivers/serial/ioc4_serial.c    |  8 ++++----
 drivers/serial/ip22zilog.c      |  2 +-
 drivers/serial/msm_serial.c     |  2 +-
 drivers/serial/pmac_zilog.c     |  2 +-
 drivers/serial/pnx8xxx_uart.c   |  2 +-
 drivers/serial/pxa.c            |  2 +-
 drivers/serial/sa1100.c         |  2 +-
 drivers/serial/sb1250-duart.c   |  2 +-
 drivers/serial/serial_core.c    | 18 +++++++++++-------
 drivers/serial/serial_ks8695.c  |  2 +-
 drivers/serial/serial_lh7a40x.c |  2 +-
 drivers/serial/sunsab.c         |  2 +-
 drivers/serial/sunsu.c          |  2 +-
 drivers/serial/sunzilog.c       |  2 +-
 drivers/serial/timbuart.c       |  2 +-
 drivers/serial/vr41xx_siu.c     |  2 +-
 drivers/serial/zs.c             |  2 +-
 include/linux/cyclades.h        |  1 -
 include/linux/hayesesp.h        |  1 -
 include/linux/serial_core.h     |  1 -
 include/linux/tty.h             |  1 +
 32 files changed, 54 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index 70bd61b2a7d7..df5038bbcbc2 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -729,7 +729,7 @@ static void cyy_chip_modem(struct cyclades_card *cinfo, int chip,
 		if (mdm_change & CyRI)
 			info->icount.rng++;
 
-		wake_up_interruptible(&info->delta_msr_wait);
+		wake_up_interruptible(&info->port.delta_msr_wait);
 	}
 
 	if ((mdm_change & CyDCD) && (info->port.flags & ASYNC_CHECK_CD)) {
@@ -1197,7 +1197,7 @@ static void cyz_handle_cmd(struct cyclades_card *cinfo)
 			break;
 		}
 		if (delta_count)
-			wake_up_interruptible(&info->delta_msr_wait);
+			wake_up_interruptible(&info->port.delta_msr_wait);
 		if (special_count)
 			tty_schedule_flip(tty);
 		tty_kref_put(tty);
@@ -1464,7 +1464,7 @@ static void cy_shutdown(struct cyclades_port *info, struct tty_struct *tty)
 		spin_lock_irqsave(&card->card_lock, flags);
 
 		/* Clear delta_msr_wait queue to avoid mem leaks. */
-		wake_up_interruptible(&info->delta_msr_wait);
+		wake_up_interruptible(&info->port.delta_msr_wait);
 
 		if (info->port.xmit_buf) {
 			unsigned char *temp;
@@ -2788,7 +2788,7 @@ cy_ioctl(struct tty_struct *tty, struct file *file,
 		/* note the counters on entry */
 		cnow = info->icount;
 		spin_unlock_irqrestore(&info->card->card_lock, flags);
-		ret_val = wait_event_interruptible(info->delta_msr_wait,
+		ret_val = wait_event_interruptible(info->port.delta_msr_wait,
 				cy_cflags_changed(info, arg, &cnow));
 		break;
 
@@ -3153,7 +3153,6 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
 		info->port.close_delay = 5 * HZ / 10;
 		info->port.flags = STD_COM_FLAGS;
 		init_completion(&info->shutdown_wait);
-		init_waitqueue_head(&info->delta_msr_wait);
 
 		if (cy_is_Z(cinfo)) {
 			struct FIRM_ID *firm_id = cinfo->base_addr + ID_ADDRESS;
diff --git a/drivers/char/esp.c b/drivers/char/esp.c
index a5c59fc2b0ff..b19d43cd9542 100644
--- a/drivers/char/esp.c
+++ b/drivers/char/esp.c
@@ -572,7 +572,7 @@ static void check_modem_status(struct esp_struct *info)
 			info->icount.dcd++;
 		if (status & UART_MSR_DCTS)
 			info->icount.cts++;
-		wake_up_interruptible(&info->delta_msr_wait);
+		wake_up_interruptible(&info->port.delta_msr_wait);
 	}
 
 	if ((info->port.flags & ASYNC_CHECK_CD) && (status & UART_MSR_DDCD)) {
@@ -927,7 +927,7 @@ static void shutdown(struct esp_struct *info)
 	 * clear delta_msr_wait queue to avoid mem leaks: we may free the irq
 	 * here so the queue might never be waken up
 	 */
-	wake_up_interruptible(&info->delta_msr_wait);
+	wake_up_interruptible(&info->port.delta_msr_wait);
 	wake_up_interruptible(&info->break_wait);
 
 	/* stop a DMA transfer on the port being closed */
@@ -1800,7 +1800,7 @@ static int rs_ioctl(struct tty_struct *tty, struct file *file,
 		spin_unlock_irqrestore(&info->lock, flags);
 		while (1) {
 			/* FIXME: convert to new style wakeup */
-			interruptible_sleep_on(&info->delta_msr_wait);
+			interruptible_sleep_on(&info->port.delta_msr_wait);
 			/* see if a signal did it */
 			if (signal_pending(current))
 				return -ERESTARTSYS;
@@ -2452,7 +2452,6 @@ static int __init espserial_init(void)
 		info->config.flow_off = flow_off;
 		info->config.pio_threshold = pio_threshold;
 		info->next_port = ports;
-		init_waitqueue_head(&info->delta_msr_wait);
 		init_waitqueue_head(&info->break_wait);
 		ports = info;
 		printk(KERN_INFO "ttyP%d at 0x%04x (irq = %d) is an ESP ",
diff --git a/drivers/char/mxser.c b/drivers/char/mxser.c
index 30544ca5e956..37058ff7da7d 100644
--- a/drivers/char/mxser.c
+++ b/drivers/char/mxser.c
@@ -258,7 +258,6 @@ struct mxser_port {
 	struct mxser_mon mon_data;
 
 	spinlock_t slock;
-	wait_queue_head_t delta_msr_wait;
 };
 
 struct mxser_board {
@@ -818,7 +817,7 @@ static void mxser_check_modem_status(struct tty_struct *tty,
 	if (status & UART_MSR_DCTS)
 		port->icount.cts++;
 	port->mon_data.modem_status = status;
-	wake_up_interruptible(&port->delta_msr_wait);
+	wake_up_interruptible(&port->port.delta_msr_wait);
 
 	if ((port->port.flags & ASYNC_CHECK_CD) && (status & UART_MSR_DDCD)) {
 		if (status & UART_MSR_DCD)
@@ -973,7 +972,7 @@ static void mxser_shutdown(struct tty_struct *tty)
 	 * clear delta_msr_wait queue to avoid mem leaks: we may free the irq
 	 * here so the queue might never be waken up
 	 */
-	wake_up_interruptible(&info->delta_msr_wait);
+	wake_up_interruptible(&info->port.delta_msr_wait);
 
 	/*
 	 * Free the IRQ, if necessary
@@ -1762,7 +1761,7 @@ static int mxser_ioctl(struct tty_struct *tty, struct file *file,
 		cnow = info->icount;	/* note the counters on entry */
 		spin_unlock_irqrestore(&info->slock, flags);
 
-		return wait_event_interruptible(info->delta_msr_wait,
+		return wait_event_interruptible(info->port.delta_msr_wait,
 				mxser_cflags_changed(info, arg, &cnow));
 	/*
 	 * Get counter of input serial line interrupts (DCD,RI,DSR,CTS)
@@ -2414,7 +2413,6 @@ static int __devinit mxser_initbrd(struct mxser_board *brd,
 		info->port.close_delay = 5 * HZ / 10;
 		info->port.closing_wait = 30 * HZ;
 		info->normal_termios = mxvar_sdriver->init_termios;
-		init_waitqueue_head(&info->delta_msr_wait);
 		memset(&info->mon_data, 0, sizeof(struct mxser_mon));
 		info->err_shadow = 0;
 		spin_lock_init(&info->slock);
diff --git a/drivers/char/tty_port.c b/drivers/char/tty_port.c
index 549bd0fa8bb6..c767e30a1425 100644
--- a/drivers/char/tty_port.c
+++ b/drivers/char/tty_port.c
@@ -23,6 +23,7 @@ void tty_port_init(struct tty_port *port)
 	memset(port, 0, sizeof(*port));
 	init_waitqueue_head(&port->open_wait);
 	init_waitqueue_head(&port->close_wait);
+	init_waitqueue_head(&port->delta_msr_wait);
 	mutex_init(&port->mutex);
 	spin_lock_init(&port->lock);
 	port->close_delay = (50 * HZ) / 100;
@@ -124,6 +125,7 @@ void tty_port_hangup(struct tty_port *port)
 	port->tty = NULL;
 	spin_unlock_irqrestore(&port->lock, flags);
 	wake_up_interruptible(&port->open_wait);
+	wake_up_interruptible(&port->delta_msr_wait);
 	tty_port_shutdown(port);
 }
 EXPORT_SYMBOL(tty_port_hangup);
diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index e415c5eca599..2209620d2349 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -1510,7 +1510,7 @@ static unsigned int check_modem_status(struct uart_8250_port *up)
 		if (status & UART_MSR_DCTS)
 			uart_handle_cts_change(&up->port, status & UART_MSR_CTS);
 
-		wake_up_interruptible(&up->port.state->delta_msr_wait);
+		wake_up_interruptible(&up->port.state->port.delta_msr_wait);
 	}
 
 	return status;
diff --git a/drivers/serial/amba-pl010.c b/drivers/serial/amba-pl010.c
index 39032413d4a1..429a8ae86933 100644
--- a/drivers/serial/amba-pl010.c
+++ b/drivers/serial/amba-pl010.c
@@ -225,7 +225,7 @@ static void pl010_modem_status(struct uart_amba_port *uap)
 	if (delta & UART01x_FR_CTS)
 		uart_handle_cts_change(&uap->port, status & UART01x_FR_CTS);
 
-	wake_up_interruptible(&uap->port.state->delta_msr_wait);
+	wake_up_interruptible(&uap->port.state->port.delta_msr_wait);
 }
 
 static irqreturn_t pl010_int(int irq, void *dev_id)
diff --git a/drivers/serial/amba-pl011.c b/drivers/serial/amba-pl011.c
index ef82a34baf0f..ef7adc8135dd 100644
--- a/drivers/serial/amba-pl011.c
+++ b/drivers/serial/amba-pl011.c
@@ -226,7 +226,7 @@ static void pl011_modem_status(struct uart_amba_port *uap)
 	if (delta & UART01x_FR_CTS)
 		uart_handle_cts_change(&uap->port, status & UART01x_FR_CTS);
 
-	wake_up_interruptible(&uap->port.state->delta_msr_wait);
+	wake_up_interruptible(&uap->port.state->port.delta_msr_wait);
 }
 
 static irqreturn_t pl011_int(int irq, void *dev_id)
diff --git a/drivers/serial/atmel_serial.c b/drivers/serial/atmel_serial.c
index 963e3c12af41..3551c5cb7094 100644
--- a/drivers/serial/atmel_serial.c
+++ b/drivers/serial/atmel_serial.c
@@ -776,7 +776,7 @@ static void atmel_tasklet_func(unsigned long data)
 		if (status_change & ATMEL_US_CTS)
 			uart_handle_cts_change(port, !(status & ATMEL_US_CTS));
 
-		wake_up_interruptible(&port->state->delta_msr_wait);
+		wake_up_interruptible(&port->state->port.delta_msr_wait);
 
 		atmel_port->irq_status_prev = status;
 	}
diff --git a/drivers/serial/icom.c b/drivers/serial/icom.c
index f86c47e08a06..2d7feecaf492 100644
--- a/drivers/serial/icom.c
+++ b/drivers/serial/icom.c
@@ -695,7 +695,7 @@ static inline void check_modem_status(struct icom_port *icom_port)
 					       delta_status & ICOM_CTS);
 
 		wake_up_interruptible(&icom_port->uart_port.state->
-				      delta_msr_wait);
+				      port.delta_msr_wait);
 		old_status = status;
 	}
 	spin_unlock(&icom_port->uart_port.lock);
diff --git a/drivers/serial/imx.c b/drivers/serial/imx.c
index 1febeafcb97a..18130f11238e 100644
--- a/drivers/serial/imx.c
+++ b/drivers/serial/imx.c
@@ -224,7 +224,7 @@ static void imx_mctrl_check(struct imx_port *sport)
 	if (changed & TIOCM_CTS)
 		uart_handle_cts_change(&sport->port, status & TIOCM_CTS);
 
-	wake_up_interruptible(&sport->port.state->delta_msr_wait);
+	wake_up_interruptible(&sport->port.state->port.delta_msr_wait);
 }
 
 /*
@@ -388,7 +388,7 @@ static irqreturn_t imx_rtsint(int irq, void *dev_id)
 
 	writel(USR1_RTSD, sport->port.membase + USR1);
 	uart_handle_cts_change(&sport->port, !!val);
-	wake_up_interruptible(&sport->port.state->delta_msr_wait);
+	wake_up_interruptible(&sport->port.state->port.delta_msr_wait);
 
 	spin_unlock_irqrestore(&sport->port.lock, flags);
 	return IRQ_HANDLED;
diff --git a/drivers/serial/ioc3_serial.c b/drivers/serial/ioc3_serial.c
index de4ab1bfee8d..d8983dd5c4b2 100644
--- a/drivers/serial/ioc3_serial.c
+++ b/drivers/serial/ioc3_serial.c
@@ -1287,7 +1287,7 @@ static inline int do_read(struct uart_port *the_port, char *buf, int len)
 							(port->ip_port, 0);
 						wake_up_interruptible
 						    (&the_port->state->
-						     delta_msr_wait);
+						     port.delta_msr_wait);
 					}
 
 					/* If we had any data to return, we
@@ -1491,7 +1491,7 @@ ioc3uart_intr_one(struct ioc3_submodule *is,
 				uart_handle_dcd_change(the_port,
 						shadow & SHADOW_DCD);
 				wake_up_interruptible
-				    (&the_port->state->delta_msr_wait);
+				    (&the_port->state->port.delta_msr_wait);
 			} else if ((port->ip_notify & N_DDCD)
 				   && !(shadow & SHADOW_DCD)) {
 				/* Flag delta DCD/no DCD */
@@ -1511,7 +1511,7 @@ ioc3uart_intr_one(struct ioc3_submodule *is,
 				uart_handle_cts_change(the_port, shadow
 						& SHADOW_CTS);
 				wake_up_interruptible
-				    (&the_port->state->delta_msr_wait);
+				    (&the_port->state->port.delta_msr_wait);
 			}
 		}
 
@@ -1728,7 +1728,7 @@ static void ic3_shutdown(struct uart_port *the_port)
 		return;
 
 	state = the_port->state;
-	wake_up_interruptible(&state->delta_msr_wait);
+	wake_up_interruptible(&state->port.delta_msr_wait);
 
 	spin_lock_irqsave(&the_port->lock, port_flags);
 	set_notification(port, N_ALL, 0);
diff --git a/drivers/serial/ioc4_serial.c b/drivers/serial/ioc4_serial.c
index 2055d323f15f..2e02c3026d24 100644
--- a/drivers/serial/ioc4_serial.c
+++ b/drivers/serial/ioc4_serial.c
@@ -1882,7 +1882,7 @@ static void handle_intr(void *arg, uint32_t sio_ir)
 				the_port = port->ip_port;
 				the_port->icount.dcd = 1;
 				wake_up_interruptible
-					    (&the_port->state->delta_msr_wait);
+					    (&the_port->state->port.delta_msr_wait);
 			} else if ((port->ip_notify & N_DDCD)
 					&& !(shadow & IOC4_SHADOW_DCD)) {
 				/* Flag delta DCD/no DCD */
@@ -1904,7 +1904,7 @@ static void handle_intr(void *arg, uint32_t sio_ir)
 				the_port->icount.cts =
 					(shadow & IOC4_SHADOW_CTS) ? 1 : 0;
 				wake_up_interruptible
-					(&the_port->state->delta_msr_wait);
+					(&the_port->state->port.delta_msr_wait);
 			}
 		}
 
@@ -2237,7 +2237,7 @@ static inline int do_read(struct uart_port *the_port, unsigned char *buf,
 						the_port->icount.dcd = 0;
 						wake_up_interruptible
 						    (&the_port->state->
-							delta_msr_wait);
+							port.delta_msr_wait);
 					}
 
 					/* If we had any data to return, we
@@ -2439,7 +2439,7 @@ static void ic4_shutdown(struct uart_port *the_port)
 	state = the_port->state;
 	port->ip_port = NULL;
 
-	wake_up_interruptible(&state->delta_msr_wait);
+	wake_up_interruptible(&state->port.delta_msr_wait);
 
 	if (state->port.tty)
 		set_bit(TTY_IO_ERROR, &state->port.tty->flags);
diff --git a/drivers/serial/ip22zilog.c b/drivers/serial/ip22zilog.c
index 2e847deb41dc..ebff4a1d4bcc 100644
--- a/drivers/serial/ip22zilog.c
+++ b/drivers/serial/ip22zilog.c
@@ -354,7 +354,7 @@ static void ip22zilog_status_handle(struct uart_ip22zilog_port *up,
 			uart_handle_cts_change(&up->port,
 					       (status & CTS));
 
-		wake_up_interruptible(&up->port.state->delta_msr_wait);
+		wake_up_interruptible(&up->port.state->port.delta_msr_wait);
 	}
 
 	up->prev_status = status;
diff --git a/drivers/serial/msm_serial.c b/drivers/serial/msm_serial.c
index ff18d50c99c1..b05c5aa02cb4 100644
--- a/drivers/serial/msm_serial.c
+++ b/drivers/serial/msm_serial.c
@@ -169,7 +169,7 @@ static void handle_delta_cts(struct uart_port *port)
 {
 	msm_write(port, UART_CR_CMD_RESET_CTS, UART_CR);
 	port->icount.cts++;
-	wake_up_interruptible(&port->state->delta_msr_wait);
+	wake_up_interruptible(&port->state->port.delta_msr_wait);
 }
 
 static irqreturn_t msm_irq(int irq, void *dev_id)
diff --git a/drivers/serial/pmac_zilog.c b/drivers/serial/pmac_zilog.c
index 0dc786835dca..0700cd10b97c 100644
--- a/drivers/serial/pmac_zilog.c
+++ b/drivers/serial/pmac_zilog.c
@@ -369,7 +369,7 @@ static void pmz_status_handle(struct uart_pmac_port *uap)
 			uart_handle_cts_change(&uap->port,
 					       !(status & CTS));
 
-		wake_up_interruptible(&uap->port.state->delta_msr_wait);
+		wake_up_interruptible(&uap->port.state->port.delta_msr_wait);
 	}
 
 	if (status & BRK_ABRT)
diff --git a/drivers/serial/pnx8xxx_uart.c b/drivers/serial/pnx8xxx_uart.c
index 2da747635275..0aa75a97531c 100644
--- a/drivers/serial/pnx8xxx_uart.c
+++ b/drivers/serial/pnx8xxx_uart.c
@@ -100,7 +100,7 @@ static void pnx8xxx_mctrl_check(struct pnx8xxx_port *sport)
 	if (changed & TIOCM_CTS)
 		uart_handle_cts_change(&sport->port, status & TIOCM_CTS);
 
-	wake_up_interruptible(&sport->port.state->delta_msr_wait);
+	wake_up_interruptible(&sport->port.state->port.delta_msr_wait);
 }
 
 /*
diff --git a/drivers/serial/pxa.c b/drivers/serial/pxa.c
index ad48919c0415..6443b7ff274a 100644
--- a/drivers/serial/pxa.c
+++ b/drivers/serial/pxa.c
@@ -220,7 +220,7 @@ static inline void check_modem_status(struct uart_pxa_port *up)
 	if (status & UART_MSR_DCTS)
 		uart_handle_cts_change(&up->port, status & UART_MSR_CTS);
 
-	wake_up_interruptible(&up->port.state->delta_msr_wait);
+	wake_up_interruptible(&up->port.state->port.delta_msr_wait);
 }
 
 /*
diff --git a/drivers/serial/sa1100.c b/drivers/serial/sa1100.c
index 61ef3ae24927..7f5e26873220 100644
--- a/drivers/serial/sa1100.c
+++ b/drivers/serial/sa1100.c
@@ -117,7 +117,7 @@ static void sa1100_mctrl_check(struct sa1100_port *sport)
 	if (changed & TIOCM_CTS)
 		uart_handle_cts_change(&sport->port, status & TIOCM_CTS);
 
-	wake_up_interruptible(&sport->port.state->delta_msr_wait);
+	wake_up_interruptible(&sport->port.state->port.delta_msr_wait);
 }
 
 /*
diff --git a/drivers/serial/sb1250-duart.c b/drivers/serial/sb1250-duart.c
index fa5f303b36d3..a2f2b3254499 100644
--- a/drivers/serial/sb1250-duart.c
+++ b/drivers/serial/sb1250-duart.c
@@ -440,7 +440,7 @@ static void sbd_status_handle(struct sbd_port *sport)
 
 	if (delta & ((M_DUART_IN_PIN2_VAL | M_DUART_IN_PIN0_VAL) <<
 		     S_DUART_IN_PIN_CHNG))
-		wake_up_interruptible(&uport->state->delta_msr_wait);
+		wake_up_interruptible(&uport->state->port.delta_msr_wait);
 }
 
 static irqreturn_t sbd_interrupt(int irq, void *dev_id)
diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index 9d42e57e1971..e16d15343dfd 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -215,7 +215,8 @@ static int uart_startup(struct uart_state *state, int init_hw)
 static void uart_shutdown(struct uart_state *state)
 {
 	struct uart_port *uport = state->uart_port;
-	struct tty_struct *tty = state->port.tty;
+	struct tty_port *port = &state->port;
+	struct tty_struct *tty = port->tty;
 
 	/*
 	 * Set the TTY IO error marker
@@ -223,7 +224,7 @@ static void uart_shutdown(struct uart_state *state)
 	if (tty)
 		set_bit(TTY_IO_ERROR, &tty->flags);
 
-	if (test_and_clear_bit(ASYNCB_INITIALIZED, &state->port.flags)) {
+	if (test_and_clear_bit(ASYNCB_INITIALIZED, &port->flags)) {
 		/*
 		 * Turn off DTR and RTS early.
 		 */
@@ -237,7 +238,7 @@ static void uart_shutdown(struct uart_state *state)
 		 * any outstanding file descriptors should be pointing at
 		 * hung_up_tty_fops now.
 		 */
-		wake_up_interruptible(&state->delta_msr_wait);
+		wake_up_interruptible(&port->delta_msr_wait);
 
 		/*
 		 * Free the IRQ and disable the port.
@@ -1004,11 +1005,15 @@ static int uart_do_autoconfig(struct uart_state *state)
  * - mask passed in arg for lines of interest
  *   (use |'ed TIOCM_RNG/DSR/CD/CTS for masking)
  * Caller should use TIOCGICOUNT to see which one it was
+ *
+ * FIXME: This wants extracting into a common all driver implementation
+ * of TIOCMWAIT using tty_port.
  */
 static int
 uart_wait_modem_status(struct uart_state *state, unsigned long arg)
 {
 	struct uart_port *uport = state->uart_port;
+	struct tty_port *port = &state->port;
 	DECLARE_WAITQUEUE(wait, current);
 	struct uart_icount cprev, cnow;
 	int ret;
@@ -1025,7 +1030,7 @@ uart_wait_modem_status(struct uart_state *state, unsigned long arg)
 	uport->ops->enable_ms(uport);
 	spin_unlock_irq(&uport->lock);
 
-	add_wait_queue(&state->delta_msr_wait, &wait);
+	add_wait_queue(&port->delta_msr_wait, &wait);
 	for (;;) {
 		spin_lock_irq(&uport->lock);
 		memcpy(&cnow, &uport->icount, sizeof(struct uart_icount));
@@ -1053,7 +1058,7 @@ uart_wait_modem_status(struct uart_state *state, unsigned long arg)
 	}
 
 	current->state = TASK_RUNNING;
-	remove_wait_queue(&state->delta_msr_wait, &wait);
+	remove_wait_queue(&port->delta_msr_wait, &wait);
 
 	return ret;
 }
@@ -1430,7 +1435,7 @@ static void uart_hangup(struct tty_struct *tty)
 		clear_bit(ASYNCB_NORMAL_ACTIVE, &port->flags);
 		port->tty = NULL;
 		wake_up_interruptible(&port->open_wait);
-		wake_up_interruptible(&state->delta_msr_wait);
+		wake_up_interruptible(&port->delta_msr_wait);
 	}
 	mutex_unlock(&port->mutex);
 }
@@ -2378,7 +2383,6 @@ int uart_register_driver(struct uart_driver *drv)
 		tty_port_init(port);
 		port->close_delay     = 500;	/* .5 seconds */
 		port->closing_wait    = 30000;	/* 30 seconds */
-		init_waitqueue_head(&state->delta_msr_wait);
 		tasklet_init(&state->tlet, uart_tasklet_action,
 			     (unsigned long)state);
 	}
diff --git a/drivers/serial/serial_ks8695.c b/drivers/serial/serial_ks8695.c
index 4560b2e70685..2e71bbc04dac 100644
--- a/drivers/serial/serial_ks8695.c
+++ b/drivers/serial/serial_ks8695.c
@@ -266,7 +266,7 @@ static irqreturn_t ks8695uart_modem_status(int irq, void *dev_id)
 	if (status & URMS_URTERI)
 		port->icount.rng++;
 
-	wake_up_interruptible(&port->state->delta_msr_wait);
+	wake_up_interruptible(&port->state->port.delta_msr_wait);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/serial/serial_lh7a40x.c b/drivers/serial/serial_lh7a40x.c
index 057fc5e8cc8d..ea744707c4d6 100644
--- a/drivers/serial/serial_lh7a40x.c
+++ b/drivers/serial/serial_lh7a40x.c
@@ -241,7 +241,7 @@ static void lh7a40xuart_modem_status (struct uart_port* port)
 	if (delta & CTS)
 		uart_handle_cts_change (port, status & CTS);
 
-	wake_up_interruptible (&port->state->delta_msr_wait);
+	wake_up_interruptible (&port->state->port.delta_msr_wait);
 }
 
 static irqreturn_t lh7a40xuart_int (int irq, void* dev_id)
diff --git a/drivers/serial/sunsab.c b/drivers/serial/sunsab.c
index 7c4f2fe8e246..d1ad34128635 100644
--- a/drivers/serial/sunsab.c
+++ b/drivers/serial/sunsab.c
@@ -297,7 +297,7 @@ static void check_status(struct uart_sunsab_port *up,
 		up->port.icount.dsr++;
 	}
 
-	wake_up_interruptible(&up->port.state->delta_msr_wait);
+	wake_up_interruptible(&up->port.state->port.delta_msr_wait);
 }
 
 static irqreturn_t sunsab_interrupt(int irq, void *dev_id)
diff --git a/drivers/serial/sunsu.c b/drivers/serial/sunsu.c
index 5a32365b58ad..68d262b15749 100644
--- a/drivers/serial/sunsu.c
+++ b/drivers/serial/sunsu.c
@@ -441,7 +441,7 @@ static void check_modem_status(struct uart_sunsu_port *up)
 	if (status & UART_MSR_DCTS)
 		uart_handle_cts_change(&up->port, status & UART_MSR_CTS);
 
-	wake_up_interruptible(&up->port.state->delta_msr_wait);
+	wake_up_interruptible(&up->port.state->port.delta_msr_wait);
 }
 
 static irqreturn_t sunsu_serial_interrupt(int irq, void *dev_id)
diff --git a/drivers/serial/sunzilog.c b/drivers/serial/sunzilog.c
index 055034d12b1c..ef693ae22e7f 100644
--- a/drivers/serial/sunzilog.c
+++ b/drivers/serial/sunzilog.c
@@ -451,7 +451,7 @@ static void sunzilog_status_handle(struct uart_sunzilog_port *up,
 			uart_handle_cts_change(&up->port,
 					       (status & CTS));
 
-		wake_up_interruptible(&up->port.state->delta_msr_wait);
+		wake_up_interruptible(&up->port.state->port.delta_msr_wait);
 	}
 
 	up->prev_status = status;
diff --git a/drivers/serial/timbuart.c b/drivers/serial/timbuart.c
index 3d40be6f389f..34b31da01d09 100644
--- a/drivers/serial/timbuart.c
+++ b/drivers/serial/timbuart.c
@@ -231,7 +231,7 @@ static void timbuart_mctrl_check(struct uart_port *port, u32 isr, u32 *ier)
 		iowrite32(CTS_DELTA, port->membase + TIMBUART_ISR);
 		cts = timbuart_get_mctrl(port);
 		uart_handle_cts_change(port, cts & TIOCM_CTS);
-		wake_up_interruptible(&port->state->delta_msr_wait);
+		wake_up_interruptible(&port->state->port.delta_msr_wait);
 	}
 
 	*ier |= CTS_DELTA;
diff --git a/drivers/serial/vr41xx_siu.c b/drivers/serial/vr41xx_siu.c
index cf4410e6d53b..3beb6ab4fa68 100644
--- a/drivers/serial/vr41xx_siu.c
+++ b/drivers/serial/vr41xx_siu.c
@@ -386,7 +386,7 @@ static inline void check_modem_status(struct uart_port *port)
 	if (msr & UART_MSR_DCTS)
 		uart_handle_cts_change(port, msr & UART_MSR_CTS);
 
-	wake_up_interruptible(&port->state->delta_msr_wait);
+	wake_up_interruptible(&port->state->port.delta_msr_wait);
 }
 
 static inline void transmit_chars(struct uart_port *port)
diff --git a/drivers/serial/zs.c b/drivers/serial/zs.c
index b9c9fb9198d6..1a7fd3e70315 100644
--- a/drivers/serial/zs.c
+++ b/drivers/serial/zs.c
@@ -686,7 +686,7 @@ static void zs_status_handle(struct zs_port *zport, struct zs_port *zport_a)
 			uport->icount.rng++;
 
 		if (delta)
-			wake_up_interruptible(&uport->state->delta_msr_wait);
+			wake_up_interruptible(&uport->state->port.delta_msr_wait);
 
 		spin_lock(&scc->zlock);
 	}
diff --git a/include/linux/cyclades.h b/include/linux/cyclades.h
index bbebef7713b3..a5049eaf782d 100644
--- a/include/linux/cyclades.h
+++ b/include/linux/cyclades.h
@@ -578,7 +578,6 @@ struct cyclades_port {
 	struct cyclades_idle_stats	idle_stats;
 	struct cyclades_icount	icount;
 	struct completion       shutdown_wait;
-	wait_queue_head_t       delta_msr_wait;
 	int throttle;
 };
 
diff --git a/include/linux/hayesesp.h b/include/linux/hayesesp.h
index 940aeb51d53f..92b08cfe4a75 100644
--- a/include/linux/hayesesp.h
+++ b/include/linux/hayesesp.h
@@ -96,7 +96,6 @@ struct esp_struct {
 	int			xmit_head;
 	int			xmit_tail;
 	int			xmit_cnt;
-	wait_queue_head_t	delta_msr_wait;
 	wait_queue_head_t	break_wait;
 	struct async_icount	icount;	/* kernel counters for the 4 input interrupts */
 	struct hayes_esp_config config; /* port configuration */
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 27767ea5fa29..bcafecd3b7c1 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -349,7 +349,6 @@ struct uart_state {
 	struct circ_buf		xmit;
 
 	struct tasklet_struct	tlet;
-	wait_queue_head_t	delta_msr_wait;
 	struct uart_port	*uart_port;
 };
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 9fdc3d84baad..0daa8a72b176 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -203,6 +203,7 @@ struct tty_port {
 	int			count;		/* Usage count */
 	wait_queue_head_t	open_wait;	/* Open waiters */
 	wait_queue_head_t	close_wait;	/* Close waiters */
+	wait_queue_head_t	delta_msr_wait;	/* Modem status change */
 	unsigned long		flags;		/* TTY flags ASY_*/
 	struct mutex		mutex;		/* Locking */
 	unsigned char		*xmit_buf;	/* Optional buffer */
-- 
cgit v1.2.3-71-gd317


From b58d13a0216d4e0753668214f23e1d2c24c30f8c Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Sat, 19 Sep 2009 13:13:32 -0700
Subject: serial: move port users helper

This little helper is now tty_port specific and useful generally so move it

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/serial/serial_core.c | 6 ++----
 include/linux/tty.h          | 5 +++++
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index e16d15343dfd..3c45a8d7eb7f 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -52,8 +52,6 @@ static struct lock_class_key port_lock_key;
 
 #define HIGH_BITS_OFFSET	((sizeof(long)-sizeof(int))*8)
 
-#define uart_users(state)	((state)->port.count + (state)->port.blocked_open)
-
 #ifdef CONFIG_SERIAL_CORE_CONSOLE
 #define uart_console(port)	((port)->cons && (port)->cons->index == (port)->line)
 #else
@@ -758,7 +756,7 @@ static int uart_set_info(struct uart_state *state,
 		/*
 		 * Make sure that we are the sole user of this port.
 		 */
-		if (uart_users(state) > 1)
+		if (tty_port_users(port) > 1)
 			goto exit;
 
 		/*
@@ -974,7 +972,7 @@ static int uart_do_autoconfig(struct uart_state *state)
 		return -ERESTARTSYS;
 
 	ret = -EBUSY;
-	if (uart_users(state) == 1) {
+	if (tty_port_users(port) == 1) {
 		uart_shutdown(state);
 
 		/*
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 0daa8a72b176..a0e03309a379 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -467,6 +467,11 @@ extern int tty_port_close_start(struct tty_port *port,
 extern void tty_port_close_end(struct tty_port *port, struct tty_struct *tty);
 extern void tty_port_close(struct tty_port *port,
 				struct tty_struct *tty, struct file *filp);
+extern inline int tty_port_users(struct tty_port *port)
+{
+	return port->count + port->blocked_open;
+}
+
 extern int tty_register_ldisc(int disc, struct tty_ldisc_ops *new_ldisc);
 extern int tty_unregister_ldisc(int disc);
 extern int tty_set_ldisc(struct tty_struct *tty, int ldisc);
-- 
cgit v1.2.3-71-gd317


From 016af53a6de6837e5be3da68901083ea85ebb4da Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Sat, 19 Sep 2009 13:13:32 -0700
Subject: serial: kill USF_CLOSING_* definitions

The serial layer for some reason uses different defines for the special
case close delays and then conditionally switches to/from the normal ones
in the ioctls.

Remove this rather pointless abstraction

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/serial/serial_core.c | 6 +++---
 include/linux/serial_core.h  | 3 ---
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index 3c45a8d7eb7f..69c4e20530f5 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -652,7 +652,7 @@ static int uart_get_info(struct uart_state *state,
 	tmp.xmit_fifo_size  = uport->fifosize;
 	tmp.baud_base	    = uport->uartclk / 16;
 	tmp.close_delay	    = port->close_delay / 10;
-	tmp.closing_wait    = port->closing_wait == USF_CLOSING_WAIT_NONE ?
+	tmp.closing_wait    = port->closing_wait == ASYNC_CLOSING_WAIT_NONE ?
 				ASYNC_CLOSING_WAIT_NONE :
 				port->closing_wait / 10;
 	tmp.custom_divisor  = uport->custom_divisor;
@@ -690,7 +690,7 @@ static int uart_set_info(struct uart_state *state,
 	new_serial.irq = irq_canonicalize(new_serial.irq);
 	close_delay = new_serial.close_delay * 10;
 	closing_wait = new_serial.closing_wait == ASYNC_CLOSING_WAIT_NONE ?
-			USF_CLOSING_WAIT_NONE : new_serial.closing_wait * 10;
+			ASYNC_CLOSING_WAIT_NONE : new_serial.closing_wait * 10;
 
 	/*
 	 * This semaphore protects port->count.  It is also
@@ -1307,7 +1307,7 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 	 */
 	tty->closing = 1;
 
-	if (port->closing_wait != USF_CLOSING_WAIT_NONE)
+	if (port->closing_wait != ASYNC_CLOSING_WAIT_NONE)
 		tty_wait_until_sent(tty, msecs_to_jiffies(port->closing_wait));
 
 	/*
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index bcafecd3b7c1..d58e460844dd 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -342,9 +342,6 @@ struct uart_port {
 struct uart_state {
 	struct tty_port		port;
 
-#define USF_CLOSING_WAIT_INF	(0)
-#define USF_CLOSING_WAIT_NONE	(~0U)
-
 	int			pm_state;
 	struct circ_buf		xmit;
 
-- 
cgit v1.2.3-71-gd317


From fe1ae7fdd2ee603f2d95f04e09a68f7f79045127 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Sat, 19 Sep 2009 13:13:33 -0700
Subject: tty: USB serial termios bits

Various drivers have hacks to mangle termios structures. This stems from
the fact there is no nice setup hook for configuring the termios settings
when the port is created

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/char/tty_io.c            |  1 +
 drivers/usb/serial/ark3116.c     | 46 ++++++++++------------------------------
 drivers/usb/serial/cypress_m8.c  | 12 +++--------
 drivers/usb/serial/empeg.c       | 12 +++--------
 drivers/usb/serial/iuu_phoenix.c | 31 ++++++++++++---------------
 drivers/usb/serial/kobil_sct.c   | 22 +++++++++----------
 drivers/usb/serial/oti6858.c     | 21 +++++++++---------
 drivers/usb/serial/spcp8x5.c     | 21 +++++++++---------
 drivers/usb/serial/usb-serial.c  | 38 ++++++++++++++++++++++++++++++++-
 drivers/usb/serial/whiteheat.c   |  6 +++---
 include/linux/usb/serial.h       |  3 +++
 11 files changed, 106 insertions(+), 107 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 385cca7074da..05f443c47bbe 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -1184,6 +1184,7 @@ int tty_init_termios(struct tty_struct *tty)
 	tty->termios->c_ospeed = tty_termios_baud_rate(tty->termios);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(tty_init_termios);
 
 /**
  *	tty_driver_install_tty() - install a tty entry in the driver
diff --git a/drivers/usb/serial/ark3116.c b/drivers/usb/serial/ark3116.c
index 7ddde4ddfb4b..5d25d3e52bf6 100644
--- a/drivers/usb/serial/ark3116.c
+++ b/drivers/usb/serial/ark3116.c
@@ -35,11 +35,6 @@ static struct usb_device_id id_table [] = {
 };
 MODULE_DEVICE_TABLE(usb, id_table);
 
-struct ark3116_private {
-	spinlock_t lock;
-	u8 termios_initialized;
-};
-
 static inline void ARK3116_SND(struct usb_serial *serial, int seq,
 			       __u8 request, __u8 requesttype,
 			       __u16 value, __u16 index)
@@ -82,22 +77,11 @@ static inline void ARK3116_RCV_QUIET(struct usb_serial *serial,
 static int ark3116_attach(struct usb_serial *serial)
 {
 	char *buf;
-	struct ark3116_private *priv;
-	int i;
-
-	for (i = 0; i < serial->num_ports; ++i) {
-		priv = kzalloc(sizeof(struct ark3116_private), GFP_KERNEL);
-		if (!priv)
-			goto cleanup;
-		spin_lock_init(&priv->lock);
-
-		usb_set_serial_port_data(serial->port[i], priv);
-	}
 
 	buf = kmalloc(1, GFP_KERNEL);
 	if (!buf) {
 		dbg("error kmalloc -> out of mem?");
-		goto cleanup;
+		return -ENOMEM;
 	}
 
 	/* 3 */
@@ -149,13 +133,16 @@ static int ark3116_attach(struct usb_serial *serial)
 
 	kfree(buf);
 	return 0;
+}
 
-cleanup:
-	for (--i; i >= 0; --i) {
-		kfree(usb_get_serial_port_data(serial->port[i]));
-		usb_set_serial_port_data(serial->port[i], NULL);
-	}
-	return -ENOMEM;
+static void ark3116_init_termios(struct tty_struct *tty)
+{
+	struct ktermios *termios = tty->termios;
+	*termios = tty_std_termios;
+	termios->c_cflag = B9600 | CS8
+				      | CREAD | HUPCL | CLOCAL;
+	termios->c_ispeed = 9600;
+	termios->c_ospeed = 9600;
 }
 
 static void ark3116_set_termios(struct tty_struct *tty,
@@ -163,10 +150,8 @@ static void ark3116_set_termios(struct tty_struct *tty,
 				struct ktermios *old_termios)
 {
 	struct usb_serial *serial = port->serial;
-	struct ark3116_private *priv = usb_get_serial_port_data(port);
 	struct ktermios *termios = tty->termios;
 	unsigned int cflag = termios->c_cflag;
-	unsigned long flags;
 	int baud;
 	int ark3116_baud;
 	char *buf;
@@ -176,16 +161,6 @@ static void ark3116_set_termios(struct tty_struct *tty,
 
 	dbg("%s - port %d", __func__, port->number);
 
-	spin_lock_irqsave(&priv->lock, flags);
-	if (!priv->termios_initialized) {
-		*termios = tty_std_termios;
-		termios->c_cflag = B9600 | CS8
-					      | CREAD | HUPCL | CLOCAL;
-		termios->c_ispeed = 9600;
-		termios->c_ospeed = 9600;
-		priv->termios_initialized = 1;
-	}
-	spin_unlock_irqrestore(&priv->lock, flags);
 
 	cflag = termios->c_cflag;
 	termios->c_cflag &= ~(CMSPAR|CRTSCTS);
@@ -454,6 +429,7 @@ static struct usb_serial_driver ark3116_device = {
 	.num_ports =		1,
 	.attach =		ark3116_attach,
 	.set_termios =		ark3116_set_termios,
+	.init_termios =		ark3116_init_termios,
 	.ioctl =		ark3116_ioctl,
 	.tiocmget =		ark3116_tiocmget,
 	.open =			ark3116_open,
diff --git a/drivers/usb/serial/cypress_m8.c b/drivers/usb/serial/cypress_m8.c
index df1adb9a4367..e0a8b715f2f2 100644
--- a/drivers/usb/serial/cypress_m8.c
+++ b/drivers/usb/serial/cypress_m8.c
@@ -657,15 +657,7 @@ static int cypress_open(struct tty_struct *tty, struct usb_serial_port *port)
 	spin_unlock_irqrestore(&priv->lock, flags);
 
 	/* Set termios */
-	result = cypress_write(tty, port, NULL, 0);
-
-	if (result) {
-		dev_err(&port->dev,
-			"%s - failed setting the control lines - error %d\n",
-							__func__, result);
-		return result;
-	} else
-		dbg("%s - success setting the control lines", __func__);
+	cypress_send(port);
 
 	if (tty)
 		cypress_set_termios(tty, port, &priv->tmp_termios);
@@ -1003,6 +995,8 @@ static void cypress_set_termios(struct tty_struct *tty,
 	dbg("%s - port %d", __func__, port->number);
 
 	spin_lock_irqsave(&priv->lock, flags);
+	/* We can't clean this one up as we don't know the device type
+	   early enough */
 	if (!priv->termios_initialized) {
 		if (priv->chiptype == CT_EARTHMATE) {
 			*(tty->termios) = tty_std_termios;
diff --git a/drivers/usb/serial/empeg.c b/drivers/usb/serial/empeg.c
index da559a773b51..33c9e9cf9eb2 100644
--- a/drivers/usb/serial/empeg.c
+++ b/drivers/usb/serial/empeg.c
@@ -89,8 +89,7 @@ static int  empeg_chars_in_buffer(struct tty_struct *tty);
 static void empeg_throttle(struct tty_struct *tty);
 static void empeg_unthrottle(struct tty_struct *tty);
 static int  empeg_startup(struct usb_serial *serial);
-static void empeg_set_termios(struct tty_struct *tty,
-		struct usb_serial_port *port, struct ktermios *old_termios);
+static void empeg_init_termios(struct tty_struct *tty);
 static void empeg_write_bulk_callback(struct urb *urb);
 static void empeg_read_bulk_callback(struct urb *urb);
 
@@ -122,7 +121,7 @@ static struct usb_serial_driver empeg_device = {
 	.throttle =		empeg_throttle,
 	.unthrottle =		empeg_unthrottle,
 	.attach =		empeg_startup,
-	.set_termios =		empeg_set_termios,
+	.init_termios =		empeg_init_termios,
 	.write =		empeg_write,
 	.write_room =		empeg_write_room,
 	.chars_in_buffer =	empeg_chars_in_buffer,
@@ -148,9 +147,6 @@ static int empeg_open(struct tty_struct *tty,struct usb_serial_port *port)
 
 	dbg("%s - port %d", __func__, port->number);
 
-	/* Force default termio settings */
-	empeg_set_termios(tty, port, NULL);
-
 	bytes_in = 0;
 	bytes_out = 0;
 
@@ -423,11 +419,9 @@ static int  empeg_startup(struct usb_serial *serial)
 }
 
 
-static void empeg_set_termios(struct tty_struct *tty,
-		struct usb_serial_port *port, struct ktermios *old_termios)
+static void empeg_init_termios(struct tty_struct *tty)
 {
 	struct ktermios *termios = tty->termios;
-	dbg("%s - port %d", __func__, port->number);
 
 	/*
 	 * The empeg-car player wants these particular tty settings.
diff --git a/drivers/usb/serial/iuu_phoenix.c b/drivers/usb/serial/iuu_phoenix.c
index f3f9be0469c5..6138c1cda35f 100644
--- a/drivers/usb/serial/iuu_phoenix.c
+++ b/drivers/usb/serial/iuu_phoenix.c
@@ -71,7 +71,6 @@ struct iuu_private {
 	spinlock_t lock;	/* store irq state */
 	wait_queue_head_t delta_msr_wait;
 	u8 line_status;
-	u8 termios_initialized;
 	int tiostatus;		/* store IUART SIGNAL for tiocmget call */
 	u8 reset;		/* if 1 reset is needed */
 	int poll;		/* number of poll */
@@ -1018,13 +1017,24 @@ static void iuu_close(struct usb_serial_port *port)
 	}
 }
 
+static void iuu_init_termios(struct tty_struct *tty)
+{
+	*(tty->termios) = tty_std_termios;
+	tty->termios->c_cflag = CLOCAL | CREAD | CS8 | B9600
+				| TIOCM_CTS | CSTOPB | PARENB;
+	tty->termios->c_ispeed = 9600;
+	tty->termios->c_ospeed = 9600;
+	tty->termios->c_lflag = 0;
+	tty->termios->c_oflag = 0;
+	tty->termios->c_iflag = 0;
+}
+
 static int iuu_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
 	struct usb_serial *serial = port->serial;
 	u8 *buf;
 	int result;
 	u32 actual;
-	unsigned long flags;
 	struct iuu_private *priv = usb_get_serial_port_data(port);
 
 	dbg("%s -  port %d", __func__, port->number);
@@ -1063,21 +1073,7 @@ static int iuu_open(struct tty_struct *tty, struct usb_serial_port *port)
 			  port->bulk_in_buffer, 512,
 			  NULL, NULL);
 
-	/* set the termios structure */
-	spin_lock_irqsave(&priv->lock, flags);
-	if (tty && !priv->termios_initialized) {
-		*(tty->termios) = tty_std_termios;
-		tty->termios->c_cflag = CLOCAL | CREAD | CS8 | B9600
-					| TIOCM_CTS | CSTOPB | PARENB;
-		tty->termios->c_ispeed = 9600;
-		tty->termios->c_ospeed = 9600;
-		tty->termios->c_lflag = 0;
-		tty->termios->c_oflag = 0;
-		tty->termios->c_iflag = 0;
-		priv->termios_initialized = 1;
-		priv->poll = 0;
-	}
-	spin_unlock_irqrestore(&priv->lock, flags);
+	priv->poll = 0;
 
 	/* initialize writebuf */
 #define FISH(a, b, c, d) do { \
@@ -1200,6 +1196,7 @@ static struct usb_serial_driver iuu_device = {
 	.tiocmget = iuu_tiocmget,
 	.tiocmset = iuu_tiocmset,
 	.set_termios = iuu_set_termios,
+	.init_termios = iuu_init_termios,
 	.attach = iuu_startup,
 	.release = iuu_release,
 };
diff --git a/drivers/usb/serial/kobil_sct.c b/drivers/usb/serial/kobil_sct.c
index 97901578343a..45ea694b3ae6 100644
--- a/drivers/usb/serial/kobil_sct.c
+++ b/drivers/usb/serial/kobil_sct.c
@@ -84,7 +84,7 @@ static void kobil_read_int_callback(struct urb *urb);
 static void kobil_write_callback(struct urb *purb);
 static void kobil_set_termios(struct tty_struct *tty,
 			struct usb_serial_port *port, struct ktermios *old);
-
+static void kobil_init_termios(struct tty_struct *tty);
 
 static struct usb_device_id id_table [] = {
 	{ USB_DEVICE(KOBIL_VENDOR_ID, KOBIL_ADAPTER_B_PRODUCT_ID) },
@@ -119,6 +119,7 @@ static struct usb_serial_driver kobil_device = {
 	.release =		kobil_release,
 	.ioctl =		kobil_ioctl,
 	.set_termios =		kobil_set_termios,
+	.init_termios =		kobil_init_termios,
 	.tiocmget =		kobil_tiocmget,
 	.tiocmset =		kobil_tiocmset,
 	.open =			kobil_open,
@@ -209,6 +210,15 @@ static void kobil_release(struct usb_serial *serial)
 		kfree(usb_get_serial_port_data(serial->port[i]));
 }
 
+static void kobil_init_termios(struct tty_struct *tty)
+{
+	/* Default to echo off and other sane device settings */
+	tty->termios->c_lflag = 0;
+	tty->termios->c_lflag &= ~(ISIG | ICANON | ECHO | IEXTEN | XCASE);
+	tty->termios->c_iflag = IGNBRK | IGNPAR | IXOFF;
+	/* do NOT translate CR to CR-NL (0x0A -> 0x0A 0x0D) */
+	tty->termios->c_oflag &= ~ONLCR;
+}
 
 static int kobil_open(struct tty_struct *tty, struct usb_serial_port *port)
 {
@@ -224,16 +234,6 @@ static int kobil_open(struct tty_struct *tty, struct usb_serial_port *port)
 	/* someone sets the dev to 0 if the close method has been called */
 	port->interrupt_in_urb->dev = port->serial->dev;
 
-	if (tty) {
-
-		/* Default to echo off and other sane device settings */
-		tty->termios->c_lflag = 0;
-		tty->termios->c_lflag &= ~(ISIG | ICANON | ECHO | IEXTEN |
-								 XCASE);
-		tty->termios->c_iflag = IGNBRK | IGNPAR | IXOFF;
-		/* do NOT translate CR to CR-NL (0x0A -> 0x0A 0x0D) */
-		tty->termios->c_oflag &= ~ONLCR;
-	}
 	/* allocate memory for transfer buffer */
 	transfer_buffer = kzalloc(transfer_buffer_length, GFP_KERNEL);
 	if (!transfer_buffer)
diff --git a/drivers/usb/serial/oti6858.c b/drivers/usb/serial/oti6858.c
index e90f88a3b040..0f4a70ce3823 100644
--- a/drivers/usb/serial/oti6858.c
+++ b/drivers/usb/serial/oti6858.c
@@ -145,6 +145,7 @@ static int oti6858_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void oti6858_close(struct usb_serial_port *port);
 static void oti6858_set_termios(struct tty_struct *tty,
 			struct usb_serial_port *port, struct ktermios *old);
+static void oti6858_init_termios(struct tty_struct *tty);
 static int oti6858_ioctl(struct tty_struct *tty, struct file *file,
 			unsigned int cmd, unsigned long arg);
 static void oti6858_read_int_callback(struct urb *urb);
@@ -185,6 +186,7 @@ static struct usb_serial_driver oti6858_device = {
 	.write =		oti6858_write,
 	.ioctl =		oti6858_ioctl,
 	.set_termios =		oti6858_set_termios,
+	.init_termios = 	oti6858_init_termios,
 	.tiocmget =		oti6858_tiocmget,
 	.tiocmset =		oti6858_tiocmset,
 	.read_bulk_callback =	oti6858_read_bulk_callback,
@@ -205,7 +207,6 @@ struct oti6858_private {
 	struct {
 		u8 read_urb_in_use;
 		u8 write_urb_in_use;
-		u8 termios_initialized;
 	} flags;
 	struct delayed_work delayed_write_work;
 
@@ -446,6 +447,14 @@ static int oti6858_chars_in_buffer(struct tty_struct *tty)
 	return chars;
 }
 
+static void oti6858_init_termios(struct tty_struct *tty)
+{
+	*(tty->termios) = tty_std_termios;
+	tty->termios->c_cflag = B38400 | CS8 | CREAD | HUPCL | CLOCAL;
+	tty->termios->c_ispeed = 38400;
+	tty->termios->c_ospeed = 38400;
+}
+
 static void oti6858_set_termios(struct tty_struct *tty,
 		struct usb_serial_port *port, struct ktermios *old_termios)
 {
@@ -463,16 +472,6 @@ static void oti6858_set_termios(struct tty_struct *tty,
 		return;
 	}
 
-	spin_lock_irqsave(&priv->lock, flags);
-	if (!priv->flags.termios_initialized) {
-		*(tty->termios) = tty_std_termios;
-		tty->termios->c_cflag = B38400 | CS8 | CREAD | HUPCL | CLOCAL;
-		tty->termios->c_ispeed = 38400;
-		tty->termios->c_ospeed = 38400;
-		priv->flags.termios_initialized = 1;
-	}
-	spin_unlock_irqrestore(&priv->lock, flags);
-
 	cflag = tty->termios->c_cflag;
 
 	spin_lock_irqsave(&priv->lock, flags);
diff --git a/drivers/usb/serial/spcp8x5.c b/drivers/usb/serial/spcp8x5.c
index 8b312a05a353..61e7c40b94fb 100644
--- a/drivers/usb/serial/spcp8x5.c
+++ b/drivers/usb/serial/spcp8x5.c
@@ -299,7 +299,6 @@ struct spcp8x5_private {
 	wait_queue_head_t	delta_msr_wait;
 	u8 			line_control;
 	u8 			line_status;
-	u8 			termios_initialized;
 };
 
 /* desc : when device plug in,this function would be called.
@@ -498,6 +497,15 @@ static void spcp8x5_close(struct usb_serial_port *port)
 		dev_dbg(&port->dev, "usb_unlink_urb(read_urb) = %d\n", result);
 }
 
+static void spcp8x5_init_termios(struct tty_struct *tty)
+{
+	/* for the 1st time call this function */
+	*(tty->termios) = tty_std_termios;
+	tty->termios->c_cflag = B115200 | CS8 | CREAD | HUPCL | CLOCAL;
+	tty->termios->c_ispeed = 115200;
+	tty->termios->c_ospeed = 115200;
+}
+
 /* set the serial param for transfer. we should check if we really need to
  * transfer. if we set flow control we should do this too. */
 static void spcp8x5_set_termios(struct tty_struct *tty,
@@ -514,16 +522,6 @@ static void spcp8x5_set_termios(struct tty_struct *tty,
 	int i;
 	u8 control;
 
-	/* for the 1st time call this function */
-	spin_lock_irqsave(&priv->lock, flags);
-	if (!priv->termios_initialized) {
-		*(tty->termios) = tty_std_termios;
-		tty->termios->c_cflag = B115200 | CS8 | CREAD | HUPCL | CLOCAL;
-		tty->termios->c_ispeed = 115200;
-		tty->termios->c_ospeed = 115200;
-		priv->termios_initialized = 1;
-	}
-	spin_unlock_irqrestore(&priv->lock, flags);
 
 	/* check that they really want us to change something */
 	if (!tty_termios_hw_change(tty->termios, old_termios))
@@ -1008,6 +1006,7 @@ static struct usb_serial_driver spcp8x5_device = {
 	.carrier_raised		= spcp8x5_carrier_raised,
 	.write 			= spcp8x5_write,
 	.set_termios 		= spcp8x5_set_termios,
+	.init_termios		= spcp8x5_init_termios,
 	.ioctl 			= spcp8x5_ioctl,
 	.tiocmget 		= spcp8x5_tiocmget,
 	.tiocmset 		= spcp8x5_tiocmset,
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index 3dda6841e724..80c1f4d8e910 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -721,6 +721,41 @@ static const struct tty_port_operations serial_port_ops = {
 	.dtr_rts = serial_dtr_rts,
 };
 
+/**
+ *	serial_install		-	install tty
+ *	@driver: the driver (USB in our case)
+ *	@tty: the tty being created
+ *
+ *	Create the termios objects for this tty. We use the default USB
+ *	serial ones but permit them to be overriddenby serial->type->termios.
+ *	This lets us remove all the ugly hackery
+ */
+
+static int serial_install(struct tty_driver *driver, struct tty_struct *tty)
+{
+	int idx = tty->index;
+	struct usb_serial *serial;
+	int retval;
+
+	/* If the termios setup has yet to be done */
+	if (tty->driver->termios[idx] == NULL) {
+		/* perform the standard setup */
+		retval = tty_init_termios(tty);
+		if (retval)
+			return retval;
+		/* allow the driver to update it */
+		serial = usb_serial_get_by_index(tty->index);
+		if (serial->type->init_termios)
+			serial->type->init_termios(tty);
+		usb_serial_put(serial);
+	}
+	/* Final install (we use the default method) */
+	tty_driver_kref_get(driver);
+	tty->count++;
+	driver->ttys[idx] = tty;
+	return 0;
+}
+
 int usb_serial_probe(struct usb_interface *interface,
 			       const struct usb_device_id *id)
 {
@@ -1228,7 +1263,8 @@ static const struct tty_operations serial_ops = {
 	.chars_in_buffer =	serial_chars_in_buffer,
 	.tiocmget =		serial_tiocmget,
 	.tiocmset =		serial_tiocmset,
-	.shutdown =		serial_do_free,
+	.shutdown = 		serial_do_free,
+	.install = 		serial_install,
 	.proc_fops =		&serial_proc_fops,
 };
 
diff --git a/drivers/usb/serial/whiteheat.c b/drivers/usb/serial/whiteheat.c
index 81f2ae505966..62424eec33ec 100644
--- a/drivers/usb/serial/whiteheat.c
+++ b/drivers/usb/serial/whiteheat.c
@@ -259,7 +259,7 @@ static int firm_send_command(struct usb_serial_port *port, __u8 command,
 						__u8 *data, __u8 datasize);
 static int firm_open(struct usb_serial_port *port);
 static int firm_close(struct usb_serial_port *port);
-static int firm_setup_port(struct tty_struct *tty);
+static void firm_setup_port(struct tty_struct *tty);
 static int firm_set_rts(struct usb_serial_port *port, __u8 onoff);
 static int firm_set_dtr(struct usb_serial_port *port, __u8 onoff);
 static int firm_set_break(struct usb_serial_port *port, __u8 onoff);
@@ -1210,7 +1210,7 @@ static int firm_close(struct usb_serial_port *port)
 }
 
 
-static int firm_setup_port(struct tty_struct *tty)
+static void firm_setup_port(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct whiteheat_port_settings port_settings;
@@ -1285,7 +1285,7 @@ static int firm_setup_port(struct tty_struct *tty)
 	port_settings.lloop = 0;
 
 	/* now send the message to the device */
-	return firm_send_command(port, WHITEHEAT_SETUP_PORT,
+	firm_send_command(port, WHITEHEAT_SETUP_PORT,
 			(__u8 *)&port_settings, sizeof(port_settings));
 }
 
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 1cbaf4a6b449..7b85e327af91 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -260,6 +260,9 @@ struct usb_serial_driver {
 	   be an attached tty at this point */
 	void (*dtr_rts)(struct usb_serial_port *port, int on);
 	int  (*carrier_raised)(struct usb_serial_port *port);
+	/* Called by the usb serial hooks to allow the user to rework the
+	   termios state */
+	void (*init_termios)(struct tty_struct *tty);
 	/* USB events */
 	void (*read_int_callback)(struct urb *urb);
 	void (*write_int_callback)(struct urb *urb);
-- 
cgit v1.2.3-71-gd317


From e92166517e3ca9bfb416f91e69cf0373b55b6ede Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 6 Aug 2009 15:09:28 +0200
Subject: tty: handle VT specific compat ioctls in vt driver

The VT specific compat_ioctl handlers are the only ones
in common code that require the BKL. Moving them into
the vt driver lets us remove the BKL from the other handlers
and cleans up the code.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/char/vt.c       |   3 +
 drivers/char/vt_ioctl.c | 203 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/tty.h     |   3 +
 3 files changed, 209 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 33214d92ca4c..5dfbfa7d7606 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -2910,6 +2910,9 @@ static const struct tty_operations con_ops = {
 	.flush_chars = con_flush_chars,
 	.chars_in_buffer = con_chars_in_buffer,
 	.ioctl = vt_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = vt_compat_ioctl,
+#endif
 	.stop = con_stop,
 	.start = con_start,
 	.throttle = con_throttle,
diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c
index 0fceb8f4d6f2..30b5d128037c 100644
--- a/drivers/char/vt_ioctl.c
+++ b/drivers/char/vt_ioctl.c
@@ -16,6 +16,7 @@
 #include <linux/tty.h>
 #include <linux/timer.h>
 #include <linux/kernel.h>
+#include <linux/compat.h>
 #include <linux/module.h>
 #include <linux/kd.h>
 #include <linux/vt.h>
@@ -1376,6 +1377,208 @@ void vc_SAK(struct work_struct *work)
 	release_console_sem();
 }
 
+#ifdef CONFIG_COMPAT
+
+struct compat_consolefontdesc {
+	unsigned short charcount;       /* characters in font (256 or 512) */
+	unsigned short charheight;      /* scan lines per character (1-32) */
+	compat_caddr_t chardata;	/* font data in expanded form */
+};
+
+static inline int
+compat_fontx_ioctl(int cmd, struct compat_consolefontdesc __user *user_cfd,
+			 int perm, struct console_font_op *op)
+{
+	struct compat_consolefontdesc cfdarg;
+	int i;
+
+	if (copy_from_user(&cfdarg, user_cfd, sizeof(struct compat_consolefontdesc)))
+		return -EFAULT;
+
+	switch (cmd) {
+	case PIO_FONTX:
+		if (!perm)
+			return -EPERM;
+		op->op = KD_FONT_OP_SET;
+		op->flags = KD_FONT_FLAG_OLD;
+		op->width = 8;
+		op->height = cfdarg.charheight;
+		op->charcount = cfdarg.charcount;
+		op->data = compat_ptr(cfdarg.chardata);
+		return con_font_op(vc_cons[fg_console].d, op);
+	case GIO_FONTX:
+		op->op = KD_FONT_OP_GET;
+		op->flags = KD_FONT_FLAG_OLD;
+		op->width = 8;
+		op->height = cfdarg.charheight;
+		op->charcount = cfdarg.charcount;
+		op->data = compat_ptr(cfdarg.chardata);
+		i = con_font_op(vc_cons[fg_console].d, op);
+		if (i)
+			return i;
+		cfdarg.charheight = op->height;
+		cfdarg.charcount = op->charcount;
+		if (copy_to_user(user_cfd, &cfdarg, sizeof(struct compat_consolefontdesc)))
+			return -EFAULT;
+		return 0;
+	}
+	return -EINVAL;
+}
+
+struct compat_console_font_op {
+	compat_uint_t op;        /* operation code KD_FONT_OP_* */
+	compat_uint_t flags;     /* KD_FONT_FLAG_* */
+	compat_uint_t width, height;     /* font size */
+	compat_uint_t charcount;
+	compat_caddr_t data;    /* font data with height fixed to 32 */
+};
+
+static inline int
+compat_kdfontop_ioctl(struct compat_console_font_op __user *fontop,
+			 int perm, struct console_font_op *op, struct vc_data *vc)
+{
+	int i;
+
+	if (copy_from_user(op, fontop, sizeof(struct compat_console_font_op)))
+		return -EFAULT;
+	if (!perm && op->op != KD_FONT_OP_GET)
+		return -EPERM;
+	op->data = compat_ptr(((struct compat_console_font_op *)op)->data);
+	op->flags |= KD_FONT_FLAG_OLD;
+	i = con_font_op(vc, op);
+	if (i)
+		return i;
+	((struct compat_console_font_op *)op)->data = (unsigned long)op->data;
+	if (copy_to_user(fontop, op, sizeof(struct compat_console_font_op)))
+		return -EFAULT;
+	return 0;
+}
+
+struct compat_unimapdesc {
+	unsigned short entry_ct;
+	compat_caddr_t entries;
+};
+
+static inline int
+compat_unimap_ioctl(unsigned int cmd, struct compat_unimapdesc __user *user_ud,
+			 int perm, struct vc_data *vc)
+{
+	struct compat_unimapdesc tmp;
+	struct unipair __user *tmp_entries;
+
+	if (copy_from_user(&tmp, user_ud, sizeof tmp))
+		return -EFAULT;
+	tmp_entries = compat_ptr(tmp.entries);
+	if (tmp_entries)
+		if (!access_ok(VERIFY_WRITE, tmp_entries,
+				tmp.entry_ct*sizeof(struct unipair)))
+			return -EFAULT;
+	switch (cmd) {
+	case PIO_UNIMAP:
+		if (!perm)
+			return -EPERM;
+		return con_set_unimap(vc, tmp.entry_ct, tmp_entries);
+	case GIO_UNIMAP:
+		if (!perm && fg_console != vc->vc_num)
+			return -EPERM;
+		return con_get_unimap(vc, tmp.entry_ct, &(user_ud->entry_ct), tmp_entries);
+	}
+	return 0;
+}
+
+long vt_compat_ioctl(struct tty_struct *tty, struct file * file,
+	     unsigned int cmd, unsigned long arg)
+{
+	struct vc_data *vc = tty->driver_data;
+	struct console_font_op op;	/* used in multiple places here */
+	struct kbd_struct *kbd;
+	unsigned int console;
+	void __user *up = (void __user *)arg;
+	int perm;
+	int ret = 0;
+
+	console = vc->vc_num;
+
+	lock_kernel();
+
+	if (!vc_cons_allocated(console)) { 	/* impossible? */
+		ret = -ENOIOCTLCMD;
+		goto out;
+	}
+
+	/*
+	 * To have permissions to do most of the vt ioctls, we either have
+	 * to be the owner of the tty, or have CAP_SYS_TTY_CONFIG.
+	 */
+	perm = 0;
+	if (current->signal->tty == tty || capable(CAP_SYS_TTY_CONFIG))
+		perm = 1;
+
+	kbd = kbd_table + console;
+	switch (cmd) {
+	/*
+	 * these need special handlers for incompatible data structures
+	 */
+	case PIO_FONTX:
+	case GIO_FONTX:
+		ret = compat_fontx_ioctl(cmd, up, perm, &op);
+		break;
+
+	case KDFONTOP:
+		ret = compat_kdfontop_ioctl(up, perm, &op, vc);
+		break;
+
+	case PIO_UNIMAP:
+	case GIO_UNIMAP:
+		ret = do_unimap_ioctl(cmd, up, perm, vc);
+		break;
+
+	/*
+	 * all these treat 'arg' as an integer
+	 */
+	case KIOCSOUND:
+	case KDMKTONE:
+#ifdef CONFIG_X86
+	case KDADDIO:
+	case KDDELIO:
+#endif
+	case KDSETMODE:
+	case KDMAPDISP:
+	case KDUNMAPDISP:
+	case KDSKBMODE:
+	case KDSKBMETA:
+	case KDSKBLED:
+	case KDSETLED:
+	case KDSIGACCEPT:
+	case VT_ACTIVATE:
+	case VT_WAITACTIVE:
+	case VT_RELDISP:
+	case VT_DISALLOCATE:
+	case VT_RESIZE:
+	case VT_RESIZEX:
+		goto fallback;
+
+	/*
+	 * the rest has a compatible data structure behind arg,
+	 * but we have to convert it to a proper 64 bit pointer.
+	 */
+	default:
+		arg = (unsigned long)compat_ptr(arg);
+		goto fallback;
+	}
+out:
+	unlock_kernel();
+	return ret;
+
+fallback:
+	unlock_kernel();
+	return vt_ioctl(tty, file, cmd, arg);
+}
+
+
+#endif /* CONFIG_COMPAT */
+
+
 /*
  * Performs the back end of a vt switch. Called under the console
  * semaphore.
diff --git a/include/linux/tty.h b/include/linux/tty.h
index a0e03309a379..f0f43d08d8b8 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -536,5 +536,8 @@ extern int pcxe_open(struct tty_struct *tty, struct file *filp);
 extern int vt_ioctl(struct tty_struct *tty, struct file *file,
 		    unsigned int cmd, unsigned long arg);
 
+extern long vt_compat_ioctl(struct tty_struct *tty, struct file * file,
+		     unsigned int cmd, unsigned long arg);
+
 #endif /* __KERNEL__ */
 #endif
-- 
cgit v1.2.3-71-gd317


From be2f092bfc4f6a415bb4c3e2dcbf521a1f2a0fe5 Mon Sep 17 00:00:00 2001
From: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Date: Sat, 5 Sep 2009 01:20:43 +0900
Subject: mtd: nand: add __nand_correct_data helper function

Split nand_correct_data() into two part, a pure calculation function
and a wrapper for mtd interface.

The tmio_nand driver can implement its ecc.correct function easily
using this __nand_correct_data helper.

Signed-off-by: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Acked-by: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Acked-by: Vimal Singh <vimalsingh@ti.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/nand/nand_ecc.c  | 31 ++++++++++++++++++++++++-------
 include/linux/mtd/nand_ecc.h |  6 ++++++
 2 files changed, 30 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_ecc.c b/drivers/mtd/nand/nand_ecc.c
index c0cb87d6d16e..db7ae9d6a296 100644
--- a/drivers/mtd/nand/nand_ecc.c
+++ b/drivers/mtd/nand/nand_ecc.c
@@ -417,22 +417,22 @@ int nand_calculate_ecc(struct mtd_info *mtd, const unsigned char *buf,
 EXPORT_SYMBOL(nand_calculate_ecc);
 
 /**
- * nand_correct_data - [NAND Interface] Detect and correct bit error(s)
- * @mtd:	MTD block structure
+ * __nand_correct_data - [NAND Interface] Detect and correct bit error(s)
  * @buf:	raw data read from the chip
  * @read_ecc:	ECC from the chip
  * @calc_ecc:	the ECC calculated from raw data
+ * @eccsize:	data bytes per ecc step (256 or 512)
  *
- * Detect and correct a 1 bit error for 256/512 byte block
+ * Detect and correct a 1 bit error for eccsize byte block
  */
-int nand_correct_data(struct mtd_info *mtd, unsigned char *buf,
-		      unsigned char *read_ecc, unsigned char *calc_ecc)
+int __nand_correct_data(unsigned char *buf,
+			unsigned char *read_ecc, unsigned char *calc_ecc,
+			unsigned int eccsize)
 {
 	unsigned char b0, b1, b2, bit_addr;
 	unsigned int byte_addr;
 	/* 256 or 512 bytes/ecc  */
-	const uint32_t eccsize_mult =
-			(((struct nand_chip *)mtd->priv)->ecc.size) >> 8;
+	const uint32_t eccsize_mult = eccsize >> 8;
 
 	/*
 	 * b0 to b2 indicate which bit is faulty (if any)
@@ -495,6 +495,23 @@ int nand_correct_data(struct mtd_info *mtd, unsigned char *buf,
 	printk(KERN_ERR "uncorrectable error : ");
 	return -1;
 }
+EXPORT_SYMBOL(__nand_correct_data);
+
+/**
+ * nand_correct_data - [NAND Interface] Detect and correct bit error(s)
+ * @mtd:	MTD block structure
+ * @buf:	raw data read from the chip
+ * @read_ecc:	ECC from the chip
+ * @calc_ecc:	the ECC calculated from raw data
+ *
+ * Detect and correct a 1 bit error for 256/512 byte block
+ */
+int nand_correct_data(struct mtd_info *mtd, unsigned char *buf,
+		      unsigned char *read_ecc, unsigned char *calc_ecc)
+{
+	return __nand_correct_data(buf, read_ecc, calc_ecc,
+				   ((struct nand_chip *)mtd->priv)->ecc.size);
+}
 EXPORT_SYMBOL(nand_correct_data);
 
 MODULE_LICENSE("GPL");
diff --git a/include/linux/mtd/nand_ecc.h b/include/linux/mtd/nand_ecc.h
index 090da505425d..052ea8ca2434 100644
--- a/include/linux/mtd/nand_ecc.h
+++ b/include/linux/mtd/nand_ecc.h
@@ -20,6 +20,12 @@ struct mtd_info;
  */
 int nand_calculate_ecc(struct mtd_info *mtd, const u_char *dat, u_char *ecc_code);
 
+/*
+ * Detect and correct a 1 bit error for eccsize byte block
+ */
+int __nand_correct_data(u_char *dat, u_char *read_ecc, u_char *calc_ecc,
+			unsigned int eccsize);
+
 /*
  * Detect and correct a 1 bit error for 256 byte block
  */
-- 
cgit v1.2.3-71-gd317


From 43cc960980760f58d1f49ecd39e8cbfb063d63e1 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sun, 20 Sep 2009 16:09:20 +0530
Subject: includecheck fix: include/linux, aio.h

fix the following 'make includecheck' warning:

  include/linux/aio.h: linux/aio_abi.h is included more than once.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: bcrl@kvack.org
Cc: Sam Ravnborg <sam@ravnborg.org>
LKML-Reference: <1247068254.4382.101.camel@ht.satnam>
---
 include/linux/aio.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/aio.h b/include/linux/aio.h
index 47f7d932a01d..aea219d7d8d1 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -225,8 +225,6 @@ static inline void exit_aio(struct mm_struct *mm) { }
 
 #define io_wait_to_kiocb(wait) container_of(wait, struct kiocb, ki_wait)
 
-#include <linux/aio_abi.h>
-
 static inline struct kiocb *list_kiocb(struct list_head *h)
 {
 	return list_entry(h, struct kiocb, ki_list);
-- 
cgit v1.2.3-71-gd317


From 97572751d78133cf9a5f7165b252bf975f9dd17d Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sun, 20 Sep 2009 16:20:44 +0530
Subject: includecheck fix: include/linux, page_cgroup.h

fix the following 'make includecheck' warning:

  include/linux/page_cgroup.h: linux/swap.h is included more than once.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
---
 include/linux/page_cgroup.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 13f126c89ae8..ada779f24178 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -105,14 +105,14 @@ static inline void __init page_cgroup_init_flatmem(void)
 
 #endif
 
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 #include <linux/swap.h>
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
 extern unsigned short lookup_swap_cgroup(swp_entry_t ent);
 extern int swap_cgroup_swapon(int type, unsigned long max_pages);
 extern void swap_cgroup_swapoff(int type);
 #else
-#include <linux/swap.h>
 
 static inline
 unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
-- 
cgit v1.2.3-71-gd317


From 83ba7c34d2b82dc608647f629616df393ab883f9 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sun, 20 Sep 2009 16:24:00 +0530
Subject: includecheck fix: include/linux, ftrace.h

fix the following 'make includecheck' warning:

  include/linux/ftrace.h: linux/sched.h is included more than once.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Sam Ravnborg <sam@ravnborg.org>
LKML-Reference: <1247068321.4382.102.camel@ht.satnam>
---
 include/linux/ftrace.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index dc3b1328aaeb..3c0924a18daf 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -446,7 +446,6 @@ static inline void unpause_graph_tracing(void) { }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 #ifdef CONFIG_TRACING
-#include <linux/sched.h>
 
 /* flags for current->trace */
 enum {
-- 
cgit v1.2.3-71-gd317


From 89f19f04dc72363d912fd007a399cb10310eff6e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Sat, 19 Sep 2009 11:55:44 -0700
Subject: sched: Fix raciness in runqueue_is_locked()

runqueue_is_locked() is unavoidably racy due to a poor interface design.
It does

	cpu = get_cpu()
	ret = some_perpcu_thing(cpu);
	put_cpu(cpu);
	return ret;

Its return value is unreliable.

Fix.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <200909191855.n8JItiko022148@imap1.linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  2 +-
 kernel/sched.c        | 10 ++--------
 kernel/trace/trace.c  |  8 +++++++-
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8af3d249170e..cc37a3fa5065 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -257,7 +257,7 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
 extern void init_idle(struct task_struct *idle, int cpu);
 extern void init_idle_bootup_task(struct task_struct *idle);
 
-extern int runqueue_is_locked(void);
+extern int runqueue_is_locked(int cpu);
 extern void task_rq_unlock_wait(struct task_struct *p);
 
 extern cpumask_var_t nohz_cpu_mask;
diff --git a/kernel/sched.c b/kernel/sched.c
index faf4d463bbff..575fb0139038 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -681,15 +681,9 @@ inline void update_rq_clock(struct rq *rq)
  * This interface allows printk to be called with the runqueue lock
  * held and know whether or not it is OK to wake up the klogd.
  */
-int runqueue_is_locked(void)
+int runqueue_is_locked(int cpu)
 {
-	int cpu = get_cpu();
-	struct rq *rq = cpu_rq(cpu);
-	int ret;
-
-	ret = spin_is_locked(&rq->lock);
-	put_cpu();
-	return ret;
+	return spin_is_locked(&cpu_rq(cpu)->lock);
 }
 
 /*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fd52a19dd172..420232a1fbba 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -275,12 +275,18 @@ static DEFINE_SPINLOCK(tracing_start_lock);
  */
 void trace_wake_up(void)
 {
+	int cpu;
+
+	if (trace_flags & TRACE_ITER_BLOCK)
+		return;
 	/*
 	 * The runqueue_is_locked() can fail, but this is the best we
 	 * have for now:
 	 */
-	if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked())
+	cpu = get_cpu();
+	if (!runqueue_is_locked(cpu))
 		wake_up(&trace_wait);
+	put_cpu();
 }
 
 static int __init set_buf_size(char *str)
-- 
cgit v1.2.3-71-gd317


From 42f29a25207dc7b3051d299cc028d4b395d1328d Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@ksplice.com>
Date: Sun, 20 Sep 2009 18:14:12 -0400
Subject: kbuild: Don't define ALIGN and ENTRY when preprocessing linker
 scripts.

Adding a reference to <linux/linkage.h> to x86's <asm/cache.h> causes
the x86 linker script to have syntax errors, because the ALIGN and
ENTRY keywords get redefined to the assembly implementations of those.
One could fix this by adjusting the include structure, but I think any
solution based on that approach would be fragile.

Currently, it is impossible when writing a header to do something
different for assembly files and linker scripts, even though there are
clearly cases where one wants them to define macros differently for
the two (ENTRY being an excellent example).
So I think the right solution here is to introduce a new preprocessor
definition, called LINKER_SCRIPT that is set along with __ASSEMBLY__
for linker scripts, and to use that to not define ALIGN and ENTRY in
linker scripts.
I suspect we'll find other uses for this mechanism in
the future.

Signed-off-by: Tim Abbott <tabbott@ksplice.com>
Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
---
 include/linux/linkage.h | 2 ++
 scripts/Makefile.build  | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index 691f59171c6c..5126cceb6ae9 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -57,6 +57,7 @@
 
 #ifdef __ASSEMBLY__
 
+#ifndef LINKER_SCRIPT
 #define ALIGN __ALIGN
 #define ALIGN_STR __ALIGN_STR
 
@@ -66,6 +67,7 @@
   ALIGN; \
   name:
 #endif
+#endif /* LINKER_SCRIPT */
 
 #ifndef WEAK
 #define WEAK(name)	   \
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index d5425660a3df..341b58902ffc 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -271,7 +271,7 @@ targets += $(extra-y) $(MAKECMDGOALS) $(always)
 # ---------------------------------------------------------------------------
 quiet_cmd_cpp_lds_S = LDS     $@
       cmd_cpp_lds_S = $(CPP) $(cpp_flags) -P -C -U$(ARCH) \
-	                     -D__ASSEMBLY__ -o $@ $<
+	                     -D__ASSEMBLY__ -DLINKER_SCRIPT -o $@ $<
 
 $(obj)/%.lds: $(src)/%.lds.S FORCE
 	$(call if_changed_dep,cpp_lds_S)
-- 
cgit v1.2.3-71-gd317


From cd74c86bdf705f824d494a2bbda393d1d562b40a Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 21 Sep 2009 16:44:32 +1000
Subject: perf_counter, powerpc, sparc: Fix compilation after
 perf_counter_overflow() change

Commit 5622f295 ("x86, perf_counter, bts: Optimize BTS overflow
handling") removed the regs field from struct perf_sample_data and
added a regs parameter to perf_counter_overflow().  This breaks the
build on powerpc (and Sparc) as reported by Sachin Sant:

  arch/powerpc/kernel/perf_counter.c: In function 'record_and_restart':
  arch/powerpc/kernel/perf_counter.c:1165: error: unknown field 'regs' specified in initializer

This adjusts arch/powerpc/kernel/perf_counter.c to correspond with the
new struct perf_sample_data and perf_counter_overflow().

[ v2: also fix Sparc, Markus Metzger <markus.t.metzger@intel.com> ]

Reported-by: Sachin Sant <sachinp@in.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Markus Metzger <markus.t.metzger@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: benh@kernel.crashing.org
Cc: linuxppc-dev@ozlabs.org
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <19127.8400.376239.586120@drongo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  3 +--
 arch/sparc/kernel/perf_counter.c   |  3 +--
 include/linux/perf_counter.h       | 17 -----------------
 3 files changed, 2 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 7ceefaf3a7f5..5ccf9bca96c0 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -1162,7 +1162,6 @@ static void record_and_restart(struct perf_counter *counter, unsigned long val,
 	 */
 	if (record) {
 		struct perf_sample_data data = {
-			.regs	= regs,
 			.addr	= 0,
 			.period	= counter->hw.last_period,
 		};
@@ -1170,7 +1169,7 @@ static void record_and_restart(struct perf_counter *counter, unsigned long val,
 		if (counter->attr.sample_type & PERF_SAMPLE_ADDR)
 			perf_get_data_addr(regs, &data.addr);
 
-		if (perf_counter_overflow(counter, nmi, &data)) {
+		if (perf_counter_overflow(counter, nmi, &data, regs)) {
 			/*
 			 * Interrupts are coming too fast - throttle them
 			 * by setting the counter to 0, so it will be
diff --git a/arch/sparc/kernel/perf_counter.c b/arch/sparc/kernel/perf_counter.c
index 09de4035eaa9..b1265ce8a053 100644
--- a/arch/sparc/kernel/perf_counter.c
+++ b/arch/sparc/kernel/perf_counter.c
@@ -493,7 +493,6 @@ static int __kprobes perf_counter_nmi_handler(struct notifier_block *self,
 
 	regs = args->regs;
 
-	data.regs = regs;
 	data.addr = 0;
 
 	cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -513,7 +512,7 @@ static int __kprobes perf_counter_nmi_handler(struct notifier_block *self,
 		if (!sparc_perf_counter_set_period(counter, hwc, idx))
 			continue;
 
-		if (perf_counter_overflow(counter, 1, &data))
+		if (perf_counter_overflow(counter, 1, &data, regs))
 			sparc_pmu_disable_counter(hwc, idx);
 	}
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index bd341007c4fc..740caad09a44 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -849,23 +849,6 @@ static inline void perf_counter_comm(struct task_struct *tsk)		{ }
 static inline void perf_counter_fork(struct task_struct *tsk)		{ }
 static inline void perf_counter_init(void)				{ }
 
-static inline int
-perf_output_begin(struct perf_output_handle *handle, struct perf_counter *c,
-		  unsigned int size, int nmi, int sample)		{ }
-static inline void perf_output_end(struct perf_output_handle *handle)	{ }
-static inline void
-perf_output_copy(struct perf_output_handle *handle,
-		 const void *buf, unsigned int len)			{ }
-static inline void
-perf_output_sample(struct perf_output_handle *handle,
-		   struct perf_event_header *header,
-		   struct perf_sample_data *data,
-		   struct perf_counter *counter)			{ }
-static inline void
-perf_prepare_sample(struct perf_event_header *header,
-		    struct perf_sample_data *data,
-		    struct perf_counter *counter,
-		    struct pt_regs *regs)				{ }
 #endif
 
 #define perf_output_put(handle, x) \
-- 
cgit v1.2.3-71-gd317


From 0d721ceadbeaa24d7f9dd41b3e5e29912327a7e1 Mon Sep 17 00:00:00 2001
From: Peter Williams <pwil3058@bigpond.net.au>
Date: Mon, 21 Sep 2009 01:31:53 +0000
Subject: sched: Simplify sys_sched_rr_get_interval() system call

By removing the need for it to know details of scheduling classes.

This allows PlugSched to define orthogonal scheduling classes.

Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <06d1b89ee15a0eef82d7.1253496713@mudlark.pw.nest>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   |  2 ++
 kernel/sched.c          | 17 +----------------
 kernel/sched_fair.c     | 21 +++++++++++++++++++++
 kernel/sched_idletask.c |  7 +++++++
 kernel/sched_rt.c       | 13 +++++++++++++
 5 files changed, 44 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index cc37a3fa5065..239c8e0dba9f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1075,6 +1075,8 @@ struct sched_class {
 	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
 			     int oldprio, int running);
 
+	unsigned int (*get_rr_interval) (struct task_struct *task);
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	void (*moved_group) (struct task_struct *p);
 #endif
diff --git a/kernel/sched.c b/kernel/sched.c
index 1b900fb1c6e1..830967e18285 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6819,23 +6819,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
 	if (retval)
 		goto out_unlock;
 
-	/*
-	 * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER
-	 * tasks that are on an otherwise idle runqueue:
-	 */
-	time_slice = 0;
-	if (p->policy == SCHED_RR) {
-		time_slice = DEF_TIMESLICE;
-	} else if (p->policy != SCHED_FIFO) {
-		struct sched_entity *se = &p->se;
-		unsigned long flags;
-		struct rq *rq;
+	time_slice = p->sched_class->get_rr_interval(p);
 
-		rq = task_rq_lock(p, &flags);
-		if (rq->cfs.load.weight)
-			time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
-		task_rq_unlock(rq, &flags);
-	}
 	read_unlock(&tasklist_lock);
 	jiffies_to_timespec(time_slice, &t);
 	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 566e3bb78ed9..cd73738f0d5f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1938,6 +1938,25 @@ static void moved_group_fair(struct task_struct *p)
 }
 #endif
 
+unsigned int get_rr_interval_fair(struct task_struct *task)
+{
+	struct sched_entity *se = &task->se;
+	unsigned long flags;
+	struct rq *rq;
+	unsigned int rr_interval = 0;
+
+	/*
+	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
+	 * idle runqueue:
+	 */
+	rq = task_rq_lock(task, &flags);
+	if (rq->cfs.load.weight)
+		rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
+	task_rq_unlock(rq, &flags);
+
+	return rr_interval;
+}
+
 /*
  * All the scheduling class methods:
  */
@@ -1966,6 +1985,8 @@ static const struct sched_class fair_sched_class = {
 	.prio_changed		= prio_changed_fair,
 	.switched_to		= switched_to_fair,
 
+	.get_rr_interval	= get_rr_interval_fair,
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	.moved_group		= moved_group_fair,
 #endif
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index a8b448af004b..b133a28fcde3 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -97,6 +97,11 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
 		check_preempt_curr(rq, p, 0);
 }
 
+unsigned int get_rr_interval_idle(struct task_struct *task)
+{
+	return 0;
+}
+
 /*
  * Simple, special scheduling class for the per-CPU idle tasks:
  */
@@ -122,6 +127,8 @@ static const struct sched_class idle_sched_class = {
 	.set_curr_task          = set_curr_task_idle,
 	.task_tick		= task_tick_idle,
 
+	.get_rr_interval	= get_rr_interval_idle,
+
 	.prio_changed		= prio_changed_idle,
 	.switched_to		= switched_to_idle,
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 13de7126a6ab..a4d790cddb19 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1734,6 +1734,17 @@ static void set_curr_task_rt(struct rq *rq)
 	dequeue_pushable_task(rq, p);
 }
 
+unsigned int get_rr_interval_rt(struct task_struct *task)
+{
+	/*
+	 * Time slice is 0 for SCHED_FIFO tasks
+	 */
+	if (task->policy == SCHED_RR)
+		return DEF_TIMESLICE;
+	else
+		return 0;
+}
+
 static const struct sched_class rt_sched_class = {
 	.next			= &fair_sched_class,
 	.enqueue_task		= enqueue_task_rt,
@@ -1762,6 +1773,8 @@ static const struct sched_class rt_sched_class = {
 	.set_curr_task          = set_curr_task_rt,
 	.task_tick		= task_tick_rt,
 
+	.get_rr_interval	= get_rr_interval_rt,
+
 	.prio_changed		= prio_changed_rt,
 	.switched_to		= switched_to_rt,
 };
-- 
cgit v1.2.3-71-gd317


From 65abc8653c282ded3dbdb9ec1227784140ba28cd Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 21 Sep 2009 10:18:27 +0200
Subject: perf_counter: Rename list_entry -> group_entry, counter_list ->
 group_list

This is in preparation of the big rename, but also makes sense
in a standalone way: 'list_entry' is a bad name as we already
have a list_entry() in list.h.

Also, the 'counter list' is too vague, it doesnt tell us the
purpose of that list.

Clarify these names to show that it's all about the group
hiearchy.

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  4 +--
 kernel/perf_counter.c        | 71 ++++++++++++++++++++++----------------------
 2 files changed, 37 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 740caad09a44..f64862732673 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -543,7 +543,7 @@ struct perf_pending_entry {
  */
 struct perf_counter {
 #ifdef CONFIG_PERF_COUNTERS
-	struct list_head		list_entry;
+	struct list_head		group_entry;
 	struct list_head		event_entry;
 	struct list_head		sibling_list;
 	int				nr_siblings;
@@ -649,7 +649,7 @@ struct perf_counter_context {
 	 */
 	struct mutex			mutex;
 
-	struct list_head		counter_list;
+	struct list_head		group_list;
 	struct list_head		event_list;
 	int				nr_counters;
 	int				nr_active;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index cc768ab81ac8..13ad73aed4ca 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -258,9 +258,9 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	 * leader's sibling list:
 	 */
 	if (group_leader == counter)
-		list_add_tail(&counter->list_entry, &ctx->counter_list);
+		list_add_tail(&counter->group_entry, &ctx->group_list);
 	else {
-		list_add_tail(&counter->list_entry, &group_leader->sibling_list);
+		list_add_tail(&counter->group_entry, &group_leader->sibling_list);
 		group_leader->nr_siblings++;
 	}
 
@@ -279,13 +279,13 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
 	struct perf_counter *sibling, *tmp;
 
-	if (list_empty(&counter->list_entry))
+	if (list_empty(&counter->group_entry))
 		return;
 	ctx->nr_counters--;
 	if (counter->attr.inherit_stat)
 		ctx->nr_stat--;
 
-	list_del_init(&counter->list_entry);
+	list_del_init(&counter->group_entry);
 	list_del_rcu(&counter->event_entry);
 
 	if (counter->group_leader != counter)
@@ -296,10 +296,9 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	 * upgrade the siblings to singleton counters by adding them
 	 * to the context list directly:
 	 */
-	list_for_each_entry_safe(sibling, tmp,
-				 &counter->sibling_list, list_entry) {
+	list_for_each_entry_safe(sibling, tmp, &counter->sibling_list, group_entry) {
 
-		list_move_tail(&sibling->list_entry, &ctx->counter_list);
+		list_move_tail(&sibling->group_entry, &ctx->group_list);
 		sibling->group_leader = sibling;
 	}
 }
@@ -343,7 +342,7 @@ group_sched_out(struct perf_counter *group_counter,
 	/*
 	 * Schedule out siblings (if any):
 	 */
-	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
+	list_for_each_entry(counter, &group_counter->sibling_list, group_entry)
 		counter_sched_out(counter, cpuctx, ctx);
 
 	if (group_counter->attr.exclusive)
@@ -435,7 +434,7 @@ retry:
 	/*
 	 * If the context is active we need to retry the smp call.
 	 */
-	if (ctx->nr_active && !list_empty(&counter->list_entry)) {
+	if (ctx->nr_active && !list_empty(&counter->group_entry)) {
 		spin_unlock_irq(&ctx->lock);
 		goto retry;
 	}
@@ -445,7 +444,7 @@ retry:
 	 * can remove the counter safely, if the call above did not
 	 * succeed.
 	 */
-	if (!list_empty(&counter->list_entry)) {
+	if (!list_empty(&counter->group_entry)) {
 		list_del_counter(counter, ctx);
 	}
 	spin_unlock_irq(&ctx->lock);
@@ -497,7 +496,7 @@ static void update_group_times(struct perf_counter *leader)
 	struct perf_counter *counter;
 
 	update_counter_times(leader);
-	list_for_each_entry(counter, &leader->sibling_list, list_entry)
+	list_for_each_entry(counter, &leader->sibling_list, group_entry)
 		update_counter_times(counter);
 }
 
@@ -643,7 +642,7 @@ group_sched_in(struct perf_counter *group_counter,
 	/*
 	 * Schedule in siblings as one group (if any):
 	 */
-	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
+	list_for_each_entry(counter, &group_counter->sibling_list, group_entry) {
 		if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
 			partial_group = counter;
 			goto group_error;
@@ -657,7 +656,7 @@ group_error:
 	 * Groups can be scheduled in as one unit only, so undo any
 	 * partial group before returning:
 	 */
-	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
+	list_for_each_entry(counter, &group_counter->sibling_list, group_entry) {
 		if (counter == partial_group)
 			break;
 		counter_sched_out(counter, cpuctx, ctx);
@@ -678,7 +677,7 @@ static int is_software_only_group(struct perf_counter *leader)
 	if (!is_software_counter(leader))
 		return 0;
 
-	list_for_each_entry(counter, &leader->sibling_list, list_entry)
+	list_for_each_entry(counter, &leader->sibling_list, group_entry)
 		if (!is_software_counter(counter))
 			return 0;
 
@@ -842,7 +841,7 @@ retry:
 	/*
 	 * we need to retry the smp call.
 	 */
-	if (ctx->is_active && list_empty(&counter->list_entry)) {
+	if (ctx->is_active && list_empty(&counter->group_entry)) {
 		spin_unlock_irq(&ctx->lock);
 		goto retry;
 	}
@@ -852,7 +851,7 @@ retry:
 	 * can add the counter safely, if it the call above did not
 	 * succeed.
 	 */
-	if (list_empty(&counter->list_entry))
+	if (list_empty(&counter->group_entry))
 		add_counter_to_ctx(counter, ctx);
 	spin_unlock_irq(&ctx->lock);
 }
@@ -872,7 +871,7 @@ static void __perf_counter_mark_enabled(struct perf_counter *counter,
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
-	list_for_each_entry(sub, &counter->sibling_list, list_entry)
+	list_for_each_entry(sub, &counter->sibling_list, group_entry)
 		if (sub->state >= PERF_COUNTER_STATE_INACTIVE)
 			sub->tstamp_enabled =
 				ctx->time - sub->total_time_enabled;
@@ -1032,7 +1031,7 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 
 	perf_disable();
 	if (ctx->nr_active) {
-		list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		list_for_each_entry(counter, &ctx->group_list, group_entry) {
 			if (counter != counter->group_leader)
 				counter_sched_out(counter, cpuctx, ctx);
 			else
@@ -1252,7 +1251,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 	 * First go through the list and put on any pinned groups
 	 * in order to give them the best chance of going on.
 	 */
-	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+	list_for_each_entry(counter, &ctx->group_list, group_entry) {
 		if (counter->state <= PERF_COUNTER_STATE_OFF ||
 		    !counter->attr.pinned)
 			continue;
@@ -1276,7 +1275,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 		}
 	}
 
-	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+	list_for_each_entry(counter, &ctx->group_list, group_entry) {
 		/*
 		 * Ignore counters in OFF or ERROR state, and
 		 * ignore pinned counters since we did them already.
@@ -1369,7 +1368,7 @@ static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
 	u64 interrupts, freq;
 
 	spin_lock(&ctx->lock);
-	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+	list_for_each_entry(counter, &ctx->group_list, group_entry) {
 		if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 			continue;
 
@@ -1441,8 +1440,8 @@ static void rotate_ctx(struct perf_counter_context *ctx)
 	 * Rotate the first entry last (works just fine for group counters too):
 	 */
 	perf_disable();
-	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		list_move_tail(&counter->list_entry, &ctx->counter_list);
+	list_for_each_entry(counter, &ctx->group_list, group_entry) {
+		list_move_tail(&counter->group_entry, &ctx->group_list);
 		break;
 	}
 	perf_enable();
@@ -1498,7 +1497,7 @@ static void perf_counter_enable_on_exec(struct task_struct *task)
 
 	spin_lock(&ctx->lock);
 
-	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+	list_for_each_entry(counter, &ctx->group_list, group_entry) {
 		if (!counter->attr.enable_on_exec)
 			continue;
 		counter->attr.enable_on_exec = 0;
@@ -1575,7 +1574,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 	memset(ctx, 0, sizeof(*ctx));
 	spin_lock_init(&ctx->lock);
 	mutex_init(&ctx->mutex);
-	INIT_LIST_HEAD(&ctx->counter_list);
+	INIT_LIST_HEAD(&ctx->group_list);
 	INIT_LIST_HEAD(&ctx->event_list);
 	atomic_set(&ctx->refcount, 1);
 	ctx->task = task;
@@ -1818,7 +1817,7 @@ static int perf_counter_read_group(struct perf_counter *counter,
 
 	size += err;
 
-	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
 		err = perf_counter_read_entry(sub, read_format,
 				buf + size);
 		if (err < 0)
@@ -1948,7 +1947,7 @@ static void perf_counter_for_each(struct perf_counter *counter,
 
 	perf_counter_for_each_child(counter, func);
 	func(counter);
-	list_for_each_entry(sibling, &counter->sibling_list, list_entry)
+	list_for_each_entry(sibling, &counter->sibling_list, group_entry)
 		perf_counter_for_each_child(counter, func);
 	mutex_unlock(&ctx->mutex);
 }
@@ -2832,7 +2831,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 
 	perf_output_copy(handle, values, n * sizeof(u64));
 
-	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
 		n = 0;
 
 		if (sub != counter)
@@ -4118,7 +4117,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 	mutex_init(&counter->child_mutex);
 	INIT_LIST_HEAD(&counter->child_list);
 
-	INIT_LIST_HEAD(&counter->list_entry);
+	INIT_LIST_HEAD(&counter->group_entry);
 	INIT_LIST_HEAD(&counter->event_entry);
 	INIT_LIST_HEAD(&counter->sibling_list);
 	init_waitqueue_head(&counter->waitq);
@@ -4544,7 +4543,7 @@ static int inherit_group(struct perf_counter *parent_counter,
 				 child, NULL, child_ctx);
 	if (IS_ERR(leader))
 		return PTR_ERR(leader);
-	list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
+	list_for_each_entry(sub, &parent_counter->sibling_list, group_entry) {
 		child_ctr = inherit_counter(sub, parent, parent_ctx,
 					    child, leader, child_ctx);
 		if (IS_ERR(child_ctr))
@@ -4670,8 +4669,8 @@ void perf_counter_exit_task(struct task_struct *child)
 	mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
 
 again:
-	list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
-				 list_entry)
+	list_for_each_entry_safe(child_counter, tmp, &child_ctx->group_list,
+				 group_entry)
 		__perf_counter_exit_task(child_counter, child_ctx, child);
 
 	/*
@@ -4679,7 +4678,7 @@ again:
 	 * its siblings to the list, but we obtained 'tmp' before that which
 	 * will still point to the list head terminating the iteration.
 	 */
-	if (!list_empty(&child_ctx->counter_list))
+	if (!list_empty(&child_ctx->group_list))
 		goto again;
 
 	mutex_unlock(&child_ctx->mutex);
@@ -4701,7 +4700,7 @@ void perf_counter_free_task(struct task_struct *task)
 
 	mutex_lock(&ctx->mutex);
 again:
-	list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) {
+	list_for_each_entry_safe(counter, tmp, &ctx->group_list, group_entry) {
 		struct perf_counter *parent = counter->parent;
 
 		if (WARN_ON_ONCE(!parent))
@@ -4717,7 +4716,7 @@ again:
 		free_counter(counter);
 	}
 
-	if (!list_empty(&ctx->counter_list))
+	if (!list_empty(&ctx->group_list))
 		goto again;
 
 	mutex_unlock(&ctx->mutex);
@@ -4847,7 +4846,7 @@ static void __perf_counter_exit_cpu(void *info)
 	struct perf_counter_context *ctx = &cpuctx->ctx;
 	struct perf_counter *counter, *tmp;
 
-	list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
+	list_for_each_entry_safe(counter, tmp, &ctx->group_list, group_entry)
 		__perf_counter_remove_from_context(counter);
 }
 static void perf_counter_exit_cpu(int cpu)
-- 
cgit v1.2.3-71-gd317


From cdd6c482c9ff9c55475ee7392ec8f672eddb7be6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 21 Sep 2009 12:02:48 +0200
Subject: perf: Do the big rename: Performance Counters -> Performance Events

Bye-bye Performance Counters, welcome Performance Events!

In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.

Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.

All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)

The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.

Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.

User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)

This patch has been generated via the following script:

  FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')

  sed -i \
    -e 's/PERF_EVENT_/PERF_RECORD_/g' \
    -e 's/PERF_COUNTER/PERF_EVENT/g' \
    -e 's/perf_counter/perf_event/g' \
    -e 's/nb_counters/nb_events/g' \
    -e 's/swcounter/swevent/g' \
    -e 's/tpcounter_event/tp_event/g' \
    $FILES

  for N in $(find . -name perf_counter.[ch]); do
    M=$(echo $N | sed 's/perf_counter/perf_event/g')
    mv $N $M
  done

  FILES=$(find . -name perf_event.*)

  sed -i \
    -e 's/COUNTER_MASK/REG_MASK/g' \
    -e 's/COUNTER/EVENT/g' \
    -e 's/\<event\>/event_id/g' \
    -e 's/counter/event/g' \
    -e 's/Counter/Event/g' \
    $FILES

... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.

Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.

( NOTE: 'counters' are still the proper terminology when we deal
  with hardware registers - and these sed scripts are a bit
  over-eager in renaming them. I've undone some of that, but
  in case there's something left where 'counter' would be
  better than 'event' we can undo that on an individual basis
  instead of touching an otherwise nicely automated patch. )

Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/include/asm/unistd.h           |    2 +-
 arch/arm/kernel/calls.S                 |    2 +-
 arch/blackfin/include/asm/unistd.h      |    2 +-
 arch/blackfin/mach-common/entry.S       |    2 +-
 arch/frv/Kconfig                        |    2 +-
 arch/frv/include/asm/perf_counter.h     |   17 -
 arch/frv/include/asm/perf_event.h       |   17 +
 arch/frv/include/asm/unistd.h           |    2 +-
 arch/frv/kernel/entry.S                 |    2 +-
 arch/frv/lib/Makefile                   |    2 +-
 arch/frv/lib/perf_counter.c             |   19 -
 arch/frv/lib/perf_event.c               |   19 +
 arch/m68k/include/asm/unistd.h          |    2 +-
 arch/m68k/kernel/entry.S                |    2 +-
 arch/m68knommu/kernel/syscalltable.S    |    2 +-
 arch/microblaze/include/asm/unistd.h    |    2 +-
 arch/microblaze/kernel/syscall_table.S  |    2 +-
 arch/mips/include/asm/unistd.h          |    6 +-
 arch/mips/kernel/scall32-o32.S          |    2 +-
 arch/mips/kernel/scall64-64.S           |    2 +-
 arch/mips/kernel/scall64-n32.S          |    2 +-
 arch/mips/kernel/scall64-o32.S          |    2 +-
 arch/mn10300/include/asm/unistd.h       |    2 +-
 arch/mn10300/kernel/entry.S             |    2 +-
 arch/parisc/Kconfig                     |    2 +-
 arch/parisc/include/asm/perf_counter.h  |    7 -
 arch/parisc/include/asm/perf_event.h    |    7 +
 arch/parisc/include/asm/unistd.h        |    4 +-
 arch/parisc/kernel/syscall_table.S      |    2 +-
 arch/powerpc/Kconfig                    |    2 +-
 arch/powerpc/include/asm/hw_irq.h       |   22 +-
 arch/powerpc/include/asm/paca.h         |    2 +-
 arch/powerpc/include/asm/perf_counter.h |  110 -
 arch/powerpc/include/asm/perf_event.h   |  110 +
 arch/powerpc/include/asm/systbl.h       |    2 +-
 arch/powerpc/include/asm/unistd.h       |    2 +-
 arch/powerpc/kernel/Makefile            |    2 +-
 arch/powerpc/kernel/asm-offsets.c       |    2 +-
 arch/powerpc/kernel/entry_64.S          |    8 +-
 arch/powerpc/kernel/irq.c               |    8 +-
 arch/powerpc/kernel/mpc7450-pmu.c       |    2 +-
 arch/powerpc/kernel/perf_callchain.c    |    2 +-
 arch/powerpc/kernel/perf_counter.c      | 1315 --------
 arch/powerpc/kernel/perf_event.c        | 1315 ++++++++
 arch/powerpc/kernel/power4-pmu.c        |    2 +-
 arch/powerpc/kernel/power5+-pmu.c       |    2 +-
 arch/powerpc/kernel/power5-pmu.c        |    2 +-
 arch/powerpc/kernel/power6-pmu.c        |    2 +-
 arch/powerpc/kernel/power7-pmu.c        |    2 +-
 arch/powerpc/kernel/ppc970-pmu.c        |    2 +-
 arch/powerpc/kernel/time.c              |   30 +-
 arch/powerpc/mm/fault.c                 |    8 +-
 arch/powerpc/platforms/Kconfig.cputype  |    4 +-
 arch/s390/Kconfig                       |    2 +-
 arch/s390/include/asm/perf_counter.h    |   10 -
 arch/s390/include/asm/perf_event.h      |   10 +
 arch/s390/include/asm/unistd.h          |    2 +-
 arch/s390/kernel/compat_wrapper.S       |    8 +-
 arch/s390/kernel/syscalls.S             |    2 +-
 arch/s390/mm/fault.c                    |    8 +-
 arch/sh/Kconfig                         |    2 +-
 arch/sh/include/asm/perf_counter.h      |    9 -
 arch/sh/include/asm/perf_event.h        |    9 +
 arch/sh/include/asm/unistd_32.h         |    2 +-
 arch/sh/include/asm/unistd_64.h         |    2 +-
 arch/sh/kernel/syscalls_32.S            |    2 +-
 arch/sh/kernel/syscalls_64.S            |    2 +-
 arch/sh/mm/fault_32.c                   |    8 +-
 arch/sh/mm/tlbflush_64.c                |    8 +-
 arch/sparc/Kconfig                      |    4 +-
 arch/sparc/include/asm/perf_counter.h   |   14 -
 arch/sparc/include/asm/perf_event.h     |   14 +
 arch/sparc/include/asm/unistd.h         |    2 +-
 arch/sparc/kernel/Makefile              |    2 +-
 arch/sparc/kernel/nmi.c                 |    4 +-
 arch/sparc/kernel/pcr.c                 |   10 +-
 arch/sparc/kernel/perf_counter.c        |  556 ----
 arch/sparc/kernel/perf_event.c          |  556 ++++
 arch/sparc/kernel/systbls_32.S          |    2 +-
 arch/sparc/kernel/systbls_64.S          |    4 +-
 arch/x86/Kconfig                        |    2 +-
 arch/x86/ia32/ia32entry.S               |    2 +-
 arch/x86/include/asm/entry_arch.h       |    2 +-
 arch/x86/include/asm/perf_counter.h     |  108 -
 arch/x86/include/asm/perf_event.h       |  108 +
 arch/x86/include/asm/unistd_32.h        |    2 +-
 arch/x86/include/asm/unistd_64.h        |    4 +-
 arch/x86/kernel/apic/apic.c             |    6 +-
 arch/x86/kernel/cpu/Makefile            |    2 +-
 arch/x86/kernel/cpu/common.c            |    4 +-
 arch/x86/kernel/cpu/perf_counter.c      | 2298 --------------
 arch/x86/kernel/cpu/perf_event.c        | 2298 ++++++++++++++
 arch/x86/kernel/cpu/perfctr-watchdog.c  |    2 +-
 arch/x86/kernel/entry_64.S              |    2 +-
 arch/x86/kernel/irqinit.c               |    2 +-
 arch/x86/kernel/syscall_table_32.S      |    2 +-
 arch/x86/mm/fault.c                     |    8 +-
 arch/x86/oprofile/op_model_ppro.c       |    4 +-
 arch/x86/oprofile/op_x86_model.h        |    2 +-
 drivers/char/sysrq.c                    |    4 +-
 fs/exec.c                               |    6 +-
 include/asm-generic/unistd.h            |    4 +-
 include/linux/init_task.h               |   14 +-
 include/linux/perf_counter.h            |  858 ------
 include/linux/perf_event.h              |  858 ++++++
 include/linux/prctl.h                   |    4 +-
 include/linux/sched.h                   |   12 +-
 include/linux/syscalls.h                |    6 +-
 include/trace/ftrace.h                  |   10 +-
 init/Kconfig                            |    8 +-
 kernel/Makefile                         |    2 +-
 kernel/exit.c                           |    8 +-
 kernel/fork.c                           |    8 +-
 kernel/perf_counter.c                   | 5000 -------------------------------
 kernel/perf_event.c                     | 5000 +++++++++++++++++++++++++++++++
 kernel/sched.c                          |   14 +-
 kernel/sys.c                            |   10 +-
 kernel/sys_ni.c                         |    2 +-
 kernel/sysctl.c                         |   22 +-
 kernel/timer.c                          |    4 +-
 kernel/trace/trace_syscalls.c           |    6 +-
 mm/mmap.c                               |    6 +-
 mm/mprotect.c                           |    4 +-
 tools/perf/Makefile                     |    2 +-
 tools/perf/builtin-annotate.c           |   28 +-
 tools/perf/builtin-record.c             |   22 +-
 tools/perf/builtin-report.c             |   48 +-
 tools/perf/builtin-sched.c              |   20 +-
 tools/perf/builtin-stat.c               |   10 +-
 tools/perf/builtin-timechart.c          |   14 +-
 tools/perf/builtin-top.c                |   12 +-
 tools/perf/builtin-trace.c              |   22 +-
 tools/perf/design.txt                   |   58 +-
 tools/perf/perf.h                       |   12 +-
 tools/perf/util/event.h                 |    4 +-
 tools/perf/util/header.c                |    6 +-
 tools/perf/util/header.h                |    8 +-
 tools/perf/util/parse-events.c          |   32 +-
 tools/perf/util/parse-events.h          |    2 +-
 tools/perf/util/trace-event-info.c      |    8 +-
 tools/perf/util/trace-event.h           |    2 +-
 141 files changed, 10694 insertions(+), 10694 deletions(-)
 delete mode 100644 arch/frv/include/asm/perf_counter.h
 create mode 100644 arch/frv/include/asm/perf_event.h
 delete mode 100644 arch/frv/lib/perf_counter.c
 create mode 100644 arch/frv/lib/perf_event.c
 delete mode 100644 arch/parisc/include/asm/perf_counter.h
 create mode 100644 arch/parisc/include/asm/perf_event.h
 delete mode 100644 arch/powerpc/include/asm/perf_counter.h
 create mode 100644 arch/powerpc/include/asm/perf_event.h
 delete mode 100644 arch/powerpc/kernel/perf_counter.c
 create mode 100644 arch/powerpc/kernel/perf_event.c
 delete mode 100644 arch/s390/include/asm/perf_counter.h
 create mode 100644 arch/s390/include/asm/perf_event.h
 delete mode 100644 arch/sh/include/asm/perf_counter.h
 create mode 100644 arch/sh/include/asm/perf_event.h
 delete mode 100644 arch/sparc/include/asm/perf_counter.h
 create mode 100644 arch/sparc/include/asm/perf_event.h
 delete mode 100644 arch/sparc/kernel/perf_counter.c
 create mode 100644 arch/sparc/kernel/perf_event.c
 delete mode 100644 arch/x86/include/asm/perf_counter.h
 create mode 100644 arch/x86/include/asm/perf_event.h
 delete mode 100644 arch/x86/kernel/cpu/perf_counter.c
 create mode 100644 arch/x86/kernel/cpu/perf_event.c
 delete mode 100644 include/linux/perf_counter.h
 create mode 100644 include/linux/perf_event.h
 delete mode 100644 kernel/perf_counter.c
 create mode 100644 kernel/perf_event.c

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h
index 9122c9ee18fb..89f7eade20af 100644
--- a/arch/arm/include/asm/unistd.h
+++ b/arch/arm/include/asm/unistd.h
@@ -390,7 +390,7 @@
 #define __NR_preadv			(__NR_SYSCALL_BASE+361)
 #define __NR_pwritev			(__NR_SYSCALL_BASE+362)
 #define __NR_rt_tgsigqueueinfo		(__NR_SYSCALL_BASE+363)
-#define __NR_perf_counter_open		(__NR_SYSCALL_BASE+364)
+#define __NR_perf_event_open		(__NR_SYSCALL_BASE+364)
 
 /*
  * The following SWIs are ARM private.
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
index ecfa98954d1d..fafce1b5c69f 100644
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -373,7 +373,7 @@
 		CALL(sys_preadv)
 		CALL(sys_pwritev)
 		CALL(sys_rt_tgsigqueueinfo)
-		CALL(sys_perf_counter_open)
+		CALL(sys_perf_event_open)
 #ifndef syscalls_counted
 .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
 #define syscalls_counted
diff --git a/arch/blackfin/include/asm/unistd.h b/arch/blackfin/include/asm/unistd.h
index c8e7ee4768cd..02b1529dad57 100644
--- a/arch/blackfin/include/asm/unistd.h
+++ b/arch/blackfin/include/asm/unistd.h
@@ -381,7 +381,7 @@
 #define __NR_preadv		366
 #define __NR_pwritev		367
 #define __NR_rt_tgsigqueueinfo	368
-#define __NR_perf_counter_open	369
+#define __NR_perf_event_open	369
 
 #define __NR_syscall		370
 #define NR_syscalls		__NR_syscall
diff --git a/arch/blackfin/mach-common/entry.S b/arch/blackfin/mach-common/entry.S
index 01af24cde362..1e7cac23e25f 100644
--- a/arch/blackfin/mach-common/entry.S
+++ b/arch/blackfin/mach-common/entry.S
@@ -1620,7 +1620,7 @@ ENTRY(_sys_call_table)
 	.long _sys_preadv
 	.long _sys_pwritev
 	.long _sys_rt_tgsigqueueinfo
-	.long _sys_perf_counter_open
+	.long _sys_perf_event_open
 
 	.rept NR_syscalls-(.-_sys_call_table)/4
 	.long _sys_ni_syscall
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index b86e19c9b5b0..4b5830bcbe2e 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -7,7 +7,7 @@ config FRV
 	default y
 	select HAVE_IDE
 	select HAVE_ARCH_TRACEHOOK
-	select HAVE_PERF_COUNTERS
+	select HAVE_PERF_EVENTS
 
 config ZONE_DMA
 	bool
diff --git a/arch/frv/include/asm/perf_counter.h b/arch/frv/include/asm/perf_counter.h
deleted file mode 100644
index ccf726e61b2e..000000000000
--- a/arch/frv/include/asm/perf_counter.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* FRV performance counter support
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#ifndef _ASM_PERF_COUNTER_H
-#define _ASM_PERF_COUNTER_H
-
-#define PERF_COUNTER_INDEX_OFFSET	0
-
-#endif /* _ASM_PERF_COUNTER_H */
diff --git a/arch/frv/include/asm/perf_event.h b/arch/frv/include/asm/perf_event.h
new file mode 100644
index 000000000000..a69e0155d146
--- /dev/null
+++ b/arch/frv/include/asm/perf_event.h
@@ -0,0 +1,17 @@
+/* FRV performance event support
+ *
+ * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _ASM_PERF_EVENT_H
+#define _ASM_PERF_EVENT_H
+
+#define PERF_EVENT_INDEX_OFFSET	0
+
+#endif /* _ASM_PERF_EVENT_H */
diff --git a/arch/frv/include/asm/unistd.h b/arch/frv/include/asm/unistd.h
index 4a8fb427ce0a..be6ef0f5cd42 100644
--- a/arch/frv/include/asm/unistd.h
+++ b/arch/frv/include/asm/unistd.h
@@ -342,7 +342,7 @@
 #define __NR_preadv		333
 #define __NR_pwritev		334
 #define __NR_rt_tgsigqueueinfo	335
-#define __NR_perf_counter_open	336
+#define __NR_perf_event_open	336
 
 #ifdef __KERNEL__
 
diff --git a/arch/frv/kernel/entry.S b/arch/frv/kernel/entry.S
index fde1e446b440..189397ec012a 100644
--- a/arch/frv/kernel/entry.S
+++ b/arch/frv/kernel/entry.S
@@ -1525,6 +1525,6 @@ sys_call_table:
 	.long sys_preadv
 	.long sys_pwritev
 	.long sys_rt_tgsigqueueinfo	/* 335 */
-	.long sys_perf_counter_open
+	.long sys_perf_event_open
 
 syscall_table_size = (. - sys_call_table)
diff --git a/arch/frv/lib/Makefile b/arch/frv/lib/Makefile
index 0a377210c89b..f4709756d0d9 100644
--- a/arch/frv/lib/Makefile
+++ b/arch/frv/lib/Makefile
@@ -5,4 +5,4 @@
 lib-y := \
 	__ashldi3.o __lshrdi3.o __muldi3.o __ashrdi3.o __negdi2.o __ucmpdi2.o \
 	checksum.o memcpy.o memset.o atomic-ops.o atomic64-ops.o \
-	outsl_ns.o outsl_sw.o insl_ns.o insl_sw.o cache.o perf_counter.o
+	outsl_ns.o outsl_sw.o insl_ns.o insl_sw.o cache.o perf_event.o
diff --git a/arch/frv/lib/perf_counter.c b/arch/frv/lib/perf_counter.c
deleted file mode 100644
index 2000feecd571..000000000000
--- a/arch/frv/lib/perf_counter.c
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Performance counter handling
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/perf_counter.h>
-
-/*
- * mark the performance counter as pending
- */
-void set_perf_counter_pending(void)
-{
-}
diff --git a/arch/frv/lib/perf_event.c b/arch/frv/lib/perf_event.c
new file mode 100644
index 000000000000..9ac5acfd2e91
--- /dev/null
+++ b/arch/frv/lib/perf_event.c
@@ -0,0 +1,19 @@
+/* Performance event handling
+ *
+ * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/perf_event.h>
+
+/*
+ * mark the performance event as pending
+ */
+void set_perf_event_pending(void)
+{
+}
diff --git a/arch/m68k/include/asm/unistd.h b/arch/m68k/include/asm/unistd.h
index 946d8691f2b0..48b87f5ced50 100644
--- a/arch/m68k/include/asm/unistd.h
+++ b/arch/m68k/include/asm/unistd.h
@@ -335,7 +335,7 @@
 #define __NR_preadv		329
 #define __NR_pwritev		330
 #define __NR_rt_tgsigqueueinfo	331
-#define __NR_perf_counter_open	332
+#define __NR_perf_event_open	332
 
 #ifdef __KERNEL__
 
diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S
index 922f52e7ed1a..c5b33634c980 100644
--- a/arch/m68k/kernel/entry.S
+++ b/arch/m68k/kernel/entry.S
@@ -756,5 +756,5 @@ sys_call_table:
 	.long sys_preadv
 	.long sys_pwritev		/* 330 */
 	.long sys_rt_tgsigqueueinfo
-	.long sys_perf_counter_open
+	.long sys_perf_event_open
 
diff --git a/arch/m68knommu/kernel/syscalltable.S b/arch/m68knommu/kernel/syscalltable.S
index 0ae123e08985..23535cc415ae 100644
--- a/arch/m68knommu/kernel/syscalltable.S
+++ b/arch/m68knommu/kernel/syscalltable.S
@@ -350,7 +350,7 @@ ENTRY(sys_call_table)
 	.long sys_preadv
 	.long sys_pwritev		/* 330 */
 	.long sys_rt_tgsigqueueinfo
-	.long sys_perf_counter_open
+	.long sys_perf_event_open
 
 	.rept NR_syscalls-(.-sys_call_table)/4
 		.long sys_ni_syscall
diff --git a/arch/microblaze/include/asm/unistd.h b/arch/microblaze/include/asm/unistd.h
index 0b852327c0e7..cb05a07e55e9 100644
--- a/arch/microblaze/include/asm/unistd.h
+++ b/arch/microblaze/include/asm/unistd.h
@@ -381,7 +381,7 @@
 #define __NR_preadv		363 /* new */
 #define __NR_pwritev		364 /* new */
 #define __NR_rt_tgsigqueueinfo	365 /* new */
-#define __NR_perf_counter_open	366 /* new */
+#define __NR_perf_event_open	366 /* new */
 
 #define __NR_syscalls		367
 
diff --git a/arch/microblaze/kernel/syscall_table.S b/arch/microblaze/kernel/syscall_table.S
index 457216097dfd..ecec19155135 100644
--- a/arch/microblaze/kernel/syscall_table.S
+++ b/arch/microblaze/kernel/syscall_table.S
@@ -370,4 +370,4 @@ ENTRY(sys_call_table)
 	.long sys_ni_syscall
 	.long sys_ni_syscall
 	.long sys_rt_tgsigqueueinfo	/* 365 */
-	.long sys_perf_counter_open
+	.long sys_perf_event_open
diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h
index e753a777949b..8c9dfa9e9018 100644
--- a/arch/mips/include/asm/unistd.h
+++ b/arch/mips/include/asm/unistd.h
@@ -353,7 +353,7 @@
 #define __NR_preadv			(__NR_Linux + 330)
 #define __NR_pwritev			(__NR_Linux + 331)
 #define __NR_rt_tgsigqueueinfo		(__NR_Linux + 332)
-#define __NR_perf_counter_open		(__NR_Linux + 333)
+#define __NR_perf_event_open		(__NR_Linux + 333)
 #define __NR_accept4			(__NR_Linux + 334)
 
 /*
@@ -664,7 +664,7 @@
 #define __NR_preadv			(__NR_Linux + 289)
 #define __NR_pwritev			(__NR_Linux + 290)
 #define __NR_rt_tgsigqueueinfo		(__NR_Linux + 291)
-#define __NR_perf_counter_open		(__NR_Linux + 292)
+#define __NR_perf_event_open		(__NR_Linux + 292)
 #define __NR_accept4			(__NR_Linux + 293)
 
 /*
@@ -979,7 +979,7 @@
 #define __NR_preadv			(__NR_Linux + 293)
 #define __NR_pwritev			(__NR_Linux + 294)
 #define __NR_rt_tgsigqueueinfo		(__NR_Linux + 295)
-#define __NR_perf_counter_open		(__NR_Linux + 296)
+#define __NR_perf_event_open		(__NR_Linux + 296)
 #define __NR_accept4			(__NR_Linux + 297)
 
 /*
diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S
index 7c2de4f091c4..fd2a9bb620d6 100644
--- a/arch/mips/kernel/scall32-o32.S
+++ b/arch/mips/kernel/scall32-o32.S
@@ -581,7 +581,7 @@ einval:	li	v0, -ENOSYS
 	sys	sys_preadv		6	/* 4330 */
 	sys	sys_pwritev		6
 	sys	sys_rt_tgsigqueueinfo	4
-	sys	sys_perf_counter_open	5
+	sys	sys_perf_event_open	5
 	sys	sys_accept4		4
 	.endm
 
diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S
index b97b993846d6..18bf7f32c5e4 100644
--- a/arch/mips/kernel/scall64-64.S
+++ b/arch/mips/kernel/scall64-64.S
@@ -418,6 +418,6 @@ sys_call_table:
 	PTR	sys_preadv
 	PTR	sys_pwritev			/* 5390 */
 	PTR	sys_rt_tgsigqueueinfo
-	PTR	sys_perf_counter_open
+	PTR	sys_perf_event_open
 	PTR	sys_accept4
 	.size	sys_call_table,.-sys_call_table
diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S
index 1a6ae124635b..6ebc07976694 100644
--- a/arch/mips/kernel/scall64-n32.S
+++ b/arch/mips/kernel/scall64-n32.S
@@ -416,6 +416,6 @@ EXPORT(sysn32_call_table)
 	PTR	sys_preadv
 	PTR	sys_pwritev
 	PTR	compat_sys_rt_tgsigqueueinfo	/* 5295 */
-	PTR	sys_perf_counter_open
+	PTR	sys_perf_event_open
 	PTR	sys_accept4
 	.size	sysn32_call_table,.-sysn32_call_table
diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S
index cd31087a651f..9bbf9775e0bd 100644
--- a/arch/mips/kernel/scall64-o32.S
+++ b/arch/mips/kernel/scall64-o32.S
@@ -536,6 +536,6 @@ sys_call_table:
 	PTR	compat_sys_preadv		/* 4330 */
 	PTR	compat_sys_pwritev
 	PTR	compat_sys_rt_tgsigqueueinfo
-	PTR	sys_perf_counter_open
+	PTR	sys_perf_event_open
 	PTR	sys_accept4
 	.size	sys_call_table,.-sys_call_table
diff --git a/arch/mn10300/include/asm/unistd.h b/arch/mn10300/include/asm/unistd.h
index fad68616af32..2a983931c11f 100644
--- a/arch/mn10300/include/asm/unistd.h
+++ b/arch/mn10300/include/asm/unistd.h
@@ -347,7 +347,7 @@
 #define __NR_preadv		334
 #define __NR_pwritev		335
 #define __NR_rt_tgsigqueueinfo	336
-#define __NR_perf_counter_open	337
+#define __NR_perf_event_open	337
 
 #ifdef __KERNEL__
 
diff --git a/arch/mn10300/kernel/entry.S b/arch/mn10300/kernel/entry.S
index e0d2563af4f2..a94e7ea3faa6 100644
--- a/arch/mn10300/kernel/entry.S
+++ b/arch/mn10300/kernel/entry.S
@@ -723,7 +723,7 @@ ENTRY(sys_call_table)
 	.long sys_preadv
 	.long sys_pwritev		/* 335 */
 	.long sys_rt_tgsigqueueinfo
-	.long sys_perf_counter_open
+	.long sys_perf_event_open
 
 
 nr_syscalls=(.-sys_call_table)/4
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 06f8d5b5b0f9..f388dc68f605 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -16,7 +16,7 @@ config PARISC
 	select RTC_DRV_GENERIC
 	select INIT_ALL_POSSIBLE
 	select BUG
-	select HAVE_PERF_COUNTERS
+	select HAVE_PERF_EVENTS
 	select GENERIC_ATOMIC64 if !64BIT
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/parisc/include/asm/perf_counter.h b/arch/parisc/include/asm/perf_counter.h
deleted file mode 100644
index dc9e829f7013..000000000000
--- a/arch/parisc/include/asm/perf_counter.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef __ASM_PARISC_PERF_COUNTER_H
-#define __ASM_PARISC_PERF_COUNTER_H
-
-/* parisc only supports software counters through this interface. */
-static inline void set_perf_counter_pending(void) { }
-
-#endif /* __ASM_PARISC_PERF_COUNTER_H */
diff --git a/arch/parisc/include/asm/perf_event.h b/arch/parisc/include/asm/perf_event.h
new file mode 100644
index 000000000000..cc146427d8f9
--- /dev/null
+++ b/arch/parisc/include/asm/perf_event.h
@@ -0,0 +1,7 @@
+#ifndef __ASM_PARISC_PERF_EVENT_H
+#define __ASM_PARISC_PERF_EVENT_H
+
+/* parisc only supports software events through this interface. */
+static inline void set_perf_event_pending(void) { }
+
+#endif /* __ASM_PARISC_PERF_EVENT_H */
diff --git a/arch/parisc/include/asm/unistd.h b/arch/parisc/include/asm/unistd.h
index f3d3b8b012c4..cda158318c62 100644
--- a/arch/parisc/include/asm/unistd.h
+++ b/arch/parisc/include/asm/unistd.h
@@ -810,9 +810,9 @@
 #define __NR_preadv		(__NR_Linux + 315)
 #define __NR_pwritev		(__NR_Linux + 316)
 #define __NR_rt_tgsigqueueinfo	(__NR_Linux + 317)
-#define __NR_perf_counter_open	(__NR_Linux + 318)
+#define __NR_perf_event_open	(__NR_Linux + 318)
 
-#define __NR_Linux_syscalls	(__NR_perf_counter_open + 1)
+#define __NR_Linux_syscalls	(__NR_perf_event_open + 1)
 
 
 #define __IGNORE_select		/* newselect */
diff --git a/arch/parisc/kernel/syscall_table.S b/arch/parisc/kernel/syscall_table.S
index cf145eb026b3..843f423dec67 100644
--- a/arch/parisc/kernel/syscall_table.S
+++ b/arch/parisc/kernel/syscall_table.S
@@ -416,7 +416,7 @@
 	ENTRY_COMP(preadv)		/* 315 */
 	ENTRY_COMP(pwritev)
 	ENTRY_COMP(rt_tgsigqueueinfo)
-	ENTRY_SAME(perf_counter_open)
+	ENTRY_SAME(perf_event_open)
 
 	/* Nothing yet */
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8250902265c6..4fd479059d65 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -129,7 +129,7 @@ config PPC
 	select HAVE_OPROFILE
 	select HAVE_SYSCALL_WRAPPERS if PPC64
 	select GENERIC_ATOMIC64 if PPC32
-	select HAVE_PERF_COUNTERS
+	select HAVE_PERF_EVENTS
 
 config EARLY_PRINTK
 	bool
diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index e73d554538dd..abbc2aaaced5 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -135,43 +135,43 @@ static inline int irqs_disabled_flags(unsigned long flags)
  */
 struct irq_chip;
 
-#ifdef CONFIG_PERF_COUNTERS
+#ifdef CONFIG_PERF_EVENTS
 
 #ifdef CONFIG_PPC64
-static inline unsigned long test_perf_counter_pending(void)
+static inline unsigned long test_perf_event_pending(void)
 {
 	unsigned long x;
 
 	asm volatile("lbz %0,%1(13)"
 		: "=r" (x)
-		: "i" (offsetof(struct paca_struct, perf_counter_pending)));
+		: "i" (offsetof(struct paca_struct, perf_event_pending)));
 	return x;
 }
 
-static inline void set_perf_counter_pending(void)
+static inline void set_perf_event_pending(void)
 {
 	asm volatile("stb %0,%1(13)" : :
 		"r" (1),
-		"i" (offsetof(struct paca_struct, perf_counter_pending)));
+		"i" (offsetof(struct paca_struct, perf_event_pending)));
 }
 
-static inline void clear_perf_counter_pending(void)
+static inline void clear_perf_event_pending(void)
 {
 	asm volatile("stb %0,%1(13)" : :
 		"r" (0),
-		"i" (offsetof(struct paca_struct, perf_counter_pending)));
+		"i" (offsetof(struct paca_struct, perf_event_pending)));
 }
 #endif /* CONFIG_PPC64 */
 
-#else  /* CONFIG_PERF_COUNTERS */
+#else  /* CONFIG_PERF_EVENTS */
 
-static inline unsigned long test_perf_counter_pending(void)
+static inline unsigned long test_perf_event_pending(void)
 {
 	return 0;
 }
 
-static inline void clear_perf_counter_pending(void) {}
-#endif /* CONFIG_PERF_COUNTERS */
+static inline void clear_perf_event_pending(void) {}
+#endif /* CONFIG_PERF_EVENTS */
 
 #endif	/* __KERNEL__ */
 #endif	/* _ASM_POWERPC_HW_IRQ_H */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index b634456ea893..154f405b642f 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -122,7 +122,7 @@ struct paca_struct {
 	u8 soft_enabled;		/* irq soft-enable flag */
 	u8 hard_enabled;		/* set if irqs are enabled in MSR */
 	u8 io_sync;			/* writel() needs spin_unlock sync */
-	u8 perf_counter_pending;	/* PM interrupt while soft-disabled */
+	u8 perf_event_pending;	/* PM interrupt while soft-disabled */
 
 	/* Stuff for accurate time accounting */
 	u64 user_time;			/* accumulated usermode TB ticks */
diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
deleted file mode 100644
index 0ea0639fcf75..000000000000
--- a/arch/powerpc/include/asm/perf_counter.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Performance counter support - PowerPC-specific definitions.
- *
- * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/types.h>
-
-#include <asm/hw_irq.h>
-
-#define MAX_HWCOUNTERS		8
-#define MAX_EVENT_ALTERNATIVES	8
-#define MAX_LIMITED_HWCOUNTERS	2
-
-/*
- * This struct provides the constants and functions needed to
- * describe the PMU on a particular POWER-family CPU.
- */
-struct power_pmu {
-	const char	*name;
-	int		n_counter;
-	int		max_alternatives;
-	unsigned long	add_fields;
-	unsigned long	test_adder;
-	int		(*compute_mmcr)(u64 events[], int n_ev,
-				unsigned int hwc[], unsigned long mmcr[]);
-	int		(*get_constraint)(u64 event, unsigned long *mskp,
-				unsigned long *valp);
-	int		(*get_alternatives)(u64 event, unsigned int flags,
-				u64 alt[]);
-	void		(*disable_pmc)(unsigned int pmc, unsigned long mmcr[]);
-	int		(*limited_pmc_event)(u64 event);
-	u32		flags;
-	int		n_generic;
-	int		*generic_events;
-	int		(*cache_events)[PERF_COUNT_HW_CACHE_MAX]
-			       [PERF_COUNT_HW_CACHE_OP_MAX]
-			       [PERF_COUNT_HW_CACHE_RESULT_MAX];
-};
-
-/*
- * Values for power_pmu.flags
- */
-#define PPMU_LIMITED_PMC5_6	1	/* PMC5/6 have limited function */
-#define PPMU_ALT_SIPR		2	/* uses alternate posn for SIPR/HV */
-
-/*
- * Values for flags to get_alternatives()
- */
-#define PPMU_LIMITED_PMC_OK	1	/* can put this on a limited PMC */
-#define PPMU_LIMITED_PMC_REQD	2	/* have to put this on a limited PMC */
-#define PPMU_ONLY_COUNT_RUN	4	/* only counting in run state */
-
-extern int register_power_pmu(struct power_pmu *);
-
-struct pt_regs;
-extern unsigned long perf_misc_flags(struct pt_regs *regs);
-extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
-
-#define PERF_COUNTER_INDEX_OFFSET	1
-
-/*
- * Only override the default definitions in include/linux/perf_counter.h
- * if we have hardware PMU support.
- */
-#ifdef CONFIG_PPC_PERF_CTRS
-#define perf_misc_flags(regs)	perf_misc_flags(regs)
-#endif
-
-/*
- * The power_pmu.get_constraint function returns a 32/64-bit value and
- * a 32/64-bit mask that express the constraints between this event and
- * other events.
- *
- * The value and mask are divided up into (non-overlapping) bitfields
- * of three different types:
- *
- * Select field: this expresses the constraint that some set of bits
- * in MMCR* needs to be set to a specific value for this event.  For a
- * select field, the mask contains 1s in every bit of the field, and
- * the value contains a unique value for each possible setting of the
- * MMCR* bits.  The constraint checking code will ensure that two events
- * that set the same field in their masks have the same value in their
- * value dwords.
- *
- * Add field: this expresses the constraint that there can be at most
- * N events in a particular class.  A field of k bits can be used for
- * N <= 2^(k-1) - 1.  The mask has the most significant bit of the field
- * set (and the other bits 0), and the value has only the least significant
- * bit of the field set.  In addition, the 'add_fields' and 'test_adder'
- * in the struct power_pmu for this processor come into play.  The
- * add_fields value contains 1 in the LSB of the field, and the
- * test_adder contains 2^(k-1) - 1 - N in the field.
- *
- * NAND field: this expresses the constraint that you may not have events
- * in all of a set of classes.  (For example, on PPC970, you can't select
- * events from the FPU, ISU and IDU simultaneously, although any two are
- * possible.)  For N classes, the field is N+1 bits wide, and each class
- * is assigned one bit from the least-significant N bits.  The mask has
- * only the most-significant bit set, and the value has only the bit
- * for the event's class set.  The test_adder has the least significant
- * bit set in the field.
- *
- * If an event is not subject to the constraint expressed by a particular
- * field, then it will have 0 in both the mask and value for that field.
- */
diff --git a/arch/powerpc/include/asm/perf_event.h b/arch/powerpc/include/asm/perf_event.h
new file mode 100644
index 000000000000..2499aaadaeb9
--- /dev/null
+++ b/arch/powerpc/include/asm/perf_event.h
@@ -0,0 +1,110 @@
+/*
+ * Performance event support - PowerPC-specific definitions.
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/types.h>
+
+#include <asm/hw_irq.h>
+
+#define MAX_HWEVENTS		8
+#define MAX_EVENT_ALTERNATIVES	8
+#define MAX_LIMITED_HWEVENTS	2
+
+/*
+ * This struct provides the constants and functions needed to
+ * describe the PMU on a particular POWER-family CPU.
+ */
+struct power_pmu {
+	const char	*name;
+	int		n_event;
+	int		max_alternatives;
+	unsigned long	add_fields;
+	unsigned long	test_adder;
+	int		(*compute_mmcr)(u64 events[], int n_ev,
+				unsigned int hwc[], unsigned long mmcr[]);
+	int		(*get_constraint)(u64 event_id, unsigned long *mskp,
+				unsigned long *valp);
+	int		(*get_alternatives)(u64 event_id, unsigned int flags,
+				u64 alt[]);
+	void		(*disable_pmc)(unsigned int pmc, unsigned long mmcr[]);
+	int		(*limited_pmc_event)(u64 event_id);
+	u32		flags;
+	int		n_generic;
+	int		*generic_events;
+	int		(*cache_events)[PERF_COUNT_HW_CACHE_MAX]
+			       [PERF_COUNT_HW_CACHE_OP_MAX]
+			       [PERF_COUNT_HW_CACHE_RESULT_MAX];
+};
+
+/*
+ * Values for power_pmu.flags
+ */
+#define PPMU_LIMITED_PMC5_6	1	/* PMC5/6 have limited function */
+#define PPMU_ALT_SIPR		2	/* uses alternate posn for SIPR/HV */
+
+/*
+ * Values for flags to get_alternatives()
+ */
+#define PPMU_LIMITED_PMC_OK	1	/* can put this on a limited PMC */
+#define PPMU_LIMITED_PMC_REQD	2	/* have to put this on a limited PMC */
+#define PPMU_ONLY_COUNT_RUN	4	/* only counting in run state */
+
+extern int register_power_pmu(struct power_pmu *);
+
+struct pt_regs;
+extern unsigned long perf_misc_flags(struct pt_regs *regs);
+extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
+
+#define PERF_EVENT_INDEX_OFFSET	1
+
+/*
+ * Only override the default definitions in include/linux/perf_event.h
+ * if we have hardware PMU support.
+ */
+#ifdef CONFIG_PPC_PERF_CTRS
+#define perf_misc_flags(regs)	perf_misc_flags(regs)
+#endif
+
+/*
+ * The power_pmu.get_constraint function returns a 32/64-bit value and
+ * a 32/64-bit mask that express the constraints between this event_id and
+ * other events.
+ *
+ * The value and mask are divided up into (non-overlapping) bitfields
+ * of three different types:
+ *
+ * Select field: this expresses the constraint that some set of bits
+ * in MMCR* needs to be set to a specific value for this event_id.  For a
+ * select field, the mask contains 1s in every bit of the field, and
+ * the value contains a unique value for each possible setting of the
+ * MMCR* bits.  The constraint checking code will ensure that two events
+ * that set the same field in their masks have the same value in their
+ * value dwords.
+ *
+ * Add field: this expresses the constraint that there can be at most
+ * N events in a particular class.  A field of k bits can be used for
+ * N <= 2^(k-1) - 1.  The mask has the most significant bit of the field
+ * set (and the other bits 0), and the value has only the least significant
+ * bit of the field set.  In addition, the 'add_fields' and 'test_adder'
+ * in the struct power_pmu for this processor come into play.  The
+ * add_fields value contains 1 in the LSB of the field, and the
+ * test_adder contains 2^(k-1) - 1 - N in the field.
+ *
+ * NAND field: this expresses the constraint that you may not have events
+ * in all of a set of classes.  (For example, on PPC970, you can't select
+ * events from the FPU, ISU and IDU simultaneously, although any two are
+ * possible.)  For N classes, the field is N+1 bits wide, and each class
+ * is assigned one bit from the least-significant N bits.  The mask has
+ * only the most-significant bit set, and the value has only the bit
+ * for the event_id's class set.  The test_adder has the least significant
+ * bit set in the field.
+ *
+ * If an event_id is not subject to the constraint expressed by a particular
+ * field, then it will have 0 in both the mask and value for that field.
+ */
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index ed24bd92fe49..c7d671a7d9a1 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -322,7 +322,7 @@ SYSCALL_SPU(epoll_create1)
 SYSCALL_SPU(dup3)
 SYSCALL_SPU(pipe2)
 SYSCALL(inotify_init1)
-SYSCALL_SPU(perf_counter_open)
+SYSCALL_SPU(perf_event_open)
 COMPAT_SYS_SPU(preadv)
 COMPAT_SYS_SPU(pwritev)
 COMPAT_SYS(rt_tgsigqueueinfo)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index cef080bfc607..f6ca76176766 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -341,7 +341,7 @@
 #define __NR_dup3		316
 #define __NR_pipe2		317
 #define __NR_inotify_init1	318
-#define __NR_perf_counter_open	319
+#define __NR_perf_event_open	319
 #define __NR_preadv		320
 #define __NR_pwritev		321
 #define __NR_rt_tgsigqueueinfo	322
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 569f79ccd310..b23664a0b86c 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -97,7 +97,7 @@ obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
-obj-$(CONFIG_PPC_PERF_CTRS)	+= perf_counter.o perf_callchain.o
+obj-$(CONFIG_PPC_PERF_CTRS)	+= perf_event.o perf_callchain.o
 obj64-$(CONFIG_PPC_PERF_CTRS)	+= power4-pmu.o ppc970-pmu.o power5-pmu.o \
 				   power5+-pmu.o power6-pmu.o power7-pmu.o
 obj32-$(CONFIG_PPC_PERF_CTRS)	+= mpc7450-pmu.o
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index f0df285f0f87..0812b0f414bb 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -133,7 +133,7 @@ int main(void)
 	DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr));
 	DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));
 	DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled));
-	DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending));
+	DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_event_pending));
 	DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
 #ifdef CONFIG_PPC_MM_SLICES
 	DEFINE(PACALOWSLICESPSIZE, offsetof(struct paca_struct,
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 66bcda34a6bb..900e0eea0099 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -556,14 +556,14 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)
 2:
 	TRACE_AND_RESTORE_IRQ(r5);
 
-#ifdef CONFIG_PERF_COUNTERS
-	/* check paca->perf_counter_pending if we're enabling ints */
+#ifdef CONFIG_PERF_EVENTS
+	/* check paca->perf_event_pending if we're enabling ints */
 	lbz	r3,PACAPERFPEND(r13)
 	and.	r3,r3,r5
 	beq	27f
-	bl	.perf_counter_do_pending
+	bl	.perf_event_do_pending
 27:
-#endif /* CONFIG_PERF_COUNTERS */
+#endif /* CONFIG_PERF_EVENTS */
 
 	/* extract EE bit and use it to restore paca->hard_enabled */
 	ld	r3,_MSR(r1)
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index f7f376ea7b17..e5d121177984 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -53,7 +53,7 @@
 #include <linux/bootmem.h>
 #include <linux/pci.h>
 #include <linux/debugfs.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -138,9 +138,9 @@ notrace void raw_local_irq_restore(unsigned long en)
 	}
 #endif /* CONFIG_PPC_STD_MMU_64 */
 
-	if (test_perf_counter_pending()) {
-		clear_perf_counter_pending();
-		perf_counter_do_pending();
+	if (test_perf_event_pending()) {
+		clear_perf_event_pending();
+		perf_event_do_pending();
 	}
 
 	/*
diff --git a/arch/powerpc/kernel/mpc7450-pmu.c b/arch/powerpc/kernel/mpc7450-pmu.c
index cc466d039af6..09d72028f317 100644
--- a/arch/powerpc/kernel/mpc7450-pmu.c
+++ b/arch/powerpc/kernel/mpc7450-pmu.c
@@ -9,7 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 #include <linux/string.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <asm/reg.h>
 #include <asm/cputable.h>
 
diff --git a/arch/powerpc/kernel/perf_callchain.c b/arch/powerpc/kernel/perf_callchain.c
index f74b62c67511..0a03cf70d247 100644
--- a/arch/powerpc/kernel/perf_callchain.c
+++ b/arch/powerpc/kernel/perf_callchain.c
@@ -10,7 +10,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/sched.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/percpu.h>
 #include <linux/uaccess.h>
 #include <linux/mm.h>
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
deleted file mode 100644
index 5ccf9bca96c0..000000000000
--- a/arch/powerpc/kernel/perf_counter.c
+++ /dev/null
@@ -1,1315 +0,0 @@
-/*
- * Performance counter support - powerpc architecture code
- *
- * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/perf_counter.h>
-#include <linux/percpu.h>
-#include <linux/hardirq.h>
-#include <asm/reg.h>
-#include <asm/pmc.h>
-#include <asm/machdep.h>
-#include <asm/firmware.h>
-#include <asm/ptrace.h>
-
-struct cpu_hw_counters {
-	int n_counters;
-	int n_percpu;
-	int disabled;
-	int n_added;
-	int n_limited;
-	u8  pmcs_enabled;
-	struct perf_counter *counter[MAX_HWCOUNTERS];
-	u64 events[MAX_HWCOUNTERS];
-	unsigned int flags[MAX_HWCOUNTERS];
-	unsigned long mmcr[3];
-	struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS];
-	u8  limited_hwidx[MAX_LIMITED_HWCOUNTERS];
-	u64 alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
-	unsigned long amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
-	unsigned long avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
-};
-DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
-
-struct power_pmu *ppmu;
-
-/*
- * Normally, to ignore kernel events we set the FCS (freeze counters
- * in supervisor mode) bit in MMCR0, but if the kernel runs with the
- * hypervisor bit set in the MSR, or if we are running on a processor
- * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
- * then we need to use the FCHV bit to ignore kernel events.
- */
-static unsigned int freeze_counters_kernel = MMCR0_FCS;
-
-/*
- * 32-bit doesn't have MMCRA but does have an MMCR2,
- * and a few other names are different.
- */
-#ifdef CONFIG_PPC32
-
-#define MMCR0_FCHV		0
-#define MMCR0_PMCjCE		MMCR0_PMCnCE
-
-#define SPRN_MMCRA		SPRN_MMCR2
-#define MMCRA_SAMPLE_ENABLE	0
-
-static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
-{
-	return 0;
-}
-static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { }
-static inline u32 perf_get_misc_flags(struct pt_regs *regs)
-{
-	return 0;
-}
-static inline void perf_read_regs(struct pt_regs *regs) { }
-static inline int perf_intr_is_nmi(struct pt_regs *regs)
-{
-	return 0;
-}
-
-#endif /* CONFIG_PPC32 */
-
-/*
- * Things that are specific to 64-bit implementations.
- */
-#ifdef CONFIG_PPC64
-
-static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
-{
-	unsigned long mmcra = regs->dsisr;
-
-	if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) {
-		unsigned long slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
-		if (slot > 1)
-			return 4 * (slot - 1);
-	}
-	return 0;
-}
-
-/*
- * The user wants a data address recorded.
- * If we're not doing instruction sampling, give them the SDAR
- * (sampled data address).  If we are doing instruction sampling, then
- * only give them the SDAR if it corresponds to the instruction
- * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC
- * bit in MMCRA.
- */
-static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp)
-{
-	unsigned long mmcra = regs->dsisr;
-	unsigned long sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
-		POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
-
-	if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
-		*addrp = mfspr(SPRN_SDAR);
-}
-
-static inline u32 perf_get_misc_flags(struct pt_regs *regs)
-{
-	unsigned long mmcra = regs->dsisr;
-
-	if (TRAP(regs) != 0xf00)
-		return 0;	/* not a PMU interrupt */
-
-	if (ppmu->flags & PPMU_ALT_SIPR) {
-		if (mmcra & POWER6_MMCRA_SIHV)
-			return PERF_EVENT_MISC_HYPERVISOR;
-		return (mmcra & POWER6_MMCRA_SIPR) ?
-			PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL;
-	}
-	if (mmcra & MMCRA_SIHV)
-		return PERF_EVENT_MISC_HYPERVISOR;
-	return (mmcra & MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
-		PERF_EVENT_MISC_KERNEL;
-}
-
-/*
- * Overload regs->dsisr to store MMCRA so we only need to read it once
- * on each interrupt.
- */
-static inline void perf_read_regs(struct pt_regs *regs)
-{
-	regs->dsisr = mfspr(SPRN_MMCRA);
-}
-
-/*
- * If interrupts were soft-disabled when a PMU interrupt occurs, treat
- * it as an NMI.
- */
-static inline int perf_intr_is_nmi(struct pt_regs *regs)
-{
-	return !regs->softe;
-}
-
-#endif /* CONFIG_PPC64 */
-
-static void perf_counter_interrupt(struct pt_regs *regs);
-
-void perf_counter_print_debug(void)
-{
-}
-
-/*
- * Read one performance monitor counter (PMC).
- */
-static unsigned long read_pmc(int idx)
-{
-	unsigned long val;
-
-	switch (idx) {
-	case 1:
-		val = mfspr(SPRN_PMC1);
-		break;
-	case 2:
-		val = mfspr(SPRN_PMC2);
-		break;
-	case 3:
-		val = mfspr(SPRN_PMC3);
-		break;
-	case 4:
-		val = mfspr(SPRN_PMC4);
-		break;
-	case 5:
-		val = mfspr(SPRN_PMC5);
-		break;
-	case 6:
-		val = mfspr(SPRN_PMC6);
-		break;
-#ifdef CONFIG_PPC64
-	case 7:
-		val = mfspr(SPRN_PMC7);
-		break;
-	case 8:
-		val = mfspr(SPRN_PMC8);
-		break;
-#endif /* CONFIG_PPC64 */
-	default:
-		printk(KERN_ERR "oops trying to read PMC%d\n", idx);
-		val = 0;
-	}
-	return val;
-}
-
-/*
- * Write one PMC.
- */
-static void write_pmc(int idx, unsigned long val)
-{
-	switch (idx) {
-	case 1:
-		mtspr(SPRN_PMC1, val);
-		break;
-	case 2:
-		mtspr(SPRN_PMC2, val);
-		break;
-	case 3:
-		mtspr(SPRN_PMC3, val);
-		break;
-	case 4:
-		mtspr(SPRN_PMC4, val);
-		break;
-	case 5:
-		mtspr(SPRN_PMC5, val);
-		break;
-	case 6:
-		mtspr(SPRN_PMC6, val);
-		break;
-#ifdef CONFIG_PPC64
-	case 7:
-		mtspr(SPRN_PMC7, val);
-		break;
-	case 8:
-		mtspr(SPRN_PMC8, val);
-		break;
-#endif /* CONFIG_PPC64 */
-	default:
-		printk(KERN_ERR "oops trying to write PMC%d\n", idx);
-	}
-}
-
-/*
- * Check if a set of events can all go on the PMU at once.
- * If they can't, this will look at alternative codes for the events
- * and see if any combination of alternative codes is feasible.
- * The feasible set is returned in event[].
- */
-static int power_check_constraints(struct cpu_hw_counters *cpuhw,
-				   u64 event[], unsigned int cflags[],
-				   int n_ev)
-{
-	unsigned long mask, value, nv;
-	unsigned long smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
-	int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS];
-	int i, j;
-	unsigned long addf = ppmu->add_fields;
-	unsigned long tadd = ppmu->test_adder;
-
-	if (n_ev > ppmu->n_counter)
-		return -1;
-
-	/* First see if the events will go on as-is */
-	for (i = 0; i < n_ev; ++i) {
-		if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
-		    && !ppmu->limited_pmc_event(event[i])) {
-			ppmu->get_alternatives(event[i], cflags[i],
-					       cpuhw->alternatives[i]);
-			event[i] = cpuhw->alternatives[i][0];
-		}
-		if (ppmu->get_constraint(event[i], &cpuhw->amasks[i][0],
-					 &cpuhw->avalues[i][0]))
-			return -1;
-	}
-	value = mask = 0;
-	for (i = 0; i < n_ev; ++i) {
-		nv = (value | cpuhw->avalues[i][0]) +
-			(value & cpuhw->avalues[i][0] & addf);
-		if ((((nv + tadd) ^ value) & mask) != 0 ||
-		    (((nv + tadd) ^ cpuhw->avalues[i][0]) &
-		     cpuhw->amasks[i][0]) != 0)
-			break;
-		value = nv;
-		mask |= cpuhw->amasks[i][0];
-	}
-	if (i == n_ev)
-		return 0;	/* all OK */
-
-	/* doesn't work, gather alternatives... */
-	if (!ppmu->get_alternatives)
-		return -1;
-	for (i = 0; i < n_ev; ++i) {
-		choice[i] = 0;
-		n_alt[i] = ppmu->get_alternatives(event[i], cflags[i],
-						  cpuhw->alternatives[i]);
-		for (j = 1; j < n_alt[i]; ++j)
-			ppmu->get_constraint(cpuhw->alternatives[i][j],
-					     &cpuhw->amasks[i][j],
-					     &cpuhw->avalues[i][j]);
-	}
-
-	/* enumerate all possibilities and see if any will work */
-	i = 0;
-	j = -1;
-	value = mask = nv = 0;
-	while (i < n_ev) {
-		if (j >= 0) {
-			/* we're backtracking, restore context */
-			value = svalues[i];
-			mask = smasks[i];
-			j = choice[i];
-		}
-		/*
-		 * See if any alternative k for event i,
-		 * where k > j, will satisfy the constraints.
-		 */
-		while (++j < n_alt[i]) {
-			nv = (value | cpuhw->avalues[i][j]) +
-				(value & cpuhw->avalues[i][j] & addf);
-			if ((((nv + tadd) ^ value) & mask) == 0 &&
-			    (((nv + tadd) ^ cpuhw->avalues[i][j])
-			     & cpuhw->amasks[i][j]) == 0)
-				break;
-		}
-		if (j >= n_alt[i]) {
-			/*
-			 * No feasible alternative, backtrack
-			 * to event i-1 and continue enumerating its
-			 * alternatives from where we got up to.
-			 */
-			if (--i < 0)
-				return -1;
-		} else {
-			/*
-			 * Found a feasible alternative for event i,
-			 * remember where we got up to with this event,
-			 * go on to the next event, and start with
-			 * the first alternative for it.
-			 */
-			choice[i] = j;
-			svalues[i] = value;
-			smasks[i] = mask;
-			value = nv;
-			mask |= cpuhw->amasks[i][j];
-			++i;
-			j = -1;
-		}
-	}
-
-	/* OK, we have a feasible combination, tell the caller the solution */
-	for (i = 0; i < n_ev; ++i)
-		event[i] = cpuhw->alternatives[i][choice[i]];
-	return 0;
-}
-
-/*
- * Check if newly-added counters have consistent settings for
- * exclude_{user,kernel,hv} with each other and any previously
- * added counters.
- */
-static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[],
-			  int n_prev, int n_new)
-{
-	int eu = 0, ek = 0, eh = 0;
-	int i, n, first;
-	struct perf_counter *counter;
-
-	n = n_prev + n_new;
-	if (n <= 1)
-		return 0;
-
-	first = 1;
-	for (i = 0; i < n; ++i) {
-		if (cflags[i] & PPMU_LIMITED_PMC_OK) {
-			cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
-			continue;
-		}
-		counter = ctrs[i];
-		if (first) {
-			eu = counter->attr.exclude_user;
-			ek = counter->attr.exclude_kernel;
-			eh = counter->attr.exclude_hv;
-			first = 0;
-		} else if (counter->attr.exclude_user != eu ||
-			   counter->attr.exclude_kernel != ek ||
-			   counter->attr.exclude_hv != eh) {
-			return -EAGAIN;
-		}
-	}
-
-	if (eu || ek || eh)
-		for (i = 0; i < n; ++i)
-			if (cflags[i] & PPMU_LIMITED_PMC_OK)
-				cflags[i] |= PPMU_LIMITED_PMC_REQD;
-
-	return 0;
-}
-
-static void power_pmu_read(struct perf_counter *counter)
-{
-	s64 val, delta, prev;
-
-	if (!counter->hw.idx)
-		return;
-	/*
-	 * Performance monitor interrupts come even when interrupts
-	 * are soft-disabled, as long as interrupts are hard-enabled.
-	 * Therefore we treat them like NMIs.
-	 */
-	do {
-		prev = atomic64_read(&counter->hw.prev_count);
-		barrier();
-		val = read_pmc(counter->hw.idx);
-	} while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev);
-
-	/* The counters are only 32 bits wide */
-	delta = (val - prev) & 0xfffffffful;
-	atomic64_add(delta, &counter->count);
-	atomic64_sub(delta, &counter->hw.period_left);
-}
-
-/*
- * On some machines, PMC5 and PMC6 can't be written, don't respect
- * the freeze conditions, and don't generate interrupts.  This tells
- * us if `counter' is using such a PMC.
- */
-static int is_limited_pmc(int pmcnum)
-{
-	return (ppmu->flags & PPMU_LIMITED_PMC5_6)
-		&& (pmcnum == 5 || pmcnum == 6);
-}
-
-static void freeze_limited_counters(struct cpu_hw_counters *cpuhw,
-				    unsigned long pmc5, unsigned long pmc6)
-{
-	struct perf_counter *counter;
-	u64 val, prev, delta;
-	int i;
-
-	for (i = 0; i < cpuhw->n_limited; ++i) {
-		counter = cpuhw->limited_counter[i];
-		if (!counter->hw.idx)
-			continue;
-		val = (counter->hw.idx == 5) ? pmc5 : pmc6;
-		prev = atomic64_read(&counter->hw.prev_count);
-		counter->hw.idx = 0;
-		delta = (val - prev) & 0xfffffffful;
-		atomic64_add(delta, &counter->count);
-	}
-}
-
-static void thaw_limited_counters(struct cpu_hw_counters *cpuhw,
-				  unsigned long pmc5, unsigned long pmc6)
-{
-	struct perf_counter *counter;
-	u64 val;
-	int i;
-
-	for (i = 0; i < cpuhw->n_limited; ++i) {
-		counter = cpuhw->limited_counter[i];
-		counter->hw.idx = cpuhw->limited_hwidx[i];
-		val = (counter->hw.idx == 5) ? pmc5 : pmc6;
-		atomic64_set(&counter->hw.prev_count, val);
-		perf_counter_update_userpage(counter);
-	}
-}
-
-/*
- * Since limited counters don't respect the freeze conditions, we
- * have to read them immediately after freezing or unfreezing the
- * other counters.  We try to keep the values from the limited
- * counters as consistent as possible by keeping the delay (in
- * cycles and instructions) between freezing/unfreezing and reading
- * the limited counters as small and consistent as possible.
- * Therefore, if any limited counters are in use, we read them
- * both, and always in the same order, to minimize variability,
- * and do it inside the same asm that writes MMCR0.
- */
-static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0)
-{
-	unsigned long pmc5, pmc6;
-
-	if (!cpuhw->n_limited) {
-		mtspr(SPRN_MMCR0, mmcr0);
-		return;
-	}
-
-	/*
-	 * Write MMCR0, then read PMC5 and PMC6 immediately.
-	 * To ensure we don't get a performance monitor interrupt
-	 * between writing MMCR0 and freezing/thawing the limited
-	 * counters, we first write MMCR0 with the counter overflow
-	 * interrupt enable bits turned off.
-	 */
-	asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
-		     : "=&r" (pmc5), "=&r" (pmc6)
-		     : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)),
-		       "i" (SPRN_MMCR0),
-		       "i" (SPRN_PMC5), "i" (SPRN_PMC6));
-
-	if (mmcr0 & MMCR0_FC)
-		freeze_limited_counters(cpuhw, pmc5, pmc6);
-	else
-		thaw_limited_counters(cpuhw, pmc5, pmc6);
-
-	/*
-	 * Write the full MMCR0 including the counter overflow interrupt
-	 * enable bits, if necessary.
-	 */
-	if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE))
-		mtspr(SPRN_MMCR0, mmcr0);
-}
-
-/*
- * Disable all counters to prevent PMU interrupts and to allow
- * counters to be added or removed.
- */
-void hw_perf_disable(void)
-{
-	struct cpu_hw_counters *cpuhw;
-	unsigned long flags;
-
-	if (!ppmu)
-		return;
-	local_irq_save(flags);
-	cpuhw = &__get_cpu_var(cpu_hw_counters);
-
-	if (!cpuhw->disabled) {
-		cpuhw->disabled = 1;
-		cpuhw->n_added = 0;
-
-		/*
-		 * Check if we ever enabled the PMU on this cpu.
-		 */
-		if (!cpuhw->pmcs_enabled) {
-			ppc_enable_pmcs();
-			cpuhw->pmcs_enabled = 1;
-		}
-
-		/*
-		 * Disable instruction sampling if it was enabled
-		 */
-		if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
-			mtspr(SPRN_MMCRA,
-			      cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
-			mb();
-		}
-
-		/*
-		 * Set the 'freeze counters' bit.
-		 * The barrier is to make sure the mtspr has been
-		 * executed and the PMU has frozen the counters
-		 * before we return.
-		 */
-		write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
-		mb();
-	}
-	local_irq_restore(flags);
-}
-
-/*
- * Re-enable all counters if disable == 0.
- * If we were previously disabled and counters were added, then
- * put the new config on the PMU.
- */
-void hw_perf_enable(void)
-{
-	struct perf_counter *counter;
-	struct cpu_hw_counters *cpuhw;
-	unsigned long flags;
-	long i;
-	unsigned long val;
-	s64 left;
-	unsigned int hwc_index[MAX_HWCOUNTERS];
-	int n_lim;
-	int idx;
-
-	if (!ppmu)
-		return;
-	local_irq_save(flags);
-	cpuhw = &__get_cpu_var(cpu_hw_counters);
-	if (!cpuhw->disabled) {
-		local_irq_restore(flags);
-		return;
-	}
-	cpuhw->disabled = 0;
-
-	/*
-	 * If we didn't change anything, or only removed counters,
-	 * no need to recalculate MMCR* settings and reset the PMCs.
-	 * Just reenable the PMU with the current MMCR* settings
-	 * (possibly updated for removal of counters).
-	 */
-	if (!cpuhw->n_added) {
-		mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
-		mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
-		if (cpuhw->n_counters == 0)
-			ppc_set_pmu_inuse(0);
-		goto out_enable;
-	}
-
-	/*
-	 * Compute MMCR* values for the new set of counters
-	 */
-	if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index,
-			       cpuhw->mmcr)) {
-		/* shouldn't ever get here */
-		printk(KERN_ERR "oops compute_mmcr failed\n");
-		goto out;
-	}
-
-	/*
-	 * Add in MMCR0 freeze bits corresponding to the
-	 * attr.exclude_* bits for the first counter.
-	 * We have already checked that all counters have the
-	 * same values for these bits as the first counter.
-	 */
-	counter = cpuhw->counter[0];
-	if (counter->attr.exclude_user)
-		cpuhw->mmcr[0] |= MMCR0_FCP;
-	if (counter->attr.exclude_kernel)
-		cpuhw->mmcr[0] |= freeze_counters_kernel;
-	if (counter->attr.exclude_hv)
-		cpuhw->mmcr[0] |= MMCR0_FCHV;
-
-	/*
-	 * Write the new configuration to MMCR* with the freeze
-	 * bit set and set the hardware counters to their initial values.
-	 * Then unfreeze the counters.
-	 */
-	ppc_set_pmu_inuse(1);
-	mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
-	mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
-	mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
-				| MMCR0_FC);
-
-	/*
-	 * Read off any pre-existing counters that need to move
-	 * to another PMC.
-	 */
-	for (i = 0; i < cpuhw->n_counters; ++i) {
-		counter = cpuhw->counter[i];
-		if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
-			power_pmu_read(counter);
-			write_pmc(counter->hw.idx, 0);
-			counter->hw.idx = 0;
-		}
-	}
-
-	/*
-	 * Initialize the PMCs for all the new and moved counters.
-	 */
-	cpuhw->n_limited = n_lim = 0;
-	for (i = 0; i < cpuhw->n_counters; ++i) {
-		counter = cpuhw->counter[i];
-		if (counter->hw.idx)
-			continue;
-		idx = hwc_index[i] + 1;
-		if (is_limited_pmc(idx)) {
-			cpuhw->limited_counter[n_lim] = counter;
-			cpuhw->limited_hwidx[n_lim] = idx;
-			++n_lim;
-			continue;
-		}
-		val = 0;
-		if (counter->hw.sample_period) {
-			left = atomic64_read(&counter->hw.period_left);
-			if (left < 0x80000000L)
-				val = 0x80000000L - left;
-		}
-		atomic64_set(&counter->hw.prev_count, val);
-		counter->hw.idx = idx;
-		write_pmc(idx, val);
-		perf_counter_update_userpage(counter);
-	}
-	cpuhw->n_limited = n_lim;
-	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
-
- out_enable:
-	mb();
-	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
-
-	/*
-	 * Enable instruction sampling if necessary
-	 */
-	if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
-		mb();
-		mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
-	}
-
- out:
-	local_irq_restore(flags);
-}
-
-static int collect_events(struct perf_counter *group, int max_count,
-			  struct perf_counter *ctrs[], u64 *events,
-			  unsigned int *flags)
-{
-	int n = 0;
-	struct perf_counter *counter;
-
-	if (!is_software_counter(group)) {
-		if (n >= max_count)
-			return -1;
-		ctrs[n] = group;
-		flags[n] = group->hw.counter_base;
-		events[n++] = group->hw.config;
-	}
-	list_for_each_entry(counter, &group->sibling_list, list_entry) {
-		if (!is_software_counter(counter) &&
-		    counter->state != PERF_COUNTER_STATE_OFF) {
-			if (n >= max_count)
-				return -1;
-			ctrs[n] = counter;
-			flags[n] = counter->hw.counter_base;
-			events[n++] = counter->hw.config;
-		}
-	}
-	return n;
-}
-
-static void counter_sched_in(struct perf_counter *counter, int cpu)
-{
-	counter->state = PERF_COUNTER_STATE_ACTIVE;
-	counter->oncpu = cpu;
-	counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped;
-	if (is_software_counter(counter))
-		counter->pmu->enable(counter);
-}
-
-/*
- * Called to enable a whole group of counters.
- * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
- * Assumes the caller has disabled interrupts and has
- * frozen the PMU with hw_perf_save_disable.
- */
-int hw_perf_group_sched_in(struct perf_counter *group_leader,
-	       struct perf_cpu_context *cpuctx,
-	       struct perf_counter_context *ctx, int cpu)
-{
-	struct cpu_hw_counters *cpuhw;
-	long i, n, n0;
-	struct perf_counter *sub;
-
-	if (!ppmu)
-		return 0;
-	cpuhw = &__get_cpu_var(cpu_hw_counters);
-	n0 = cpuhw->n_counters;
-	n = collect_events(group_leader, ppmu->n_counter - n0,
-			   &cpuhw->counter[n0], &cpuhw->events[n0],
-			   &cpuhw->flags[n0]);
-	if (n < 0)
-		return -EAGAIN;
-	if (check_excludes(cpuhw->counter, cpuhw->flags, n0, n))
-		return -EAGAIN;
-	i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n + n0);
-	if (i < 0)
-		return -EAGAIN;
-	cpuhw->n_counters = n0 + n;
-	cpuhw->n_added += n;
-
-	/*
-	 * OK, this group can go on; update counter states etc.,
-	 * and enable any software counters
-	 */
-	for (i = n0; i < n0 + n; ++i)
-		cpuhw->counter[i]->hw.config = cpuhw->events[i];
-	cpuctx->active_oncpu += n;
-	n = 1;
-	counter_sched_in(group_leader, cpu);
-	list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
-		if (sub->state != PERF_COUNTER_STATE_OFF) {
-			counter_sched_in(sub, cpu);
-			++n;
-		}
-	}
-	ctx->nr_active += n;
-
-	return 1;
-}
-
-/*
- * Add a counter to the PMU.
- * If all counters are not already frozen, then we disable and
- * re-enable the PMU in order to get hw_perf_enable to do the
- * actual work of reconfiguring the PMU.
- */
-static int power_pmu_enable(struct perf_counter *counter)
-{
-	struct cpu_hw_counters *cpuhw;
-	unsigned long flags;
-	int n0;
-	int ret = -EAGAIN;
-
-	local_irq_save(flags);
-	perf_disable();
-
-	/*
-	 * Add the counter to the list (if there is room)
-	 * and check whether the total set is still feasible.
-	 */
-	cpuhw = &__get_cpu_var(cpu_hw_counters);
-	n0 = cpuhw->n_counters;
-	if (n0 >= ppmu->n_counter)
-		goto out;
-	cpuhw->counter[n0] = counter;
-	cpuhw->events[n0] = counter->hw.config;
-	cpuhw->flags[n0] = counter->hw.counter_base;
-	if (check_excludes(cpuhw->counter, cpuhw->flags, n0, 1))
-		goto out;
-	if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1))
-		goto out;
-
-	counter->hw.config = cpuhw->events[n0];
-	++cpuhw->n_counters;
-	++cpuhw->n_added;
-
-	ret = 0;
- out:
-	perf_enable();
-	local_irq_restore(flags);
-	return ret;
-}
-
-/*
- * Remove a counter from the PMU.
- */
-static void power_pmu_disable(struct perf_counter *counter)
-{
-	struct cpu_hw_counters *cpuhw;
-	long i;
-	unsigned long flags;
-
-	local_irq_save(flags);
-	perf_disable();
-
-	power_pmu_read(counter);
-
-	cpuhw = &__get_cpu_var(cpu_hw_counters);
-	for (i = 0; i < cpuhw->n_counters; ++i) {
-		if (counter == cpuhw->counter[i]) {
-			while (++i < cpuhw->n_counters)
-				cpuhw->counter[i-1] = cpuhw->counter[i];
-			--cpuhw->n_counters;
-			ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
-			if (counter->hw.idx) {
-				write_pmc(counter->hw.idx, 0);
-				counter->hw.idx = 0;
-			}
-			perf_counter_update_userpage(counter);
-			break;
-		}
-	}
-	for (i = 0; i < cpuhw->n_limited; ++i)
-		if (counter == cpuhw->limited_counter[i])
-			break;
-	if (i < cpuhw->n_limited) {
-		while (++i < cpuhw->n_limited) {
-			cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i];
-			cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
-		}
-		--cpuhw->n_limited;
-	}
-	if (cpuhw->n_counters == 0) {
-		/* disable exceptions if no counters are running */
-		cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
-	}
-
-	perf_enable();
-	local_irq_restore(flags);
-}
-
-/*
- * Re-enable interrupts on a counter after they were throttled
- * because they were coming too fast.
- */
-static void power_pmu_unthrottle(struct perf_counter *counter)
-{
-	s64 val, left;
-	unsigned long flags;
-
-	if (!counter->hw.idx || !counter->hw.sample_period)
-		return;
-	local_irq_save(flags);
-	perf_disable();
-	power_pmu_read(counter);
-	left = counter->hw.sample_period;
-	counter->hw.last_period = left;
-	val = 0;
-	if (left < 0x80000000L)
-		val = 0x80000000L - left;
-	write_pmc(counter->hw.idx, val);
-	atomic64_set(&counter->hw.prev_count, val);
-	atomic64_set(&counter->hw.period_left, left);
-	perf_counter_update_userpage(counter);
-	perf_enable();
-	local_irq_restore(flags);
-}
-
-struct pmu power_pmu = {
-	.enable		= power_pmu_enable,
-	.disable	= power_pmu_disable,
-	.read		= power_pmu_read,
-	.unthrottle	= power_pmu_unthrottle,
-};
-
-/*
- * Return 1 if we might be able to put counter on a limited PMC,
- * or 0 if not.
- * A counter can only go on a limited PMC if it counts something
- * that a limited PMC can count, doesn't require interrupts, and
- * doesn't exclude any processor mode.
- */
-static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev,
-				 unsigned int flags)
-{
-	int n;
-	u64 alt[MAX_EVENT_ALTERNATIVES];
-
-	if (counter->attr.exclude_user
-	    || counter->attr.exclude_kernel
-	    || counter->attr.exclude_hv
-	    || counter->attr.sample_period)
-		return 0;
-
-	if (ppmu->limited_pmc_event(ev))
-		return 1;
-
-	/*
-	 * The requested event isn't on a limited PMC already;
-	 * see if any alternative code goes on a limited PMC.
-	 */
-	if (!ppmu->get_alternatives)
-		return 0;
-
-	flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
-	n = ppmu->get_alternatives(ev, flags, alt);
-
-	return n > 0;
-}
-
-/*
- * Find an alternative event that goes on a normal PMC, if possible,
- * and return the event code, or 0 if there is no such alternative.
- * (Note: event code 0 is "don't count" on all machines.)
- */
-static u64 normal_pmc_alternative(u64 ev, unsigned long flags)
-{
-	u64 alt[MAX_EVENT_ALTERNATIVES];
-	int n;
-
-	flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
-	n = ppmu->get_alternatives(ev, flags, alt);
-	if (!n)
-		return 0;
-	return alt[0];
-}
-
-/* Number of perf_counters counting hardware events */
-static atomic_t num_counters;
-/* Used to avoid races in calling reserve/release_pmc_hardware */
-static DEFINE_MUTEX(pmc_reserve_mutex);
-
-/*
- * Release the PMU if this is the last perf_counter.
- */
-static void hw_perf_counter_destroy(struct perf_counter *counter)
-{
-	if (!atomic_add_unless(&num_counters, -1, 1)) {
-		mutex_lock(&pmc_reserve_mutex);
-		if (atomic_dec_return(&num_counters) == 0)
-			release_pmc_hardware();
-		mutex_unlock(&pmc_reserve_mutex);
-	}
-}
-
-/*
- * Translate a generic cache event config to a raw event code.
- */
-static int hw_perf_cache_event(u64 config, u64 *eventp)
-{
-	unsigned long type, op, result;
-	int ev;
-
-	if (!ppmu->cache_events)
-		return -EINVAL;
-
-	/* unpack config */
-	type = config & 0xff;
-	op = (config >> 8) & 0xff;
-	result = (config >> 16) & 0xff;
-
-	if (type >= PERF_COUNT_HW_CACHE_MAX ||
-	    op >= PERF_COUNT_HW_CACHE_OP_MAX ||
-	    result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
-		return -EINVAL;
-
-	ev = (*ppmu->cache_events)[type][op][result];
-	if (ev == 0)
-		return -EOPNOTSUPP;
-	if (ev == -1)
-		return -EINVAL;
-	*eventp = ev;
-	return 0;
-}
-
-const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
-{
-	u64 ev;
-	unsigned long flags;
-	struct perf_counter *ctrs[MAX_HWCOUNTERS];
-	u64 events[MAX_HWCOUNTERS];
-	unsigned int cflags[MAX_HWCOUNTERS];
-	int n;
-	int err;
-	struct cpu_hw_counters *cpuhw;
-
-	if (!ppmu)
-		return ERR_PTR(-ENXIO);
-	switch (counter->attr.type) {
-	case PERF_TYPE_HARDWARE:
-		ev = counter->attr.config;
-		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
-			return ERR_PTR(-EOPNOTSUPP);
-		ev = ppmu->generic_events[ev];
-		break;
-	case PERF_TYPE_HW_CACHE:
-		err = hw_perf_cache_event(counter->attr.config, &ev);
-		if (err)
-			return ERR_PTR(err);
-		break;
-	case PERF_TYPE_RAW:
-		ev = counter->attr.config;
-		break;
-	default:
-		return ERR_PTR(-EINVAL);
-	}
-	counter->hw.config_base = ev;
-	counter->hw.idx = 0;
-
-	/*
-	 * If we are not running on a hypervisor, force the
-	 * exclude_hv bit to 0 so that we don't care what
-	 * the user set it to.
-	 */
-	if (!firmware_has_feature(FW_FEATURE_LPAR))
-		counter->attr.exclude_hv = 0;
-
-	/*
-	 * If this is a per-task counter, then we can use
-	 * PM_RUN_* events interchangeably with their non RUN_*
-	 * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
-	 * XXX we should check if the task is an idle task.
-	 */
-	flags = 0;
-	if (counter->ctx->task)
-		flags |= PPMU_ONLY_COUNT_RUN;
-
-	/*
-	 * If this machine has limited counters, check whether this
-	 * event could go on a limited counter.
-	 */
-	if (ppmu->flags & PPMU_LIMITED_PMC5_6) {
-		if (can_go_on_limited_pmc(counter, ev, flags)) {
-			flags |= PPMU_LIMITED_PMC_OK;
-		} else if (ppmu->limited_pmc_event(ev)) {
-			/*
-			 * The requested event is on a limited PMC,
-			 * but we can't use a limited PMC; see if any
-			 * alternative goes on a normal PMC.
-			 */
-			ev = normal_pmc_alternative(ev, flags);
-			if (!ev)
-				return ERR_PTR(-EINVAL);
-		}
-	}
-
-	/*
-	 * If this is in a group, check if it can go on with all the
-	 * other hardware counters in the group.  We assume the counter
-	 * hasn't been linked into its leader's sibling list at this point.
-	 */
-	n = 0;
-	if (counter->group_leader != counter) {
-		n = collect_events(counter->group_leader, ppmu->n_counter - 1,
-				   ctrs, events, cflags);
-		if (n < 0)
-			return ERR_PTR(-EINVAL);
-	}
-	events[n] = ev;
-	ctrs[n] = counter;
-	cflags[n] = flags;
-	if (check_excludes(ctrs, cflags, n, 1))
-		return ERR_PTR(-EINVAL);
-
-	cpuhw = &get_cpu_var(cpu_hw_counters);
-	err = power_check_constraints(cpuhw, events, cflags, n + 1);
-	put_cpu_var(cpu_hw_counters);
-	if (err)
-		return ERR_PTR(-EINVAL);
-
-	counter->hw.config = events[n];
-	counter->hw.counter_base = cflags[n];
-	counter->hw.last_period = counter->hw.sample_period;
-	atomic64_set(&counter->hw.period_left, counter->hw.last_period);
-
-	/*
-	 * See if we need to reserve the PMU.
-	 * If no counters are currently in use, then we have to take a
-	 * mutex to ensure that we don't race with another task doing
-	 * reserve_pmc_hardware or release_pmc_hardware.
-	 */
-	err = 0;
-	if (!atomic_inc_not_zero(&num_counters)) {
-		mutex_lock(&pmc_reserve_mutex);
-		if (atomic_read(&num_counters) == 0 &&
-		    reserve_pmc_hardware(perf_counter_interrupt))
-			err = -EBUSY;
-		else
-			atomic_inc(&num_counters);
-		mutex_unlock(&pmc_reserve_mutex);
-	}
-	counter->destroy = hw_perf_counter_destroy;
-
-	if (err)
-		return ERR_PTR(err);
-	return &power_pmu;
-}
-
-/*
- * A counter has overflowed; update its count and record
- * things if requested.  Note that interrupts are hard-disabled
- * here so there is no possibility of being interrupted.
- */
-static void record_and_restart(struct perf_counter *counter, unsigned long val,
-			       struct pt_regs *regs, int nmi)
-{
-	u64 period = counter->hw.sample_period;
-	s64 prev, delta, left;
-	int record = 0;
-
-	/* we don't have to worry about interrupts here */
-	prev = atomic64_read(&counter->hw.prev_count);
-	delta = (val - prev) & 0xfffffffful;
-	atomic64_add(delta, &counter->count);
-
-	/*
-	 * See if the total period for this counter has expired,
-	 * and update for the next period.
-	 */
-	val = 0;
-	left = atomic64_read(&counter->hw.period_left) - delta;
-	if (period) {
-		if (left <= 0) {
-			left += period;
-			if (left <= 0)
-				left = period;
-			record = 1;
-		}
-		if (left < 0x80000000LL)
-			val = 0x80000000LL - left;
-	}
-
-	/*
-	 * Finally record data if requested.
-	 */
-	if (record) {
-		struct perf_sample_data data = {
-			.addr	= 0,
-			.period	= counter->hw.last_period,
-		};
-
-		if (counter->attr.sample_type & PERF_SAMPLE_ADDR)
-			perf_get_data_addr(regs, &data.addr);
-
-		if (perf_counter_overflow(counter, nmi, &data, regs)) {
-			/*
-			 * Interrupts are coming too fast - throttle them
-			 * by setting the counter to 0, so it will be
-			 * at least 2^30 cycles until the next interrupt
-			 * (assuming each counter counts at most 2 counts
-			 * per cycle).
-			 */
-			val = 0;
-			left = ~0ULL >> 1;
-		}
-	}
-
-	write_pmc(counter->hw.idx, val);
-	atomic64_set(&counter->hw.prev_count, val);
-	atomic64_set(&counter->hw.period_left, left);
-	perf_counter_update_userpage(counter);
-}
-
-/*
- * Called from generic code to get the misc flags (i.e. processor mode)
- * for an event.
- */
-unsigned long perf_misc_flags(struct pt_regs *regs)
-{
-	u32 flags = perf_get_misc_flags(regs);
-
-	if (flags)
-		return flags;
-	return user_mode(regs) ? PERF_EVENT_MISC_USER :
-		PERF_EVENT_MISC_KERNEL;
-}
-
-/*
- * Called from generic code to get the instruction pointer
- * for an event.
- */
-unsigned long perf_instruction_pointer(struct pt_regs *regs)
-{
-	unsigned long ip;
-
-	if (TRAP(regs) != 0xf00)
-		return regs->nip;	/* not a PMU interrupt */
-
-	ip = mfspr(SPRN_SIAR) + perf_ip_adjust(regs);
-	return ip;
-}
-
-/*
- * Performance monitor interrupt stuff
- */
-static void perf_counter_interrupt(struct pt_regs *regs)
-{
-	int i;
-	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
-	struct perf_counter *counter;
-	unsigned long val;
-	int found = 0;
-	int nmi;
-
-	if (cpuhw->n_limited)
-		freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
-					mfspr(SPRN_PMC6));
-
-	perf_read_regs(regs);
-
-	nmi = perf_intr_is_nmi(regs);
-	if (nmi)
-		nmi_enter();
-	else
-		irq_enter();
-
-	for (i = 0; i < cpuhw->n_counters; ++i) {
-		counter = cpuhw->counter[i];
-		if (!counter->hw.idx || is_limited_pmc(counter->hw.idx))
-			continue;
-		val = read_pmc(counter->hw.idx);
-		if ((int)val < 0) {
-			/* counter has overflowed */
-			found = 1;
-			record_and_restart(counter, val, regs, nmi);
-		}
-	}
-
-	/*
-	 * In case we didn't find and reset the counter that caused
-	 * the interrupt, scan all counters and reset any that are
-	 * negative, to avoid getting continual interrupts.
-	 * Any that we processed in the previous loop will not be negative.
-	 */
-	if (!found) {
-		for (i = 0; i < ppmu->n_counter; ++i) {
-			if (is_limited_pmc(i + 1))
-				continue;
-			val = read_pmc(i + 1);
-			if ((int)val < 0)
-				write_pmc(i + 1, 0);
-		}
-	}
-
-	/*
-	 * Reset MMCR0 to its normal value.  This will set PMXE and
-	 * clear FC (freeze counters) and PMAO (perf mon alert occurred)
-	 * and thus allow interrupts to occur again.
-	 * XXX might want to use MSR.PM to keep the counters frozen until
-	 * we get back out of this interrupt.
-	 */
-	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
-
-	if (nmi)
-		nmi_exit();
-	else
-		irq_exit();
-}
-
-void hw_perf_counter_setup(int cpu)
-{
-	struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu);
-
-	if (!ppmu)
-		return;
-	memset(cpuhw, 0, sizeof(*cpuhw));
-	cpuhw->mmcr[0] = MMCR0_FC;
-}
-
-int register_power_pmu(struct power_pmu *pmu)
-{
-	if (ppmu)
-		return -EBUSY;		/* something's already registered */
-
-	ppmu = pmu;
-	pr_info("%s performance monitor hardware support registered\n",
-		pmu->name);
-
-#ifdef MSR_HV
-	/*
-	 * Use FCHV to ignore kernel events if MSR.HV is set.
-	 */
-	if (mfmsr() & MSR_HV)
-		freeze_counters_kernel = MMCR0_FCHV;
-#endif /* CONFIG_PPC64 */
-
-	return 0;
-}
diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c
new file mode 100644
index 000000000000..c98321fcb459
--- /dev/null
+++ b/arch/powerpc/kernel/perf_event.c
@@ -0,0 +1,1315 @@
+/*
+ * Performance event support - powerpc architecture code
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/perf_event.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <asm/reg.h>
+#include <asm/pmc.h>
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/ptrace.h>
+
+struct cpu_hw_events {
+	int n_events;
+	int n_percpu;
+	int disabled;
+	int n_added;
+	int n_limited;
+	u8  pmcs_enabled;
+	struct perf_event *event[MAX_HWEVENTS];
+	u64 events[MAX_HWEVENTS];
+	unsigned int flags[MAX_HWEVENTS];
+	unsigned long mmcr[3];
+	struct perf_event *limited_event[MAX_LIMITED_HWEVENTS];
+	u8  limited_hwidx[MAX_LIMITED_HWEVENTS];
+	u64 alternatives[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
+	unsigned long amasks[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
+	unsigned long avalues[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
+};
+DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
+
+struct power_pmu *ppmu;
+
+/*
+ * Normally, to ignore kernel events we set the FCS (freeze events
+ * in supervisor mode) bit in MMCR0, but if the kernel runs with the
+ * hypervisor bit set in the MSR, or if we are running on a processor
+ * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
+ * then we need to use the FCHV bit to ignore kernel events.
+ */
+static unsigned int freeze_events_kernel = MMCR0_FCS;
+
+/*
+ * 32-bit doesn't have MMCRA but does have an MMCR2,
+ * and a few other names are different.
+ */
+#ifdef CONFIG_PPC32
+
+#define MMCR0_FCHV		0
+#define MMCR0_PMCjCE		MMCR0_PMCnCE
+
+#define SPRN_MMCRA		SPRN_MMCR2
+#define MMCRA_SAMPLE_ENABLE	0
+
+static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
+{
+	return 0;
+}
+static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { }
+static inline u32 perf_get_misc_flags(struct pt_regs *regs)
+{
+	return 0;
+}
+static inline void perf_read_regs(struct pt_regs *regs) { }
+static inline int perf_intr_is_nmi(struct pt_regs *regs)
+{
+	return 0;
+}
+
+#endif /* CONFIG_PPC32 */
+
+/*
+ * Things that are specific to 64-bit implementations.
+ */
+#ifdef CONFIG_PPC64
+
+static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
+{
+	unsigned long mmcra = regs->dsisr;
+
+	if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) {
+		unsigned long slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
+		if (slot > 1)
+			return 4 * (slot - 1);
+	}
+	return 0;
+}
+
+/*
+ * The user wants a data address recorded.
+ * If we're not doing instruction sampling, give them the SDAR
+ * (sampled data address).  If we are doing instruction sampling, then
+ * only give them the SDAR if it corresponds to the instruction
+ * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC
+ * bit in MMCRA.
+ */
+static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp)
+{
+	unsigned long mmcra = regs->dsisr;
+	unsigned long sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
+		POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
+
+	if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
+		*addrp = mfspr(SPRN_SDAR);
+}
+
+static inline u32 perf_get_misc_flags(struct pt_regs *regs)
+{
+	unsigned long mmcra = regs->dsisr;
+
+	if (TRAP(regs) != 0xf00)
+		return 0;	/* not a PMU interrupt */
+
+	if (ppmu->flags & PPMU_ALT_SIPR) {
+		if (mmcra & POWER6_MMCRA_SIHV)
+			return PERF_RECORD_MISC_HYPERVISOR;
+		return (mmcra & POWER6_MMCRA_SIPR) ?
+			PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL;
+	}
+	if (mmcra & MMCRA_SIHV)
+		return PERF_RECORD_MISC_HYPERVISOR;
+	return (mmcra & MMCRA_SIPR) ? PERF_RECORD_MISC_USER :
+		PERF_RECORD_MISC_KERNEL;
+}
+
+/*
+ * Overload regs->dsisr to store MMCRA so we only need to read it once
+ * on each interrupt.
+ */
+static inline void perf_read_regs(struct pt_regs *regs)
+{
+	regs->dsisr = mfspr(SPRN_MMCRA);
+}
+
+/*
+ * If interrupts were soft-disabled when a PMU interrupt occurs, treat
+ * it as an NMI.
+ */
+static inline int perf_intr_is_nmi(struct pt_regs *regs)
+{
+	return !regs->softe;
+}
+
+#endif /* CONFIG_PPC64 */
+
+static void perf_event_interrupt(struct pt_regs *regs);
+
+void perf_event_print_debug(void)
+{
+}
+
+/*
+ * Read one performance monitor event (PMC).
+ */
+static unsigned long read_pmc(int idx)
+{
+	unsigned long val;
+
+	switch (idx) {
+	case 1:
+		val = mfspr(SPRN_PMC1);
+		break;
+	case 2:
+		val = mfspr(SPRN_PMC2);
+		break;
+	case 3:
+		val = mfspr(SPRN_PMC3);
+		break;
+	case 4:
+		val = mfspr(SPRN_PMC4);
+		break;
+	case 5:
+		val = mfspr(SPRN_PMC5);
+		break;
+	case 6:
+		val = mfspr(SPRN_PMC6);
+		break;
+#ifdef CONFIG_PPC64
+	case 7:
+		val = mfspr(SPRN_PMC7);
+		break;
+	case 8:
+		val = mfspr(SPRN_PMC8);
+		break;
+#endif /* CONFIG_PPC64 */
+	default:
+		printk(KERN_ERR "oops trying to read PMC%d\n", idx);
+		val = 0;
+	}
+	return val;
+}
+
+/*
+ * Write one PMC.
+ */
+static void write_pmc(int idx, unsigned long val)
+{
+	switch (idx) {
+	case 1:
+		mtspr(SPRN_PMC1, val);
+		break;
+	case 2:
+		mtspr(SPRN_PMC2, val);
+		break;
+	case 3:
+		mtspr(SPRN_PMC3, val);
+		break;
+	case 4:
+		mtspr(SPRN_PMC4, val);
+		break;
+	case 5:
+		mtspr(SPRN_PMC5, val);
+		break;
+	case 6:
+		mtspr(SPRN_PMC6, val);
+		break;
+#ifdef CONFIG_PPC64
+	case 7:
+		mtspr(SPRN_PMC7, val);
+		break;
+	case 8:
+		mtspr(SPRN_PMC8, val);
+		break;
+#endif /* CONFIG_PPC64 */
+	default:
+		printk(KERN_ERR "oops trying to write PMC%d\n", idx);
+	}
+}
+
+/*
+ * Check if a set of events can all go on the PMU at once.
+ * If they can't, this will look at alternative codes for the events
+ * and see if any combination of alternative codes is feasible.
+ * The feasible set is returned in event_id[].
+ */
+static int power_check_constraints(struct cpu_hw_events *cpuhw,
+				   u64 event_id[], unsigned int cflags[],
+				   int n_ev)
+{
+	unsigned long mask, value, nv;
+	unsigned long smasks[MAX_HWEVENTS], svalues[MAX_HWEVENTS];
+	int n_alt[MAX_HWEVENTS], choice[MAX_HWEVENTS];
+	int i, j;
+	unsigned long addf = ppmu->add_fields;
+	unsigned long tadd = ppmu->test_adder;
+
+	if (n_ev > ppmu->n_event)
+		return -1;
+
+	/* First see if the events will go on as-is */
+	for (i = 0; i < n_ev; ++i) {
+		if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
+		    && !ppmu->limited_pmc_event(event_id[i])) {
+			ppmu->get_alternatives(event_id[i], cflags[i],
+					       cpuhw->alternatives[i]);
+			event_id[i] = cpuhw->alternatives[i][0];
+		}
+		if (ppmu->get_constraint(event_id[i], &cpuhw->amasks[i][0],
+					 &cpuhw->avalues[i][0]))
+			return -1;
+	}
+	value = mask = 0;
+	for (i = 0; i < n_ev; ++i) {
+		nv = (value | cpuhw->avalues[i][0]) +
+			(value & cpuhw->avalues[i][0] & addf);
+		if ((((nv + tadd) ^ value) & mask) != 0 ||
+		    (((nv + tadd) ^ cpuhw->avalues[i][0]) &
+		     cpuhw->amasks[i][0]) != 0)
+			break;
+		value = nv;
+		mask |= cpuhw->amasks[i][0];
+	}
+	if (i == n_ev)
+		return 0;	/* all OK */
+
+	/* doesn't work, gather alternatives... */
+	if (!ppmu->get_alternatives)
+		return -1;
+	for (i = 0; i < n_ev; ++i) {
+		choice[i] = 0;
+		n_alt[i] = ppmu->get_alternatives(event_id[i], cflags[i],
+						  cpuhw->alternatives[i]);
+		for (j = 1; j < n_alt[i]; ++j)
+			ppmu->get_constraint(cpuhw->alternatives[i][j],
+					     &cpuhw->amasks[i][j],
+					     &cpuhw->avalues[i][j]);
+	}
+
+	/* enumerate all possibilities and see if any will work */
+	i = 0;
+	j = -1;
+	value = mask = nv = 0;
+	while (i < n_ev) {
+		if (j >= 0) {
+			/* we're backtracking, restore context */
+			value = svalues[i];
+			mask = smasks[i];
+			j = choice[i];
+		}
+		/*
+		 * See if any alternative k for event_id i,
+		 * where k > j, will satisfy the constraints.
+		 */
+		while (++j < n_alt[i]) {
+			nv = (value | cpuhw->avalues[i][j]) +
+				(value & cpuhw->avalues[i][j] & addf);
+			if ((((nv + tadd) ^ value) & mask) == 0 &&
+			    (((nv + tadd) ^ cpuhw->avalues[i][j])
+			     & cpuhw->amasks[i][j]) == 0)
+				break;
+		}
+		if (j >= n_alt[i]) {
+			/*
+			 * No feasible alternative, backtrack
+			 * to event_id i-1 and continue enumerating its
+			 * alternatives from where we got up to.
+			 */
+			if (--i < 0)
+				return -1;
+		} else {
+			/*
+			 * Found a feasible alternative for event_id i,
+			 * remember where we got up to with this event_id,
+			 * go on to the next event_id, and start with
+			 * the first alternative for it.
+			 */
+			choice[i] = j;
+			svalues[i] = value;
+			smasks[i] = mask;
+			value = nv;
+			mask |= cpuhw->amasks[i][j];
+			++i;
+			j = -1;
+		}
+	}
+
+	/* OK, we have a feasible combination, tell the caller the solution */
+	for (i = 0; i < n_ev; ++i)
+		event_id[i] = cpuhw->alternatives[i][choice[i]];
+	return 0;
+}
+
+/*
+ * Check if newly-added events have consistent settings for
+ * exclude_{user,kernel,hv} with each other and any previously
+ * added events.
+ */
+static int check_excludes(struct perf_event **ctrs, unsigned int cflags[],
+			  int n_prev, int n_new)
+{
+	int eu = 0, ek = 0, eh = 0;
+	int i, n, first;
+	struct perf_event *event;
+
+	n = n_prev + n_new;
+	if (n <= 1)
+		return 0;
+
+	first = 1;
+	for (i = 0; i < n; ++i) {
+		if (cflags[i] & PPMU_LIMITED_PMC_OK) {
+			cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
+			continue;
+		}
+		event = ctrs[i];
+		if (first) {
+			eu = event->attr.exclude_user;
+			ek = event->attr.exclude_kernel;
+			eh = event->attr.exclude_hv;
+			first = 0;
+		} else if (event->attr.exclude_user != eu ||
+			   event->attr.exclude_kernel != ek ||
+			   event->attr.exclude_hv != eh) {
+			return -EAGAIN;
+		}
+	}
+
+	if (eu || ek || eh)
+		for (i = 0; i < n; ++i)
+			if (cflags[i] & PPMU_LIMITED_PMC_OK)
+				cflags[i] |= PPMU_LIMITED_PMC_REQD;
+
+	return 0;
+}
+
+static void power_pmu_read(struct perf_event *event)
+{
+	s64 val, delta, prev;
+
+	if (!event->hw.idx)
+		return;
+	/*
+	 * Performance monitor interrupts come even when interrupts
+	 * are soft-disabled, as long as interrupts are hard-enabled.
+	 * Therefore we treat them like NMIs.
+	 */
+	do {
+		prev = atomic64_read(&event->hw.prev_count);
+		barrier();
+		val = read_pmc(event->hw.idx);
+	} while (atomic64_cmpxchg(&event->hw.prev_count, prev, val) != prev);
+
+	/* The events are only 32 bits wide */
+	delta = (val - prev) & 0xfffffffful;
+	atomic64_add(delta, &event->count);
+	atomic64_sub(delta, &event->hw.period_left);
+}
+
+/*
+ * On some machines, PMC5 and PMC6 can't be written, don't respect
+ * the freeze conditions, and don't generate interrupts.  This tells
+ * us if `event' is using such a PMC.
+ */
+static int is_limited_pmc(int pmcnum)
+{
+	return (ppmu->flags & PPMU_LIMITED_PMC5_6)
+		&& (pmcnum == 5 || pmcnum == 6);
+}
+
+static void freeze_limited_events(struct cpu_hw_events *cpuhw,
+				    unsigned long pmc5, unsigned long pmc6)
+{
+	struct perf_event *event;
+	u64 val, prev, delta;
+	int i;
+
+	for (i = 0; i < cpuhw->n_limited; ++i) {
+		event = cpuhw->limited_event[i];
+		if (!event->hw.idx)
+			continue;
+		val = (event->hw.idx == 5) ? pmc5 : pmc6;
+		prev = atomic64_read(&event->hw.prev_count);
+		event->hw.idx = 0;
+		delta = (val - prev) & 0xfffffffful;
+		atomic64_add(delta, &event->count);
+	}
+}
+
+static void thaw_limited_events(struct cpu_hw_events *cpuhw,
+				  unsigned long pmc5, unsigned long pmc6)
+{
+	struct perf_event *event;
+	u64 val;
+	int i;
+
+	for (i = 0; i < cpuhw->n_limited; ++i) {
+		event = cpuhw->limited_event[i];
+		event->hw.idx = cpuhw->limited_hwidx[i];
+		val = (event->hw.idx == 5) ? pmc5 : pmc6;
+		atomic64_set(&event->hw.prev_count, val);
+		perf_event_update_userpage(event);
+	}
+}
+
+/*
+ * Since limited events don't respect the freeze conditions, we
+ * have to read them immediately after freezing or unfreezing the
+ * other events.  We try to keep the values from the limited
+ * events as consistent as possible by keeping the delay (in
+ * cycles and instructions) between freezing/unfreezing and reading
+ * the limited events as small and consistent as possible.
+ * Therefore, if any limited events are in use, we read them
+ * both, and always in the same order, to minimize variability,
+ * and do it inside the same asm that writes MMCR0.
+ */
+static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0)
+{
+	unsigned long pmc5, pmc6;
+
+	if (!cpuhw->n_limited) {
+		mtspr(SPRN_MMCR0, mmcr0);
+		return;
+	}
+
+	/*
+	 * Write MMCR0, then read PMC5 and PMC6 immediately.
+	 * To ensure we don't get a performance monitor interrupt
+	 * between writing MMCR0 and freezing/thawing the limited
+	 * events, we first write MMCR0 with the event overflow
+	 * interrupt enable bits turned off.
+	 */
+	asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
+		     : "=&r" (pmc5), "=&r" (pmc6)
+		     : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)),
+		       "i" (SPRN_MMCR0),
+		       "i" (SPRN_PMC5), "i" (SPRN_PMC6));
+
+	if (mmcr0 & MMCR0_FC)
+		freeze_limited_events(cpuhw, pmc5, pmc6);
+	else
+		thaw_limited_events(cpuhw, pmc5, pmc6);
+
+	/*
+	 * Write the full MMCR0 including the event overflow interrupt
+	 * enable bits, if necessary.
+	 */
+	if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE))
+		mtspr(SPRN_MMCR0, mmcr0);
+}
+
+/*
+ * Disable all events to prevent PMU interrupts and to allow
+ * events to be added or removed.
+ */
+void hw_perf_disable(void)
+{
+	struct cpu_hw_events *cpuhw;
+	unsigned long flags;
+
+	if (!ppmu)
+		return;
+	local_irq_save(flags);
+	cpuhw = &__get_cpu_var(cpu_hw_events);
+
+	if (!cpuhw->disabled) {
+		cpuhw->disabled = 1;
+		cpuhw->n_added = 0;
+
+		/*
+		 * Check if we ever enabled the PMU on this cpu.
+		 */
+		if (!cpuhw->pmcs_enabled) {
+			ppc_enable_pmcs();
+			cpuhw->pmcs_enabled = 1;
+		}
+
+		/*
+		 * Disable instruction sampling if it was enabled
+		 */
+		if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
+			mtspr(SPRN_MMCRA,
+			      cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
+			mb();
+		}
+
+		/*
+		 * Set the 'freeze events' bit.
+		 * The barrier is to make sure the mtspr has been
+		 * executed and the PMU has frozen the events
+		 * before we return.
+		 */
+		write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
+		mb();
+	}
+	local_irq_restore(flags);
+}
+
+/*
+ * Re-enable all events if disable == 0.
+ * If we were previously disabled and events were added, then
+ * put the new config on the PMU.
+ */
+void hw_perf_enable(void)
+{
+	struct perf_event *event;
+	struct cpu_hw_events *cpuhw;
+	unsigned long flags;
+	long i;
+	unsigned long val;
+	s64 left;
+	unsigned int hwc_index[MAX_HWEVENTS];
+	int n_lim;
+	int idx;
+
+	if (!ppmu)
+		return;
+	local_irq_save(flags);
+	cpuhw = &__get_cpu_var(cpu_hw_events);
+	if (!cpuhw->disabled) {
+		local_irq_restore(flags);
+		return;
+	}
+	cpuhw->disabled = 0;
+
+	/*
+	 * If we didn't change anything, or only removed events,
+	 * no need to recalculate MMCR* settings and reset the PMCs.
+	 * Just reenable the PMU with the current MMCR* settings
+	 * (possibly updated for removal of events).
+	 */
+	if (!cpuhw->n_added) {
+		mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
+		mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
+		if (cpuhw->n_events == 0)
+			ppc_set_pmu_inuse(0);
+		goto out_enable;
+	}
+
+	/*
+	 * Compute MMCR* values for the new set of events
+	 */
+	if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_events, hwc_index,
+			       cpuhw->mmcr)) {
+		/* shouldn't ever get here */
+		printk(KERN_ERR "oops compute_mmcr failed\n");
+		goto out;
+	}
+
+	/*
+	 * Add in MMCR0 freeze bits corresponding to the
+	 * attr.exclude_* bits for the first event.
+	 * We have already checked that all events have the
+	 * same values for these bits as the first event.
+	 */
+	event = cpuhw->event[0];
+	if (event->attr.exclude_user)
+		cpuhw->mmcr[0] |= MMCR0_FCP;
+	if (event->attr.exclude_kernel)
+		cpuhw->mmcr[0] |= freeze_events_kernel;
+	if (event->attr.exclude_hv)
+		cpuhw->mmcr[0] |= MMCR0_FCHV;
+
+	/*
+	 * Write the new configuration to MMCR* with the freeze
+	 * bit set and set the hardware events to their initial values.
+	 * Then unfreeze the events.
+	 */
+	ppc_set_pmu_inuse(1);
+	mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
+	mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
+	mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
+				| MMCR0_FC);
+
+	/*
+	 * Read off any pre-existing events that need to move
+	 * to another PMC.
+	 */
+	for (i = 0; i < cpuhw->n_events; ++i) {
+		event = cpuhw->event[i];
+		if (event->hw.idx && event->hw.idx != hwc_index[i] + 1) {
+			power_pmu_read(event);
+			write_pmc(event->hw.idx, 0);
+			event->hw.idx = 0;
+		}
+	}
+
+	/*
+	 * Initialize the PMCs for all the new and moved events.
+	 */
+	cpuhw->n_limited = n_lim = 0;
+	for (i = 0; i < cpuhw->n_events; ++i) {
+		event = cpuhw->event[i];
+		if (event->hw.idx)
+			continue;
+		idx = hwc_index[i] + 1;
+		if (is_limited_pmc(idx)) {
+			cpuhw->limited_event[n_lim] = event;
+			cpuhw->limited_hwidx[n_lim] = idx;
+			++n_lim;
+			continue;
+		}
+		val = 0;
+		if (event->hw.sample_period) {
+			left = atomic64_read(&event->hw.period_left);
+			if (left < 0x80000000L)
+				val = 0x80000000L - left;
+		}
+		atomic64_set(&event->hw.prev_count, val);
+		event->hw.idx = idx;
+		write_pmc(idx, val);
+		perf_event_update_userpage(event);
+	}
+	cpuhw->n_limited = n_lim;
+	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
+
+ out_enable:
+	mb();
+	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
+
+	/*
+	 * Enable instruction sampling if necessary
+	 */
+	if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
+		mb();
+		mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+	}
+
+ out:
+	local_irq_restore(flags);
+}
+
+static int collect_events(struct perf_event *group, int max_count,
+			  struct perf_event *ctrs[], u64 *events,
+			  unsigned int *flags)
+{
+	int n = 0;
+	struct perf_event *event;
+
+	if (!is_software_event(group)) {
+		if (n >= max_count)
+			return -1;
+		ctrs[n] = group;
+		flags[n] = group->hw.event_base;
+		events[n++] = group->hw.config;
+	}
+	list_for_each_entry(event, &group->sibling_list, list_entry) {
+		if (!is_software_event(event) &&
+		    event->state != PERF_EVENT_STATE_OFF) {
+			if (n >= max_count)
+				return -1;
+			ctrs[n] = event;
+			flags[n] = event->hw.event_base;
+			events[n++] = event->hw.config;
+		}
+	}
+	return n;
+}
+
+static void event_sched_in(struct perf_event *event, int cpu)
+{
+	event->state = PERF_EVENT_STATE_ACTIVE;
+	event->oncpu = cpu;
+	event->tstamp_running += event->ctx->time - event->tstamp_stopped;
+	if (is_software_event(event))
+		event->pmu->enable(event);
+}
+
+/*
+ * Called to enable a whole group of events.
+ * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
+ * Assumes the caller has disabled interrupts and has
+ * frozen the PMU with hw_perf_save_disable.
+ */
+int hw_perf_group_sched_in(struct perf_event *group_leader,
+	       struct perf_cpu_context *cpuctx,
+	       struct perf_event_context *ctx, int cpu)
+{
+	struct cpu_hw_events *cpuhw;
+	long i, n, n0;
+	struct perf_event *sub;
+
+	if (!ppmu)
+		return 0;
+	cpuhw = &__get_cpu_var(cpu_hw_events);
+	n0 = cpuhw->n_events;
+	n = collect_events(group_leader, ppmu->n_event - n0,
+			   &cpuhw->event[n0], &cpuhw->events[n0],
+			   &cpuhw->flags[n0]);
+	if (n < 0)
+		return -EAGAIN;
+	if (check_excludes(cpuhw->event, cpuhw->flags, n0, n))
+		return -EAGAIN;
+	i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n + n0);
+	if (i < 0)
+		return -EAGAIN;
+	cpuhw->n_events = n0 + n;
+	cpuhw->n_added += n;
+
+	/*
+	 * OK, this group can go on; update event states etc.,
+	 * and enable any software events
+	 */
+	for (i = n0; i < n0 + n; ++i)
+		cpuhw->event[i]->hw.config = cpuhw->events[i];
+	cpuctx->active_oncpu += n;
+	n = 1;
+	event_sched_in(group_leader, cpu);
+	list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
+		if (sub->state != PERF_EVENT_STATE_OFF) {
+			event_sched_in(sub, cpu);
+			++n;
+		}
+	}
+	ctx->nr_active += n;
+
+	return 1;
+}
+
+/*
+ * Add a event to the PMU.
+ * If all events are not already frozen, then we disable and
+ * re-enable the PMU in order to get hw_perf_enable to do the
+ * actual work of reconfiguring the PMU.
+ */
+static int power_pmu_enable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuhw;
+	unsigned long flags;
+	int n0;
+	int ret = -EAGAIN;
+
+	local_irq_save(flags);
+	perf_disable();
+
+	/*
+	 * Add the event to the list (if there is room)
+	 * and check whether the total set is still feasible.
+	 */
+	cpuhw = &__get_cpu_var(cpu_hw_events);
+	n0 = cpuhw->n_events;
+	if (n0 >= ppmu->n_event)
+		goto out;
+	cpuhw->event[n0] = event;
+	cpuhw->events[n0] = event->hw.config;
+	cpuhw->flags[n0] = event->hw.event_base;
+	if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1))
+		goto out;
+	if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1))
+		goto out;
+
+	event->hw.config = cpuhw->events[n0];
+	++cpuhw->n_events;
+	++cpuhw->n_added;
+
+	ret = 0;
+ out:
+	perf_enable();
+	local_irq_restore(flags);
+	return ret;
+}
+
+/*
+ * Remove a event from the PMU.
+ */
+static void power_pmu_disable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuhw;
+	long i;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	perf_disable();
+
+	power_pmu_read(event);
+
+	cpuhw = &__get_cpu_var(cpu_hw_events);
+	for (i = 0; i < cpuhw->n_events; ++i) {
+		if (event == cpuhw->event[i]) {
+			while (++i < cpuhw->n_events)
+				cpuhw->event[i-1] = cpuhw->event[i];
+			--cpuhw->n_events;
+			ppmu->disable_pmc(event->hw.idx - 1, cpuhw->mmcr);
+			if (event->hw.idx) {
+				write_pmc(event->hw.idx, 0);
+				event->hw.idx = 0;
+			}
+			perf_event_update_userpage(event);
+			break;
+		}
+	}
+	for (i = 0; i < cpuhw->n_limited; ++i)
+		if (event == cpuhw->limited_event[i])
+			break;
+	if (i < cpuhw->n_limited) {
+		while (++i < cpuhw->n_limited) {
+			cpuhw->limited_event[i-1] = cpuhw->limited_event[i];
+			cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
+		}
+		--cpuhw->n_limited;
+	}
+	if (cpuhw->n_events == 0) {
+		/* disable exceptions if no events are running */
+		cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
+	}
+
+	perf_enable();
+	local_irq_restore(flags);
+}
+
+/*
+ * Re-enable interrupts on a event after they were throttled
+ * because they were coming too fast.
+ */
+static void power_pmu_unthrottle(struct perf_event *event)
+{
+	s64 val, left;
+	unsigned long flags;
+
+	if (!event->hw.idx || !event->hw.sample_period)
+		return;
+	local_irq_save(flags);
+	perf_disable();
+	power_pmu_read(event);
+	left = event->hw.sample_period;
+	event->hw.last_period = left;
+	val = 0;
+	if (left < 0x80000000L)
+		val = 0x80000000L - left;
+	write_pmc(event->hw.idx, val);
+	atomic64_set(&event->hw.prev_count, val);
+	atomic64_set(&event->hw.period_left, left);
+	perf_event_update_userpage(event);
+	perf_enable();
+	local_irq_restore(flags);
+}
+
+struct pmu power_pmu = {
+	.enable		= power_pmu_enable,
+	.disable	= power_pmu_disable,
+	.read		= power_pmu_read,
+	.unthrottle	= power_pmu_unthrottle,
+};
+
+/*
+ * Return 1 if we might be able to put event on a limited PMC,
+ * or 0 if not.
+ * A event can only go on a limited PMC if it counts something
+ * that a limited PMC can count, doesn't require interrupts, and
+ * doesn't exclude any processor mode.
+ */
+static int can_go_on_limited_pmc(struct perf_event *event, u64 ev,
+				 unsigned int flags)
+{
+	int n;
+	u64 alt[MAX_EVENT_ALTERNATIVES];
+
+	if (event->attr.exclude_user
+	    || event->attr.exclude_kernel
+	    || event->attr.exclude_hv
+	    || event->attr.sample_period)
+		return 0;
+
+	if (ppmu->limited_pmc_event(ev))
+		return 1;
+
+	/*
+	 * The requested event_id isn't on a limited PMC already;
+	 * see if any alternative code goes on a limited PMC.
+	 */
+	if (!ppmu->get_alternatives)
+		return 0;
+
+	flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
+	n = ppmu->get_alternatives(ev, flags, alt);
+
+	return n > 0;
+}
+
+/*
+ * Find an alternative event_id that goes on a normal PMC, if possible,
+ * and return the event_id code, or 0 if there is no such alternative.
+ * (Note: event_id code 0 is "don't count" on all machines.)
+ */
+static u64 normal_pmc_alternative(u64 ev, unsigned long flags)
+{
+	u64 alt[MAX_EVENT_ALTERNATIVES];
+	int n;
+
+	flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
+	n = ppmu->get_alternatives(ev, flags, alt);
+	if (!n)
+		return 0;
+	return alt[0];
+}
+
+/* Number of perf_events counting hardware events */
+static atomic_t num_events;
+/* Used to avoid races in calling reserve/release_pmc_hardware */
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+/*
+ * Release the PMU if this is the last perf_event.
+ */
+static void hw_perf_event_destroy(struct perf_event *event)
+{
+	if (!atomic_add_unless(&num_events, -1, 1)) {
+		mutex_lock(&pmc_reserve_mutex);
+		if (atomic_dec_return(&num_events) == 0)
+			release_pmc_hardware();
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+}
+
+/*
+ * Translate a generic cache event_id config to a raw event_id code.
+ */
+static int hw_perf_cache_event(u64 config, u64 *eventp)
+{
+	unsigned long type, op, result;
+	int ev;
+
+	if (!ppmu->cache_events)
+		return -EINVAL;
+
+	/* unpack config */
+	type = config & 0xff;
+	op = (config >> 8) & 0xff;
+	result = (config >> 16) & 0xff;
+
+	if (type >= PERF_COUNT_HW_CACHE_MAX ||
+	    op >= PERF_COUNT_HW_CACHE_OP_MAX ||
+	    result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+		return -EINVAL;
+
+	ev = (*ppmu->cache_events)[type][op][result];
+	if (ev == 0)
+		return -EOPNOTSUPP;
+	if (ev == -1)
+		return -EINVAL;
+	*eventp = ev;
+	return 0;
+}
+
+const struct pmu *hw_perf_event_init(struct perf_event *event)
+{
+	u64 ev;
+	unsigned long flags;
+	struct perf_event *ctrs[MAX_HWEVENTS];
+	u64 events[MAX_HWEVENTS];
+	unsigned int cflags[MAX_HWEVENTS];
+	int n;
+	int err;
+	struct cpu_hw_events *cpuhw;
+
+	if (!ppmu)
+		return ERR_PTR(-ENXIO);
+	switch (event->attr.type) {
+	case PERF_TYPE_HARDWARE:
+		ev = event->attr.config;
+		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
+			return ERR_PTR(-EOPNOTSUPP);
+		ev = ppmu->generic_events[ev];
+		break;
+	case PERF_TYPE_HW_CACHE:
+		err = hw_perf_cache_event(event->attr.config, &ev);
+		if (err)
+			return ERR_PTR(err);
+		break;
+	case PERF_TYPE_RAW:
+		ev = event->attr.config;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+	event->hw.config_base = ev;
+	event->hw.idx = 0;
+
+	/*
+	 * If we are not running on a hypervisor, force the
+	 * exclude_hv bit to 0 so that we don't care what
+	 * the user set it to.
+	 */
+	if (!firmware_has_feature(FW_FEATURE_LPAR))
+		event->attr.exclude_hv = 0;
+
+	/*
+	 * If this is a per-task event, then we can use
+	 * PM_RUN_* events interchangeably with their non RUN_*
+	 * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
+	 * XXX we should check if the task is an idle task.
+	 */
+	flags = 0;
+	if (event->ctx->task)
+		flags |= PPMU_ONLY_COUNT_RUN;
+
+	/*
+	 * If this machine has limited events, check whether this
+	 * event_id could go on a limited event.
+	 */
+	if (ppmu->flags & PPMU_LIMITED_PMC5_6) {
+		if (can_go_on_limited_pmc(event, ev, flags)) {
+			flags |= PPMU_LIMITED_PMC_OK;
+		} else if (ppmu->limited_pmc_event(ev)) {
+			/*
+			 * The requested event_id is on a limited PMC,
+			 * but we can't use a limited PMC; see if any
+			 * alternative goes on a normal PMC.
+			 */
+			ev = normal_pmc_alternative(ev, flags);
+			if (!ev)
+				return ERR_PTR(-EINVAL);
+		}
+	}
+
+	/*
+	 * If this is in a group, check if it can go on with all the
+	 * other hardware events in the group.  We assume the event
+	 * hasn't been linked into its leader's sibling list at this point.
+	 */
+	n = 0;
+	if (event->group_leader != event) {
+		n = collect_events(event->group_leader, ppmu->n_event - 1,
+				   ctrs, events, cflags);
+		if (n < 0)
+			return ERR_PTR(-EINVAL);
+	}
+	events[n] = ev;
+	ctrs[n] = event;
+	cflags[n] = flags;
+	if (check_excludes(ctrs, cflags, n, 1))
+		return ERR_PTR(-EINVAL);
+
+	cpuhw = &get_cpu_var(cpu_hw_events);
+	err = power_check_constraints(cpuhw, events, cflags, n + 1);
+	put_cpu_var(cpu_hw_events);
+	if (err)
+		return ERR_PTR(-EINVAL);
+
+	event->hw.config = events[n];
+	event->hw.event_base = cflags[n];
+	event->hw.last_period = event->hw.sample_period;
+	atomic64_set(&event->hw.period_left, event->hw.last_period);
+
+	/*
+	 * See if we need to reserve the PMU.
+	 * If no events are currently in use, then we have to take a
+	 * mutex to ensure that we don't race with another task doing
+	 * reserve_pmc_hardware or release_pmc_hardware.
+	 */
+	err = 0;
+	if (!atomic_inc_not_zero(&num_events)) {
+		mutex_lock(&pmc_reserve_mutex);
+		if (atomic_read(&num_events) == 0 &&
+		    reserve_pmc_hardware(perf_event_interrupt))
+			err = -EBUSY;
+		else
+			atomic_inc(&num_events);
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+	event->destroy = hw_perf_event_destroy;
+
+	if (err)
+		return ERR_PTR(err);
+	return &power_pmu;
+}
+
+/*
+ * A event has overflowed; update its count and record
+ * things if requested.  Note that interrupts are hard-disabled
+ * here so there is no possibility of being interrupted.
+ */
+static void record_and_restart(struct perf_event *event, unsigned long val,
+			       struct pt_regs *regs, int nmi)
+{
+	u64 period = event->hw.sample_period;
+	s64 prev, delta, left;
+	int record = 0;
+
+	/* we don't have to worry about interrupts here */
+	prev = atomic64_read(&event->hw.prev_count);
+	delta = (val - prev) & 0xfffffffful;
+	atomic64_add(delta, &event->count);
+
+	/*
+	 * See if the total period for this event has expired,
+	 * and update for the next period.
+	 */
+	val = 0;
+	left = atomic64_read(&event->hw.period_left) - delta;
+	if (period) {
+		if (left <= 0) {
+			left += period;
+			if (left <= 0)
+				left = period;
+			record = 1;
+		}
+		if (left < 0x80000000LL)
+			val = 0x80000000LL - left;
+	}
+
+	/*
+	 * Finally record data if requested.
+	 */
+	if (record) {
+		struct perf_sample_data data = {
+			.addr	= 0,
+			.period	= event->hw.last_period,
+		};
+
+		if (event->attr.sample_type & PERF_SAMPLE_ADDR)
+			perf_get_data_addr(regs, &data.addr);
+
+		if (perf_event_overflow(event, nmi, &data, regs)) {
+			/*
+			 * Interrupts are coming too fast - throttle them
+			 * by setting the event to 0, so it will be
+			 * at least 2^30 cycles until the next interrupt
+			 * (assuming each event counts at most 2 counts
+			 * per cycle).
+			 */
+			val = 0;
+			left = ~0ULL >> 1;
+		}
+	}
+
+	write_pmc(event->hw.idx, val);
+	atomic64_set(&event->hw.prev_count, val);
+	atomic64_set(&event->hw.period_left, left);
+	perf_event_update_userpage(event);
+}
+
+/*
+ * Called from generic code to get the misc flags (i.e. processor mode)
+ * for an event_id.
+ */
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+	u32 flags = perf_get_misc_flags(regs);
+
+	if (flags)
+		return flags;
+	return user_mode(regs) ? PERF_RECORD_MISC_USER :
+		PERF_RECORD_MISC_KERNEL;
+}
+
+/*
+ * Called from generic code to get the instruction pointer
+ * for an event_id.
+ */
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+	unsigned long ip;
+
+	if (TRAP(regs) != 0xf00)
+		return regs->nip;	/* not a PMU interrupt */
+
+	ip = mfspr(SPRN_SIAR) + perf_ip_adjust(regs);
+	return ip;
+}
+
+/*
+ * Performance monitor interrupt stuff
+ */
+static void perf_event_interrupt(struct pt_regs *regs)
+{
+	int i;
+	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
+	struct perf_event *event;
+	unsigned long val;
+	int found = 0;
+	int nmi;
+
+	if (cpuhw->n_limited)
+		freeze_limited_events(cpuhw, mfspr(SPRN_PMC5),
+					mfspr(SPRN_PMC6));
+
+	perf_read_regs(regs);
+
+	nmi = perf_intr_is_nmi(regs);
+	if (nmi)
+		nmi_enter();
+	else
+		irq_enter();
+
+	for (i = 0; i < cpuhw->n_events; ++i) {
+		event = cpuhw->event[i];
+		if (!event->hw.idx || is_limited_pmc(event->hw.idx))
+			continue;
+		val = read_pmc(event->hw.idx);
+		if ((int)val < 0) {
+			/* event has overflowed */
+			found = 1;
+			record_and_restart(event, val, regs, nmi);
+		}
+	}
+
+	/*
+	 * In case we didn't find and reset the event that caused
+	 * the interrupt, scan all events and reset any that are
+	 * negative, to avoid getting continual interrupts.
+	 * Any that we processed in the previous loop will not be negative.
+	 */
+	if (!found) {
+		for (i = 0; i < ppmu->n_event; ++i) {
+			if (is_limited_pmc(i + 1))
+				continue;
+			val = read_pmc(i + 1);
+			if ((int)val < 0)
+				write_pmc(i + 1, 0);
+		}
+	}
+
+	/*
+	 * Reset MMCR0 to its normal value.  This will set PMXE and
+	 * clear FC (freeze events) and PMAO (perf mon alert occurred)
+	 * and thus allow interrupts to occur again.
+	 * XXX might want to use MSR.PM to keep the events frozen until
+	 * we get back out of this interrupt.
+	 */
+	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
+
+	if (nmi)
+		nmi_exit();
+	else
+		irq_exit();
+}
+
+void hw_perf_event_setup(int cpu)
+{
+	struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
+
+	if (!ppmu)
+		return;
+	memset(cpuhw, 0, sizeof(*cpuhw));
+	cpuhw->mmcr[0] = MMCR0_FC;
+}
+
+int register_power_pmu(struct power_pmu *pmu)
+{
+	if (ppmu)
+		return -EBUSY;		/* something's already registered */
+
+	ppmu = pmu;
+	pr_info("%s performance monitor hardware support registered\n",
+		pmu->name);
+
+#ifdef MSR_HV
+	/*
+	 * Use FCHV to ignore kernel events if MSR.HV is set.
+	 */
+	if (mfmsr() & MSR_HV)
+		freeze_events_kernel = MMCR0_FCHV;
+#endif /* CONFIG_PPC64 */
+
+	return 0;
+}
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
index 3c90a3d9173e..2a361cdda635 100644
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -9,7 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 #include <linux/kernel.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/string.h>
 #include <asm/reg.h>
 #include <asm/cputable.h>
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index 31918af3e355..0f4c1c73a6ad 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -9,7 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 #include <linux/kernel.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/string.h>
 #include <asm/reg.h>
 #include <asm/cputable.h>
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index 867f6f663963..c351b3a57fbb 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -9,7 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 #include <linux/kernel.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/string.h>
 #include <asm/reg.h>
 #include <asm/cputable.h>
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index fa21890531da..ca399ba5034c 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -9,7 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 #include <linux/kernel.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/string.h>
 #include <asm/reg.h>
 #include <asm/cputable.h>
diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c
index 018d094d92f9..28a4daacdc02 100644
--- a/arch/powerpc/kernel/power7-pmu.c
+++ b/arch/powerpc/kernel/power7-pmu.c
@@ -9,7 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 #include <linux/kernel.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/string.h>
 #include <asm/reg.h>
 #include <asm/cputable.h>
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index 75dccb71a043..479574413a93 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -9,7 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 #include <linux/string.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <asm/reg.h>
 #include <asm/cputable.h>
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 465e498bcb33..df45a7449a66 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -53,7 +53,7 @@
 #include <linux/posix-timers.h>
 #include <linux/irq.h>
 #include <linux/delay.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 
 #include <asm/io.h>
 #include <asm/processor.h>
@@ -527,25 +527,25 @@ void __init iSeries_time_init_early(void)
 }
 #endif /* CONFIG_PPC_ISERIES */
 
-#if defined(CONFIG_PERF_COUNTERS) && defined(CONFIG_PPC32)
-DEFINE_PER_CPU(u8, perf_counter_pending);
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_PPC32)
+DEFINE_PER_CPU(u8, perf_event_pending);
 
-void set_perf_counter_pending(void)
+void set_perf_event_pending(void)
 {
-	get_cpu_var(perf_counter_pending) = 1;
+	get_cpu_var(perf_event_pending) = 1;
 	set_dec(1);
-	put_cpu_var(perf_counter_pending);
+	put_cpu_var(perf_event_pending);
 }
 
-#define test_perf_counter_pending()	__get_cpu_var(perf_counter_pending)
-#define clear_perf_counter_pending()	__get_cpu_var(perf_counter_pending) = 0
+#define test_perf_event_pending()	__get_cpu_var(perf_event_pending)
+#define clear_perf_event_pending()	__get_cpu_var(perf_event_pending) = 0
 
-#else  /* CONFIG_PERF_COUNTERS && CONFIG_PPC32 */
+#else  /* CONFIG_PERF_EVENTS && CONFIG_PPC32 */
 
-#define test_perf_counter_pending()	0
-#define clear_perf_counter_pending()
+#define test_perf_event_pending()	0
+#define clear_perf_event_pending()
 
-#endif /* CONFIG_PERF_COUNTERS && CONFIG_PPC32 */
+#endif /* CONFIG_PERF_EVENTS && CONFIG_PPC32 */
 
 /*
  * For iSeries shared processors, we have to let the hypervisor
@@ -573,9 +573,9 @@ void timer_interrupt(struct pt_regs * regs)
 	set_dec(DECREMENTER_MAX);
 
 #ifdef CONFIG_PPC32
-	if (test_perf_counter_pending()) {
-		clear_perf_counter_pending();
-		perf_counter_do_pending();
+	if (test_perf_event_pending()) {
+		clear_perf_event_pending();
+		perf_event_do_pending();
 	}
 	if (atomic_read(&ppc_n_lost_interrupts) != 0)
 		do_IRQ(regs);
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 830bef0a1131..e7dae82c1285 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -29,7 +29,7 @@
 #include <linux/module.h>
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 
 #include <asm/firmware.h>
 #include <asm/page.h>
@@ -171,7 +171,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 		die("Weird page fault", regs, SIGSEGV);
 	}
 
-	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
 
 	/* When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
@@ -312,7 +312,7 @@ good_area:
 	}
 	if (ret & VM_FAULT_MAJOR) {
 		current->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
 				     regs, address);
 #ifdef CONFIG_PPC_SMLPAR
 		if (firmware_has_feature(FW_FEATURE_CMO)) {
@@ -323,7 +323,7 @@ good_area:
 #endif
 	} else {
 		current->min_flt++;
-		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
 				     regs, address);
 	}
 	up_read(&mm->mmap_sem);
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 9efc8bda01b4..e382cae678b8 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -280,9 +280,9 @@ config PPC_HAVE_PMU_SUPPORT
 
 config PPC_PERF_CTRS
        def_bool y
-       depends on PERF_COUNTERS && PPC_HAVE_PMU_SUPPORT
+       depends on PERF_EVENTS && PPC_HAVE_PMU_SUPPORT
        help
-         This enables the powerpc-specific perf_counter back-end.
+         This enables the powerpc-specific perf_event back-end.
 
 config SMP
 	depends on PPC_BOOK3S || PPC_BOOK3E || FSL_BOOKE
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 1c866efd217d..43c0acad7160 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -94,7 +94,7 @@ config S390
 	select HAVE_KVM if 64BIT
 	select HAVE_ARCH_TRACEHOOK
 	select INIT_ALL_POSSIBLE
-	select HAVE_PERF_COUNTERS
+	select HAVE_PERF_EVENTS
 
 config SCHED_OMIT_FRAME_POINTER
 	bool
diff --git a/arch/s390/include/asm/perf_counter.h b/arch/s390/include/asm/perf_counter.h
deleted file mode 100644
index 7015188c2cc2..000000000000
--- a/arch/s390/include/asm/perf_counter.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- * Performance counter support - s390 specific definitions.
- *
- * Copyright 2009 Martin Schwidefsky, IBM Corporation.
- */
-
-static inline void set_perf_counter_pending(void) {}
-static inline void clear_perf_counter_pending(void) {}
-
-#define PERF_COUNTER_INDEX_OFFSET 0
diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h
new file mode 100644
index 000000000000..3840cbe77637
--- /dev/null
+++ b/arch/s390/include/asm/perf_event.h
@@ -0,0 +1,10 @@
+/*
+ * Performance event support - s390 specific definitions.
+ *
+ * Copyright 2009 Martin Schwidefsky, IBM Corporation.
+ */
+
+static inline void set_perf_event_pending(void) {}
+static inline void clear_perf_event_pending(void) {}
+
+#define PERF_EVENT_INDEX_OFFSET 0
diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h
index c80602d7c880..cb5232df151e 100644
--- a/arch/s390/include/asm/unistd.h
+++ b/arch/s390/include/asm/unistd.h
@@ -268,7 +268,7 @@
 #define	__NR_preadv		328
 #define	__NR_pwritev		329
 #define __NR_rt_tgsigqueueinfo	330
-#define __NR_perf_counter_open	331
+#define __NR_perf_event_open	331
 #define NR_syscalls 332
 
 /* 
diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S
index 88a83366819f..624790042d41 100644
--- a/arch/s390/kernel/compat_wrapper.S
+++ b/arch/s390/kernel/compat_wrapper.S
@@ -1832,11 +1832,11 @@ compat_sys_rt_tgsigqueueinfo_wrapper:
 	llgtr	%r5,%r5			# struct compat_siginfo *
 	jg	compat_sys_rt_tgsigqueueinfo_wrapper # branch to system call
 
-	.globl	sys_perf_counter_open_wrapper
-sys_perf_counter_open_wrapper:
-	llgtr	%r2,%r2			# const struct perf_counter_attr *
+	.globl	sys_perf_event_open_wrapper
+sys_perf_event_open_wrapper:
+	llgtr	%r2,%r2			# const struct perf_event_attr *
 	lgfr	%r3,%r3			# pid_t
 	lgfr	%r4,%r4			# int
 	lgfr	%r5,%r5			# int
 	llgfr	%r6,%r6			# unsigned long
-	jg	sys_perf_counter_open	# branch to system call
+	jg	sys_perf_event_open	# branch to system call
diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S
index ad1acd200385..0b5083681e77 100644
--- a/arch/s390/kernel/syscalls.S
+++ b/arch/s390/kernel/syscalls.S
@@ -339,4 +339,4 @@ SYSCALL(sys_epoll_create1,sys_epoll_create1,sys_epoll_create1_wrapper)
 SYSCALL(sys_preadv,sys_preadv,compat_sys_preadv_wrapper)
 SYSCALL(sys_pwritev,sys_pwritev,compat_sys_pwritev_wrapper)
 SYSCALL(sys_rt_tgsigqueueinfo,sys_rt_tgsigqueueinfo,compat_sys_rt_tgsigqueueinfo_wrapper) /* 330 */
-SYSCALL(sys_perf_counter_open,sys_perf_counter_open,sys_perf_counter_open_wrapper)
+SYSCALL(sys_perf_event_open,sys_perf_event_open,sys_perf_event_open_wrapper)
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 1abbadd497e1..6d507462967a 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -10,7 +10,7 @@
  *    Copyright (C) 1995  Linus Torvalds
  */
 
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
@@ -306,7 +306,7 @@ do_exception(struct pt_regs *regs, unsigned long error_code, int write)
 	 * interrupts again and then search the VMAs
 	 */
 	local_irq_enable();
-	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
 	down_read(&mm->mmap_sem);
 
 	si_code = SEGV_MAPERR;
@@ -366,11 +366,11 @@ good_area:
 	}
 	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
 				     regs, address);
 	} else {
 		tsk->min_flt++;
-		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
 				     regs, address);
 	}
         up_read(&mm->mmap_sem);
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 4df3570fe511..b940424f8ccc 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -16,7 +16,7 @@ config SUPERH
 	select HAVE_IOREMAP_PROT if MMU
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_DMA_API_DEBUG
-	select HAVE_PERF_COUNTERS
+	select HAVE_PERF_EVENTS
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_BZIP2
 	select HAVE_KERNEL_LZMA
diff --git a/arch/sh/include/asm/perf_counter.h b/arch/sh/include/asm/perf_counter.h
deleted file mode 100644
index d8e6bb9c0ccc..000000000000
--- a/arch/sh/include/asm/perf_counter.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef __ASM_SH_PERF_COUNTER_H
-#define __ASM_SH_PERF_COUNTER_H
-
-/* SH only supports software counters through this interface. */
-static inline void set_perf_counter_pending(void) {}
-
-#define PERF_COUNTER_INDEX_OFFSET	0
-
-#endif /* __ASM_SH_PERF_COUNTER_H */
diff --git a/arch/sh/include/asm/perf_event.h b/arch/sh/include/asm/perf_event.h
new file mode 100644
index 000000000000..11a302297ab7
--- /dev/null
+++ b/arch/sh/include/asm/perf_event.h
@@ -0,0 +1,9 @@
+#ifndef __ASM_SH_PERF_EVENT_H
+#define __ASM_SH_PERF_EVENT_H
+
+/* SH only supports software events through this interface. */
+static inline void set_perf_event_pending(void) {}
+
+#define PERF_EVENT_INDEX_OFFSET	0
+
+#endif /* __ASM_SH_PERF_EVENT_H */
diff --git a/arch/sh/include/asm/unistd_32.h b/arch/sh/include/asm/unistd_32.h
index 925dd40d9d55..f3fd1b9eb6b1 100644
--- a/arch/sh/include/asm/unistd_32.h
+++ b/arch/sh/include/asm/unistd_32.h
@@ -344,7 +344,7 @@
 #define __NR_preadv		333
 #define __NR_pwritev		334
 #define __NR_rt_tgsigqueueinfo	335
-#define __NR_perf_counter_open	336
+#define __NR_perf_event_open	336
 
 #define NR_syscalls 337
 
diff --git a/arch/sh/include/asm/unistd_64.h b/arch/sh/include/asm/unistd_64.h
index 2b84bc916bc5..343ce8f073ea 100644
--- a/arch/sh/include/asm/unistd_64.h
+++ b/arch/sh/include/asm/unistd_64.h
@@ -384,7 +384,7 @@
 #define __NR_preadv		361
 #define __NR_pwritev		362
 #define __NR_rt_tgsigqueueinfo	363
-#define __NR_perf_counter_open	364
+#define __NR_perf_event_open	364
 
 #ifdef __KERNEL__
 
diff --git a/arch/sh/kernel/syscalls_32.S b/arch/sh/kernel/syscalls_32.S
index 16ba225ede89..19fd11dd9871 100644
--- a/arch/sh/kernel/syscalls_32.S
+++ b/arch/sh/kernel/syscalls_32.S
@@ -352,4 +352,4 @@ ENTRY(sys_call_table)
 	.long sys_preadv
 	.long sys_pwritev
 	.long sys_rt_tgsigqueueinfo	/* 335 */
-	.long sys_perf_counter_open
+	.long sys_perf_event_open
diff --git a/arch/sh/kernel/syscalls_64.S b/arch/sh/kernel/syscalls_64.S
index af6fb7410c21..5bfde6c77498 100644
--- a/arch/sh/kernel/syscalls_64.S
+++ b/arch/sh/kernel/syscalls_64.S
@@ -390,4 +390,4 @@ sys_call_table:
 	.long sys_preadv
 	.long sys_pwritev
 	.long sys_rt_tgsigqueueinfo
-	.long sys_perf_counter_open
+	.long sys_perf_event_open
diff --git a/arch/sh/mm/fault_32.c b/arch/sh/mm/fault_32.c
index 781b413ff82d..47530104e0ad 100644
--- a/arch/sh/mm/fault_32.c
+++ b/arch/sh/mm/fault_32.c
@@ -15,7 +15,7 @@
 #include <linux/mm.h>
 #include <linux/hardirq.h>
 #include <linux/kprobes.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <asm/io_trapped.h>
 #include <asm/system.h>
 #include <asm/mmu_context.h>
@@ -157,7 +157,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	if ((regs->sr & SR_IMASK) != SR_IMASK)
 		local_irq_enable();
 
-	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
 
 	/*
 	 * If we're in an interrupt, have no user context or are running
@@ -208,11 +208,11 @@ survive:
 	}
 	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
 				     regs, address);
 	} else {
 		tsk->min_flt++;
-		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
 				     regs, address);
 	}
 
diff --git a/arch/sh/mm/tlbflush_64.c b/arch/sh/mm/tlbflush_64.c
index 2dcc48528f7a..de0b0e881823 100644
--- a/arch/sh/mm/tlbflush_64.c
+++ b/arch/sh/mm/tlbflush_64.c
@@ -20,7 +20,7 @@
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/interrupt.h>
 #include <asm/system.h>
 #include <asm/io.h>
@@ -116,7 +116,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
 	/* Not an IO address, so reenable interrupts */
 	local_irq_enable();
 
-	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
 
 	/*
 	 * If we're in an interrupt or have no user
@@ -201,11 +201,11 @@ survive:
 
 	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
 				     regs, address);
 	} else {
 		tsk->min_flt++;
-		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
 				     regs, address);
 	}
 
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 86b82348b97c..97fca4695e0b 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -25,7 +25,7 @@ config SPARC
 	select ARCH_WANT_OPTIONAL_GPIOLIB
 	select RTC_CLASS
 	select RTC_DRV_M48T59
-	select HAVE_PERF_COUNTERS
+	select HAVE_PERF_EVENTS
 	select HAVE_DMA_ATTRS
 	select HAVE_DMA_API_DEBUG
 
@@ -47,7 +47,7 @@ config SPARC64
 	select RTC_DRV_BQ4802
 	select RTC_DRV_SUN4V
 	select RTC_DRV_STARFIRE
-	select HAVE_PERF_COUNTERS
+	select HAVE_PERF_EVENTS
 
 config ARCH_DEFCONFIG
 	string
diff --git a/arch/sparc/include/asm/perf_counter.h b/arch/sparc/include/asm/perf_counter.h
deleted file mode 100644
index 5d7a8ca0e491..000000000000
--- a/arch/sparc/include/asm/perf_counter.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef __ASM_SPARC_PERF_COUNTER_H
-#define __ASM_SPARC_PERF_COUNTER_H
-
-extern void set_perf_counter_pending(void);
-
-#define	PERF_COUNTER_INDEX_OFFSET	0
-
-#ifdef CONFIG_PERF_COUNTERS
-extern void init_hw_perf_counters(void);
-#else
-static inline void init_hw_perf_counters(void)	{ }
-#endif
-
-#endif
diff --git a/arch/sparc/include/asm/perf_event.h b/arch/sparc/include/asm/perf_event.h
new file mode 100644
index 000000000000..7e2669894ce8
--- /dev/null
+++ b/arch/sparc/include/asm/perf_event.h
@@ -0,0 +1,14 @@
+#ifndef __ASM_SPARC_PERF_EVENT_H
+#define __ASM_SPARC_PERF_EVENT_H
+
+extern void set_perf_event_pending(void);
+
+#define	PERF_EVENT_INDEX_OFFSET	0
+
+#ifdef CONFIG_PERF_EVENTS
+extern void init_hw_perf_events(void);
+#else
+static inline void init_hw_perf_events(void)	{ }
+#endif
+
+#endif
diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h
index 706df669f3b8..42f2316c3eaa 100644
--- a/arch/sparc/include/asm/unistd.h
+++ b/arch/sparc/include/asm/unistd.h
@@ -395,7 +395,7 @@
 #define __NR_preadv		324
 #define __NR_pwritev		325
 #define __NR_rt_tgsigqueueinfo	326
-#define __NR_perf_counter_open	327
+#define __NR_perf_event_open	327
 
 #define NR_SYSCALLS		328
 
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index 247cc620cee5..3a048fad7ee2 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -104,5 +104,5 @@ obj-$(CONFIG_AUDIT)     += audit.o
 audit--$(CONFIG_AUDIT)  := compat_audit.o
 obj-$(CONFIG_COMPAT)    += $(audit--y)
 
-pc--$(CONFIG_PERF_COUNTERS) := perf_counter.o
+pc--$(CONFIG_PERF_EVENTS) := perf_event.o
 obj-$(CONFIG_SPARC64)	+= $(pc--y)
diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c
index 378eb53e0776..b129611590a4 100644
--- a/arch/sparc/kernel/nmi.c
+++ b/arch/sparc/kernel/nmi.c
@@ -19,7 +19,7 @@
 #include <linux/delay.h>
 #include <linux/smp.h>
 
-#include <asm/perf_counter.h>
+#include <asm/perf_event.h>
 #include <asm/ptrace.h>
 #include <asm/local.h>
 #include <asm/pcr.h>
@@ -265,7 +265,7 @@ int __init nmi_init(void)
 		}
 	}
 	if (!err)
-		init_hw_perf_counters();
+		init_hw_perf_events();
 
 	return err;
 }
diff --git a/arch/sparc/kernel/pcr.c b/arch/sparc/kernel/pcr.c
index 68ff00107073..2d94e7a03af5 100644
--- a/arch/sparc/kernel/pcr.c
+++ b/arch/sparc/kernel/pcr.c
@@ -7,7 +7,7 @@
 #include <linux/init.h>
 #include <linux/irq.h>
 
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 
 #include <asm/pil.h>
 #include <asm/pcr.h>
@@ -15,7 +15,7 @@
 
 /* This code is shared between various users of the performance
  * counters.  Users will be oprofile, pseudo-NMI watchdog, and the
- * perf_counter support layer.
+ * perf_event support layer.
  */
 
 #define PCR_SUN4U_ENABLE	(PCR_PIC_PRIV | PCR_STRACE | PCR_UTRACE)
@@ -42,14 +42,14 @@ void deferred_pcr_work_irq(int irq, struct pt_regs *regs)
 
 	old_regs = set_irq_regs(regs);
 	irq_enter();
-#ifdef CONFIG_PERF_COUNTERS
-	perf_counter_do_pending();
+#ifdef CONFIG_PERF_EVENTS
+	perf_event_do_pending();
 #endif
 	irq_exit();
 	set_irq_regs(old_regs);
 }
 
-void set_perf_counter_pending(void)
+void set_perf_event_pending(void)
 {
 	set_softint(1 << PIL_DEFERRED_PCR_WORK);
 }
diff --git a/arch/sparc/kernel/perf_counter.c b/arch/sparc/kernel/perf_counter.c
deleted file mode 100644
index b1265ce8a053..000000000000
--- a/arch/sparc/kernel/perf_counter.c
+++ /dev/null
@@ -1,556 +0,0 @@
-/* Performance counter support for sparc64.
- *
- * Copyright (C) 2009 David S. Miller <davem@davemloft.net>
- *
- * This code is based almost entirely upon the x86 perf counter
- * code, which is:
- *
- *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
- *  Copyright (C) 2009 Jaswinder Singh Rajput
- *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- */
-
-#include <linux/perf_counter.h>
-#include <linux/kprobes.h>
-#include <linux/kernel.h>
-#include <linux/kdebug.h>
-#include <linux/mutex.h>
-
-#include <asm/cpudata.h>
-#include <asm/atomic.h>
-#include <asm/nmi.h>
-#include <asm/pcr.h>
-
-/* Sparc64 chips have two performance counters, 32-bits each, with
- * overflow interrupts generated on transition from 0xffffffff to 0.
- * The counters are accessed in one go using a 64-bit register.
- *
- * Both counters are controlled using a single control register.  The
- * only way to stop all sampling is to clear all of the context (user,
- * supervisor, hypervisor) sampling enable bits.  But these bits apply
- * to both counters, thus the two counters can't be enabled/disabled
- * individually.
- *
- * The control register has two event fields, one for each of the two
- * counters.  It's thus nearly impossible to have one counter going
- * while keeping the other one stopped.  Therefore it is possible to
- * get overflow interrupts for counters not currently "in use" and
- * that condition must be checked in the overflow interrupt handler.
- *
- * So we use a hack, in that we program inactive counters with the
- * "sw_count0" and "sw_count1" events.  These count how many times
- * the instruction "sethi %hi(0xfc000), %g0" is executed.  It's an
- * unusual way to encode a NOP and therefore will not trigger in
- * normal code.
- */
-
-#define MAX_HWCOUNTERS			2
-#define MAX_PERIOD			((1UL << 32) - 1)
-
-#define PIC_UPPER_INDEX			0
-#define PIC_LOWER_INDEX			1
-
-struct cpu_hw_counters {
-	struct perf_counter	*counters[MAX_HWCOUNTERS];
-	unsigned long		used_mask[BITS_TO_LONGS(MAX_HWCOUNTERS)];
-	unsigned long		active_mask[BITS_TO_LONGS(MAX_HWCOUNTERS)];
-	int enabled;
-};
-DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { .enabled = 1, };
-
-struct perf_event_map {
-	u16	encoding;
-	u8	pic_mask;
-#define PIC_NONE	0x00
-#define PIC_UPPER	0x01
-#define PIC_LOWER	0x02
-};
-
-struct sparc_pmu {
-	const struct perf_event_map	*(*event_map)(int);
-	int				max_events;
-	int				upper_shift;
-	int				lower_shift;
-	int				event_mask;
-	int				hv_bit;
-	int				irq_bit;
-	int				upper_nop;
-	int				lower_nop;
-};
-
-static const struct perf_event_map ultra3i_perfmon_event_map[] = {
-	[PERF_COUNT_HW_CPU_CYCLES] = { 0x0000, PIC_UPPER | PIC_LOWER },
-	[PERF_COUNT_HW_INSTRUCTIONS] = { 0x0001, PIC_UPPER | PIC_LOWER },
-	[PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0009, PIC_LOWER },
-	[PERF_COUNT_HW_CACHE_MISSES] = { 0x0009, PIC_UPPER },
-};
-
-static const struct perf_event_map *ultra3i_event_map(int event)
-{
-	return &ultra3i_perfmon_event_map[event];
-}
-
-static const struct sparc_pmu ultra3i_pmu = {
-	.event_map	= ultra3i_event_map,
-	.max_events	= ARRAY_SIZE(ultra3i_perfmon_event_map),
-	.upper_shift	= 11,
-	.lower_shift	= 4,
-	.event_mask	= 0x3f,
-	.upper_nop	= 0x1c,
-	.lower_nop	= 0x14,
-};
-
-static const struct perf_event_map niagara2_perfmon_event_map[] = {
-	[PERF_COUNT_HW_CPU_CYCLES] = { 0x02ff, PIC_UPPER | PIC_LOWER },
-	[PERF_COUNT_HW_INSTRUCTIONS] = { 0x02ff, PIC_UPPER | PIC_LOWER },
-	[PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0208, PIC_UPPER | PIC_LOWER },
-	[PERF_COUNT_HW_CACHE_MISSES] = { 0x0302, PIC_UPPER | PIC_LOWER },
-	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = { 0x0201, PIC_UPPER | PIC_LOWER },
-	[PERF_COUNT_HW_BRANCH_MISSES] = { 0x0202, PIC_UPPER | PIC_LOWER },
-};
-
-static const struct perf_event_map *niagara2_event_map(int event)
-{
-	return &niagara2_perfmon_event_map[event];
-}
-
-static const struct sparc_pmu niagara2_pmu = {
-	.event_map	= niagara2_event_map,
-	.max_events	= ARRAY_SIZE(niagara2_perfmon_event_map),
-	.upper_shift	= 19,
-	.lower_shift	= 6,
-	.event_mask	= 0xfff,
-	.hv_bit		= 0x8,
-	.irq_bit	= 0x03,
-	.upper_nop	= 0x220,
-	.lower_nop	= 0x220,
-};
-
-static const struct sparc_pmu *sparc_pmu __read_mostly;
-
-static u64 event_encoding(u64 event, int idx)
-{
-	if (idx == PIC_UPPER_INDEX)
-		event <<= sparc_pmu->upper_shift;
-	else
-		event <<= sparc_pmu->lower_shift;
-	return event;
-}
-
-static u64 mask_for_index(int idx)
-{
-	return event_encoding(sparc_pmu->event_mask, idx);
-}
-
-static u64 nop_for_index(int idx)
-{
-	return event_encoding(idx == PIC_UPPER_INDEX ?
-			      sparc_pmu->upper_nop :
-			      sparc_pmu->lower_nop, idx);
-}
-
-static inline void sparc_pmu_enable_counter(struct hw_perf_counter *hwc,
-					    int idx)
-{
-	u64 val, mask = mask_for_index(idx);
-
-	val = pcr_ops->read();
-	pcr_ops->write((val & ~mask) | hwc->config);
-}
-
-static inline void sparc_pmu_disable_counter(struct hw_perf_counter *hwc,
-					     int idx)
-{
-	u64 mask = mask_for_index(idx);
-	u64 nop = nop_for_index(idx);
-	u64 val = pcr_ops->read();
-
-	pcr_ops->write((val & ~mask) | nop);
-}
-
-void hw_perf_enable(void)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	u64 val;
-	int i;
-
-	if (cpuc->enabled)
-		return;
-
-	cpuc->enabled = 1;
-	barrier();
-
-	val = pcr_ops->read();
-
-	for (i = 0; i < MAX_HWCOUNTERS; i++) {
-		struct perf_counter *cp = cpuc->counters[i];
-		struct hw_perf_counter *hwc;
-
-		if (!cp)
-			continue;
-		hwc = &cp->hw;
-		val |= hwc->config_base;
-	}
-
-	pcr_ops->write(val);
-}
-
-void hw_perf_disable(void)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	u64 val;
-
-	if (!cpuc->enabled)
-		return;
-
-	cpuc->enabled = 0;
-
-	val = pcr_ops->read();
-	val &= ~(PCR_UTRACE | PCR_STRACE |
-		 sparc_pmu->hv_bit | sparc_pmu->irq_bit);
-	pcr_ops->write(val);
-}
-
-static u32 read_pmc(int idx)
-{
-	u64 val;
-
-	read_pic(val);
-	if (idx == PIC_UPPER_INDEX)
-		val >>= 32;
-
-	return val & 0xffffffff;
-}
-
-static void write_pmc(int idx, u64 val)
-{
-	u64 shift, mask, pic;
-
-	shift = 0;
-	if (idx == PIC_UPPER_INDEX)
-		shift = 32;
-
-	mask = ((u64) 0xffffffff) << shift;
-	val <<= shift;
-
-	read_pic(pic);
-	pic &= ~mask;
-	pic |= val;
-	write_pic(pic);
-}
-
-static int sparc_perf_counter_set_period(struct perf_counter *counter,
-					 struct hw_perf_counter *hwc, int idx)
-{
-	s64 left = atomic64_read(&hwc->period_left);
-	s64 period = hwc->sample_period;
-	int ret = 0;
-
-	if (unlikely(left <= -period)) {
-		left = period;
-		atomic64_set(&hwc->period_left, left);
-		hwc->last_period = period;
-		ret = 1;
-	}
-
-	if (unlikely(left <= 0)) {
-		left += period;
-		atomic64_set(&hwc->period_left, left);
-		hwc->last_period = period;
-		ret = 1;
-	}
-	if (left > MAX_PERIOD)
-		left = MAX_PERIOD;
-
-	atomic64_set(&hwc->prev_count, (u64)-left);
-
-	write_pmc(idx, (u64)(-left) & 0xffffffff);
-
-	perf_counter_update_userpage(counter);
-
-	return ret;
-}
-
-static int sparc_pmu_enable(struct perf_counter *counter)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	struct hw_perf_counter *hwc = &counter->hw;
-	int idx = hwc->idx;
-
-	if (test_and_set_bit(idx, cpuc->used_mask))
-		return -EAGAIN;
-
-	sparc_pmu_disable_counter(hwc, idx);
-
-	cpuc->counters[idx] = counter;
-	set_bit(idx, cpuc->active_mask);
-
-	sparc_perf_counter_set_period(counter, hwc, idx);
-	sparc_pmu_enable_counter(hwc, idx);
-	perf_counter_update_userpage(counter);
-	return 0;
-}
-
-static u64 sparc_perf_counter_update(struct perf_counter *counter,
-				     struct hw_perf_counter *hwc, int idx)
-{
-	int shift = 64 - 32;
-	u64 prev_raw_count, new_raw_count;
-	s64 delta;
-
-again:
-	prev_raw_count = atomic64_read(&hwc->prev_count);
-	new_raw_count = read_pmc(idx);
-
-	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
-			     new_raw_count) != prev_raw_count)
-		goto again;
-
-	delta = (new_raw_count << shift) - (prev_raw_count << shift);
-	delta >>= shift;
-
-	atomic64_add(delta, &counter->count);
-	atomic64_sub(delta, &hwc->period_left);
-
-	return new_raw_count;
-}
-
-static void sparc_pmu_disable(struct perf_counter *counter)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	struct hw_perf_counter *hwc = &counter->hw;
-	int idx = hwc->idx;
-
-	clear_bit(idx, cpuc->active_mask);
-	sparc_pmu_disable_counter(hwc, idx);
-
-	barrier();
-
-	sparc_perf_counter_update(counter, hwc, idx);
-	cpuc->counters[idx] = NULL;
-	clear_bit(idx, cpuc->used_mask);
-
-	perf_counter_update_userpage(counter);
-}
-
-static void sparc_pmu_read(struct perf_counter *counter)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-	sparc_perf_counter_update(counter, hwc, hwc->idx);
-}
-
-static void sparc_pmu_unthrottle(struct perf_counter *counter)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-	sparc_pmu_enable_counter(hwc, hwc->idx);
-}
-
-static atomic_t active_counters = ATOMIC_INIT(0);
-static DEFINE_MUTEX(pmc_grab_mutex);
-
-void perf_counter_grab_pmc(void)
-{
-	if (atomic_inc_not_zero(&active_counters))
-		return;
-
-	mutex_lock(&pmc_grab_mutex);
-	if (atomic_read(&active_counters) == 0) {
-		if (atomic_read(&nmi_active) > 0) {
-			on_each_cpu(stop_nmi_watchdog, NULL, 1);
-			BUG_ON(atomic_read(&nmi_active) != 0);
-		}
-		atomic_inc(&active_counters);
-	}
-	mutex_unlock(&pmc_grab_mutex);
-}
-
-void perf_counter_release_pmc(void)
-{
-	if (atomic_dec_and_mutex_lock(&active_counters, &pmc_grab_mutex)) {
-		if (atomic_read(&nmi_active) == 0)
-			on_each_cpu(start_nmi_watchdog, NULL, 1);
-		mutex_unlock(&pmc_grab_mutex);
-	}
-}
-
-static void hw_perf_counter_destroy(struct perf_counter *counter)
-{
-	perf_counter_release_pmc();
-}
-
-static int __hw_perf_counter_init(struct perf_counter *counter)
-{
-	struct perf_counter_attr *attr = &counter->attr;
-	struct hw_perf_counter *hwc = &counter->hw;
-	const struct perf_event_map *pmap;
-	u64 enc;
-
-	if (atomic_read(&nmi_active) < 0)
-		return -ENODEV;
-
-	if (attr->type != PERF_TYPE_HARDWARE)
-		return -EOPNOTSUPP;
-
-	if (attr->config >= sparc_pmu->max_events)
-		return -EINVAL;
-
-	perf_counter_grab_pmc();
-	counter->destroy = hw_perf_counter_destroy;
-
-	/* We save the enable bits in the config_base.  So to
-	 * turn off sampling just write 'config', and to enable
-	 * things write 'config | config_base'.
-	 */
-	hwc->config_base = sparc_pmu->irq_bit;
-	if (!attr->exclude_user)
-		hwc->config_base |= PCR_UTRACE;
-	if (!attr->exclude_kernel)
-		hwc->config_base |= PCR_STRACE;
-	if (!attr->exclude_hv)
-		hwc->config_base |= sparc_pmu->hv_bit;
-
-	if (!hwc->sample_period) {
-		hwc->sample_period = MAX_PERIOD;
-		hwc->last_period = hwc->sample_period;
-		atomic64_set(&hwc->period_left, hwc->sample_period);
-	}
-
-	pmap = sparc_pmu->event_map(attr->config);
-
-	enc = pmap->encoding;
-	if (pmap->pic_mask & PIC_UPPER) {
-		hwc->idx = PIC_UPPER_INDEX;
-		enc <<= sparc_pmu->upper_shift;
-	} else {
-		hwc->idx = PIC_LOWER_INDEX;
-		enc <<= sparc_pmu->lower_shift;
-	}
-
-	hwc->config |= enc;
-	return 0;
-}
-
-static const struct pmu pmu = {
-	.enable		= sparc_pmu_enable,
-	.disable	= sparc_pmu_disable,
-	.read		= sparc_pmu_read,
-	.unthrottle	= sparc_pmu_unthrottle,
-};
-
-const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
-{
-	int err = __hw_perf_counter_init(counter);
-
-	if (err)
-		return ERR_PTR(err);
-	return &pmu;
-}
-
-void perf_counter_print_debug(void)
-{
-	unsigned long flags;
-	u64 pcr, pic;
-	int cpu;
-
-	if (!sparc_pmu)
-		return;
-
-	local_irq_save(flags);
-
-	cpu = smp_processor_id();
-
-	pcr = pcr_ops->read();
-	read_pic(pic);
-
-	pr_info("\n");
-	pr_info("CPU#%d: PCR[%016llx] PIC[%016llx]\n",
-		cpu, pcr, pic);
-
-	local_irq_restore(flags);
-}
-
-static int __kprobes perf_counter_nmi_handler(struct notifier_block *self,
-					      unsigned long cmd, void *__args)
-{
-	struct die_args *args = __args;
-	struct perf_sample_data data;
-	struct cpu_hw_counters *cpuc;
-	struct pt_regs *regs;
-	int idx;
-
-	if (!atomic_read(&active_counters))
-		return NOTIFY_DONE;
-
-	switch (cmd) {
-	case DIE_NMI:
-		break;
-
-	default:
-		return NOTIFY_DONE;
-	}
-
-	regs = args->regs;
-
-	data.addr = 0;
-
-	cpuc = &__get_cpu_var(cpu_hw_counters);
-	for (idx = 0; idx < MAX_HWCOUNTERS; idx++) {
-		struct perf_counter *counter = cpuc->counters[idx];
-		struct hw_perf_counter *hwc;
-		u64 val;
-
-		if (!test_bit(idx, cpuc->active_mask))
-			continue;
-		hwc = &counter->hw;
-		val = sparc_perf_counter_update(counter, hwc, idx);
-		if (val & (1ULL << 31))
-			continue;
-
-		data.period = counter->hw.last_period;
-		if (!sparc_perf_counter_set_period(counter, hwc, idx))
-			continue;
-
-		if (perf_counter_overflow(counter, 1, &data, regs))
-			sparc_pmu_disable_counter(hwc, idx);
-	}
-
-	return NOTIFY_STOP;
-}
-
-static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
-	.notifier_call		= perf_counter_nmi_handler,
-};
-
-static bool __init supported_pmu(void)
-{
-	if (!strcmp(sparc_pmu_type, "ultra3i")) {
-		sparc_pmu = &ultra3i_pmu;
-		return true;
-	}
-	if (!strcmp(sparc_pmu_type, "niagara2")) {
-		sparc_pmu = &niagara2_pmu;
-		return true;
-	}
-	return false;
-}
-
-void __init init_hw_perf_counters(void)
-{
-	pr_info("Performance counters: ");
-
-	if (!supported_pmu()) {
-		pr_cont("No support for PMU type '%s'\n", sparc_pmu_type);
-		return;
-	}
-
-	pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type);
-
-	/* All sparc64 PMUs currently have 2 counters.  But this simple
-	 * driver only supports one active counter at a time.
-	 */
-	perf_max_counters = 1;
-
-	register_die_notifier(&perf_counter_nmi_notifier);
-}
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
new file mode 100644
index 000000000000..2d6a1b10c81d
--- /dev/null
+++ b/arch/sparc/kernel/perf_event.c
@@ -0,0 +1,556 @@
+/* Performance event support for sparc64.
+ *
+ * Copyright (C) 2009 David S. Miller <davem@davemloft.net>
+ *
+ * This code is based almost entirely upon the x86 perf event
+ * code, which is:
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ */
+
+#include <linux/perf_event.h>
+#include <linux/kprobes.h>
+#include <linux/kernel.h>
+#include <linux/kdebug.h>
+#include <linux/mutex.h>
+
+#include <asm/cpudata.h>
+#include <asm/atomic.h>
+#include <asm/nmi.h>
+#include <asm/pcr.h>
+
+/* Sparc64 chips have two performance counters, 32-bits each, with
+ * overflow interrupts generated on transition from 0xffffffff to 0.
+ * The counters are accessed in one go using a 64-bit register.
+ *
+ * Both counters are controlled using a single control register.  The
+ * only way to stop all sampling is to clear all of the context (user,
+ * supervisor, hypervisor) sampling enable bits.  But these bits apply
+ * to both counters, thus the two counters can't be enabled/disabled
+ * individually.
+ *
+ * The control register has two event fields, one for each of the two
+ * counters.  It's thus nearly impossible to have one counter going
+ * while keeping the other one stopped.  Therefore it is possible to
+ * get overflow interrupts for counters not currently "in use" and
+ * that condition must be checked in the overflow interrupt handler.
+ *
+ * So we use a hack, in that we program inactive counters with the
+ * "sw_count0" and "sw_count1" events.  These count how many times
+ * the instruction "sethi %hi(0xfc000), %g0" is executed.  It's an
+ * unusual way to encode a NOP and therefore will not trigger in
+ * normal code.
+ */
+
+#define MAX_HWEVENTS			2
+#define MAX_PERIOD			((1UL << 32) - 1)
+
+#define PIC_UPPER_INDEX			0
+#define PIC_LOWER_INDEX			1
+
+struct cpu_hw_events {
+	struct perf_event	*events[MAX_HWEVENTS];
+	unsigned long		used_mask[BITS_TO_LONGS(MAX_HWEVENTS)];
+	unsigned long		active_mask[BITS_TO_LONGS(MAX_HWEVENTS)];
+	int enabled;
+};
+DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, };
+
+struct perf_event_map {
+	u16	encoding;
+	u8	pic_mask;
+#define PIC_NONE	0x00
+#define PIC_UPPER	0x01
+#define PIC_LOWER	0x02
+};
+
+struct sparc_pmu {
+	const struct perf_event_map	*(*event_map)(int);
+	int				max_events;
+	int				upper_shift;
+	int				lower_shift;
+	int				event_mask;
+	int				hv_bit;
+	int				irq_bit;
+	int				upper_nop;
+	int				lower_nop;
+};
+
+static const struct perf_event_map ultra3i_perfmon_event_map[] = {
+	[PERF_COUNT_HW_CPU_CYCLES] = { 0x0000, PIC_UPPER | PIC_LOWER },
+	[PERF_COUNT_HW_INSTRUCTIONS] = { 0x0001, PIC_UPPER | PIC_LOWER },
+	[PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0009, PIC_LOWER },
+	[PERF_COUNT_HW_CACHE_MISSES] = { 0x0009, PIC_UPPER },
+};
+
+static const struct perf_event_map *ultra3i_event_map(int event_id)
+{
+	return &ultra3i_perfmon_event_map[event_id];
+}
+
+static const struct sparc_pmu ultra3i_pmu = {
+	.event_map	= ultra3i_event_map,
+	.max_events	= ARRAY_SIZE(ultra3i_perfmon_event_map),
+	.upper_shift	= 11,
+	.lower_shift	= 4,
+	.event_mask	= 0x3f,
+	.upper_nop	= 0x1c,
+	.lower_nop	= 0x14,
+};
+
+static const struct perf_event_map niagara2_perfmon_event_map[] = {
+	[PERF_COUNT_HW_CPU_CYCLES] = { 0x02ff, PIC_UPPER | PIC_LOWER },
+	[PERF_COUNT_HW_INSTRUCTIONS] = { 0x02ff, PIC_UPPER | PIC_LOWER },
+	[PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0208, PIC_UPPER | PIC_LOWER },
+	[PERF_COUNT_HW_CACHE_MISSES] = { 0x0302, PIC_UPPER | PIC_LOWER },
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = { 0x0201, PIC_UPPER | PIC_LOWER },
+	[PERF_COUNT_HW_BRANCH_MISSES] = { 0x0202, PIC_UPPER | PIC_LOWER },
+};
+
+static const struct perf_event_map *niagara2_event_map(int event_id)
+{
+	return &niagara2_perfmon_event_map[event_id];
+}
+
+static const struct sparc_pmu niagara2_pmu = {
+	.event_map	= niagara2_event_map,
+	.max_events	= ARRAY_SIZE(niagara2_perfmon_event_map),
+	.upper_shift	= 19,
+	.lower_shift	= 6,
+	.event_mask	= 0xfff,
+	.hv_bit		= 0x8,
+	.irq_bit	= 0x03,
+	.upper_nop	= 0x220,
+	.lower_nop	= 0x220,
+};
+
+static const struct sparc_pmu *sparc_pmu __read_mostly;
+
+static u64 event_encoding(u64 event_id, int idx)
+{
+	if (idx == PIC_UPPER_INDEX)
+		event_id <<= sparc_pmu->upper_shift;
+	else
+		event_id <<= sparc_pmu->lower_shift;
+	return event_id;
+}
+
+static u64 mask_for_index(int idx)
+{
+	return event_encoding(sparc_pmu->event_mask, idx);
+}
+
+static u64 nop_for_index(int idx)
+{
+	return event_encoding(idx == PIC_UPPER_INDEX ?
+			      sparc_pmu->upper_nop :
+			      sparc_pmu->lower_nop, idx);
+}
+
+static inline void sparc_pmu_enable_event(struct hw_perf_event *hwc,
+					    int idx)
+{
+	u64 val, mask = mask_for_index(idx);
+
+	val = pcr_ops->read();
+	pcr_ops->write((val & ~mask) | hwc->config);
+}
+
+static inline void sparc_pmu_disable_event(struct hw_perf_event *hwc,
+					     int idx)
+{
+	u64 mask = mask_for_index(idx);
+	u64 nop = nop_for_index(idx);
+	u64 val = pcr_ops->read();
+
+	pcr_ops->write((val & ~mask) | nop);
+}
+
+void hw_perf_enable(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	u64 val;
+	int i;
+
+	if (cpuc->enabled)
+		return;
+
+	cpuc->enabled = 1;
+	barrier();
+
+	val = pcr_ops->read();
+
+	for (i = 0; i < MAX_HWEVENTS; i++) {
+		struct perf_event *cp = cpuc->events[i];
+		struct hw_perf_event *hwc;
+
+		if (!cp)
+			continue;
+		hwc = &cp->hw;
+		val |= hwc->config_base;
+	}
+
+	pcr_ops->write(val);
+}
+
+void hw_perf_disable(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	u64 val;
+
+	if (!cpuc->enabled)
+		return;
+
+	cpuc->enabled = 0;
+
+	val = pcr_ops->read();
+	val &= ~(PCR_UTRACE | PCR_STRACE |
+		 sparc_pmu->hv_bit | sparc_pmu->irq_bit);
+	pcr_ops->write(val);
+}
+
+static u32 read_pmc(int idx)
+{
+	u64 val;
+
+	read_pic(val);
+	if (idx == PIC_UPPER_INDEX)
+		val >>= 32;
+
+	return val & 0xffffffff;
+}
+
+static void write_pmc(int idx, u64 val)
+{
+	u64 shift, mask, pic;
+
+	shift = 0;
+	if (idx == PIC_UPPER_INDEX)
+		shift = 32;
+
+	mask = ((u64) 0xffffffff) << shift;
+	val <<= shift;
+
+	read_pic(pic);
+	pic &= ~mask;
+	pic |= val;
+	write_pic(pic);
+}
+
+static int sparc_perf_event_set_period(struct perf_event *event,
+					 struct hw_perf_event *hwc, int idx)
+{
+	s64 left = atomic64_read(&hwc->period_left);
+	s64 period = hwc->sample_period;
+	int ret = 0;
+
+	if (unlikely(left <= -period)) {
+		left = period;
+		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+
+	if (unlikely(left <= 0)) {
+		left += period;
+		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+	if (left > MAX_PERIOD)
+		left = MAX_PERIOD;
+
+	atomic64_set(&hwc->prev_count, (u64)-left);
+
+	write_pmc(idx, (u64)(-left) & 0xffffffff);
+
+	perf_event_update_userpage(event);
+
+	return ret;
+}
+
+static int sparc_pmu_enable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	if (test_and_set_bit(idx, cpuc->used_mask))
+		return -EAGAIN;
+
+	sparc_pmu_disable_event(hwc, idx);
+
+	cpuc->events[idx] = event;
+	set_bit(idx, cpuc->active_mask);
+
+	sparc_perf_event_set_period(event, hwc, idx);
+	sparc_pmu_enable_event(hwc, idx);
+	perf_event_update_userpage(event);
+	return 0;
+}
+
+static u64 sparc_perf_event_update(struct perf_event *event,
+				     struct hw_perf_event *hwc, int idx)
+{
+	int shift = 64 - 32;
+	u64 prev_raw_count, new_raw_count;
+	s64 delta;
+
+again:
+	prev_raw_count = atomic64_read(&hwc->prev_count);
+	new_raw_count = read_pmc(idx);
+
+	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+			     new_raw_count) != prev_raw_count)
+		goto again;
+
+	delta = (new_raw_count << shift) - (prev_raw_count << shift);
+	delta >>= shift;
+
+	atomic64_add(delta, &event->count);
+	atomic64_sub(delta, &hwc->period_left);
+
+	return new_raw_count;
+}
+
+static void sparc_pmu_disable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	clear_bit(idx, cpuc->active_mask);
+	sparc_pmu_disable_event(hwc, idx);
+
+	barrier();
+
+	sparc_perf_event_update(event, hwc, idx);
+	cpuc->events[idx] = NULL;
+	clear_bit(idx, cpuc->used_mask);
+
+	perf_event_update_userpage(event);
+}
+
+static void sparc_pmu_read(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	sparc_perf_event_update(event, hwc, hwc->idx);
+}
+
+static void sparc_pmu_unthrottle(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	sparc_pmu_enable_event(hwc, hwc->idx);
+}
+
+static atomic_t active_events = ATOMIC_INIT(0);
+static DEFINE_MUTEX(pmc_grab_mutex);
+
+void perf_event_grab_pmc(void)
+{
+	if (atomic_inc_not_zero(&active_events))
+		return;
+
+	mutex_lock(&pmc_grab_mutex);
+	if (atomic_read(&active_events) == 0) {
+		if (atomic_read(&nmi_active) > 0) {
+			on_each_cpu(stop_nmi_watchdog, NULL, 1);
+			BUG_ON(atomic_read(&nmi_active) != 0);
+		}
+		atomic_inc(&active_events);
+	}
+	mutex_unlock(&pmc_grab_mutex);
+}
+
+void perf_event_release_pmc(void)
+{
+	if (atomic_dec_and_mutex_lock(&active_events, &pmc_grab_mutex)) {
+		if (atomic_read(&nmi_active) == 0)
+			on_each_cpu(start_nmi_watchdog, NULL, 1);
+		mutex_unlock(&pmc_grab_mutex);
+	}
+}
+
+static void hw_perf_event_destroy(struct perf_event *event)
+{
+	perf_event_release_pmc();
+}
+
+static int __hw_perf_event_init(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	struct hw_perf_event *hwc = &event->hw;
+	const struct perf_event_map *pmap;
+	u64 enc;
+
+	if (atomic_read(&nmi_active) < 0)
+		return -ENODEV;
+
+	if (attr->type != PERF_TYPE_HARDWARE)
+		return -EOPNOTSUPP;
+
+	if (attr->config >= sparc_pmu->max_events)
+		return -EINVAL;
+
+	perf_event_grab_pmc();
+	event->destroy = hw_perf_event_destroy;
+
+	/* We save the enable bits in the config_base.  So to
+	 * turn off sampling just write 'config', and to enable
+	 * things write 'config | config_base'.
+	 */
+	hwc->config_base = sparc_pmu->irq_bit;
+	if (!attr->exclude_user)
+		hwc->config_base |= PCR_UTRACE;
+	if (!attr->exclude_kernel)
+		hwc->config_base |= PCR_STRACE;
+	if (!attr->exclude_hv)
+		hwc->config_base |= sparc_pmu->hv_bit;
+
+	if (!hwc->sample_period) {
+		hwc->sample_period = MAX_PERIOD;
+		hwc->last_period = hwc->sample_period;
+		atomic64_set(&hwc->period_left, hwc->sample_period);
+	}
+
+	pmap = sparc_pmu->event_map(attr->config);
+
+	enc = pmap->encoding;
+	if (pmap->pic_mask & PIC_UPPER) {
+		hwc->idx = PIC_UPPER_INDEX;
+		enc <<= sparc_pmu->upper_shift;
+	} else {
+		hwc->idx = PIC_LOWER_INDEX;
+		enc <<= sparc_pmu->lower_shift;
+	}
+
+	hwc->config |= enc;
+	return 0;
+}
+
+static const struct pmu pmu = {
+	.enable		= sparc_pmu_enable,
+	.disable	= sparc_pmu_disable,
+	.read		= sparc_pmu_read,
+	.unthrottle	= sparc_pmu_unthrottle,
+};
+
+const struct pmu *hw_perf_event_init(struct perf_event *event)
+{
+	int err = __hw_perf_event_init(event);
+
+	if (err)
+		return ERR_PTR(err);
+	return &pmu;
+}
+
+void perf_event_print_debug(void)
+{
+	unsigned long flags;
+	u64 pcr, pic;
+	int cpu;
+
+	if (!sparc_pmu)
+		return;
+
+	local_irq_save(flags);
+
+	cpu = smp_processor_id();
+
+	pcr = pcr_ops->read();
+	read_pic(pic);
+
+	pr_info("\n");
+	pr_info("CPU#%d: PCR[%016llx] PIC[%016llx]\n",
+		cpu, pcr, pic);
+
+	local_irq_restore(flags);
+}
+
+static int __kprobes perf_event_nmi_handler(struct notifier_block *self,
+					      unsigned long cmd, void *__args)
+{
+	struct die_args *args = __args;
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	struct pt_regs *regs;
+	int idx;
+
+	if (!atomic_read(&active_events))
+		return NOTIFY_DONE;
+
+	switch (cmd) {
+	case DIE_NMI:
+		break;
+
+	default:
+		return NOTIFY_DONE;
+	}
+
+	regs = args->regs;
+
+	data.addr = 0;
+
+	cpuc = &__get_cpu_var(cpu_hw_events);
+	for (idx = 0; idx < MAX_HWEVENTS; idx++) {
+		struct perf_event *event = cpuc->events[idx];
+		struct hw_perf_event *hwc;
+		u64 val;
+
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+		hwc = &event->hw;
+		val = sparc_perf_event_update(event, hwc, idx);
+		if (val & (1ULL << 31))
+			continue;
+
+		data.period = event->hw.last_period;
+		if (!sparc_perf_event_set_period(event, hwc, idx))
+			continue;
+
+		if (perf_event_overflow(event, 1, &data, regs))
+			sparc_pmu_disable_event(hwc, idx);
+	}
+
+	return NOTIFY_STOP;
+}
+
+static __read_mostly struct notifier_block perf_event_nmi_notifier = {
+	.notifier_call		= perf_event_nmi_handler,
+};
+
+static bool __init supported_pmu(void)
+{
+	if (!strcmp(sparc_pmu_type, "ultra3i")) {
+		sparc_pmu = &ultra3i_pmu;
+		return true;
+	}
+	if (!strcmp(sparc_pmu_type, "niagara2")) {
+		sparc_pmu = &niagara2_pmu;
+		return true;
+	}
+	return false;
+}
+
+void __init init_hw_perf_events(void)
+{
+	pr_info("Performance events: ");
+
+	if (!supported_pmu()) {
+		pr_cont("No support for PMU type '%s'\n", sparc_pmu_type);
+		return;
+	}
+
+	pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type);
+
+	/* All sparc64 PMUs currently have 2 events.  But this simple
+	 * driver only supports one active event at a time.
+	 */
+	perf_max_events = 1;
+
+	register_die_notifier(&perf_event_nmi_notifier);
+}
diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S
index 04181577cb65..0f1658d37490 100644
--- a/arch/sparc/kernel/systbls_32.S
+++ b/arch/sparc/kernel/systbls_32.S
@@ -82,5 +82,5 @@ sys_call_table:
 /*310*/	.long sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate
 /*315*/	.long sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1
 /*320*/	.long sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv
-/*325*/	.long sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_counter_open
+/*325*/	.long sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open
 
diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S
index 91b06b7f7acf..009825f6e73c 100644
--- a/arch/sparc/kernel/systbls_64.S
+++ b/arch/sparc/kernel/systbls_64.S
@@ -83,7 +83,7 @@ sys_call_table32:
 /*310*/	.word compat_sys_utimensat, compat_sys_signalfd, sys_timerfd_create, sys_eventfd, compat_sys_fallocate
 	.word compat_sys_timerfd_settime, compat_sys_timerfd_gettime, compat_sys_signalfd4, sys_eventfd2, sys_epoll_create1
 /*320*/	.word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, compat_sys_preadv
-	.word compat_sys_pwritev, compat_sys_rt_tgsigqueueinfo, sys_perf_counter_open
+	.word compat_sys_pwritev, compat_sys_rt_tgsigqueueinfo, sys_perf_event_open
 
 #endif /* CONFIG_COMPAT */
 
@@ -158,4 +158,4 @@ sys_call_table:
 /*310*/	.word sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate
 	.word sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1
 /*320*/	.word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv
-	.word sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_counter_open
+	.word sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 51c59015b280..e4ff5d1280ca 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -24,7 +24,7 @@ config X86
 	select HAVE_UNSTABLE_SCHED_CLOCK
 	select HAVE_IDE
 	select HAVE_OPROFILE
-	select HAVE_PERF_COUNTERS if (!M386 && !M486)
+	select HAVE_PERF_EVENTS if (!M386 && !M486)
 	select HAVE_IOREMAP_PROT
 	select HAVE_KPROBES
 	select ARCH_WANT_OPTIONAL_GPIOLIB
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index ba331bfd1112..74619c4f9fda 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -831,5 +831,5 @@ ia32_sys_call_table:
 	.quad compat_sys_preadv
 	.quad compat_sys_pwritev
 	.quad compat_sys_rt_tgsigqueueinfo	/* 335 */
-	.quad sys_perf_counter_open
+	.quad sys_perf_event_open
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 5e3f2044f0d3..f5693c81a1db 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -49,7 +49,7 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
 BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
 BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
 
-#ifdef CONFIG_PERF_COUNTERS
+#ifdef CONFIG_PERF_EVENTS
 BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
 #endif
 
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
deleted file mode 100644
index e7b7c938ae27..000000000000
--- a/arch/x86/include/asm/perf_counter.h
+++ /dev/null
@@ -1,108 +0,0 @@
-#ifndef _ASM_X86_PERF_COUNTER_H
-#define _ASM_X86_PERF_COUNTER_H
-
-/*
- * Performance counter hw details:
- */
-
-#define X86_PMC_MAX_GENERIC					8
-#define X86_PMC_MAX_FIXED					3
-
-#define X86_PMC_IDX_GENERIC				        0
-#define X86_PMC_IDX_FIXED				       32
-#define X86_PMC_IDX_MAX					       64
-
-#define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
-#define MSR_ARCH_PERFMON_PERFCTR1			      0xc2
-
-#define MSR_ARCH_PERFMON_EVENTSEL0			     0x186
-#define MSR_ARCH_PERFMON_EVENTSEL1			     0x187
-
-#define ARCH_PERFMON_EVENTSEL0_ENABLE			  (1 << 22)
-#define ARCH_PERFMON_EVENTSEL_INT			  (1 << 20)
-#define ARCH_PERFMON_EVENTSEL_OS			  (1 << 17)
-#define ARCH_PERFMON_EVENTSEL_USR			  (1 << 16)
-
-/*
- * Includes eventsel and unit mask as well:
- */
-#define ARCH_PERFMON_EVENT_MASK				    0xffff
-
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 		 0
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
-		(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
-
-#define ARCH_PERFMON_BRANCH_MISSES_RETIRED			 6
-
-/*
- * Intel "Architectural Performance Monitoring" CPUID
- * detection/enumeration details:
- */
-union cpuid10_eax {
-	struct {
-		unsigned int version_id:8;
-		unsigned int num_counters:8;
-		unsigned int bit_width:8;
-		unsigned int mask_length:8;
-	} split;
-	unsigned int full;
-};
-
-union cpuid10_edx {
-	struct {
-		unsigned int num_counters_fixed:4;
-		unsigned int reserved:28;
-	} split;
-	unsigned int full;
-};
-
-
-/*
- * Fixed-purpose performance counters:
- */
-
-/*
- * All 3 fixed-mode PMCs are configured via this single MSR:
- */
-#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL			0x38d
-
-/*
- * The counts are available in three separate MSRs:
- */
-
-/* Instr_Retired.Any: */
-#define MSR_ARCH_PERFMON_FIXED_CTR0			0x309
-#define X86_PMC_IDX_FIXED_INSTRUCTIONS			(X86_PMC_IDX_FIXED + 0)
-
-/* CPU_CLK_Unhalted.Core: */
-#define MSR_ARCH_PERFMON_FIXED_CTR1			0x30a
-#define X86_PMC_IDX_FIXED_CPU_CYCLES			(X86_PMC_IDX_FIXED + 1)
-
-/* CPU_CLK_Unhalted.Ref: */
-#define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
-#define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
-
-/*
- * We model BTS tracing as another fixed-mode PMC.
- *
- * We choose a value in the middle of the fixed counter range, since lower
- * values are used by actual fixed counters and higher values are used
- * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
- */
-#define X86_PMC_IDX_FIXED_BTS				(X86_PMC_IDX_FIXED + 16)
-
-
-#ifdef CONFIG_PERF_COUNTERS
-extern void init_hw_perf_counters(void);
-extern void perf_counters_lapic_init(void);
-
-#define PERF_COUNTER_INDEX_OFFSET			0
-
-#else
-static inline void init_hw_perf_counters(void)		{ }
-static inline void perf_counters_lapic_init(void)	{ }
-#endif
-
-#endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
new file mode 100644
index 000000000000..ad7ce3fd5065
--- /dev/null
+++ b/arch/x86/include/asm/perf_event.h
@@ -0,0 +1,108 @@
+#ifndef _ASM_X86_PERF_EVENT_H
+#define _ASM_X86_PERF_EVENT_H
+
+/*
+ * Performance event hw details:
+ */
+
+#define X86_PMC_MAX_GENERIC					8
+#define X86_PMC_MAX_FIXED					3
+
+#define X86_PMC_IDX_GENERIC				        0
+#define X86_PMC_IDX_FIXED				       32
+#define X86_PMC_IDX_MAX					       64
+
+#define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
+#define MSR_ARCH_PERFMON_PERFCTR1			      0xc2
+
+#define MSR_ARCH_PERFMON_EVENTSEL0			     0x186
+#define MSR_ARCH_PERFMON_EVENTSEL1			     0x187
+
+#define ARCH_PERFMON_EVENTSEL0_ENABLE			  (1 << 22)
+#define ARCH_PERFMON_EVENTSEL_INT			  (1 << 20)
+#define ARCH_PERFMON_EVENTSEL_OS			  (1 << 17)
+#define ARCH_PERFMON_EVENTSEL_USR			  (1 << 16)
+
+/*
+ * Includes eventsel and unit mask as well:
+ */
+#define ARCH_PERFMON_EVENT_MASK				    0xffff
+
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 		 0
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
+		(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
+
+#define ARCH_PERFMON_BRANCH_MISSES_RETIRED			 6
+
+/*
+ * Intel "Architectural Performance Monitoring" CPUID
+ * detection/enumeration details:
+ */
+union cpuid10_eax {
+	struct {
+		unsigned int version_id:8;
+		unsigned int num_events:8;
+		unsigned int bit_width:8;
+		unsigned int mask_length:8;
+	} split;
+	unsigned int full;
+};
+
+union cpuid10_edx {
+	struct {
+		unsigned int num_events_fixed:4;
+		unsigned int reserved:28;
+	} split;
+	unsigned int full;
+};
+
+
+/*
+ * Fixed-purpose performance events:
+ */
+
+/*
+ * All 3 fixed-mode PMCs are configured via this single MSR:
+ */
+#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL			0x38d
+
+/*
+ * The counts are available in three separate MSRs:
+ */
+
+/* Instr_Retired.Any: */
+#define MSR_ARCH_PERFMON_FIXED_CTR0			0x309
+#define X86_PMC_IDX_FIXED_INSTRUCTIONS			(X86_PMC_IDX_FIXED + 0)
+
+/* CPU_CLK_Unhalted.Core: */
+#define MSR_ARCH_PERFMON_FIXED_CTR1			0x30a
+#define X86_PMC_IDX_FIXED_CPU_CYCLES			(X86_PMC_IDX_FIXED + 1)
+
+/* CPU_CLK_Unhalted.Ref: */
+#define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
+#define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
+
+/*
+ * We model BTS tracing as another fixed-mode PMC.
+ *
+ * We choose a value in the middle of the fixed event range, since lower
+ * values are used by actual fixed events and higher values are used
+ * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
+ */
+#define X86_PMC_IDX_FIXED_BTS				(X86_PMC_IDX_FIXED + 16)
+
+
+#ifdef CONFIG_PERF_EVENTS
+extern void init_hw_perf_events(void);
+extern void perf_events_lapic_init(void);
+
+#define PERF_EVENT_INDEX_OFFSET			0
+
+#else
+static inline void init_hw_perf_events(void)		{ }
+static inline void perf_events_lapic_init(void)	{ }
+#endif
+
+#endif /* _ASM_X86_PERF_EVENT_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 8deaada61bc8..6fb3c209a7e3 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -341,7 +341,7 @@
 #define __NR_preadv		333
 #define __NR_pwritev		334
 #define __NR_rt_tgsigqueueinfo	335
-#define __NR_perf_counter_open	336
+#define __NR_perf_event_open	336
 
 #ifdef __KERNEL__
 
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index b9f3c60de5f7..8d3ad0adbc68 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -659,8 +659,8 @@ __SYSCALL(__NR_preadv, sys_preadv)
 __SYSCALL(__NR_pwritev, sys_pwritev)
 #define __NR_rt_tgsigqueueinfo			297
 __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
-#define __NR_perf_counter_open			298
-__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
+#define __NR_perf_event_open			298
+__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index a34601f52987..754174d09deb 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -14,7 +14,7 @@
  *	Mikael Pettersson	:	PM converted to driver model.
  */
 
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/kernel_stat.h>
 #include <linux/mc146818rtc.h>
 #include <linux/acpi_pmtmr.h>
@@ -35,7 +35,7 @@
 #include <linux/smp.h>
 #include <linux/mm.h>
 
-#include <asm/perf_counter.h>
+#include <asm/perf_event.h>
 #include <asm/x86_init.h>
 #include <asm/pgalloc.h>
 #include <asm/atomic.h>
@@ -1189,7 +1189,7 @@ void __cpuinit setup_local_APIC(void)
 		apic_write(APIC_ESR, 0);
 	}
 #endif
-	perf_counters_lapic_init();
+	perf_events_lapic_init();
 
 	preempt_disable();
 
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 8dd30638fe44..68537e957a9b 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -27,7 +27,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
-obj-$(CONFIG_PERF_COUNTERS)		+= perf_counter.o
+obj-$(CONFIG_PERF_EVENTS)		+= perf_event.o
 
 obj-$(CONFIG_X86_MCE)			+= mcheck/
 obj-$(CONFIG_MTRR)			+= mtrr/
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2fea97eccf77..cc25c2b4a567 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,7 +13,7 @@
 #include <linux/io.h>
 
 #include <asm/stackprotector.h>
-#include <asm/perf_counter.h>
+#include <asm/perf_event.h>
 #include <asm/mmu_context.h>
 #include <asm/hypervisor.h>
 #include <asm/processor.h>
@@ -869,7 +869,7 @@ void __init identify_boot_cpu(void)
 #else
 	vgetcpu_set_mode();
 #endif
-	init_hw_perf_counters();
+	init_hw_perf_events();
 }
 
 void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
deleted file mode 100644
index b1f115696c84..000000000000
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ /dev/null
@@ -1,2298 +0,0 @@
-/*
- * Performance counter x86 architecture code
- *
- *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
- *  Copyright (C) 2009 Jaswinder Singh Rajput
- *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
- *
- *  For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_counter.h>
-#include <linux/capability.h>
-#include <linux/notifier.h>
-#include <linux/hardirq.h>
-#include <linux/kprobes.h>
-#include <linux/module.h>
-#include <linux/kdebug.h>
-#include <linux/sched.h>
-#include <linux/uaccess.h>
-#include <linux/highmem.h>
-#include <linux/cpu.h>
-
-#include <asm/apic.h>
-#include <asm/stacktrace.h>
-#include <asm/nmi.h>
-
-static u64 perf_counter_mask __read_mostly;
-
-/* The maximal number of PEBS counters: */
-#define MAX_PEBS_COUNTERS	4
-
-/* The size of a BTS record in bytes: */
-#define BTS_RECORD_SIZE		24
-
-/* The size of a per-cpu BTS buffer in bytes: */
-#define BTS_BUFFER_SIZE		(BTS_RECORD_SIZE * 2048)
-
-/* The BTS overflow threshold in bytes from the end of the buffer: */
-#define BTS_OVFL_TH		(BTS_RECORD_SIZE * 128)
-
-
-/*
- * Bits in the debugctlmsr controlling branch tracing.
- */
-#define X86_DEBUGCTL_TR			(1 << 6)
-#define X86_DEBUGCTL_BTS		(1 << 7)
-#define X86_DEBUGCTL_BTINT		(1 << 8)
-#define X86_DEBUGCTL_BTS_OFF_OS		(1 << 9)
-#define X86_DEBUGCTL_BTS_OFF_USR	(1 << 10)
-
-/*
- * A debug store configuration.
- *
- * We only support architectures that use 64bit fields.
- */
-struct debug_store {
-	u64	bts_buffer_base;
-	u64	bts_index;
-	u64	bts_absolute_maximum;
-	u64	bts_interrupt_threshold;
-	u64	pebs_buffer_base;
-	u64	pebs_index;
-	u64	pebs_absolute_maximum;
-	u64	pebs_interrupt_threshold;
-	u64	pebs_counter_reset[MAX_PEBS_COUNTERS];
-};
-
-struct cpu_hw_counters {
-	struct perf_counter	*counters[X86_PMC_IDX_MAX];
-	unsigned long		used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	unsigned long		interrupts;
-	int			enabled;
-	struct debug_store	*ds;
-};
-
-/*
- * struct x86_pmu - generic x86 pmu
- */
-struct x86_pmu {
-	const char	*name;
-	int		version;
-	int		(*handle_irq)(struct pt_regs *);
-	void		(*disable_all)(void);
-	void		(*enable_all)(void);
-	void		(*enable)(struct hw_perf_counter *, int);
-	void		(*disable)(struct hw_perf_counter *, int);
-	unsigned	eventsel;
-	unsigned	perfctr;
-	u64		(*event_map)(int);
-	u64		(*raw_event)(u64);
-	int		max_events;
-	int		num_counters;
-	int		num_counters_fixed;
-	int		counter_bits;
-	u64		counter_mask;
-	int		apic;
-	u64		max_period;
-	u64		intel_ctrl;
-	void		(*enable_bts)(u64 config);
-	void		(*disable_bts)(void);
-};
-
-static struct x86_pmu x86_pmu __read_mostly;
-
-static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
-	.enabled = 1,
-};
-
-/*
- * Not sure about some of these
- */
-static const u64 p6_perfmon_event_map[] =
-{
-  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0079,
-  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
-  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0f2e,
-  [PERF_COUNT_HW_CACHE_MISSES]		= 0x012e,
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
-  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
-  [PERF_COUNT_HW_BUS_CYCLES]		= 0x0062,
-};
-
-static u64 p6_pmu_event_map(int hw_event)
-{
-	return p6_perfmon_event_map[hw_event];
-}
-
-/*
- * Counter setting that is specified not to count anything.
- * We use this to effectively disable a counter.
- *
- * L2_RQSTS with 0 MESI unit mask.
- */
-#define P6_NOP_COUNTER			0x0000002EULL
-
-static u64 p6_pmu_raw_event(u64 hw_event)
-{
-#define P6_EVNTSEL_EVENT_MASK		0x000000FFULL
-#define P6_EVNTSEL_UNIT_MASK		0x0000FF00ULL
-#define P6_EVNTSEL_EDGE_MASK		0x00040000ULL
-#define P6_EVNTSEL_INV_MASK		0x00800000ULL
-#define P6_EVNTSEL_COUNTER_MASK		0xFF000000ULL
-
-#define P6_EVNTSEL_MASK			\
-	(P6_EVNTSEL_EVENT_MASK |	\
-	 P6_EVNTSEL_UNIT_MASK  |	\
-	 P6_EVNTSEL_EDGE_MASK  |	\
-	 P6_EVNTSEL_INV_MASK   |	\
-	 P6_EVNTSEL_COUNTER_MASK)
-
-	return hw_event & P6_EVNTSEL_MASK;
-}
-
-
-/*
- * Intel PerfMon v3. Used on Core2 and later.
- */
-static const u64 intel_perfmon_event_map[] =
-{
-  [PERF_COUNT_HW_CPU_CYCLES]		= 0x003c,
-  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
-  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4f2e,
-  [PERF_COUNT_HW_CACHE_MISSES]		= 0x412e,
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
-  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
-  [PERF_COUNT_HW_BUS_CYCLES]		= 0x013c,
-};
-
-static u64 intel_pmu_event_map(int hw_event)
-{
-	return intel_perfmon_event_map[hw_event];
-}
-
-/*
- * Generalized hw caching related hw_event table, filled
- * in on a per model basis. A value of 0 means
- * 'not supported', -1 means 'hw_event makes no sense on
- * this CPU', any other value means the raw hw_event
- * ID.
- */
-
-#define C(x) PERF_COUNT_HW_CACHE_##x
-
-static u64 __read_mostly hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX];
-
-static const u64 nehalem_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI            */
-		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE         */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI            */
-		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE         */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
-		[ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
-		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
-		[ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS                */
-		[ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference                */
-		[ C(RESULT_MISS)   ] = 0x412e, /* LLC Misses                   */
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
-		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
-		[ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
-		[ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
-		[ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
-};
-
-static const u64 core2_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */
-		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */
-		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
-		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
-		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
-		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
-		[ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
-		[ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
-		[ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
-		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
-};
-
-static const u64 atom_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */
-		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
-		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
-		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */
-		[ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */
-		[ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
-		[ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
-		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
-};
-
-static u64 intel_pmu_raw_event(u64 hw_event)
-{
-#define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
-#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
-#define CORE_EVNTSEL_EDGE_MASK		0x00040000ULL
-#define CORE_EVNTSEL_INV_MASK		0x00800000ULL
-#define CORE_EVNTSEL_COUNTER_MASK	0xFF000000ULL
-
-#define CORE_EVNTSEL_MASK		\
-	(CORE_EVNTSEL_EVENT_MASK |	\
-	 CORE_EVNTSEL_UNIT_MASK  |	\
-	 CORE_EVNTSEL_EDGE_MASK  |	\
-	 CORE_EVNTSEL_INV_MASK  |	\
-	 CORE_EVNTSEL_COUNTER_MASK)
-
-	return hw_event & CORE_EVNTSEL_MASK;
-}
-
-static const u64 amd_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
-		[ C(RESULT_MISS)   ] = 0x0041, /* Data Cache Misses          */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts  */
-		[ C(RESULT_MISS)   ] = 0x0167, /* Data Prefetcher :cancelled */
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */
-		[ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
-		[ C(RESULT_MISS)   ] = 0x037E, /* L2 Cache Misses : IC+DC     */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback           */
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
-		[ C(RESULT_MISS)   ] = 0x0046, /* L1 DTLB and L2 DLTB Miss   */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
-		[ C(RESULT_MISS)   ] = 0x0085, /* Instr. fetch ITLB misses   */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */
-		[ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
-};
-
-/*
- * AMD Performance Monitor K7 and later.
- */
-static const u64 amd_perfmon_event_map[] =
-{
-  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0076,
-  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
-  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0080,
-  [PERF_COUNT_HW_CACHE_MISSES]		= 0x0081,
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
-  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
-};
-
-static u64 amd_pmu_event_map(int hw_event)
-{
-	return amd_perfmon_event_map[hw_event];
-}
-
-static u64 amd_pmu_raw_event(u64 hw_event)
-{
-#define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL
-#define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
-#define K7_EVNTSEL_EDGE_MASK	0x000040000ULL
-#define K7_EVNTSEL_INV_MASK	0x000800000ULL
-#define K7_EVNTSEL_COUNTER_MASK	0x0FF000000ULL
-
-#define K7_EVNTSEL_MASK			\
-	(K7_EVNTSEL_EVENT_MASK |	\
-	 K7_EVNTSEL_UNIT_MASK  |	\
-	 K7_EVNTSEL_EDGE_MASK  |	\
-	 K7_EVNTSEL_INV_MASK   |	\
-	 K7_EVNTSEL_COUNTER_MASK)
-
-	return hw_event & K7_EVNTSEL_MASK;
-}
-
-/*
- * Propagate counter elapsed time into the generic counter.
- * Can only be executed on the CPU where the counter is active.
- * Returns the delta events processed.
- */
-static u64
-x86_perf_counter_update(struct perf_counter *counter,
-			struct hw_perf_counter *hwc, int idx)
-{
-	int shift = 64 - x86_pmu.counter_bits;
-	u64 prev_raw_count, new_raw_count;
-	s64 delta;
-
-	if (idx == X86_PMC_IDX_FIXED_BTS)
-		return 0;
-
-	/*
-	 * Careful: an NMI might modify the previous counter value.
-	 *
-	 * Our tactic to handle this is to first atomically read and
-	 * exchange a new raw count - then add that new-prev delta
-	 * count to the generic counter atomically:
-	 */
-again:
-	prev_raw_count = atomic64_read(&hwc->prev_count);
-	rdmsrl(hwc->counter_base + idx, new_raw_count);
-
-	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
-					new_raw_count) != prev_raw_count)
-		goto again;
-
-	/*
-	 * Now we have the new raw value and have updated the prev
-	 * timestamp already. We can now calculate the elapsed delta
-	 * (counter-)time and add that to the generic counter.
-	 *
-	 * Careful, not all hw sign-extends above the physical width
-	 * of the count.
-	 */
-	delta = (new_raw_count << shift) - (prev_raw_count << shift);
-	delta >>= shift;
-
-	atomic64_add(delta, &counter->count);
-	atomic64_sub(delta, &hwc->period_left);
-
-	return new_raw_count;
-}
-
-static atomic_t active_counters;
-static DEFINE_MUTEX(pmc_reserve_mutex);
-
-static bool reserve_pmc_hardware(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
-	int i;
-
-	if (nmi_watchdog == NMI_LOCAL_APIC)
-		disable_lapic_nmi_watchdog();
-
-	for (i = 0; i < x86_pmu.num_counters; i++) {
-		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
-			goto perfctr_fail;
-	}
-
-	for (i = 0; i < x86_pmu.num_counters; i++) {
-		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
-			goto eventsel_fail;
-	}
-#endif
-
-	return true;
-
-#ifdef CONFIG_X86_LOCAL_APIC
-eventsel_fail:
-	for (i--; i >= 0; i--)
-		release_evntsel_nmi(x86_pmu.eventsel + i);
-
-	i = x86_pmu.num_counters;
-
-perfctr_fail:
-	for (i--; i >= 0; i--)
-		release_perfctr_nmi(x86_pmu.perfctr + i);
-
-	if (nmi_watchdog == NMI_LOCAL_APIC)
-		enable_lapic_nmi_watchdog();
-
-	return false;
-#endif
-}
-
-static void release_pmc_hardware(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
-	int i;
-
-	for (i = 0; i < x86_pmu.num_counters; i++) {
-		release_perfctr_nmi(x86_pmu.perfctr + i);
-		release_evntsel_nmi(x86_pmu.eventsel + i);
-	}
-
-	if (nmi_watchdog == NMI_LOCAL_APIC)
-		enable_lapic_nmi_watchdog();
-#endif
-}
-
-static inline bool bts_available(void)
-{
-	return x86_pmu.enable_bts != NULL;
-}
-
-static inline void init_debug_store_on_cpu(int cpu)
-{
-	struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
-
-	if (!ds)
-		return;
-
-	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
-		     (u32)((u64)(unsigned long)ds),
-		     (u32)((u64)(unsigned long)ds >> 32));
-}
-
-static inline void fini_debug_store_on_cpu(int cpu)
-{
-	if (!per_cpu(cpu_hw_counters, cpu).ds)
-		return;
-
-	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
-}
-
-static void release_bts_hardware(void)
-{
-	int cpu;
-
-	if (!bts_available())
-		return;
-
-	get_online_cpus();
-
-	for_each_online_cpu(cpu)
-		fini_debug_store_on_cpu(cpu);
-
-	for_each_possible_cpu(cpu) {
-		struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
-
-		if (!ds)
-			continue;
-
-		per_cpu(cpu_hw_counters, cpu).ds = NULL;
-
-		kfree((void *)(unsigned long)ds->bts_buffer_base);
-		kfree(ds);
-	}
-
-	put_online_cpus();
-}
-
-static int reserve_bts_hardware(void)
-{
-	int cpu, err = 0;
-
-	if (!bts_available())
-		return 0;
-
-	get_online_cpus();
-
-	for_each_possible_cpu(cpu) {
-		struct debug_store *ds;
-		void *buffer;
-
-		err = -ENOMEM;
-		buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
-		if (unlikely(!buffer))
-			break;
-
-		ds = kzalloc(sizeof(*ds), GFP_KERNEL);
-		if (unlikely(!ds)) {
-			kfree(buffer);
-			break;
-		}
-
-		ds->bts_buffer_base = (u64)(unsigned long)buffer;
-		ds->bts_index = ds->bts_buffer_base;
-		ds->bts_absolute_maximum =
-			ds->bts_buffer_base + BTS_BUFFER_SIZE;
-		ds->bts_interrupt_threshold =
-			ds->bts_absolute_maximum - BTS_OVFL_TH;
-
-		per_cpu(cpu_hw_counters, cpu).ds = ds;
-		err = 0;
-	}
-
-	if (err)
-		release_bts_hardware();
-	else {
-		for_each_online_cpu(cpu)
-			init_debug_store_on_cpu(cpu);
-	}
-
-	put_online_cpus();
-
-	return err;
-}
-
-static void hw_perf_counter_destroy(struct perf_counter *counter)
-{
-	if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
-		release_pmc_hardware();
-		release_bts_hardware();
-		mutex_unlock(&pmc_reserve_mutex);
-	}
-}
-
-static inline int x86_pmu_initialized(void)
-{
-	return x86_pmu.handle_irq != NULL;
-}
-
-static inline int
-set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
-{
-	unsigned int cache_type, cache_op, cache_result;
-	u64 config, val;
-
-	config = attr->config;
-
-	cache_type = (config >>  0) & 0xff;
-	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
-		return -EINVAL;
-
-	cache_op = (config >>  8) & 0xff;
-	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
-		return -EINVAL;
-
-	cache_result = (config >> 16) & 0xff;
-	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
-		return -EINVAL;
-
-	val = hw_cache_event_ids[cache_type][cache_op][cache_result];
-
-	if (val == 0)
-		return -ENOENT;
-
-	if (val == -1)
-		return -EINVAL;
-
-	hwc->config |= val;
-
-	return 0;
-}
-
-static void intel_pmu_enable_bts(u64 config)
-{
-	unsigned long debugctlmsr;
-
-	debugctlmsr = get_debugctlmsr();
-
-	debugctlmsr |= X86_DEBUGCTL_TR;
-	debugctlmsr |= X86_DEBUGCTL_BTS;
-	debugctlmsr |= X86_DEBUGCTL_BTINT;
-
-	if (!(config & ARCH_PERFMON_EVENTSEL_OS))
-		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
-
-	if (!(config & ARCH_PERFMON_EVENTSEL_USR))
-		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
-
-	update_debugctlmsr(debugctlmsr);
-}
-
-static void intel_pmu_disable_bts(void)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	unsigned long debugctlmsr;
-
-	if (!cpuc->ds)
-		return;
-
-	debugctlmsr = get_debugctlmsr();
-
-	debugctlmsr &=
-		~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
-		  X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
-
-	update_debugctlmsr(debugctlmsr);
-}
-
-/*
- * Setup the hardware configuration for a given attr_type
- */
-static int __hw_perf_counter_init(struct perf_counter *counter)
-{
-	struct perf_counter_attr *attr = &counter->attr;
-	struct hw_perf_counter *hwc = &counter->hw;
-	u64 config;
-	int err;
-
-	if (!x86_pmu_initialized())
-		return -ENODEV;
-
-	err = 0;
-	if (!atomic_inc_not_zero(&active_counters)) {
-		mutex_lock(&pmc_reserve_mutex);
-		if (atomic_read(&active_counters) == 0) {
-			if (!reserve_pmc_hardware())
-				err = -EBUSY;
-			else
-				err = reserve_bts_hardware();
-		}
-		if (!err)
-			atomic_inc(&active_counters);
-		mutex_unlock(&pmc_reserve_mutex);
-	}
-	if (err)
-		return err;
-
-	counter->destroy = hw_perf_counter_destroy;
-
-	/*
-	 * Generate PMC IRQs:
-	 * (keep 'enabled' bit clear for now)
-	 */
-	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
-
-	/*
-	 * Count user and OS events unless requested not to.
-	 */
-	if (!attr->exclude_user)
-		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
-	if (!attr->exclude_kernel)
-		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
-
-	if (!hwc->sample_period) {
-		hwc->sample_period = x86_pmu.max_period;
-		hwc->last_period = hwc->sample_period;
-		atomic64_set(&hwc->period_left, hwc->sample_period);
-	} else {
-		/*
-		 * If we have a PMU initialized but no APIC
-		 * interrupts, we cannot sample hardware
-		 * counters (user-space has to fall back and
-		 * sample via a hrtimer based software counter):
-		 */
-		if (!x86_pmu.apic)
-			return -EOPNOTSUPP;
-	}
-
-	/*
-	 * Raw hw_event type provide the config in the hw_event structure
-	 */
-	if (attr->type == PERF_TYPE_RAW) {
-		hwc->config |= x86_pmu.raw_event(attr->config);
-		return 0;
-	}
-
-	if (attr->type == PERF_TYPE_HW_CACHE)
-		return set_ext_hw_attr(hwc, attr);
-
-	if (attr->config >= x86_pmu.max_events)
-		return -EINVAL;
-
-	/*
-	 * The generic map:
-	 */
-	config = x86_pmu.event_map(attr->config);
-
-	if (config == 0)
-		return -ENOENT;
-
-	if (config == -1LL)
-		return -EINVAL;
-
-	/*
-	 * Branch tracing:
-	 */
-	if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
-	    (hwc->sample_period == 1)) {
-		/* BTS is not supported by this architecture. */
-		if (!bts_available())
-			return -EOPNOTSUPP;
-
-		/* BTS is currently only allowed for user-mode. */
-		if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
-			return -EOPNOTSUPP;
-	}
-
-	hwc->config |= config;
-
-	return 0;
-}
-
-static void p6_pmu_disable_all(void)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	u64 val;
-
-	if (!cpuc->enabled)
-		return;
-
-	cpuc->enabled = 0;
-	barrier();
-
-	/* p6 only has one enable register */
-	rdmsrl(MSR_P6_EVNTSEL0, val);
-	val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
-	wrmsrl(MSR_P6_EVNTSEL0, val);
-}
-
-static void intel_pmu_disable_all(void)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	if (!cpuc->enabled)
-		return;
-
-	cpuc->enabled = 0;
-	barrier();
-
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-
-	if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
-		intel_pmu_disable_bts();
-}
-
-static void amd_pmu_disable_all(void)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	int idx;
-
-	if (!cpuc->enabled)
-		return;
-
-	cpuc->enabled = 0;
-	/*
-	 * ensure we write the disable before we start disabling the
-	 * counters proper, so that amd_pmu_enable_counter() does the
-	 * right thing.
-	 */
-	barrier();
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		u64 val;
-
-		if (!test_bit(idx, cpuc->active_mask))
-			continue;
-		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
-		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
-			continue;
-		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
-		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
-	}
-}
-
-void hw_perf_disable(void)
-{
-	if (!x86_pmu_initialized())
-		return;
-	return x86_pmu.disable_all();
-}
-
-static void p6_pmu_enable_all(void)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	unsigned long val;
-
-	if (cpuc->enabled)
-		return;
-
-	cpuc->enabled = 1;
-	barrier();
-
-	/* p6 only has one enable register */
-	rdmsrl(MSR_P6_EVNTSEL0, val);
-	val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-	wrmsrl(MSR_P6_EVNTSEL0, val);
-}
-
-static void intel_pmu_enable_all(void)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	if (cpuc->enabled)
-		return;
-
-	cpuc->enabled = 1;
-	barrier();
-
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
-
-	if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
-		struct perf_counter *counter =
-			cpuc->counters[X86_PMC_IDX_FIXED_BTS];
-
-		if (WARN_ON_ONCE(!counter))
-			return;
-
-		intel_pmu_enable_bts(counter->hw.config);
-	}
-}
-
-static void amd_pmu_enable_all(void)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	int idx;
-
-	if (cpuc->enabled)
-		return;
-
-	cpuc->enabled = 1;
-	barrier();
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		struct perf_counter *counter = cpuc->counters[idx];
-		u64 val;
-
-		if (!test_bit(idx, cpuc->active_mask))
-			continue;
-
-		val = counter->hw.config;
-		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
-	}
-}
-
-void hw_perf_enable(void)
-{
-	if (!x86_pmu_initialized())
-		return;
-	x86_pmu.enable_all();
-}
-
-static inline u64 intel_pmu_get_status(void)
-{
-	u64 status;
-
-	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-
-	return status;
-}
-
-static inline void intel_pmu_ack_status(u64 ack)
-{
-	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
-}
-
-static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
-{
-	(void)checking_wrmsrl(hwc->config_base + idx,
-			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
-}
-
-static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
-{
-	(void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
-}
-
-static inline void
-intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
-{
-	int idx = __idx - X86_PMC_IDX_FIXED;
-	u64 ctrl_val, mask;
-
-	mask = 0xfULL << (idx * 4);
-
-	rdmsrl(hwc->config_base, ctrl_val);
-	ctrl_val &= ~mask;
-	(void)checking_wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static inline void
-p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	u64 val = P6_NOP_COUNTER;
-
-	if (cpuc->enabled)
-		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-
-	(void)checking_wrmsrl(hwc->config_base + idx, val);
-}
-
-static inline void
-intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
-{
-	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
-		intel_pmu_disable_bts();
-		return;
-	}
-
-	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
-		intel_pmu_disable_fixed(hwc, idx);
-		return;
-	}
-
-	x86_pmu_disable_counter(hwc, idx);
-}
-
-static inline void
-amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
-{
-	x86_pmu_disable_counter(hwc, idx);
-}
-
-static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
-
-/*
- * Set the next IRQ period, based on the hwc->period_left value.
- * To be called with the counter disabled in hw:
- */
-static int
-x86_perf_counter_set_period(struct perf_counter *counter,
-			     struct hw_perf_counter *hwc, int idx)
-{
-	s64 left = atomic64_read(&hwc->period_left);
-	s64 period = hwc->sample_period;
-	int err, ret = 0;
-
-	if (idx == X86_PMC_IDX_FIXED_BTS)
-		return 0;
-
-	/*
-	 * If we are way outside a reasoable range then just skip forward:
-	 */
-	if (unlikely(left <= -period)) {
-		left = period;
-		atomic64_set(&hwc->period_left, left);
-		hwc->last_period = period;
-		ret = 1;
-	}
-
-	if (unlikely(left <= 0)) {
-		left += period;
-		atomic64_set(&hwc->period_left, left);
-		hwc->last_period = period;
-		ret = 1;
-	}
-	/*
-	 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
-	 */
-	if (unlikely(left < 2))
-		left = 2;
-
-	if (left > x86_pmu.max_period)
-		left = x86_pmu.max_period;
-
-	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
-
-	/*
-	 * The hw counter starts counting from this counter offset,
-	 * mark it to be able to extra future deltas:
-	 */
-	atomic64_set(&hwc->prev_count, (u64)-left);
-
-	err = checking_wrmsrl(hwc->counter_base + idx,
-			     (u64)(-left) & x86_pmu.counter_mask);
-
-	perf_counter_update_userpage(counter);
-
-	return ret;
-}
-
-static inline void
-intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
-{
-	int idx = __idx - X86_PMC_IDX_FIXED;
-	u64 ctrl_val, bits, mask;
-	int err;
-
-	/*
-	 * Enable IRQ generation (0x8),
-	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
-	 * if requested:
-	 */
-	bits = 0x8ULL;
-	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
-		bits |= 0x2;
-	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
-		bits |= 0x1;
-	bits <<= (idx * 4);
-	mask = 0xfULL << (idx * 4);
-
-	rdmsrl(hwc->config_base, ctrl_val);
-	ctrl_val &= ~mask;
-	ctrl_val |= bits;
-	err = checking_wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	u64 val;
-
-	val = hwc->config;
-	if (cpuc->enabled)
-		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-
-	(void)checking_wrmsrl(hwc->config_base + idx, val);
-}
-
-
-static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
-{
-	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
-		if (!__get_cpu_var(cpu_hw_counters).enabled)
-			return;
-
-		intel_pmu_enable_bts(hwc->config);
-		return;
-	}
-
-	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
-		intel_pmu_enable_fixed(hwc, idx);
-		return;
-	}
-
-	x86_pmu_enable_counter(hwc, idx);
-}
-
-static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	if (cpuc->enabled)
-		x86_pmu_enable_counter(hwc, idx);
-}
-
-static int
-fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
-{
-	unsigned int hw_event;
-
-	hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK;
-
-	if (unlikely((hw_event ==
-		      x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
-		     (hwc->sample_period == 1)))
-		return X86_PMC_IDX_FIXED_BTS;
-
-	if (!x86_pmu.num_counters_fixed)
-		return -1;
-
-	if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
-		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
-	if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
-		return X86_PMC_IDX_FIXED_CPU_CYCLES;
-	if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
-		return X86_PMC_IDX_FIXED_BUS_CYCLES;
-
-	return -1;
-}
-
-/*
- * Find a PMC slot for the freshly enabled / scheduled in counter:
- */
-static int x86_pmu_enable(struct perf_counter *counter)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	struct hw_perf_counter *hwc = &counter->hw;
-	int idx;
-
-	idx = fixed_mode_idx(counter, hwc);
-	if (idx == X86_PMC_IDX_FIXED_BTS) {
-		/* BTS is already occupied. */
-		if (test_and_set_bit(idx, cpuc->used_mask))
-			return -EAGAIN;
-
-		hwc->config_base	= 0;
-		hwc->counter_base	= 0;
-		hwc->idx		= idx;
-	} else if (idx >= 0) {
-		/*
-		 * Try to get the fixed counter, if that is already taken
-		 * then try to get a generic counter:
-		 */
-		if (test_and_set_bit(idx, cpuc->used_mask))
-			goto try_generic;
-
-		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
-		/*
-		 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
-		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
-		 */
-		hwc->counter_base =
-			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
-		hwc->idx = idx;
-	} else {
-		idx = hwc->idx;
-		/* Try to get the previous generic counter again */
-		if (test_and_set_bit(idx, cpuc->used_mask)) {
-try_generic:
-			idx = find_first_zero_bit(cpuc->used_mask,
-						  x86_pmu.num_counters);
-			if (idx == x86_pmu.num_counters)
-				return -EAGAIN;
-
-			set_bit(idx, cpuc->used_mask);
-			hwc->idx = idx;
-		}
-		hwc->config_base  = x86_pmu.eventsel;
-		hwc->counter_base = x86_pmu.perfctr;
-	}
-
-	perf_counters_lapic_init();
-
-	x86_pmu.disable(hwc, idx);
-
-	cpuc->counters[idx] = counter;
-	set_bit(idx, cpuc->active_mask);
-
-	x86_perf_counter_set_period(counter, hwc, idx);
-	x86_pmu.enable(hwc, idx);
-
-	perf_counter_update_userpage(counter);
-
-	return 0;
-}
-
-static void x86_pmu_unthrottle(struct perf_counter *counter)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	struct hw_perf_counter *hwc = &counter->hw;
-
-	if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
-				cpuc->counters[hwc->idx] != counter))
-		return;
-
-	x86_pmu.enable(hwc, hwc->idx);
-}
-
-void perf_counter_print_debug(void)
-{
-	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
-	struct cpu_hw_counters *cpuc;
-	unsigned long flags;
-	int cpu, idx;
-
-	if (!x86_pmu.num_counters)
-		return;
-
-	local_irq_save(flags);
-
-	cpu = smp_processor_id();
-	cpuc = &per_cpu(cpu_hw_counters, cpu);
-
-	if (x86_pmu.version >= 2) {
-		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
-		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
-		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
-
-		pr_info("\n");
-		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
-		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
-		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
-		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
-	}
-	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used_mask);
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
-		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
-
-		prev_left = per_cpu(pmc_prev_left[idx], cpu);
-
-		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
-			cpu, idx, pmc_ctrl);
-		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
-			cpu, idx, pmc_count);
-		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
-			cpu, idx, prev_left);
-	}
-	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
-		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
-
-		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
-			cpu, idx, pmc_count);
-	}
-	local_irq_restore(flags);
-}
-
-static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc)
-{
-	struct debug_store *ds = cpuc->ds;
-	struct bts_record {
-		u64	from;
-		u64	to;
-		u64	flags;
-	};
-	struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS];
-	struct bts_record *at, *top;
-	struct perf_output_handle handle;
-	struct perf_event_header header;
-	struct perf_sample_data data;
-	struct pt_regs regs;
-
-	if (!counter)
-		return;
-
-	if (!ds)
-		return;
-
-	at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
-	top = (struct bts_record *)(unsigned long)ds->bts_index;
-
-	if (top <= at)
-		return;
-
-	ds->bts_index = ds->bts_buffer_base;
-
-
-	data.period	= counter->hw.last_period;
-	data.addr	= 0;
-	regs.ip		= 0;
-
-	/*
-	 * Prepare a generic sample, i.e. fill in the invariant fields.
-	 * We will overwrite the from and to address before we output
-	 * the sample.
-	 */
-	perf_prepare_sample(&header, &data, counter, &regs);
-
-	if (perf_output_begin(&handle, counter,
-			      header.size * (top - at), 1, 1))
-		return;
-
-	for (; at < top; at++) {
-		data.ip		= at->from;
-		data.addr	= at->to;
-
-		perf_output_sample(&handle, &header, &data, counter);
-	}
-
-	perf_output_end(&handle);
-
-	/* There's new data available. */
-	counter->hw.interrupts++;
-	counter->pending_kill = POLL_IN;
-}
-
-static void x86_pmu_disable(struct perf_counter *counter)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	struct hw_perf_counter *hwc = &counter->hw;
-	int idx = hwc->idx;
-
-	/*
-	 * Must be done before we disable, otherwise the nmi handler
-	 * could reenable again:
-	 */
-	clear_bit(idx, cpuc->active_mask);
-	x86_pmu.disable(hwc, idx);
-
-	/*
-	 * Make sure the cleared pointer becomes visible before we
-	 * (potentially) free the counter:
-	 */
-	barrier();
-
-	/*
-	 * Drain the remaining delta count out of a counter
-	 * that we are disabling:
-	 */
-	x86_perf_counter_update(counter, hwc, idx);
-
-	/* Drain the remaining BTS records. */
-	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
-		intel_pmu_drain_bts_buffer(cpuc);
-
-	cpuc->counters[idx] = NULL;
-	clear_bit(idx, cpuc->used_mask);
-
-	perf_counter_update_userpage(counter);
-}
-
-/*
- * Save and restart an expired counter. Called by NMI contexts,
- * so it has to be careful about preempting normal counter ops:
- */
-static int intel_pmu_save_and_restart(struct perf_counter *counter)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-	int idx = hwc->idx;
-	int ret;
-
-	x86_perf_counter_update(counter, hwc, idx);
-	ret = x86_perf_counter_set_period(counter, hwc, idx);
-
-	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
-		intel_pmu_enable_counter(hwc, idx);
-
-	return ret;
-}
-
-static void intel_pmu_reset(void)
-{
-	struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds;
-	unsigned long flags;
-	int idx;
-
-	if (!x86_pmu.num_counters)
-		return;
-
-	local_irq_save(flags);
-
-	printk("clearing PMU state on CPU#%d\n", smp_processor_id());
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
-		checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull);
-	}
-	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
-		checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
-	}
-	if (ds)
-		ds->bts_index = ds->bts_buffer_base;
-
-	local_irq_restore(flags);
-}
-
-static int p6_pmu_handle_irq(struct pt_regs *regs)
-{
-	struct perf_sample_data data;
-	struct cpu_hw_counters *cpuc;
-	struct perf_counter *counter;
-	struct hw_perf_counter *hwc;
-	int idx, handled = 0;
-	u64 val;
-
-	data.addr = 0;
-
-	cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		if (!test_bit(idx, cpuc->active_mask))
-			continue;
-
-		counter = cpuc->counters[idx];
-		hwc = &counter->hw;
-
-		val = x86_perf_counter_update(counter, hwc, idx);
-		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
-			continue;
-
-		/*
-		 * counter overflow
-		 */
-		handled		= 1;
-		data.period	= counter->hw.last_period;
-
-		if (!x86_perf_counter_set_period(counter, hwc, idx))
-			continue;
-
-		if (perf_counter_overflow(counter, 1, &data, regs))
-			p6_pmu_disable_counter(hwc, idx);
-	}
-
-	if (handled)
-		inc_irq_stat(apic_perf_irqs);
-
-	return handled;
-}
-
-/*
- * This handler is triggered by the local APIC, so the APIC IRQ handling
- * rules apply:
- */
-static int intel_pmu_handle_irq(struct pt_regs *regs)
-{
-	struct perf_sample_data data;
-	struct cpu_hw_counters *cpuc;
-	int bit, loops;
-	u64 ack, status;
-
-	data.addr = 0;
-
-	cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	perf_disable();
-	intel_pmu_drain_bts_buffer(cpuc);
-	status = intel_pmu_get_status();
-	if (!status) {
-		perf_enable();
-		return 0;
-	}
-
-	loops = 0;
-again:
-	if (++loops > 100) {
-		WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
-		perf_counter_print_debug();
-		intel_pmu_reset();
-		perf_enable();
-		return 1;
-	}
-
-	inc_irq_stat(apic_perf_irqs);
-	ack = status;
-	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
-		struct perf_counter *counter = cpuc->counters[bit];
-
-		clear_bit(bit, (unsigned long *) &status);
-		if (!test_bit(bit, cpuc->active_mask))
-			continue;
-
-		if (!intel_pmu_save_and_restart(counter))
-			continue;
-
-		data.period = counter->hw.last_period;
-
-		if (perf_counter_overflow(counter, 1, &data, regs))
-			intel_pmu_disable_counter(&counter->hw, bit);
-	}
-
-	intel_pmu_ack_status(ack);
-
-	/*
-	 * Repeat if there is more work to be done:
-	 */
-	status = intel_pmu_get_status();
-	if (status)
-		goto again;
-
-	perf_enable();
-
-	return 1;
-}
-
-static int amd_pmu_handle_irq(struct pt_regs *regs)
-{
-	struct perf_sample_data data;
-	struct cpu_hw_counters *cpuc;
-	struct perf_counter *counter;
-	struct hw_perf_counter *hwc;
-	int idx, handled = 0;
-	u64 val;
-
-	data.addr = 0;
-
-	cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		if (!test_bit(idx, cpuc->active_mask))
-			continue;
-
-		counter = cpuc->counters[idx];
-		hwc = &counter->hw;
-
-		val = x86_perf_counter_update(counter, hwc, idx);
-		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
-			continue;
-
-		/*
-		 * counter overflow
-		 */
-		handled		= 1;
-		data.period	= counter->hw.last_period;
-
-		if (!x86_perf_counter_set_period(counter, hwc, idx))
-			continue;
-
-		if (perf_counter_overflow(counter, 1, &data, regs))
-			amd_pmu_disable_counter(hwc, idx);
-	}
-
-	if (handled)
-		inc_irq_stat(apic_perf_irqs);
-
-	return handled;
-}
-
-void smp_perf_pending_interrupt(struct pt_regs *regs)
-{
-	irq_enter();
-	ack_APIC_irq();
-	inc_irq_stat(apic_pending_irqs);
-	perf_counter_do_pending();
-	irq_exit();
-}
-
-void set_perf_counter_pending(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
-	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
-#endif
-}
-
-void perf_counters_lapic_init(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
-	if (!x86_pmu.apic || !x86_pmu_initialized())
-		return;
-
-	/*
-	 * Always use NMI for PMU
-	 */
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-#endif
-}
-
-static int __kprobes
-perf_counter_nmi_handler(struct notifier_block *self,
-			 unsigned long cmd, void *__args)
-{
-	struct die_args *args = __args;
-	struct pt_regs *regs;
-
-	if (!atomic_read(&active_counters))
-		return NOTIFY_DONE;
-
-	switch (cmd) {
-	case DIE_NMI:
-	case DIE_NMI_IPI:
-		break;
-
-	default:
-		return NOTIFY_DONE;
-	}
-
-	regs = args->regs;
-
-#ifdef CONFIG_X86_LOCAL_APIC
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-#endif
-	/*
-	 * Can't rely on the handled return value to say it was our NMI, two
-	 * counters could trigger 'simultaneously' raising two back-to-back NMIs.
-	 *
-	 * If the first NMI handles both, the latter will be empty and daze
-	 * the CPU.
-	 */
-	x86_pmu.handle_irq(regs);
-
-	return NOTIFY_STOP;
-}
-
-static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
-	.notifier_call		= perf_counter_nmi_handler,
-	.next			= NULL,
-	.priority		= 1
-};
-
-static struct x86_pmu p6_pmu = {
-	.name			= "p6",
-	.handle_irq		= p6_pmu_handle_irq,
-	.disable_all		= p6_pmu_disable_all,
-	.enable_all		= p6_pmu_enable_all,
-	.enable			= p6_pmu_enable_counter,
-	.disable		= p6_pmu_disable_counter,
-	.eventsel		= MSR_P6_EVNTSEL0,
-	.perfctr		= MSR_P6_PERFCTR0,
-	.event_map		= p6_pmu_event_map,
-	.raw_event		= p6_pmu_raw_event,
-	.max_events		= ARRAY_SIZE(p6_perfmon_event_map),
-	.apic			= 1,
-	.max_period		= (1ULL << 31) - 1,
-	.version		= 0,
-	.num_counters		= 2,
-	/*
-	 * Counters have 40 bits implemented. However they are designed such
-	 * that bits [32-39] are sign extensions of bit 31. As such the
-	 * effective width of a counter for P6-like PMU is 32 bits only.
-	 *
-	 * See IA-32 Intel Architecture Software developer manual Vol 3B
-	 */
-	.counter_bits		= 32,
-	.counter_mask		= (1ULL << 32) - 1,
-};
-
-static struct x86_pmu intel_pmu = {
-	.name			= "Intel",
-	.handle_irq		= intel_pmu_handle_irq,
-	.disable_all		= intel_pmu_disable_all,
-	.enable_all		= intel_pmu_enable_all,
-	.enable			= intel_pmu_enable_counter,
-	.disable		= intel_pmu_disable_counter,
-	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
-	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
-	.event_map		= intel_pmu_event_map,
-	.raw_event		= intel_pmu_raw_event,
-	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
-	.apic			= 1,
-	/*
-	 * Intel PMCs cannot be accessed sanely above 32 bit width,
-	 * so we install an artificial 1<<31 period regardless of
-	 * the generic counter period:
-	 */
-	.max_period		= (1ULL << 31) - 1,
-	.enable_bts		= intel_pmu_enable_bts,
-	.disable_bts		= intel_pmu_disable_bts,
-};
-
-static struct x86_pmu amd_pmu = {
-	.name			= "AMD",
-	.handle_irq		= amd_pmu_handle_irq,
-	.disable_all		= amd_pmu_disable_all,
-	.enable_all		= amd_pmu_enable_all,
-	.enable			= amd_pmu_enable_counter,
-	.disable		= amd_pmu_disable_counter,
-	.eventsel		= MSR_K7_EVNTSEL0,
-	.perfctr		= MSR_K7_PERFCTR0,
-	.event_map		= amd_pmu_event_map,
-	.raw_event		= amd_pmu_raw_event,
-	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
-	.num_counters		= 4,
-	.counter_bits		= 48,
-	.counter_mask		= (1ULL << 48) - 1,
-	.apic			= 1,
-	/* use highest bit to detect overflow */
-	.max_period		= (1ULL << 47) - 1,
-};
-
-static int p6_pmu_init(void)
-{
-	switch (boot_cpu_data.x86_model) {
-	case 1:
-	case 3:  /* Pentium Pro */
-	case 5:
-	case 6:  /* Pentium II */
-	case 7:
-	case 8:
-	case 11: /* Pentium III */
-		break;
-	case 9:
-	case 13:
-		/* Pentium M */
-		break;
-	default:
-		pr_cont("unsupported p6 CPU model %d ",
-			boot_cpu_data.x86_model);
-		return -ENODEV;
-	}
-
-	x86_pmu = p6_pmu;
-
-	if (!cpu_has_apic) {
-		pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
-		pr_info("no hardware sampling interrupt available.\n");
-		x86_pmu.apic = 0;
-	}
-
-	return 0;
-}
-
-static int intel_pmu_init(void)
-{
-	union cpuid10_edx edx;
-	union cpuid10_eax eax;
-	unsigned int unused;
-	unsigned int ebx;
-	int version;
-
-	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
-		/* check for P6 processor family */
-	   if (boot_cpu_data.x86 == 6) {
-		return p6_pmu_init();
-	   } else {
-		return -ENODEV;
-	   }
-	}
-
-	/*
-	 * Check whether the Architectural PerfMon supports
-	 * Branch Misses Retired hw_event or not.
-	 */
-	cpuid(10, &eax.full, &ebx, &unused, &edx.full);
-	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
-		return -ENODEV;
-
-	version = eax.split.version_id;
-	if (version < 2)
-		return -ENODEV;
-
-	x86_pmu				= intel_pmu;
-	x86_pmu.version			= version;
-	x86_pmu.num_counters		= eax.split.num_counters;
-	x86_pmu.counter_bits		= eax.split.bit_width;
-	x86_pmu.counter_mask		= (1ULL << eax.split.bit_width) - 1;
-
-	/*
-	 * Quirk: v2 perfmon does not report fixed-purpose counters, so
-	 * assume at least 3 counters:
-	 */
-	x86_pmu.num_counters_fixed	= max((int)edx.split.num_counters_fixed, 3);
-
-	/*
-	 * Install the hw-cache-events table:
-	 */
-	switch (boot_cpu_data.x86_model) {
-	case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
-	case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
-	case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
-	case 29: /* six-core 45 nm xeon "Dunnington" */
-		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
-		       sizeof(hw_cache_event_ids));
-
-		pr_cont("Core2 events, ");
-		break;
-	default:
-	case 26:
-		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
-		       sizeof(hw_cache_event_ids));
-
-		pr_cont("Nehalem/Corei7 events, ");
-		break;
-	case 28:
-		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
-		       sizeof(hw_cache_event_ids));
-
-		pr_cont("Atom events, ");
-		break;
-	}
-	return 0;
-}
-
-static int amd_pmu_init(void)
-{
-	/* Performance-monitoring supported from K7 and later: */
-	if (boot_cpu_data.x86 < 6)
-		return -ENODEV;
-
-	x86_pmu = amd_pmu;
-
-	/* Events are common for all AMDs */
-	memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
-	       sizeof(hw_cache_event_ids));
-
-	return 0;
-}
-
-void __init init_hw_perf_counters(void)
-{
-	int err;
-
-	pr_info("Performance Counters: ");
-
-	switch (boot_cpu_data.x86_vendor) {
-	case X86_VENDOR_INTEL:
-		err = intel_pmu_init();
-		break;
-	case X86_VENDOR_AMD:
-		err = amd_pmu_init();
-		break;
-	default:
-		return;
-	}
-	if (err != 0) {
-		pr_cont("no PMU driver, software counters only.\n");
-		return;
-	}
-
-	pr_cont("%s PMU driver.\n", x86_pmu.name);
-
-	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
-		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
-		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
-		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
-	}
-	perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
-	perf_max_counters = x86_pmu.num_counters;
-
-	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
-		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
-		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
-		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
-	}
-
-	perf_counter_mask |=
-		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
-	x86_pmu.intel_ctrl = perf_counter_mask;
-
-	perf_counters_lapic_init();
-	register_die_notifier(&perf_counter_nmi_notifier);
-
-	pr_info("... version:                 %d\n",     x86_pmu.version);
-	pr_info("... bit width:               %d\n",     x86_pmu.counter_bits);
-	pr_info("... generic counters:        %d\n",     x86_pmu.num_counters);
-	pr_info("... value mask:              %016Lx\n", x86_pmu.counter_mask);
-	pr_info("... max period:              %016Lx\n", x86_pmu.max_period);
-	pr_info("... fixed-purpose counters:  %d\n",     x86_pmu.num_counters_fixed);
-	pr_info("... counter mask:            %016Lx\n", perf_counter_mask);
-}
-
-static inline void x86_pmu_read(struct perf_counter *counter)
-{
-	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
-}
-
-static const struct pmu pmu = {
-	.enable		= x86_pmu_enable,
-	.disable	= x86_pmu_disable,
-	.read		= x86_pmu_read,
-	.unthrottle	= x86_pmu_unthrottle,
-};
-
-const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
-{
-	int err;
-
-	err = __hw_perf_counter_init(counter);
-	if (err) {
-		if (counter->destroy)
-			counter->destroy(counter);
-		return ERR_PTR(err);
-	}
-
-	return &pmu;
-}
-
-/*
- * callchain support
- */
-
-static inline
-void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
-	if (entry->nr < PERF_MAX_STACK_DEPTH)
-		entry->ip[entry->nr++] = ip;
-}
-
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
-static DEFINE_PER_CPU(int, in_nmi_frame);
-
-
-static void
-backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-	/* Ignore warnings */
-}
-
-static void backtrace_warning(void *data, char *msg)
-{
-	/* Ignore warnings */
-}
-
-static int backtrace_stack(void *data, char *name)
-{
-	per_cpu(in_nmi_frame, smp_processor_id()) =
-			x86_is_stack_id(NMI_STACK, name);
-
-	return 0;
-}
-
-static void backtrace_address(void *data, unsigned long addr, int reliable)
-{
-	struct perf_callchain_entry *entry = data;
-
-	if (per_cpu(in_nmi_frame, smp_processor_id()))
-		return;
-
-	if (reliable)
-		callchain_store(entry, addr);
-}
-
-static const struct stacktrace_ops backtrace_ops = {
-	.warning		= backtrace_warning,
-	.warning_symbol		= backtrace_warning_symbol,
-	.stack			= backtrace_stack,
-	.address		= backtrace_address,
-};
-
-#include "../dumpstack.h"
-
-static void
-perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-	callchain_store(entry, PERF_CONTEXT_KERNEL);
-	callchain_store(entry, regs->ip);
-
-	dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
-}
-
-/*
- * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
- */
-static unsigned long
-copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
-{
-	unsigned long offset, addr = (unsigned long)from;
-	int type = in_nmi() ? KM_NMI : KM_IRQ0;
-	unsigned long size, len = 0;
-	struct page *page;
-	void *map;
-	int ret;
-
-	do {
-		ret = __get_user_pages_fast(addr, 1, 0, &page);
-		if (!ret)
-			break;
-
-		offset = addr & (PAGE_SIZE - 1);
-		size = min(PAGE_SIZE - offset, n - len);
-
-		map = kmap_atomic(page, type);
-		memcpy(to, map+offset, size);
-		kunmap_atomic(map, type);
-		put_page(page);
-
-		len  += size;
-		to   += size;
-		addr += size;
-
-	} while (len < n);
-
-	return len;
-}
-
-static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
-{
-	unsigned long bytes;
-
-	bytes = copy_from_user_nmi(frame, fp, sizeof(*frame));
-
-	return bytes == sizeof(*frame);
-}
-
-static void
-perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-	struct stack_frame frame;
-	const void __user *fp;
-
-	if (!user_mode(regs))
-		regs = task_pt_regs(current);
-
-	fp = (void __user *)regs->bp;
-
-	callchain_store(entry, PERF_CONTEXT_USER);
-	callchain_store(entry, regs->ip);
-
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
-		frame.next_frame	     = NULL;
-		frame.return_address = 0;
-
-		if (!copy_stack_frame(fp, &frame))
-			break;
-
-		if ((unsigned long)fp < regs->sp)
-			break;
-
-		callchain_store(entry, frame.return_address);
-		fp = frame.next_frame;
-	}
-}
-
-static void
-perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-	int is_user;
-
-	if (!regs)
-		return;
-
-	is_user = user_mode(regs);
-
-	if (!current || current->pid == 0)
-		return;
-
-	if (is_user && current->state != TASK_RUNNING)
-		return;
-
-	if (!is_user)
-		perf_callchain_kernel(regs, entry);
-
-	if (current->mm)
-		perf_callchain_user(regs, entry);
-}
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-	struct perf_callchain_entry *entry;
-
-	if (in_nmi())
-		entry = &__get_cpu_var(pmc_nmi_entry);
-	else
-		entry = &__get_cpu_var(pmc_irq_entry);
-
-	entry->nr = 0;
-
-	perf_do_callchain(regs, entry);
-
-	return entry;
-}
-
-void hw_perf_counter_setup_online(int cpu)
-{
-	init_debug_store_on_cpu(cpu);
-}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
new file mode 100644
index 000000000000..0d03629fb1a5
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -0,0 +1,2298 @@
+/*
+ * Performance events x86 architecture code
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/capability.h>
+#include <linux/notifier.h>
+#include <linux/hardirq.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/kdebug.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/highmem.h>
+#include <linux/cpu.h>
+
+#include <asm/apic.h>
+#include <asm/stacktrace.h>
+#include <asm/nmi.h>
+
+static u64 perf_event_mask __read_mostly;
+
+/* The maximal number of PEBS events: */
+#define MAX_PEBS_EVENTS	4
+
+/* The size of a BTS record in bytes: */
+#define BTS_RECORD_SIZE		24
+
+/* The size of a per-cpu BTS buffer in bytes: */
+#define BTS_BUFFER_SIZE		(BTS_RECORD_SIZE * 2048)
+
+/* The BTS overflow threshold in bytes from the end of the buffer: */
+#define BTS_OVFL_TH		(BTS_RECORD_SIZE * 128)
+
+
+/*
+ * Bits in the debugctlmsr controlling branch tracing.
+ */
+#define X86_DEBUGCTL_TR			(1 << 6)
+#define X86_DEBUGCTL_BTS		(1 << 7)
+#define X86_DEBUGCTL_BTINT		(1 << 8)
+#define X86_DEBUGCTL_BTS_OFF_OS		(1 << 9)
+#define X86_DEBUGCTL_BTS_OFF_USR	(1 << 10)
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+	u64	bts_buffer_base;
+	u64	bts_index;
+	u64	bts_absolute_maximum;
+	u64	bts_interrupt_threshold;
+	u64	pebs_buffer_base;
+	u64	pebs_index;
+	u64	pebs_absolute_maximum;
+	u64	pebs_interrupt_threshold;
+	u64	pebs_event_reset[MAX_PEBS_EVENTS];
+};
+
+struct cpu_hw_events {
+	struct perf_event	*events[X86_PMC_IDX_MAX];
+	unsigned long		used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	unsigned long		interrupts;
+	int			enabled;
+	struct debug_store	*ds;
+};
+
+/*
+ * struct x86_pmu - generic x86 pmu
+ */
+struct x86_pmu {
+	const char	*name;
+	int		version;
+	int		(*handle_irq)(struct pt_regs *);
+	void		(*disable_all)(void);
+	void		(*enable_all)(void);
+	void		(*enable)(struct hw_perf_event *, int);
+	void		(*disable)(struct hw_perf_event *, int);
+	unsigned	eventsel;
+	unsigned	perfctr;
+	u64		(*event_map)(int);
+	u64		(*raw_event)(u64);
+	int		max_events;
+	int		num_events;
+	int		num_events_fixed;
+	int		event_bits;
+	u64		event_mask;
+	int		apic;
+	u64		max_period;
+	u64		intel_ctrl;
+	void		(*enable_bts)(u64 config);
+	void		(*disable_bts)(void);
+};
+
+static struct x86_pmu x86_pmu __read_mostly;
+
+static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
+	.enabled = 1,
+};
+
+/*
+ * Not sure about some of these
+ */
+static const u64 p6_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0079,
+  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0f2e,
+  [PERF_COUNT_HW_CACHE_MISSES]		= 0x012e,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
+  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
+  [PERF_COUNT_HW_BUS_CYCLES]		= 0x0062,
+};
+
+static u64 p6_pmu_event_map(int hw_event)
+{
+	return p6_perfmon_event_map[hw_event];
+}
+
+/*
+ * Event setting that is specified not to count anything.
+ * We use this to effectively disable a counter.
+ *
+ * L2_RQSTS with 0 MESI unit mask.
+ */
+#define P6_NOP_EVENT			0x0000002EULL
+
+static u64 p6_pmu_raw_event(u64 hw_event)
+{
+#define P6_EVNTSEL_EVENT_MASK		0x000000FFULL
+#define P6_EVNTSEL_UNIT_MASK		0x0000FF00ULL
+#define P6_EVNTSEL_EDGE_MASK		0x00040000ULL
+#define P6_EVNTSEL_INV_MASK		0x00800000ULL
+#define P6_EVNTSEL_REG_MASK		0xFF000000ULL
+
+#define P6_EVNTSEL_MASK			\
+	(P6_EVNTSEL_EVENT_MASK |	\
+	 P6_EVNTSEL_UNIT_MASK  |	\
+	 P6_EVNTSEL_EDGE_MASK  |	\
+	 P6_EVNTSEL_INV_MASK   |	\
+	 P6_EVNTSEL_REG_MASK)
+
+	return hw_event & P6_EVNTSEL_MASK;
+}
+
+
+/*
+ * Intel PerfMon v3. Used on Core2 and later.
+ */
+static const u64 intel_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]		= 0x003c,
+  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4f2e,
+  [PERF_COUNT_HW_CACHE_MISSES]		= 0x412e,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
+  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
+  [PERF_COUNT_HW_BUS_CYCLES]		= 0x013c,
+};
+
+static u64 intel_pmu_event_map(int hw_event)
+{
+	return intel_perfmon_event_map[hw_event];
+}
+
+/*
+ * Generalized hw caching related hw_event table, filled
+ * in on a per model basis. A value of 0 means
+ * 'not supported', -1 means 'hw_event makes no sense on
+ * this CPU', any other value means the raw hw_event
+ * ID.
+ */
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+static u64 __read_mostly hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+static const u64 nehalem_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI            */
+		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE         */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI            */
+		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE         */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
+		[ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
+		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
+		[ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS                */
+		[ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference                */
+		[ C(RESULT_MISS)   ] = 0x412e, /* LLC Misses                   */
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
+		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
+		[ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
+		[ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+static const u64 core2_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */
+		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */
+		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
+		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+		[ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+static const u64 atom_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */
+		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+		[ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+static u64 intel_pmu_raw_event(u64 hw_event)
+{
+#define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
+#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
+#define CORE_EVNTSEL_EDGE_MASK		0x00040000ULL
+#define CORE_EVNTSEL_INV_MASK		0x00800000ULL
+#define CORE_EVNTSEL_REG_MASK	0xFF000000ULL
+
+#define CORE_EVNTSEL_MASK		\
+	(CORE_EVNTSEL_EVENT_MASK |	\
+	 CORE_EVNTSEL_UNIT_MASK  |	\
+	 CORE_EVNTSEL_EDGE_MASK  |	\
+	 CORE_EVNTSEL_INV_MASK  |	\
+	 CORE_EVNTSEL_REG_MASK)
+
+	return hw_event & CORE_EVNTSEL_MASK;
+}
+
+static const u64 amd_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
+		[ C(RESULT_MISS)   ] = 0x0041, /* Data Cache Misses          */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts  */
+		[ C(RESULT_MISS)   ] = 0x0167, /* Data Prefetcher :cancelled */
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */
+		[ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
+		[ C(RESULT_MISS)   ] = 0x037E, /* L2 Cache Misses : IC+DC     */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback           */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
+		[ C(RESULT_MISS)   ] = 0x0046, /* L1 DTLB and L2 DLTB Miss   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
+		[ C(RESULT_MISS)   ] = 0x0085, /* Instr. fetch ITLB misses   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */
+		[ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+/*
+ * AMD Performance Monitor K7 and later.
+ */
+static const u64 amd_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0076,
+  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0080,
+  [PERF_COUNT_HW_CACHE_MISSES]		= 0x0081,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
+  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
+};
+
+static u64 amd_pmu_event_map(int hw_event)
+{
+	return amd_perfmon_event_map[hw_event];
+}
+
+static u64 amd_pmu_raw_event(u64 hw_event)
+{
+#define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL
+#define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
+#define K7_EVNTSEL_EDGE_MASK	0x000040000ULL
+#define K7_EVNTSEL_INV_MASK	0x000800000ULL
+#define K7_EVNTSEL_REG_MASK	0x0FF000000ULL
+
+#define K7_EVNTSEL_MASK			\
+	(K7_EVNTSEL_EVENT_MASK |	\
+	 K7_EVNTSEL_UNIT_MASK  |	\
+	 K7_EVNTSEL_EDGE_MASK  |	\
+	 K7_EVNTSEL_INV_MASK   |	\
+	 K7_EVNTSEL_REG_MASK)
+
+	return hw_event & K7_EVNTSEL_MASK;
+}
+
+/*
+ * Propagate event elapsed time into the generic event.
+ * Can only be executed on the CPU where the event is active.
+ * Returns the delta events processed.
+ */
+static u64
+x86_perf_event_update(struct perf_event *event,
+			struct hw_perf_event *hwc, int idx)
+{
+	int shift = 64 - x86_pmu.event_bits;
+	u64 prev_raw_count, new_raw_count;
+	s64 delta;
+
+	if (idx == X86_PMC_IDX_FIXED_BTS)
+		return 0;
+
+	/*
+	 * Careful: an NMI might modify the previous event value.
+	 *
+	 * Our tactic to handle this is to first atomically read and
+	 * exchange a new raw count - then add that new-prev delta
+	 * count to the generic event atomically:
+	 */
+again:
+	prev_raw_count = atomic64_read(&hwc->prev_count);
+	rdmsrl(hwc->event_base + idx, new_raw_count);
+
+	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+					new_raw_count) != prev_raw_count)
+		goto again;
+
+	/*
+	 * Now we have the new raw value and have updated the prev
+	 * timestamp already. We can now calculate the elapsed delta
+	 * (event-)time and add that to the generic event.
+	 *
+	 * Careful, not all hw sign-extends above the physical width
+	 * of the count.
+	 */
+	delta = (new_raw_count << shift) - (prev_raw_count << shift);
+	delta >>= shift;
+
+	atomic64_add(delta, &event->count);
+	atomic64_sub(delta, &hwc->period_left);
+
+	return new_raw_count;
+}
+
+static atomic_t active_events;
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+static bool reserve_pmc_hardware(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+	int i;
+
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		disable_lapic_nmi_watchdog();
+
+	for (i = 0; i < x86_pmu.num_events; i++) {
+		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
+			goto perfctr_fail;
+	}
+
+	for (i = 0; i < x86_pmu.num_events; i++) {
+		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
+			goto eventsel_fail;
+	}
+#endif
+
+	return true;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+eventsel_fail:
+	for (i--; i >= 0; i--)
+		release_evntsel_nmi(x86_pmu.eventsel + i);
+
+	i = x86_pmu.num_events;
+
+perfctr_fail:
+	for (i--; i >= 0; i--)
+		release_perfctr_nmi(x86_pmu.perfctr + i);
+
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		enable_lapic_nmi_watchdog();
+
+	return false;
+#endif
+}
+
+static void release_pmc_hardware(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+	int i;
+
+	for (i = 0; i < x86_pmu.num_events; i++) {
+		release_perfctr_nmi(x86_pmu.perfctr + i);
+		release_evntsel_nmi(x86_pmu.eventsel + i);
+	}
+
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		enable_lapic_nmi_watchdog();
+#endif
+}
+
+static inline bool bts_available(void)
+{
+	return x86_pmu.enable_bts != NULL;
+}
+
+static inline void init_debug_store_on_cpu(int cpu)
+{
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+	if (!ds)
+		return;
+
+	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
+		     (u32)((u64)(unsigned long)ds),
+		     (u32)((u64)(unsigned long)ds >> 32));
+}
+
+static inline void fini_debug_store_on_cpu(int cpu)
+{
+	if (!per_cpu(cpu_hw_events, cpu).ds)
+		return;
+
+	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
+}
+
+static void release_bts_hardware(void)
+{
+	int cpu;
+
+	if (!bts_available())
+		return;
+
+	get_online_cpus();
+
+	for_each_online_cpu(cpu)
+		fini_debug_store_on_cpu(cpu);
+
+	for_each_possible_cpu(cpu) {
+		struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+		if (!ds)
+			continue;
+
+		per_cpu(cpu_hw_events, cpu).ds = NULL;
+
+		kfree((void *)(unsigned long)ds->bts_buffer_base);
+		kfree(ds);
+	}
+
+	put_online_cpus();
+}
+
+static int reserve_bts_hardware(void)
+{
+	int cpu, err = 0;
+
+	if (!bts_available())
+		return 0;
+
+	get_online_cpus();
+
+	for_each_possible_cpu(cpu) {
+		struct debug_store *ds;
+		void *buffer;
+
+		err = -ENOMEM;
+		buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
+		if (unlikely(!buffer))
+			break;
+
+		ds = kzalloc(sizeof(*ds), GFP_KERNEL);
+		if (unlikely(!ds)) {
+			kfree(buffer);
+			break;
+		}
+
+		ds->bts_buffer_base = (u64)(unsigned long)buffer;
+		ds->bts_index = ds->bts_buffer_base;
+		ds->bts_absolute_maximum =
+			ds->bts_buffer_base + BTS_BUFFER_SIZE;
+		ds->bts_interrupt_threshold =
+			ds->bts_absolute_maximum - BTS_OVFL_TH;
+
+		per_cpu(cpu_hw_events, cpu).ds = ds;
+		err = 0;
+	}
+
+	if (err)
+		release_bts_hardware();
+	else {
+		for_each_online_cpu(cpu)
+			init_debug_store_on_cpu(cpu);
+	}
+
+	put_online_cpus();
+
+	return err;
+}
+
+static void hw_perf_event_destroy(struct perf_event *event)
+{
+	if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
+		release_pmc_hardware();
+		release_bts_hardware();
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+}
+
+static inline int x86_pmu_initialized(void)
+{
+	return x86_pmu.handle_irq != NULL;
+}
+
+static inline int
+set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
+{
+	unsigned int cache_type, cache_op, cache_result;
+	u64 config, val;
+
+	config = attr->config;
+
+	cache_type = (config >>  0) & 0xff;
+	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
+		return -EINVAL;
+
+	cache_op = (config >>  8) & 0xff;
+	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
+		return -EINVAL;
+
+	cache_result = (config >> 16) & 0xff;
+	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+		return -EINVAL;
+
+	val = hw_cache_event_ids[cache_type][cache_op][cache_result];
+
+	if (val == 0)
+		return -ENOENT;
+
+	if (val == -1)
+		return -EINVAL;
+
+	hwc->config |= val;
+
+	return 0;
+}
+
+static void intel_pmu_enable_bts(u64 config)
+{
+	unsigned long debugctlmsr;
+
+	debugctlmsr = get_debugctlmsr();
+
+	debugctlmsr |= X86_DEBUGCTL_TR;
+	debugctlmsr |= X86_DEBUGCTL_BTS;
+	debugctlmsr |= X86_DEBUGCTL_BTINT;
+
+	if (!(config & ARCH_PERFMON_EVENTSEL_OS))
+		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
+
+	if (!(config & ARCH_PERFMON_EVENTSEL_USR))
+		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
+
+	update_debugctlmsr(debugctlmsr);
+}
+
+static void intel_pmu_disable_bts(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	unsigned long debugctlmsr;
+
+	if (!cpuc->ds)
+		return;
+
+	debugctlmsr = get_debugctlmsr();
+
+	debugctlmsr &=
+		~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
+		  X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
+
+	update_debugctlmsr(debugctlmsr);
+}
+
+/*
+ * Setup the hardware configuration for a given attr_type
+ */
+static int __hw_perf_event_init(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	struct hw_perf_event *hwc = &event->hw;
+	u64 config;
+	int err;
+
+	if (!x86_pmu_initialized())
+		return -ENODEV;
+
+	err = 0;
+	if (!atomic_inc_not_zero(&active_events)) {
+		mutex_lock(&pmc_reserve_mutex);
+		if (atomic_read(&active_events) == 0) {
+			if (!reserve_pmc_hardware())
+				err = -EBUSY;
+			else
+				err = reserve_bts_hardware();
+		}
+		if (!err)
+			atomic_inc(&active_events);
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+	if (err)
+		return err;
+
+	event->destroy = hw_perf_event_destroy;
+
+	/*
+	 * Generate PMC IRQs:
+	 * (keep 'enabled' bit clear for now)
+	 */
+	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
+
+	/*
+	 * Count user and OS events unless requested not to.
+	 */
+	if (!attr->exclude_user)
+		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
+	if (!attr->exclude_kernel)
+		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
+
+	if (!hwc->sample_period) {
+		hwc->sample_period = x86_pmu.max_period;
+		hwc->last_period = hwc->sample_period;
+		atomic64_set(&hwc->period_left, hwc->sample_period);
+	} else {
+		/*
+		 * If we have a PMU initialized but no APIC
+		 * interrupts, we cannot sample hardware
+		 * events (user-space has to fall back and
+		 * sample via a hrtimer based software event):
+		 */
+		if (!x86_pmu.apic)
+			return -EOPNOTSUPP;
+	}
+
+	/*
+	 * Raw hw_event type provide the config in the hw_event structure
+	 */
+	if (attr->type == PERF_TYPE_RAW) {
+		hwc->config |= x86_pmu.raw_event(attr->config);
+		return 0;
+	}
+
+	if (attr->type == PERF_TYPE_HW_CACHE)
+		return set_ext_hw_attr(hwc, attr);
+
+	if (attr->config >= x86_pmu.max_events)
+		return -EINVAL;
+
+	/*
+	 * The generic map:
+	 */
+	config = x86_pmu.event_map(attr->config);
+
+	if (config == 0)
+		return -ENOENT;
+
+	if (config == -1LL)
+		return -EINVAL;
+
+	/*
+	 * Branch tracing:
+	 */
+	if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
+	    (hwc->sample_period == 1)) {
+		/* BTS is not supported by this architecture. */
+		if (!bts_available())
+			return -EOPNOTSUPP;
+
+		/* BTS is currently only allowed for user-mode. */
+		if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+			return -EOPNOTSUPP;
+	}
+
+	hwc->config |= config;
+
+	return 0;
+}
+
+static void p6_pmu_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	u64 val;
+
+	if (!cpuc->enabled)
+		return;
+
+	cpuc->enabled = 0;
+	barrier();
+
+	/* p6 only has one enable register */
+	rdmsrl(MSR_P6_EVNTSEL0, val);
+	val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+	wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static void intel_pmu_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (!cpuc->enabled)
+		return;
+
+	cpuc->enabled = 0;
+	barrier();
+
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+
+	if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+		intel_pmu_disable_bts();
+}
+
+static void amd_pmu_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	int idx;
+
+	if (!cpuc->enabled)
+		return;
+
+	cpuc->enabled = 0;
+	/*
+	 * ensure we write the disable before we start disabling the
+	 * events proper, so that amd_pmu_enable_event() does the
+	 * right thing.
+	 */
+	barrier();
+
+	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+		u64 val;
+
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
+		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
+			continue;
+		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+	}
+}
+
+void hw_perf_disable(void)
+{
+	if (!x86_pmu_initialized())
+		return;
+	return x86_pmu.disable_all();
+}
+
+static void p6_pmu_enable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	unsigned long val;
+
+	if (cpuc->enabled)
+		return;
+
+	cpuc->enabled = 1;
+	barrier();
+
+	/* p6 only has one enable register */
+	rdmsrl(MSR_P6_EVNTSEL0, val);
+	val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+	wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static void intel_pmu_enable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (cpuc->enabled)
+		return;
+
+	cpuc->enabled = 1;
+	barrier();
+
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+
+	if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
+		struct perf_event *event =
+			cpuc->events[X86_PMC_IDX_FIXED_BTS];
+
+		if (WARN_ON_ONCE(!event))
+			return;
+
+		intel_pmu_enable_bts(event->hw.config);
+	}
+}
+
+static void amd_pmu_enable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	int idx;
+
+	if (cpuc->enabled)
+		return;
+
+	cpuc->enabled = 1;
+	barrier();
+
+	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+		struct perf_event *event = cpuc->events[idx];
+		u64 val;
+
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+
+		val = event->hw.config;
+		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+	}
+}
+
+void hw_perf_enable(void)
+{
+	if (!x86_pmu_initialized())
+		return;
+	x86_pmu.enable_all();
+}
+
+static inline u64 intel_pmu_get_status(void)
+{
+	u64 status;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+	return status;
+}
+
+static inline void intel_pmu_ack_status(u64 ack)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx)
+{
+	(void)checking_wrmsrl(hwc->config_base + idx,
+			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
+}
+
+static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx)
+{
+	(void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
+}
+
+static inline void
+intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx)
+{
+	int idx = __idx - X86_PMC_IDX_FIXED;
+	u64 ctrl_val, mask;
+
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	(void)checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static inline void
+p6_pmu_disable_event(struct hw_perf_event *hwc, int idx)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	u64 val = P6_NOP_EVENT;
+
+	if (cpuc->enabled)
+		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+
+	(void)checking_wrmsrl(hwc->config_base + idx, val);
+}
+
+static inline void
+intel_pmu_disable_event(struct hw_perf_event *hwc, int idx)
+{
+	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
+		intel_pmu_disable_bts();
+		return;
+	}
+
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		intel_pmu_disable_fixed(hwc, idx);
+		return;
+	}
+
+	x86_pmu_disable_event(hwc, idx);
+}
+
+static inline void
+amd_pmu_disable_event(struct hw_perf_event *hwc, int idx)
+{
+	x86_pmu_disable_event(hwc, idx);
+}
+
+static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
+
+/*
+ * Set the next IRQ period, based on the hwc->period_left value.
+ * To be called with the event disabled in hw:
+ */
+static int
+x86_perf_event_set_period(struct perf_event *event,
+			     struct hw_perf_event *hwc, int idx)
+{
+	s64 left = atomic64_read(&hwc->period_left);
+	s64 period = hwc->sample_period;
+	int err, ret = 0;
+
+	if (idx == X86_PMC_IDX_FIXED_BTS)
+		return 0;
+
+	/*
+	 * If we are way outside a reasoable range then just skip forward:
+	 */
+	if (unlikely(left <= -period)) {
+		left = period;
+		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+
+	if (unlikely(left <= 0)) {
+		left += period;
+		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+	/*
+	 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
+	 */
+	if (unlikely(left < 2))
+		left = 2;
+
+	if (left > x86_pmu.max_period)
+		left = x86_pmu.max_period;
+
+	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
+
+	/*
+	 * The hw event starts counting from this event offset,
+	 * mark it to be able to extra future deltas:
+	 */
+	atomic64_set(&hwc->prev_count, (u64)-left);
+
+	err = checking_wrmsrl(hwc->event_base + idx,
+			     (u64)(-left) & x86_pmu.event_mask);
+
+	perf_event_update_userpage(event);
+
+	return ret;
+}
+
+static inline void
+intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)
+{
+	int idx = __idx - X86_PMC_IDX_FIXED;
+	u64 ctrl_val, bits, mask;
+	int err;
+
+	/*
+	 * Enable IRQ generation (0x8),
+	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+	 * if requested:
+	 */
+	bits = 0x8ULL;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+		bits |= 0x2;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+		bits |= 0x1;
+	bits <<= (idx * 4);
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	ctrl_val |= bits;
+	err = checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	u64 val;
+
+	val = hwc->config;
+	if (cpuc->enabled)
+		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+
+	(void)checking_wrmsrl(hwc->config_base + idx, val);
+}
+
+
+static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx)
+{
+	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
+		if (!__get_cpu_var(cpu_hw_events).enabled)
+			return;
+
+		intel_pmu_enable_bts(hwc->config);
+		return;
+	}
+
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		intel_pmu_enable_fixed(hwc, idx);
+		return;
+	}
+
+	x86_pmu_enable_event(hwc, idx);
+}
+
+static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (cpuc->enabled)
+		x86_pmu_enable_event(hwc, idx);
+}
+
+static int
+fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
+{
+	unsigned int hw_event;
+
+	hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK;
+
+	if (unlikely((hw_event ==
+		      x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
+		     (hwc->sample_period == 1)))
+		return X86_PMC_IDX_FIXED_BTS;
+
+	if (!x86_pmu.num_events_fixed)
+		return -1;
+
+	if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
+		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
+	if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
+		return X86_PMC_IDX_FIXED_CPU_CYCLES;
+	if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
+		return X86_PMC_IDX_FIXED_BUS_CYCLES;
+
+	return -1;
+}
+
+/*
+ * Find a PMC slot for the freshly enabled / scheduled in event:
+ */
+static int x86_pmu_enable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx;
+
+	idx = fixed_mode_idx(event, hwc);
+	if (idx == X86_PMC_IDX_FIXED_BTS) {
+		/* BTS is already occupied. */
+		if (test_and_set_bit(idx, cpuc->used_mask))
+			return -EAGAIN;
+
+		hwc->config_base	= 0;
+		hwc->event_base	= 0;
+		hwc->idx		= idx;
+	} else if (idx >= 0) {
+		/*
+		 * Try to get the fixed event, if that is already taken
+		 * then try to get a generic event:
+		 */
+		if (test_and_set_bit(idx, cpuc->used_mask))
+			goto try_generic;
+
+		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+		/*
+		 * We set it so that event_base + idx in wrmsr/rdmsr maps to
+		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
+		 */
+		hwc->event_base =
+			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
+		hwc->idx = idx;
+	} else {
+		idx = hwc->idx;
+		/* Try to get the previous generic event again */
+		if (test_and_set_bit(idx, cpuc->used_mask)) {
+try_generic:
+			idx = find_first_zero_bit(cpuc->used_mask,
+						  x86_pmu.num_events);
+			if (idx == x86_pmu.num_events)
+				return -EAGAIN;
+
+			set_bit(idx, cpuc->used_mask);
+			hwc->idx = idx;
+		}
+		hwc->config_base  = x86_pmu.eventsel;
+		hwc->event_base = x86_pmu.perfctr;
+	}
+
+	perf_events_lapic_init();
+
+	x86_pmu.disable(hwc, idx);
+
+	cpuc->events[idx] = event;
+	set_bit(idx, cpuc->active_mask);
+
+	x86_perf_event_set_period(event, hwc, idx);
+	x86_pmu.enable(hwc, idx);
+
+	perf_event_update_userpage(event);
+
+	return 0;
+}
+
+static void x86_pmu_unthrottle(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
+				cpuc->events[hwc->idx] != event))
+		return;
+
+	x86_pmu.enable(hwc, hwc->idx);
+}
+
+void perf_event_print_debug(void)
+{
+	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
+	struct cpu_hw_events *cpuc;
+	unsigned long flags;
+	int cpu, idx;
+
+	if (!x86_pmu.num_events)
+		return;
+
+	local_irq_save(flags);
+
+	cpu = smp_processor_id();
+	cpuc = &per_cpu(cpu_hw_events, cpu);
+
+	if (x86_pmu.version >= 2) {
+		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
+		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
+
+		pr_info("\n");
+		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
+		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
+		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
+		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
+	}
+	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used_mask);
+
+	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
+		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
+
+		prev_left = per_cpu(pmc_prev_left[idx], cpu);
+
+		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
+			cpu, idx, pmc_ctrl);
+		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
+			cpu, idx, pmc_count);
+		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
+			cpu, idx, prev_left);
+	}
+	for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
+		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
+
+		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
+			cpu, idx, pmc_count);
+	}
+	local_irq_restore(flags);
+}
+
+static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc)
+{
+	struct debug_store *ds = cpuc->ds;
+	struct bts_record {
+		u64	from;
+		u64	to;
+		u64	flags;
+	};
+	struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
+	struct bts_record *at, *top;
+	struct perf_output_handle handle;
+	struct perf_event_header header;
+	struct perf_sample_data data;
+	struct pt_regs regs;
+
+	if (!event)
+		return;
+
+	if (!ds)
+		return;
+
+	at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+	top = (struct bts_record *)(unsigned long)ds->bts_index;
+
+	if (top <= at)
+		return;
+
+	ds->bts_index = ds->bts_buffer_base;
+
+
+	data.period	= event->hw.last_period;
+	data.addr	= 0;
+	regs.ip		= 0;
+
+	/*
+	 * Prepare a generic sample, i.e. fill in the invariant fields.
+	 * We will overwrite the from and to address before we output
+	 * the sample.
+	 */
+	perf_prepare_sample(&header, &data, event, &regs);
+
+	if (perf_output_begin(&handle, event,
+			      header.size * (top - at), 1, 1))
+		return;
+
+	for (; at < top; at++) {
+		data.ip		= at->from;
+		data.addr	= at->to;
+
+		perf_output_sample(&handle, &header, &data, event);
+	}
+
+	perf_output_end(&handle);
+
+	/* There's new data available. */
+	event->hw.interrupts++;
+	event->pending_kill = POLL_IN;
+}
+
+static void x86_pmu_disable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	/*
+	 * Must be done before we disable, otherwise the nmi handler
+	 * could reenable again:
+	 */
+	clear_bit(idx, cpuc->active_mask);
+	x86_pmu.disable(hwc, idx);
+
+	/*
+	 * Make sure the cleared pointer becomes visible before we
+	 * (potentially) free the event:
+	 */
+	barrier();
+
+	/*
+	 * Drain the remaining delta count out of a event
+	 * that we are disabling:
+	 */
+	x86_perf_event_update(event, hwc, idx);
+
+	/* Drain the remaining BTS records. */
+	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
+		intel_pmu_drain_bts_buffer(cpuc);
+
+	cpuc->events[idx] = NULL;
+	clear_bit(idx, cpuc->used_mask);
+
+	perf_event_update_userpage(event);
+}
+
+/*
+ * Save and restart an expired event. Called by NMI contexts,
+ * so it has to be careful about preempting normal event ops:
+ */
+static int intel_pmu_save_and_restart(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+	int ret;
+
+	x86_perf_event_update(event, hwc, idx);
+	ret = x86_perf_event_set_period(event, hwc, idx);
+
+	if (event->state == PERF_EVENT_STATE_ACTIVE)
+		intel_pmu_enable_event(hwc, idx);
+
+	return ret;
+}
+
+static void intel_pmu_reset(void)
+{
+	struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
+	unsigned long flags;
+	int idx;
+
+	if (!x86_pmu.num_events)
+		return;
+
+	local_irq_save(flags);
+
+	printk("clearing PMU state on CPU#%d\n", smp_processor_id());
+
+	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+		checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
+		checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull);
+	}
+	for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
+		checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+	}
+	if (ds)
+		ds->bts_index = ds->bts_buffer_base;
+
+	local_irq_restore(flags);
+}
+
+static int p6_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	struct perf_event *event;
+	struct hw_perf_event *hwc;
+	int idx, handled = 0;
+	u64 val;
+
+	data.addr = 0;
+
+	cpuc = &__get_cpu_var(cpu_hw_events);
+
+	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+
+		event = cpuc->events[idx];
+		hwc = &event->hw;
+
+		val = x86_perf_event_update(event, hwc, idx);
+		if (val & (1ULL << (x86_pmu.event_bits - 1)))
+			continue;
+
+		/*
+		 * event overflow
+		 */
+		handled		= 1;
+		data.period	= event->hw.last_period;
+
+		if (!x86_perf_event_set_period(event, hwc, idx))
+			continue;
+
+		if (perf_event_overflow(event, 1, &data, regs))
+			p6_pmu_disable_event(hwc, idx);
+	}
+
+	if (handled)
+		inc_irq_stat(apic_perf_irqs);
+
+	return handled;
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int intel_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	int bit, loops;
+	u64 ack, status;
+
+	data.addr = 0;
+
+	cpuc = &__get_cpu_var(cpu_hw_events);
+
+	perf_disable();
+	intel_pmu_drain_bts_buffer(cpuc);
+	status = intel_pmu_get_status();
+	if (!status) {
+		perf_enable();
+		return 0;
+	}
+
+	loops = 0;
+again:
+	if (++loops > 100) {
+		WARN_ONCE(1, "perfevents: irq loop stuck!\n");
+		perf_event_print_debug();
+		intel_pmu_reset();
+		perf_enable();
+		return 1;
+	}
+
+	inc_irq_stat(apic_perf_irqs);
+	ack = status;
+	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+		struct perf_event *event = cpuc->events[bit];
+
+		clear_bit(bit, (unsigned long *) &status);
+		if (!test_bit(bit, cpuc->active_mask))
+			continue;
+
+		if (!intel_pmu_save_and_restart(event))
+			continue;
+
+		data.period = event->hw.last_period;
+
+		if (perf_event_overflow(event, 1, &data, regs))
+			intel_pmu_disable_event(&event->hw, bit);
+	}
+
+	intel_pmu_ack_status(ack);
+
+	/*
+	 * Repeat if there is more work to be done:
+	 */
+	status = intel_pmu_get_status();
+	if (status)
+		goto again;
+
+	perf_enable();
+
+	return 1;
+}
+
+static int amd_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	struct perf_event *event;
+	struct hw_perf_event *hwc;
+	int idx, handled = 0;
+	u64 val;
+
+	data.addr = 0;
+
+	cpuc = &__get_cpu_var(cpu_hw_events);
+
+	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+
+		event = cpuc->events[idx];
+		hwc = &event->hw;
+
+		val = x86_perf_event_update(event, hwc, idx);
+		if (val & (1ULL << (x86_pmu.event_bits - 1)))
+			continue;
+
+		/*
+		 * event overflow
+		 */
+		handled		= 1;
+		data.period	= event->hw.last_period;
+
+		if (!x86_perf_event_set_period(event, hwc, idx))
+			continue;
+
+		if (perf_event_overflow(event, 1, &data, regs))
+			amd_pmu_disable_event(hwc, idx);
+	}
+
+	if (handled)
+		inc_irq_stat(apic_perf_irqs);
+
+	return handled;
+}
+
+void smp_perf_pending_interrupt(struct pt_regs *regs)
+{
+	irq_enter();
+	ack_APIC_irq();
+	inc_irq_stat(apic_pending_irqs);
+	perf_event_do_pending();
+	irq_exit();
+}
+
+void set_perf_event_pending(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
+#endif
+}
+
+void perf_events_lapic_init(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+	if (!x86_pmu.apic || !x86_pmu_initialized())
+		return;
+
+	/*
+	 * Always use NMI for PMU
+	 */
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+#endif
+}
+
+static int __kprobes
+perf_event_nmi_handler(struct notifier_block *self,
+			 unsigned long cmd, void *__args)
+{
+	struct die_args *args = __args;
+	struct pt_regs *regs;
+
+	if (!atomic_read(&active_events))
+		return NOTIFY_DONE;
+
+	switch (cmd) {
+	case DIE_NMI:
+	case DIE_NMI_IPI:
+		break;
+
+	default:
+		return NOTIFY_DONE;
+	}
+
+	regs = args->regs;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+#endif
+	/*
+	 * Can't rely on the handled return value to say it was our NMI, two
+	 * events could trigger 'simultaneously' raising two back-to-back NMIs.
+	 *
+	 * If the first NMI handles both, the latter will be empty and daze
+	 * the CPU.
+	 */
+	x86_pmu.handle_irq(regs);
+
+	return NOTIFY_STOP;
+}
+
+static __read_mostly struct notifier_block perf_event_nmi_notifier = {
+	.notifier_call		= perf_event_nmi_handler,
+	.next			= NULL,
+	.priority		= 1
+};
+
+static struct x86_pmu p6_pmu = {
+	.name			= "p6",
+	.handle_irq		= p6_pmu_handle_irq,
+	.disable_all		= p6_pmu_disable_all,
+	.enable_all		= p6_pmu_enable_all,
+	.enable			= p6_pmu_enable_event,
+	.disable		= p6_pmu_disable_event,
+	.eventsel		= MSR_P6_EVNTSEL0,
+	.perfctr		= MSR_P6_PERFCTR0,
+	.event_map		= p6_pmu_event_map,
+	.raw_event		= p6_pmu_raw_event,
+	.max_events		= ARRAY_SIZE(p6_perfmon_event_map),
+	.apic			= 1,
+	.max_period		= (1ULL << 31) - 1,
+	.version		= 0,
+	.num_events		= 2,
+	/*
+	 * Events have 40 bits implemented. However they are designed such
+	 * that bits [32-39] are sign extensions of bit 31. As such the
+	 * effective width of a event for P6-like PMU is 32 bits only.
+	 *
+	 * See IA-32 Intel Architecture Software developer manual Vol 3B
+	 */
+	.event_bits		= 32,
+	.event_mask		= (1ULL << 32) - 1,
+};
+
+static struct x86_pmu intel_pmu = {
+	.name			= "Intel",
+	.handle_irq		= intel_pmu_handle_irq,
+	.disable_all		= intel_pmu_disable_all,
+	.enable_all		= intel_pmu_enable_all,
+	.enable			= intel_pmu_enable_event,
+	.disable		= intel_pmu_disable_event,
+	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
+	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
+	.event_map		= intel_pmu_event_map,
+	.raw_event		= intel_pmu_raw_event,
+	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
+	.apic			= 1,
+	/*
+	 * Intel PMCs cannot be accessed sanely above 32 bit width,
+	 * so we install an artificial 1<<31 period regardless of
+	 * the generic event period:
+	 */
+	.max_period		= (1ULL << 31) - 1,
+	.enable_bts		= intel_pmu_enable_bts,
+	.disable_bts		= intel_pmu_disable_bts,
+};
+
+static struct x86_pmu amd_pmu = {
+	.name			= "AMD",
+	.handle_irq		= amd_pmu_handle_irq,
+	.disable_all		= amd_pmu_disable_all,
+	.enable_all		= amd_pmu_enable_all,
+	.enable			= amd_pmu_enable_event,
+	.disable		= amd_pmu_disable_event,
+	.eventsel		= MSR_K7_EVNTSEL0,
+	.perfctr		= MSR_K7_PERFCTR0,
+	.event_map		= amd_pmu_event_map,
+	.raw_event		= amd_pmu_raw_event,
+	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
+	.num_events		= 4,
+	.event_bits		= 48,
+	.event_mask		= (1ULL << 48) - 1,
+	.apic			= 1,
+	/* use highest bit to detect overflow */
+	.max_period		= (1ULL << 47) - 1,
+};
+
+static int p6_pmu_init(void)
+{
+	switch (boot_cpu_data.x86_model) {
+	case 1:
+	case 3:  /* Pentium Pro */
+	case 5:
+	case 6:  /* Pentium II */
+	case 7:
+	case 8:
+	case 11: /* Pentium III */
+		break;
+	case 9:
+	case 13:
+		/* Pentium M */
+		break;
+	default:
+		pr_cont("unsupported p6 CPU model %d ",
+			boot_cpu_data.x86_model);
+		return -ENODEV;
+	}
+
+	x86_pmu = p6_pmu;
+
+	if (!cpu_has_apic) {
+		pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
+		pr_info("no hardware sampling interrupt available.\n");
+		x86_pmu.apic = 0;
+	}
+
+	return 0;
+}
+
+static int intel_pmu_init(void)
+{
+	union cpuid10_edx edx;
+	union cpuid10_eax eax;
+	unsigned int unused;
+	unsigned int ebx;
+	int version;
+
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+		/* check for P6 processor family */
+	   if (boot_cpu_data.x86 == 6) {
+		return p6_pmu_init();
+	   } else {
+		return -ENODEV;
+	   }
+	}
+
+	/*
+	 * Check whether the Architectural PerfMon supports
+	 * Branch Misses Retired hw_event or not.
+	 */
+	cpuid(10, &eax.full, &ebx, &unused, &edx.full);
+	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
+		return -ENODEV;
+
+	version = eax.split.version_id;
+	if (version < 2)
+		return -ENODEV;
+
+	x86_pmu				= intel_pmu;
+	x86_pmu.version			= version;
+	x86_pmu.num_events		= eax.split.num_events;
+	x86_pmu.event_bits		= eax.split.bit_width;
+	x86_pmu.event_mask		= (1ULL << eax.split.bit_width) - 1;
+
+	/*
+	 * Quirk: v2 perfmon does not report fixed-purpose events, so
+	 * assume at least 3 events:
+	 */
+	x86_pmu.num_events_fixed	= max((int)edx.split.num_events_fixed, 3);
+
+	/*
+	 * Install the hw-cache-events table:
+	 */
+	switch (boot_cpu_data.x86_model) {
+	case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
+	case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
+	case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
+	case 29: /* six-core 45 nm xeon "Dunnington" */
+		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+
+		pr_cont("Core2 events, ");
+		break;
+	default:
+	case 26:
+		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+
+		pr_cont("Nehalem/Corei7 events, ");
+		break;
+	case 28:
+		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+
+		pr_cont("Atom events, ");
+		break;
+	}
+	return 0;
+}
+
+static int amd_pmu_init(void)
+{
+	/* Performance-monitoring supported from K7 and later: */
+	if (boot_cpu_data.x86 < 6)
+		return -ENODEV;
+
+	x86_pmu = amd_pmu;
+
+	/* Events are common for all AMDs */
+	memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
+	       sizeof(hw_cache_event_ids));
+
+	return 0;
+}
+
+void __init init_hw_perf_events(void)
+{
+	int err;
+
+	pr_info("Performance Events: ");
+
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_INTEL:
+		err = intel_pmu_init();
+		break;
+	case X86_VENDOR_AMD:
+		err = amd_pmu_init();
+		break;
+	default:
+		return;
+	}
+	if (err != 0) {
+		pr_cont("no PMU driver, software events only.\n");
+		return;
+	}
+
+	pr_cont("%s PMU driver.\n", x86_pmu.name);
+
+	if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
+		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
+		     x86_pmu.num_events, X86_PMC_MAX_GENERIC);
+		x86_pmu.num_events = X86_PMC_MAX_GENERIC;
+	}
+	perf_event_mask = (1 << x86_pmu.num_events) - 1;
+	perf_max_events = x86_pmu.num_events;
+
+	if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) {
+		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
+		     x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED);
+		x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED;
+	}
+
+	perf_event_mask |=
+		((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED;
+	x86_pmu.intel_ctrl = perf_event_mask;
+
+	perf_events_lapic_init();
+	register_die_notifier(&perf_event_nmi_notifier);
+
+	pr_info("... version:                 %d\n",     x86_pmu.version);
+	pr_info("... bit width:               %d\n",     x86_pmu.event_bits);
+	pr_info("... generic events:        %d\n",     x86_pmu.num_events);
+	pr_info("... value mask:              %016Lx\n", x86_pmu.event_mask);
+	pr_info("... max period:              %016Lx\n", x86_pmu.max_period);
+	pr_info("... fixed-purpose events:  %d\n",     x86_pmu.num_events_fixed);
+	pr_info("... event mask:            %016Lx\n", perf_event_mask);
+}
+
+static inline void x86_pmu_read(struct perf_event *event)
+{
+	x86_perf_event_update(event, &event->hw, event->hw.idx);
+}
+
+static const struct pmu pmu = {
+	.enable		= x86_pmu_enable,
+	.disable	= x86_pmu_disable,
+	.read		= x86_pmu_read,
+	.unthrottle	= x86_pmu_unthrottle,
+};
+
+const struct pmu *hw_perf_event_init(struct perf_event *event)
+{
+	int err;
+
+	err = __hw_perf_event_init(event);
+	if (err) {
+		if (event->destroy)
+			event->destroy(event);
+		return ERR_PTR(err);
+	}
+
+	return &pmu;
+}
+
+/*
+ * callchain support
+ */
+
+static inline
+void callchain_store(struct perf_callchain_entry *entry, u64 ip)
+{
+	if (entry->nr < PERF_MAX_STACK_DEPTH)
+		entry->ip[entry->nr++] = ip;
+}
+
+static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
+static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
+static DEFINE_PER_CPU(int, in_nmi_frame);
+
+
+static void
+backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+	/* Ignore warnings */
+}
+
+static void backtrace_warning(void *data, char *msg)
+{
+	/* Ignore warnings */
+}
+
+static int backtrace_stack(void *data, char *name)
+{
+	per_cpu(in_nmi_frame, smp_processor_id()) =
+			x86_is_stack_id(NMI_STACK, name);
+
+	return 0;
+}
+
+static void backtrace_address(void *data, unsigned long addr, int reliable)
+{
+	struct perf_callchain_entry *entry = data;
+
+	if (per_cpu(in_nmi_frame, smp_processor_id()))
+		return;
+
+	if (reliable)
+		callchain_store(entry, addr);
+}
+
+static const struct stacktrace_ops backtrace_ops = {
+	.warning		= backtrace_warning,
+	.warning_symbol		= backtrace_warning_symbol,
+	.stack			= backtrace_stack,
+	.address		= backtrace_address,
+};
+
+#include "../dumpstack.h"
+
+static void
+perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+	callchain_store(entry, PERF_CONTEXT_KERNEL);
+	callchain_store(entry, regs->ip);
+
+	dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
+}
+
+/*
+ * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
+ */
+static unsigned long
+copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
+{
+	unsigned long offset, addr = (unsigned long)from;
+	int type = in_nmi() ? KM_NMI : KM_IRQ0;
+	unsigned long size, len = 0;
+	struct page *page;
+	void *map;
+	int ret;
+
+	do {
+		ret = __get_user_pages_fast(addr, 1, 0, &page);
+		if (!ret)
+			break;
+
+		offset = addr & (PAGE_SIZE - 1);
+		size = min(PAGE_SIZE - offset, n - len);
+
+		map = kmap_atomic(page, type);
+		memcpy(to, map+offset, size);
+		kunmap_atomic(map, type);
+		put_page(page);
+
+		len  += size;
+		to   += size;
+		addr += size;
+
+	} while (len < n);
+
+	return len;
+}
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+	unsigned long bytes;
+
+	bytes = copy_from_user_nmi(frame, fp, sizeof(*frame));
+
+	return bytes == sizeof(*frame);
+}
+
+static void
+perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+	struct stack_frame frame;
+	const void __user *fp;
+
+	if (!user_mode(regs))
+		regs = task_pt_regs(current);
+
+	fp = (void __user *)regs->bp;
+
+	callchain_store(entry, PERF_CONTEXT_USER);
+	callchain_store(entry, regs->ip);
+
+	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+		frame.next_frame	     = NULL;
+		frame.return_address = 0;
+
+		if (!copy_stack_frame(fp, &frame))
+			break;
+
+		if ((unsigned long)fp < regs->sp)
+			break;
+
+		callchain_store(entry, frame.return_address);
+		fp = frame.next_frame;
+	}
+}
+
+static void
+perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+	int is_user;
+
+	if (!regs)
+		return;
+
+	is_user = user_mode(regs);
+
+	if (!current || current->pid == 0)
+		return;
+
+	if (is_user && current->state != TASK_RUNNING)
+		return;
+
+	if (!is_user)
+		perf_callchain_kernel(regs, entry);
+
+	if (current->mm)
+		perf_callchain_user(regs, entry);
+}
+
+struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+	struct perf_callchain_entry *entry;
+
+	if (in_nmi())
+		entry = &__get_cpu_var(pmc_nmi_entry);
+	else
+		entry = &__get_cpu_var(pmc_irq_entry);
+
+	entry->nr = 0;
+
+	perf_do_callchain(regs, entry);
+
+	return entry;
+}
+
+void hw_perf_event_setup_online(int cpu)
+{
+	init_debug_store_on_cpu(cpu);
+}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 392bea43b890..fab786f60ed6 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -20,7 +20,7 @@
 #include <linux/kprobes.h>
 
 #include <asm/apic.h>
-#include <asm/perf_counter.h>
+#include <asm/perf_event.h>
 
 struct nmi_watchdog_ctlblk {
 	unsigned int cccr_msr;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index d59fe323807e..681c3fda7391 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1021,7 +1021,7 @@ apicinterrupt ERROR_APIC_VECTOR \
 apicinterrupt SPURIOUS_APIC_VECTOR \
 	spurious_interrupt smp_spurious_interrupt
 
-#ifdef CONFIG_PERF_COUNTERS
+#ifdef CONFIG_PERF_EVENTS
 apicinterrupt LOCAL_PENDING_VECTOR \
 	perf_pending_interrupt smp_perf_pending_interrupt
 #endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 300883112e3d..40f30773fb29 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -208,7 +208,7 @@ static void __init apic_intr_init(void)
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 
 	/* Performance monitoring interrupts: */
-# ifdef CONFIG_PERF_COUNTERS
+# ifdef CONFIG_PERF_EVENTS
 	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
 # endif
 
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d51321ddafda..0157cd26d7cc 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -335,4 +335,4 @@ ENTRY(sys_call_table)
 	.long sys_preadv
 	.long sys_pwritev
 	.long sys_rt_tgsigqueueinfo	/* 335 */
-	.long sys_perf_counter_open
+	.long sys_perf_event_open
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 775a020990a5..82728f2c6d55 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -10,7 +10,7 @@
 #include <linux/bootmem.h>		/* max_low_pfn			*/
 #include <linux/kprobes.h>		/* __kprobes, ...		*/
 #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
-#include <linux/perf_counter.h>		/* perf_swcounter_event		*/
+#include <linux/perf_event.h>		/* perf_sw_event		*/
 
 #include <asm/traps.h>			/* dotraplinkage, ...		*/
 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
@@ -1017,7 +1017,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	if (unlikely(error_code & PF_RSVD))
 		pgtable_bad(regs, error_code, address);
 
-	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
 
 	/*
 	 * If we're in an interrupt, have no user context or are running
@@ -1114,11 +1114,11 @@ good_area:
 
 	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
 				     regs, address);
 	} else {
 		tsk->min_flt++;
-		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
 				     regs, address);
 	}
 
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 4899215999de..8eb05878554c 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -234,11 +234,11 @@ static void arch_perfmon_setup_counters(void)
 	if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 &&
 		current_cpu_data.x86_model == 15) {
 		eax.split.version_id = 2;
-		eax.split.num_counters = 2;
+		eax.split.num_events = 2;
 		eax.split.bit_width = 40;
 	}
 
-	num_counters = eax.split.num_counters;
+	num_counters = eax.split.num_events;
 
 	op_arch_perfmon_spec.num_counters = num_counters;
 	op_arch_perfmon_spec.num_controls = num_counters;
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index b83776180c7f..7b8e75d16081 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -13,7 +13,7 @@
 #define OP_X86_MODEL_H
 
 #include <asm/types.h>
-#include <asm/perf_counter.h>
+#include <asm/perf_event.h>
 
 struct op_msr {
 	unsigned long	addr;
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 50eecfe1d724..44203ff599da 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -26,7 +26,7 @@
 #include <linux/proc_fs.h>
 #include <linux/nmi.h>
 #include <linux/quotaops.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
@@ -252,7 +252,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty)
 	struct pt_regs *regs = get_irq_regs();
 	if (regs)
 		show_regs(regs);
-	perf_counter_print_debug();
+	perf_event_print_debug();
 }
 static struct sysrq_key_op sysrq_showregs_op = {
 	.handler	= sysrq_handle_showregs,
diff --git a/fs/exec.c b/fs/exec.c
index 172ceb6edde4..434dba778ccc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,7 +33,7 @@
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
 #include <linux/key.h>
@@ -923,7 +923,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
 	task_lock(tsk);
 	strlcpy(tsk->comm, buf, sizeof(tsk->comm));
 	task_unlock(tsk);
-	perf_counter_comm(tsk);
+	perf_event_comm(tsk);
 }
 
 int flush_old_exec(struct linux_binprm * bprm)
@@ -997,7 +997,7 @@ int flush_old_exec(struct linux_binprm * bprm)
 	 * security domain:
 	 */
 	if (!get_dumpable(current->mm))
-		perf_counter_exit_task(current);
+		perf_event_exit_task(current);
 
 	/* An exec changes our domain. We are no longer part of the thread
 	   group */
diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h
index 1125e5a1ee5d..d76b66acea95 100644
--- a/include/asm-generic/unistd.h
+++ b/include/asm-generic/unistd.h
@@ -620,8 +620,8 @@ __SYSCALL(__NR_move_pages, sys_move_pages)
 
 #define __NR_rt_tgsigqueueinfo 240
 __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
-#define __NR_perf_counter_open 241
-__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
+#define __NR_perf_event_open 241
+__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
 
 #undef __NR_syscalls
 #define __NR_syscalls 242
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 9e7f2e8fc66e..21a6f5d9af22 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -106,13 +106,13 @@ extern struct group_info init_groups;
 
 extern struct cred init_cred;
 
-#ifdef CONFIG_PERF_COUNTERS
-# define INIT_PERF_COUNTERS(tsk)					\
-	.perf_counter_mutex = 						\
-		 __MUTEX_INITIALIZER(tsk.perf_counter_mutex),		\
-	.perf_counter_list = LIST_HEAD_INIT(tsk.perf_counter_list),
+#ifdef CONFIG_PERF_EVENTS
+# define INIT_PERF_EVENTS(tsk)					\
+	.perf_event_mutex = 						\
+		 __MUTEX_INITIALIZER(tsk.perf_event_mutex),		\
+	.perf_event_list = LIST_HEAD_INIT(tsk.perf_event_list),
 #else
-# define INIT_PERF_COUNTERS(tsk)
+# define INIT_PERF_EVENTS(tsk)
 #endif
 
 /*
@@ -178,7 +178,7 @@ extern struct cred init_cred;
 	},								\
 	.dirties = INIT_PROP_LOCAL_SINGLE(dirties),			\
 	INIT_IDS							\
-	INIT_PERF_COUNTERS(tsk)						\
+	INIT_PERF_EVENTS(tsk)						\
 	INIT_TRACE_IRQFLAGS						\
 	INIT_LOCKDEP							\
 	INIT_FTRACE_GRAPH						\
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
deleted file mode 100644
index f64862732673..000000000000
--- a/include/linux/perf_counter.h
+++ /dev/null
@@ -1,858 +0,0 @@
-/*
- *  Performance counters:
- *
- *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
- *    Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar
- *    Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra
- *
- *  Data type definitions, declarations, prototypes.
- *
- *    Started by: Thomas Gleixner and Ingo Molnar
- *
- *  For licencing details see kernel-base/COPYING
- */
-#ifndef _LINUX_PERF_COUNTER_H
-#define _LINUX_PERF_COUNTER_H
-
-#include <linux/types.h>
-#include <linux/ioctl.h>
-#include <asm/byteorder.h>
-
-/*
- * User-space ABI bits:
- */
-
-/*
- * attr.type
- */
-enum perf_type_id {
-	PERF_TYPE_HARDWARE			= 0,
-	PERF_TYPE_SOFTWARE			= 1,
-	PERF_TYPE_TRACEPOINT			= 2,
-	PERF_TYPE_HW_CACHE			= 3,
-	PERF_TYPE_RAW				= 4,
-
-	PERF_TYPE_MAX,				/* non-ABI */
-};
-
-/*
- * Generalized performance counter event types, used by the
- * attr.event_id parameter of the sys_perf_counter_open()
- * syscall:
- */
-enum perf_hw_id {
-	/*
-	 * Common hardware events, generalized by the kernel:
-	 */
-	PERF_COUNT_HW_CPU_CYCLES		= 0,
-	PERF_COUNT_HW_INSTRUCTIONS		= 1,
-	PERF_COUNT_HW_CACHE_REFERENCES		= 2,
-	PERF_COUNT_HW_CACHE_MISSES		= 3,
-	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4,
-	PERF_COUNT_HW_BRANCH_MISSES		= 5,
-	PERF_COUNT_HW_BUS_CYCLES		= 6,
-
-	PERF_COUNT_HW_MAX,			/* non-ABI */
-};
-
-/*
- * Generalized hardware cache counters:
- *
- *       { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x
- *       { read, write, prefetch } x
- *       { accesses, misses }
- */
-enum perf_hw_cache_id {
-	PERF_COUNT_HW_CACHE_L1D			= 0,
-	PERF_COUNT_HW_CACHE_L1I			= 1,
-	PERF_COUNT_HW_CACHE_LL			= 2,
-	PERF_COUNT_HW_CACHE_DTLB		= 3,
-	PERF_COUNT_HW_CACHE_ITLB		= 4,
-	PERF_COUNT_HW_CACHE_BPU			= 5,
-
-	PERF_COUNT_HW_CACHE_MAX,		/* non-ABI */
-};
-
-enum perf_hw_cache_op_id {
-	PERF_COUNT_HW_CACHE_OP_READ		= 0,
-	PERF_COUNT_HW_CACHE_OP_WRITE		= 1,
-	PERF_COUNT_HW_CACHE_OP_PREFETCH		= 2,
-
-	PERF_COUNT_HW_CACHE_OP_MAX,		/* non-ABI */
-};
-
-enum perf_hw_cache_op_result_id {
-	PERF_COUNT_HW_CACHE_RESULT_ACCESS	= 0,
-	PERF_COUNT_HW_CACHE_RESULT_MISS		= 1,
-
-	PERF_COUNT_HW_CACHE_RESULT_MAX,		/* non-ABI */
-};
-
-/*
- * Special "software" counters provided by the kernel, even if the hardware
- * does not support performance counters. These counters measure various
- * physical and sw events of the kernel (and allow the profiling of them as
- * well):
- */
-enum perf_sw_ids {
-	PERF_COUNT_SW_CPU_CLOCK			= 0,
-	PERF_COUNT_SW_TASK_CLOCK		= 1,
-	PERF_COUNT_SW_PAGE_FAULTS		= 2,
-	PERF_COUNT_SW_CONTEXT_SWITCHES		= 3,
-	PERF_COUNT_SW_CPU_MIGRATIONS		= 4,
-	PERF_COUNT_SW_PAGE_FAULTS_MIN		= 5,
-	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
-
-	PERF_COUNT_SW_MAX,			/* non-ABI */
-};
-
-/*
- * Bits that can be set in attr.sample_type to request information
- * in the overflow packets.
- */
-enum perf_counter_sample_format {
-	PERF_SAMPLE_IP				= 1U << 0,
-	PERF_SAMPLE_TID				= 1U << 1,
-	PERF_SAMPLE_TIME			= 1U << 2,
-	PERF_SAMPLE_ADDR			= 1U << 3,
-	PERF_SAMPLE_READ			= 1U << 4,
-	PERF_SAMPLE_CALLCHAIN			= 1U << 5,
-	PERF_SAMPLE_ID				= 1U << 6,
-	PERF_SAMPLE_CPU				= 1U << 7,
-	PERF_SAMPLE_PERIOD			= 1U << 8,
-	PERF_SAMPLE_STREAM_ID			= 1U << 9,
-	PERF_SAMPLE_RAW				= 1U << 10,
-
-	PERF_SAMPLE_MAX = 1U << 11,		/* non-ABI */
-};
-
-/*
- * The format of the data returned by read() on a perf counter fd,
- * as specified by attr.read_format:
- *
- * struct read_format {
- * 	{ u64		value;
- * 	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
- * 	  { u64		time_running; } && PERF_FORMAT_RUNNING
- * 	  { u64		id;           } && PERF_FORMAT_ID
- * 	} && !PERF_FORMAT_GROUP
- *
- * 	{ u64		nr;
- * 	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
- * 	  { u64		time_running; } && PERF_FORMAT_RUNNING
- * 	  { u64		value;
- * 	    { u64	id;           } && PERF_FORMAT_ID
- * 	  }		cntr[nr];
- * 	} && PERF_FORMAT_GROUP
- * };
- */
-enum perf_counter_read_format {
-	PERF_FORMAT_TOTAL_TIME_ENABLED		= 1U << 0,
-	PERF_FORMAT_TOTAL_TIME_RUNNING		= 1U << 1,
-	PERF_FORMAT_ID				= 1U << 2,
-	PERF_FORMAT_GROUP			= 1U << 3,
-
-	PERF_FORMAT_MAX = 1U << 4, 		/* non-ABI */
-};
-
-#define PERF_ATTR_SIZE_VER0	64	/* sizeof first published struct */
-
-/*
- * Hardware event to monitor via a performance monitoring counter:
- */
-struct perf_counter_attr {
-
-	/*
-	 * Major type: hardware/software/tracepoint/etc.
-	 */
-	__u32			type;
-
-	/*
-	 * Size of the attr structure, for fwd/bwd compat.
-	 */
-	__u32			size;
-
-	/*
-	 * Type specific configuration information.
-	 */
-	__u64			config;
-
-	union {
-		__u64		sample_period;
-		__u64		sample_freq;
-	};
-
-	__u64			sample_type;
-	__u64			read_format;
-
-	__u64			disabled       :  1, /* off by default        */
-				inherit	       :  1, /* children inherit it   */
-				pinned	       :  1, /* must always be on PMU */
-				exclusive      :  1, /* only group on PMU     */
-				exclude_user   :  1, /* don't count user      */
-				exclude_kernel :  1, /* ditto kernel          */
-				exclude_hv     :  1, /* ditto hypervisor      */
-				exclude_idle   :  1, /* don't count when idle */
-				mmap           :  1, /* include mmap data     */
-				comm	       :  1, /* include comm data     */
-				freq           :  1, /* use freq, not period  */
-				inherit_stat   :  1, /* per task counts       */
-				enable_on_exec :  1, /* next exec enables     */
-				task           :  1, /* trace fork/exit       */
-				watermark      :  1, /* wakeup_watermark      */
-
-				__reserved_1   : 49;
-
-	union {
-		__u32		wakeup_events;	  /* wakeup every n events */
-		__u32		wakeup_watermark; /* bytes before wakeup   */
-	};
-	__u32			__reserved_2;
-
-	__u64			__reserved_3;
-};
-
-/*
- * Ioctls that can be done on a perf counter fd:
- */
-#define PERF_COUNTER_IOC_ENABLE		_IO ('$', 0)
-#define PERF_COUNTER_IOC_DISABLE	_IO ('$', 1)
-#define PERF_COUNTER_IOC_REFRESH	_IO ('$', 2)
-#define PERF_COUNTER_IOC_RESET		_IO ('$', 3)
-#define PERF_COUNTER_IOC_PERIOD		_IOW('$', 4, u64)
-#define PERF_COUNTER_IOC_SET_OUTPUT	_IO ('$', 5)
-
-enum perf_counter_ioc_flags {
-	PERF_IOC_FLAG_GROUP		= 1U << 0,
-};
-
-/*
- * Structure of the page that can be mapped via mmap
- */
-struct perf_counter_mmap_page {
-	__u32	version;		/* version number of this structure */
-	__u32	compat_version;		/* lowest version this is compat with */
-
-	/*
-	 * Bits needed to read the hw counters in user-space.
-	 *
-	 *   u32 seq;
-	 *   s64 count;
-	 *
-	 *   do {
-	 *     seq = pc->lock;
-	 *
-	 *     barrier()
-	 *     if (pc->index) {
-	 *       count = pmc_read(pc->index - 1);
-	 *       count += pc->offset;
-	 *     } else
-	 *       goto regular_read;
-	 *
-	 *     barrier();
-	 *   } while (pc->lock != seq);
-	 *
-	 * NOTE: for obvious reason this only works on self-monitoring
-	 *       processes.
-	 */
-	__u32	lock;			/* seqlock for synchronization */
-	__u32	index;			/* hardware counter identifier */
-	__s64	offset;			/* add to hardware counter value */
-	__u64	time_enabled;		/* time counter active */
-	__u64	time_running;		/* time counter on cpu */
-
-		/*
-		 * Hole for extension of the self monitor capabilities
-		 */
-
-	__u64	__reserved[123];	/* align to 1k */
-
-	/*
-	 * Control data for the mmap() data buffer.
-	 *
-	 * User-space reading the @data_head value should issue an rmb(), on
-	 * SMP capable platforms, after reading this value -- see
-	 * perf_counter_wakeup().
-	 *
-	 * When the mapping is PROT_WRITE the @data_tail value should be
-	 * written by userspace to reflect the last read data. In this case
-	 * the kernel will not over-write unread data.
-	 */
-	__u64   data_head;		/* head in the data section */
-	__u64	data_tail;		/* user-space written tail */
-};
-
-#define PERF_EVENT_MISC_CPUMODE_MASK		(3 << 0)
-#define PERF_EVENT_MISC_CPUMODE_UNKNOWN		(0 << 0)
-#define PERF_EVENT_MISC_KERNEL			(1 << 0)
-#define PERF_EVENT_MISC_USER			(2 << 0)
-#define PERF_EVENT_MISC_HYPERVISOR		(3 << 0)
-
-struct perf_event_header {
-	__u32	type;
-	__u16	misc;
-	__u16	size;
-};
-
-enum perf_event_type {
-
-	/*
-	 * The MMAP events record the PROT_EXEC mappings so that we can
-	 * correlate userspace IPs to code. They have the following structure:
-	 *
-	 * struct {
-	 *	struct perf_event_header	header;
-	 *
-	 *	u32				pid, tid;
-	 *	u64				addr;
-	 *	u64				len;
-	 *	u64				pgoff;
-	 *	char				filename[];
-	 * };
-	 */
-	PERF_EVENT_MMAP			= 1,
-
-	/*
-	 * struct {
-	 * 	struct perf_event_header	header;
-	 * 	u64				id;
-	 * 	u64				lost;
-	 * };
-	 */
-	PERF_EVENT_LOST			= 2,
-
-	/*
-	 * struct {
-	 *	struct perf_event_header	header;
-	 *
-	 *	u32				pid, tid;
-	 *	char				comm[];
-	 * };
-	 */
-	PERF_EVENT_COMM			= 3,
-
-	/*
-	 * struct {
-	 *	struct perf_event_header	header;
-	 *	u32				pid, ppid;
-	 *	u32				tid, ptid;
-	 *	u64				time;
-	 * };
-	 */
-	PERF_EVENT_EXIT			= 4,
-
-	/*
-	 * struct {
-	 *	struct perf_event_header	header;
-	 *	u64				time;
-	 *	u64				id;
-	 *	u64				stream_id;
-	 * };
-	 */
-	PERF_EVENT_THROTTLE		= 5,
-	PERF_EVENT_UNTHROTTLE		= 6,
-
-	/*
-	 * struct {
-	 *	struct perf_event_header	header;
-	 *	u32				pid, ppid;
-	 *	u32				tid, ptid;
-	 *	{ u64				time;     } && PERF_SAMPLE_TIME
-	 * };
-	 */
-	PERF_EVENT_FORK			= 7,
-
-	/*
-	 * struct {
-	 * 	struct perf_event_header	header;
-	 * 	u32				pid, tid;
-	 *
-	 * 	struct read_format		values;
-	 * };
-	 */
-	PERF_EVENT_READ			= 8,
-
-	/*
-	 * struct {
-	 *	struct perf_event_header	header;
-	 *
-	 *	{ u64			ip;	  } && PERF_SAMPLE_IP
-	 *	{ u32			pid, tid; } && PERF_SAMPLE_TID
-	 *	{ u64			time;     } && PERF_SAMPLE_TIME
-	 *	{ u64			addr;     } && PERF_SAMPLE_ADDR
-	 *	{ u64			id;	  } && PERF_SAMPLE_ID
-	 *	{ u64			stream_id;} && PERF_SAMPLE_STREAM_ID
-	 *	{ u32			cpu, res; } && PERF_SAMPLE_CPU
-	 * 	{ u64			period;   } && PERF_SAMPLE_PERIOD
-	 *
-	 *	{ struct read_format	values;	  } && PERF_SAMPLE_READ
-	 *
-	 *	{ u64			nr,
-	 *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
-	 *
-	 * 	#
-	 * 	# The RAW record below is opaque data wrt the ABI
-	 * 	#
-	 * 	# That is, the ABI doesn't make any promises wrt to
-	 * 	# the stability of its content, it may vary depending
-	 * 	# on event, hardware, kernel version and phase of
-	 * 	# the moon.
-	 * 	#
-	 * 	# In other words, PERF_SAMPLE_RAW contents are not an ABI.
-	 * 	#
-	 *
-	 *	{ u32			size;
-	 *	  char                  data[size];}&& PERF_SAMPLE_RAW
-	 * };
-	 */
-	PERF_EVENT_SAMPLE		= 9,
-
-	PERF_EVENT_MAX,			/* non-ABI */
-};
-
-enum perf_callchain_context {
-	PERF_CONTEXT_HV			= (__u64)-32,
-	PERF_CONTEXT_KERNEL		= (__u64)-128,
-	PERF_CONTEXT_USER		= (__u64)-512,
-
-	PERF_CONTEXT_GUEST		= (__u64)-2048,
-	PERF_CONTEXT_GUEST_KERNEL	= (__u64)-2176,
-	PERF_CONTEXT_GUEST_USER		= (__u64)-2560,
-
-	PERF_CONTEXT_MAX		= (__u64)-4095,
-};
-
-#define PERF_FLAG_FD_NO_GROUP	(1U << 0)
-#define PERF_FLAG_FD_OUTPUT	(1U << 1)
-
-#ifdef __KERNEL__
-/*
- * Kernel-internal data types and definitions:
- */
-
-#ifdef CONFIG_PERF_COUNTERS
-# include <asm/perf_counter.h>
-#endif
-
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-#include <linux/spinlock.h>
-#include <linux/hrtimer.h>
-#include <linux/fs.h>
-#include <linux/pid_namespace.h>
-#include <asm/atomic.h>
-
-#define PERF_MAX_STACK_DEPTH		255
-
-struct perf_callchain_entry {
-	__u64				nr;
-	__u64				ip[PERF_MAX_STACK_DEPTH];
-};
-
-struct perf_raw_record {
-	u32				size;
-	void				*data;
-};
-
-struct task_struct;
-
-/**
- * struct hw_perf_counter - performance counter hardware details:
- */
-struct hw_perf_counter {
-#ifdef CONFIG_PERF_COUNTERS
-	union {
-		struct { /* hardware */
-			u64		config;
-			unsigned long	config_base;
-			unsigned long	counter_base;
-			int		idx;
-		};
-		union { /* software */
-			atomic64_t	count;
-			struct hrtimer	hrtimer;
-		};
-	};
-	atomic64_t			prev_count;
-	u64				sample_period;
-	u64				last_period;
-	atomic64_t			period_left;
-	u64				interrupts;
-
-	u64				freq_count;
-	u64				freq_interrupts;
-	u64				freq_stamp;
-#endif
-};
-
-struct perf_counter;
-
-/**
- * struct pmu - generic performance monitoring unit
- */
-struct pmu {
-	int (*enable)			(struct perf_counter *counter);
-	void (*disable)			(struct perf_counter *counter);
-	void (*read)			(struct perf_counter *counter);
-	void (*unthrottle)		(struct perf_counter *counter);
-};
-
-/**
- * enum perf_counter_active_state - the states of a counter
- */
-enum perf_counter_active_state {
-	PERF_COUNTER_STATE_ERROR	= -2,
-	PERF_COUNTER_STATE_OFF		= -1,
-	PERF_COUNTER_STATE_INACTIVE	=  0,
-	PERF_COUNTER_STATE_ACTIVE	=  1,
-};
-
-struct file;
-
-struct perf_mmap_data {
-	struct rcu_head			rcu_head;
-	int				nr_pages;	/* nr of data pages  */
-	int				writable;	/* are we writable   */
-	int				nr_locked;	/* nr pages mlocked  */
-
-	atomic_t			poll;		/* POLL_ for wakeups */
-	atomic_t			events;		/* event limit       */
-
-	atomic_long_t			head;		/* write position    */
-	atomic_long_t			done_head;	/* completed head    */
-
-	atomic_t			lock;		/* concurrent writes */
-	atomic_t			wakeup;		/* needs a wakeup    */
-	atomic_t			lost;		/* nr records lost   */
-
-	long				watermark;	/* wakeup watermark  */
-
-	struct perf_counter_mmap_page   *user_page;
-	void				*data_pages[0];
-};
-
-struct perf_pending_entry {
-	struct perf_pending_entry *next;
-	void (*func)(struct perf_pending_entry *);
-};
-
-/**
- * struct perf_counter - performance counter kernel representation:
- */
-struct perf_counter {
-#ifdef CONFIG_PERF_COUNTERS
-	struct list_head		group_entry;
-	struct list_head		event_entry;
-	struct list_head		sibling_list;
-	int				nr_siblings;
-	struct perf_counter		*group_leader;
-	struct perf_counter		*output;
-	const struct pmu		*pmu;
-
-	enum perf_counter_active_state	state;
-	atomic64_t			count;
-
-	/*
-	 * These are the total time in nanoseconds that the counter
-	 * has been enabled (i.e. eligible to run, and the task has
-	 * been scheduled in, if this is a per-task counter)
-	 * and running (scheduled onto the CPU), respectively.
-	 *
-	 * They are computed from tstamp_enabled, tstamp_running and
-	 * tstamp_stopped when the counter is in INACTIVE or ACTIVE state.
-	 */
-	u64				total_time_enabled;
-	u64				total_time_running;
-
-	/*
-	 * These are timestamps used for computing total_time_enabled
-	 * and total_time_running when the counter is in INACTIVE or
-	 * ACTIVE state, measured in nanoseconds from an arbitrary point
-	 * in time.
-	 * tstamp_enabled: the notional time when the counter was enabled
-	 * tstamp_running: the notional time when the counter was scheduled on
-	 * tstamp_stopped: in INACTIVE state, the notional time when the
-	 *	counter was scheduled off.
-	 */
-	u64				tstamp_enabled;
-	u64				tstamp_running;
-	u64				tstamp_stopped;
-
-	struct perf_counter_attr	attr;
-	struct hw_perf_counter		hw;
-
-	struct perf_counter_context	*ctx;
-	struct file			*filp;
-
-	/*
-	 * These accumulate total time (in nanoseconds) that children
-	 * counters have been enabled and running, respectively.
-	 */
-	atomic64_t			child_total_time_enabled;
-	atomic64_t			child_total_time_running;
-
-	/*
-	 * Protect attach/detach and child_list:
-	 */
-	struct mutex			child_mutex;
-	struct list_head		child_list;
-	struct perf_counter		*parent;
-
-	int				oncpu;
-	int				cpu;
-
-	struct list_head		owner_entry;
-	struct task_struct		*owner;
-
-	/* mmap bits */
-	struct mutex			mmap_mutex;
-	atomic_t			mmap_count;
-	struct perf_mmap_data		*data;
-
-	/* poll related */
-	wait_queue_head_t		waitq;
-	struct fasync_struct		*fasync;
-
-	/* delayed work for NMIs and such */
-	int				pending_wakeup;
-	int				pending_kill;
-	int				pending_disable;
-	struct perf_pending_entry	pending;
-
-	atomic_t			event_limit;
-
-	void (*destroy)(struct perf_counter *);
-	struct rcu_head			rcu_head;
-
-	struct pid_namespace		*ns;
-	u64				id;
-#endif
-};
-
-/**
- * struct perf_counter_context - counter context structure
- *
- * Used as a container for task counters and CPU counters as well:
- */
-struct perf_counter_context {
-	/*
-	 * Protect the states of the counters in the list,
-	 * nr_active, and the list:
-	 */
-	spinlock_t			lock;
-	/*
-	 * Protect the list of counters.  Locking either mutex or lock
-	 * is sufficient to ensure the list doesn't change; to change
-	 * the list you need to lock both the mutex and the spinlock.
-	 */
-	struct mutex			mutex;
-
-	struct list_head		group_list;
-	struct list_head		event_list;
-	int				nr_counters;
-	int				nr_active;
-	int				is_active;
-	int				nr_stat;
-	atomic_t			refcount;
-	struct task_struct		*task;
-
-	/*
-	 * Context clock, runs when context enabled.
-	 */
-	u64				time;
-	u64				timestamp;
-
-	/*
-	 * These fields let us detect when two contexts have both
-	 * been cloned (inherited) from a common ancestor.
-	 */
-	struct perf_counter_context	*parent_ctx;
-	u64				parent_gen;
-	u64				generation;
-	int				pin_count;
-	struct rcu_head			rcu_head;
-};
-
-/**
- * struct perf_counter_cpu_context - per cpu counter context structure
- */
-struct perf_cpu_context {
-	struct perf_counter_context	ctx;
-	struct perf_counter_context	*task_ctx;
-	int				active_oncpu;
-	int				max_pertask;
-	int				exclusive;
-
-	/*
-	 * Recursion avoidance:
-	 *
-	 * task, softirq, irq, nmi context
-	 */
-	int				recursion[4];
-};
-
-struct perf_output_handle {
-	struct perf_counter	*counter;
-	struct perf_mmap_data	*data;
-	unsigned long		head;
-	unsigned long		offset;
-	int			nmi;
-	int			sample;
-	int			locked;
-	unsigned long		flags;
-};
-
-#ifdef CONFIG_PERF_COUNTERS
-
-/*
- * Set by architecture code:
- */
-extern int perf_max_counters;
-
-extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter);
-
-extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
-extern void perf_counter_task_sched_out(struct task_struct *task,
-					struct task_struct *next, int cpu);
-extern void perf_counter_task_tick(struct task_struct *task, int cpu);
-extern int perf_counter_init_task(struct task_struct *child);
-extern void perf_counter_exit_task(struct task_struct *child);
-extern void perf_counter_free_task(struct task_struct *task);
-extern void set_perf_counter_pending(void);
-extern void perf_counter_do_pending(void);
-extern void perf_counter_print_debug(void);
-extern void __perf_disable(void);
-extern bool __perf_enable(void);
-extern void perf_disable(void);
-extern void perf_enable(void);
-extern int perf_counter_task_disable(void);
-extern int perf_counter_task_enable(void);
-extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
-	       struct perf_cpu_context *cpuctx,
-	       struct perf_counter_context *ctx, int cpu);
-extern void perf_counter_update_userpage(struct perf_counter *counter);
-
-struct perf_sample_data {
-	u64				type;
-
-	u64				ip;
-	struct {
-		u32	pid;
-		u32	tid;
-	}				tid_entry;
-	u64				time;
-	u64				addr;
-	u64				id;
-	u64				stream_id;
-	struct {
-		u32	cpu;
-		u32	reserved;
-	}				cpu_entry;
-	u64				period;
-	struct perf_callchain_entry	*callchain;
-	struct perf_raw_record		*raw;
-};
-
-extern void perf_output_sample(struct perf_output_handle *handle,
-			       struct perf_event_header *header,
-			       struct perf_sample_data *data,
-			       struct perf_counter *counter);
-extern void perf_prepare_sample(struct perf_event_header *header,
-				struct perf_sample_data *data,
-				struct perf_counter *counter,
-				struct pt_regs *regs);
-
-extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
-				 struct perf_sample_data *data,
-				 struct pt_regs *regs);
-
-/*
- * Return 1 for a software counter, 0 for a hardware counter
- */
-static inline int is_software_counter(struct perf_counter *counter)
-{
-	return (counter->attr.type != PERF_TYPE_RAW) &&
-		(counter->attr.type != PERF_TYPE_HARDWARE) &&
-		(counter->attr.type != PERF_TYPE_HW_CACHE);
-}
-
-extern atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
-
-extern void __perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
-
-static inline void
-perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
-{
-	if (atomic_read(&perf_swcounter_enabled[event]))
-		__perf_swcounter_event(event, nr, nmi, regs, addr);
-}
-
-extern void __perf_counter_mmap(struct vm_area_struct *vma);
-
-static inline void perf_counter_mmap(struct vm_area_struct *vma)
-{
-	if (vma->vm_flags & VM_EXEC)
-		__perf_counter_mmap(vma);
-}
-
-extern void perf_counter_comm(struct task_struct *tsk);
-extern void perf_counter_fork(struct task_struct *tsk);
-
-extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
-
-extern int sysctl_perf_counter_paranoid;
-extern int sysctl_perf_counter_mlock;
-extern int sysctl_perf_counter_sample_rate;
-
-extern void perf_counter_init(void);
-extern void perf_tpcounter_event(int event_id, u64 addr, u64 count,
-				 void *record, int entry_size);
-
-#ifndef perf_misc_flags
-#define perf_misc_flags(regs)	(user_mode(regs) ? PERF_EVENT_MISC_USER : \
-				 PERF_EVENT_MISC_KERNEL)
-#define perf_instruction_pointer(regs)	instruction_pointer(regs)
-#endif
-
-extern int perf_output_begin(struct perf_output_handle *handle,
-			     struct perf_counter *counter, unsigned int size,
-			     int nmi, int sample);
-extern void perf_output_end(struct perf_output_handle *handle);
-extern void perf_output_copy(struct perf_output_handle *handle,
-			     const void *buf, unsigned int len);
-#else
-static inline void
-perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
-static inline void
-perf_counter_task_sched_out(struct task_struct *task,
-			    struct task_struct *next, int cpu)		{ }
-static inline void
-perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
-static inline int perf_counter_init_task(struct task_struct *child)	{ return 0; }
-static inline void perf_counter_exit_task(struct task_struct *child)	{ }
-static inline void perf_counter_free_task(struct task_struct *task)	{ }
-static inline void perf_counter_do_pending(void)			{ }
-static inline void perf_counter_print_debug(void)			{ }
-static inline void perf_disable(void)					{ }
-static inline void perf_enable(void)					{ }
-static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
-static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
-
-static inline void
-perf_swcounter_event(u32 event, u64 nr, int nmi,
-		     struct pt_regs *regs, u64 addr)			{ }
-
-static inline void perf_counter_mmap(struct vm_area_struct *vma)	{ }
-static inline void perf_counter_comm(struct task_struct *tsk)		{ }
-static inline void perf_counter_fork(struct task_struct *tsk)		{ }
-static inline void perf_counter_init(void)				{ }
-
-#endif
-
-#define perf_output_put(handle, x) \
-	perf_output_copy((handle), &(x), sizeof(x))
-
-#endif /* __KERNEL__ */
-#endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
new file mode 100644
index 000000000000..ae9d9ed6df2a
--- /dev/null
+++ b/include/linux/perf_event.h
@@ -0,0 +1,858 @@
+/*
+ *  Performance events:
+ *
+ *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
+ *    Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar
+ *    Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra
+ *
+ *  Data type definitions, declarations, prototypes.
+ *
+ *    Started by: Thomas Gleixner and Ingo Molnar
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+#ifndef _LINUX_PERF_EVENT_H
+#define _LINUX_PERF_EVENT_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+#include <asm/byteorder.h>
+
+/*
+ * User-space ABI bits:
+ */
+
+/*
+ * attr.type
+ */
+enum perf_type_id {
+	PERF_TYPE_HARDWARE			= 0,
+	PERF_TYPE_SOFTWARE			= 1,
+	PERF_TYPE_TRACEPOINT			= 2,
+	PERF_TYPE_HW_CACHE			= 3,
+	PERF_TYPE_RAW				= 4,
+
+	PERF_TYPE_MAX,				/* non-ABI */
+};
+
+/*
+ * Generalized performance event event_id types, used by the
+ * attr.event_id parameter of the sys_perf_event_open()
+ * syscall:
+ */
+enum perf_hw_id {
+	/*
+	 * Common hardware events, generalized by the kernel:
+	 */
+	PERF_COUNT_HW_CPU_CYCLES		= 0,
+	PERF_COUNT_HW_INSTRUCTIONS		= 1,
+	PERF_COUNT_HW_CACHE_REFERENCES		= 2,
+	PERF_COUNT_HW_CACHE_MISSES		= 3,
+	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4,
+	PERF_COUNT_HW_BRANCH_MISSES		= 5,
+	PERF_COUNT_HW_BUS_CYCLES		= 6,
+
+	PERF_COUNT_HW_MAX,			/* non-ABI */
+};
+
+/*
+ * Generalized hardware cache events:
+ *
+ *       { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x
+ *       { read, write, prefetch } x
+ *       { accesses, misses }
+ */
+enum perf_hw_cache_id {
+	PERF_COUNT_HW_CACHE_L1D			= 0,
+	PERF_COUNT_HW_CACHE_L1I			= 1,
+	PERF_COUNT_HW_CACHE_LL			= 2,
+	PERF_COUNT_HW_CACHE_DTLB		= 3,
+	PERF_COUNT_HW_CACHE_ITLB		= 4,
+	PERF_COUNT_HW_CACHE_BPU			= 5,
+
+	PERF_COUNT_HW_CACHE_MAX,		/* non-ABI */
+};
+
+enum perf_hw_cache_op_id {
+	PERF_COUNT_HW_CACHE_OP_READ		= 0,
+	PERF_COUNT_HW_CACHE_OP_WRITE		= 1,
+	PERF_COUNT_HW_CACHE_OP_PREFETCH		= 2,
+
+	PERF_COUNT_HW_CACHE_OP_MAX,		/* non-ABI */
+};
+
+enum perf_hw_cache_op_result_id {
+	PERF_COUNT_HW_CACHE_RESULT_ACCESS	= 0,
+	PERF_COUNT_HW_CACHE_RESULT_MISS		= 1,
+
+	PERF_COUNT_HW_CACHE_RESULT_MAX,		/* non-ABI */
+};
+
+/*
+ * Special "software" events provided by the kernel, even if the hardware
+ * does not support performance events. These events measure various
+ * physical and sw events of the kernel (and allow the profiling of them as
+ * well):
+ */
+enum perf_sw_ids {
+	PERF_COUNT_SW_CPU_CLOCK			= 0,
+	PERF_COUNT_SW_TASK_CLOCK		= 1,
+	PERF_COUNT_SW_PAGE_FAULTS		= 2,
+	PERF_COUNT_SW_CONTEXT_SWITCHES		= 3,
+	PERF_COUNT_SW_CPU_MIGRATIONS		= 4,
+	PERF_COUNT_SW_PAGE_FAULTS_MIN		= 5,
+	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
+
+	PERF_COUNT_SW_MAX,			/* non-ABI */
+};
+
+/*
+ * Bits that can be set in attr.sample_type to request information
+ * in the overflow packets.
+ */
+enum perf_event_sample_format {
+	PERF_SAMPLE_IP				= 1U << 0,
+	PERF_SAMPLE_TID				= 1U << 1,
+	PERF_SAMPLE_TIME			= 1U << 2,
+	PERF_SAMPLE_ADDR			= 1U << 3,
+	PERF_SAMPLE_READ			= 1U << 4,
+	PERF_SAMPLE_CALLCHAIN			= 1U << 5,
+	PERF_SAMPLE_ID				= 1U << 6,
+	PERF_SAMPLE_CPU				= 1U << 7,
+	PERF_SAMPLE_PERIOD			= 1U << 8,
+	PERF_SAMPLE_STREAM_ID			= 1U << 9,
+	PERF_SAMPLE_RAW				= 1U << 10,
+
+	PERF_SAMPLE_MAX = 1U << 11,		/* non-ABI */
+};
+
+/*
+ * The format of the data returned by read() on a perf event fd,
+ * as specified by attr.read_format:
+ *
+ * struct read_format {
+ * 	{ u64		value;
+ * 	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
+ * 	  { u64		time_running; } && PERF_FORMAT_RUNNING
+ * 	  { u64		id;           } && PERF_FORMAT_ID
+ * 	} && !PERF_FORMAT_GROUP
+ *
+ * 	{ u64		nr;
+ * 	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
+ * 	  { u64		time_running; } && PERF_FORMAT_RUNNING
+ * 	  { u64		value;
+ * 	    { u64	id;           } && PERF_FORMAT_ID
+ * 	  }		cntr[nr];
+ * 	} && PERF_FORMAT_GROUP
+ * };
+ */
+enum perf_event_read_format {
+	PERF_FORMAT_TOTAL_TIME_ENABLED		= 1U << 0,
+	PERF_FORMAT_TOTAL_TIME_RUNNING		= 1U << 1,
+	PERF_FORMAT_ID				= 1U << 2,
+	PERF_FORMAT_GROUP			= 1U << 3,
+
+	PERF_FORMAT_MAX = 1U << 4, 		/* non-ABI */
+};
+
+#define PERF_ATTR_SIZE_VER0	64	/* sizeof first published struct */
+
+/*
+ * Hardware event_id to monitor via a performance monitoring event:
+ */
+struct perf_event_attr {
+
+	/*
+	 * Major type: hardware/software/tracepoint/etc.
+	 */
+	__u32			type;
+
+	/*
+	 * Size of the attr structure, for fwd/bwd compat.
+	 */
+	__u32			size;
+
+	/*
+	 * Type specific configuration information.
+	 */
+	__u64			config;
+
+	union {
+		__u64		sample_period;
+		__u64		sample_freq;
+	};
+
+	__u64			sample_type;
+	__u64			read_format;
+
+	__u64			disabled       :  1, /* off by default        */
+				inherit	       :  1, /* children inherit it   */
+				pinned	       :  1, /* must always be on PMU */
+				exclusive      :  1, /* only group on PMU     */
+				exclude_user   :  1, /* don't count user      */
+				exclude_kernel :  1, /* ditto kernel          */
+				exclude_hv     :  1, /* ditto hypervisor      */
+				exclude_idle   :  1, /* don't count when idle */
+				mmap           :  1, /* include mmap data     */
+				comm	       :  1, /* include comm data     */
+				freq           :  1, /* use freq, not period  */
+				inherit_stat   :  1, /* per task counts       */
+				enable_on_exec :  1, /* next exec enables     */
+				task           :  1, /* trace fork/exit       */
+				watermark      :  1, /* wakeup_watermark      */
+
+				__reserved_1   : 49;
+
+	union {
+		__u32		wakeup_events;	  /* wakeup every n events */
+		__u32		wakeup_watermark; /* bytes before wakeup   */
+	};
+	__u32			__reserved_2;
+
+	__u64			__reserved_3;
+};
+
+/*
+ * Ioctls that can be done on a perf event fd:
+ */
+#define PERF_EVENT_IOC_ENABLE		_IO ('$', 0)
+#define PERF_EVENT_IOC_DISABLE	_IO ('$', 1)
+#define PERF_EVENT_IOC_REFRESH	_IO ('$', 2)
+#define PERF_EVENT_IOC_RESET		_IO ('$', 3)
+#define PERF_EVENT_IOC_PERIOD		_IOW('$', 4, u64)
+#define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
+
+enum perf_event_ioc_flags {
+	PERF_IOC_FLAG_GROUP		= 1U << 0,
+};
+
+/*
+ * Structure of the page that can be mapped via mmap
+ */
+struct perf_event_mmap_page {
+	__u32	version;		/* version number of this structure */
+	__u32	compat_version;		/* lowest version this is compat with */
+
+	/*
+	 * Bits needed to read the hw events in user-space.
+	 *
+	 *   u32 seq;
+	 *   s64 count;
+	 *
+	 *   do {
+	 *     seq = pc->lock;
+	 *
+	 *     barrier()
+	 *     if (pc->index) {
+	 *       count = pmc_read(pc->index - 1);
+	 *       count += pc->offset;
+	 *     } else
+	 *       goto regular_read;
+	 *
+	 *     barrier();
+	 *   } while (pc->lock != seq);
+	 *
+	 * NOTE: for obvious reason this only works on self-monitoring
+	 *       processes.
+	 */
+	__u32	lock;			/* seqlock for synchronization */
+	__u32	index;			/* hardware event identifier */
+	__s64	offset;			/* add to hardware event value */
+	__u64	time_enabled;		/* time event active */
+	__u64	time_running;		/* time event on cpu */
+
+		/*
+		 * Hole for extension of the self monitor capabilities
+		 */
+
+	__u64	__reserved[123];	/* align to 1k */
+
+	/*
+	 * Control data for the mmap() data buffer.
+	 *
+	 * User-space reading the @data_head value should issue an rmb(), on
+	 * SMP capable platforms, after reading this value -- see
+	 * perf_event_wakeup().
+	 *
+	 * When the mapping is PROT_WRITE the @data_tail value should be
+	 * written by userspace to reflect the last read data. In this case
+	 * the kernel will not over-write unread data.
+	 */
+	__u64   data_head;		/* head in the data section */
+	__u64	data_tail;		/* user-space written tail */
+};
+
+#define PERF_RECORD_MISC_CPUMODE_MASK		(3 << 0)
+#define PERF_RECORD_MISC_CPUMODE_UNKNOWN		(0 << 0)
+#define PERF_RECORD_MISC_KERNEL			(1 << 0)
+#define PERF_RECORD_MISC_USER			(2 << 0)
+#define PERF_RECORD_MISC_HYPERVISOR		(3 << 0)
+
+struct perf_event_header {
+	__u32	type;
+	__u16	misc;
+	__u16	size;
+};
+
+enum perf_event_type {
+
+	/*
+	 * The MMAP events record the PROT_EXEC mappings so that we can
+	 * correlate userspace IPs to code. They have the following structure:
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *
+	 *	u32				pid, tid;
+	 *	u64				addr;
+	 *	u64				len;
+	 *	u64				pgoff;
+	 *	char				filename[];
+	 * };
+	 */
+	PERF_RECORD_MMAP			= 1,
+
+	/*
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 * 	u64				id;
+	 * 	u64				lost;
+	 * };
+	 */
+	PERF_RECORD_LOST			= 2,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *
+	 *	u32				pid, tid;
+	 *	char				comm[];
+	 * };
+	 */
+	PERF_RECORD_COMM			= 3,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u32				pid, ppid;
+	 *	u32				tid, ptid;
+	 *	u64				time;
+	 * };
+	 */
+	PERF_RECORD_EXIT			= 4,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u64				time;
+	 *	u64				id;
+	 *	u64				stream_id;
+	 * };
+	 */
+	PERF_RECORD_THROTTLE		= 5,
+	PERF_RECORD_UNTHROTTLE		= 6,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u32				pid, ppid;
+	 *	u32				tid, ptid;
+	 *	{ u64				time;     } && PERF_SAMPLE_TIME
+	 * };
+	 */
+	PERF_RECORD_FORK			= 7,
+
+	/*
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 * 	u32				pid, tid;
+	 *
+	 * 	struct read_format		values;
+	 * };
+	 */
+	PERF_RECORD_READ			= 8,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *
+	 *	{ u64			ip;	  } && PERF_SAMPLE_IP
+	 *	{ u32			pid, tid; } && PERF_SAMPLE_TID
+	 *	{ u64			time;     } && PERF_SAMPLE_TIME
+	 *	{ u64			addr;     } && PERF_SAMPLE_ADDR
+	 *	{ u64			id;	  } && PERF_SAMPLE_ID
+	 *	{ u64			stream_id;} && PERF_SAMPLE_STREAM_ID
+	 *	{ u32			cpu, res; } && PERF_SAMPLE_CPU
+	 * 	{ u64			period;   } && PERF_SAMPLE_PERIOD
+	 *
+	 *	{ struct read_format	values;	  } && PERF_SAMPLE_READ
+	 *
+	 *	{ u64			nr,
+	 *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
+	 *
+	 * 	#
+	 * 	# The RAW record below is opaque data wrt the ABI
+	 * 	#
+	 * 	# That is, the ABI doesn't make any promises wrt to
+	 * 	# the stability of its content, it may vary depending
+	 * 	# on event_id, hardware, kernel version and phase of
+	 * 	# the moon.
+	 * 	#
+	 * 	# In other words, PERF_SAMPLE_RAW contents are not an ABI.
+	 * 	#
+	 *
+	 *	{ u32			size;
+	 *	  char                  data[size];}&& PERF_SAMPLE_RAW
+	 * };
+	 */
+	PERF_RECORD_SAMPLE		= 9,
+
+	PERF_RECORD_MAX,			/* non-ABI */
+};
+
+enum perf_callchain_context {
+	PERF_CONTEXT_HV			= (__u64)-32,
+	PERF_CONTEXT_KERNEL		= (__u64)-128,
+	PERF_CONTEXT_USER		= (__u64)-512,
+
+	PERF_CONTEXT_GUEST		= (__u64)-2048,
+	PERF_CONTEXT_GUEST_KERNEL	= (__u64)-2176,
+	PERF_CONTEXT_GUEST_USER		= (__u64)-2560,
+
+	PERF_CONTEXT_MAX		= (__u64)-4095,
+};
+
+#define PERF_FLAG_FD_NO_GROUP	(1U << 0)
+#define PERF_FLAG_FD_OUTPUT	(1U << 1)
+
+#ifdef __KERNEL__
+/*
+ * Kernel-internal data types and definitions:
+ */
+
+#ifdef CONFIG_PERF_EVENTS
+# include <asm/perf_event.h>
+#endif
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <linux/hrtimer.h>
+#include <linux/fs.h>
+#include <linux/pid_namespace.h>
+#include <asm/atomic.h>
+
+#define PERF_MAX_STACK_DEPTH		255
+
+struct perf_callchain_entry {
+	__u64				nr;
+	__u64				ip[PERF_MAX_STACK_DEPTH];
+};
+
+struct perf_raw_record {
+	u32				size;
+	void				*data;
+};
+
+struct task_struct;
+
+/**
+ * struct hw_perf_event - performance event hardware details:
+ */
+struct hw_perf_event {
+#ifdef CONFIG_PERF_EVENTS
+	union {
+		struct { /* hardware */
+			u64		config;
+			unsigned long	config_base;
+			unsigned long	event_base;
+			int		idx;
+		};
+		union { /* software */
+			atomic64_t	count;
+			struct hrtimer	hrtimer;
+		};
+	};
+	atomic64_t			prev_count;
+	u64				sample_period;
+	u64				last_period;
+	atomic64_t			period_left;
+	u64				interrupts;
+
+	u64				freq_count;
+	u64				freq_interrupts;
+	u64				freq_stamp;
+#endif
+};
+
+struct perf_event;
+
+/**
+ * struct pmu - generic performance monitoring unit
+ */
+struct pmu {
+	int (*enable)			(struct perf_event *event);
+	void (*disable)			(struct perf_event *event);
+	void (*read)			(struct perf_event *event);
+	void (*unthrottle)		(struct perf_event *event);
+};
+
+/**
+ * enum perf_event_active_state - the states of a event
+ */
+enum perf_event_active_state {
+	PERF_EVENT_STATE_ERROR	= -2,
+	PERF_EVENT_STATE_OFF		= -1,
+	PERF_EVENT_STATE_INACTIVE	=  0,
+	PERF_EVENT_STATE_ACTIVE	=  1,
+};
+
+struct file;
+
+struct perf_mmap_data {
+	struct rcu_head			rcu_head;
+	int				nr_pages;	/* nr of data pages  */
+	int				writable;	/* are we writable   */
+	int				nr_locked;	/* nr pages mlocked  */
+
+	atomic_t			poll;		/* POLL_ for wakeups */
+	atomic_t			events;		/* event_id limit       */
+
+	atomic_long_t			head;		/* write position    */
+	atomic_long_t			done_head;	/* completed head    */
+
+	atomic_t			lock;		/* concurrent writes */
+	atomic_t			wakeup;		/* needs a wakeup    */
+	atomic_t			lost;		/* nr records lost   */
+
+	long				watermark;	/* wakeup watermark  */
+
+	struct perf_event_mmap_page   *user_page;
+	void				*data_pages[0];
+};
+
+struct perf_pending_entry {
+	struct perf_pending_entry *next;
+	void (*func)(struct perf_pending_entry *);
+};
+
+/**
+ * struct perf_event - performance event kernel representation:
+ */
+struct perf_event {
+#ifdef CONFIG_PERF_EVENTS
+	struct list_head		group_entry;
+	struct list_head		event_entry;
+	struct list_head		sibling_list;
+	int				nr_siblings;
+	struct perf_event		*group_leader;
+	struct perf_event		*output;
+	const struct pmu		*pmu;
+
+	enum perf_event_active_state	state;
+	atomic64_t			count;
+
+	/*
+	 * These are the total time in nanoseconds that the event
+	 * has been enabled (i.e. eligible to run, and the task has
+	 * been scheduled in, if this is a per-task event)
+	 * and running (scheduled onto the CPU), respectively.
+	 *
+	 * They are computed from tstamp_enabled, tstamp_running and
+	 * tstamp_stopped when the event is in INACTIVE or ACTIVE state.
+	 */
+	u64				total_time_enabled;
+	u64				total_time_running;
+
+	/*
+	 * These are timestamps used for computing total_time_enabled
+	 * and total_time_running when the event is in INACTIVE or
+	 * ACTIVE state, measured in nanoseconds from an arbitrary point
+	 * in time.
+	 * tstamp_enabled: the notional time when the event was enabled
+	 * tstamp_running: the notional time when the event was scheduled on
+	 * tstamp_stopped: in INACTIVE state, the notional time when the
+	 *	event was scheduled off.
+	 */
+	u64				tstamp_enabled;
+	u64				tstamp_running;
+	u64				tstamp_stopped;
+
+	struct perf_event_attr	attr;
+	struct hw_perf_event		hw;
+
+	struct perf_event_context	*ctx;
+	struct file			*filp;
+
+	/*
+	 * These accumulate total time (in nanoseconds) that children
+	 * events have been enabled and running, respectively.
+	 */
+	atomic64_t			child_total_time_enabled;
+	atomic64_t			child_total_time_running;
+
+	/*
+	 * Protect attach/detach and child_list:
+	 */
+	struct mutex			child_mutex;
+	struct list_head		child_list;
+	struct perf_event		*parent;
+
+	int				oncpu;
+	int				cpu;
+
+	struct list_head		owner_entry;
+	struct task_struct		*owner;
+
+	/* mmap bits */
+	struct mutex			mmap_mutex;
+	atomic_t			mmap_count;
+	struct perf_mmap_data		*data;
+
+	/* poll related */
+	wait_queue_head_t		waitq;
+	struct fasync_struct		*fasync;
+
+	/* delayed work for NMIs and such */
+	int				pending_wakeup;
+	int				pending_kill;
+	int				pending_disable;
+	struct perf_pending_entry	pending;
+
+	atomic_t			event_limit;
+
+	void (*destroy)(struct perf_event *);
+	struct rcu_head			rcu_head;
+
+	struct pid_namespace		*ns;
+	u64				id;
+#endif
+};
+
+/**
+ * struct perf_event_context - event context structure
+ *
+ * Used as a container for task events and CPU events as well:
+ */
+struct perf_event_context {
+	/*
+	 * Protect the states of the events in the list,
+	 * nr_active, and the list:
+	 */
+	spinlock_t			lock;
+	/*
+	 * Protect the list of events.  Locking either mutex or lock
+	 * is sufficient to ensure the list doesn't change; to change
+	 * the list you need to lock both the mutex and the spinlock.
+	 */
+	struct mutex			mutex;
+
+	struct list_head		group_list;
+	struct list_head		event_list;
+	int				nr_events;
+	int				nr_active;
+	int				is_active;
+	int				nr_stat;
+	atomic_t			refcount;
+	struct task_struct		*task;
+
+	/*
+	 * Context clock, runs when context enabled.
+	 */
+	u64				time;
+	u64				timestamp;
+
+	/*
+	 * These fields let us detect when two contexts have both
+	 * been cloned (inherited) from a common ancestor.
+	 */
+	struct perf_event_context	*parent_ctx;
+	u64				parent_gen;
+	u64				generation;
+	int				pin_count;
+	struct rcu_head			rcu_head;
+};
+
+/**
+ * struct perf_event_cpu_context - per cpu event context structure
+ */
+struct perf_cpu_context {
+	struct perf_event_context	ctx;
+	struct perf_event_context	*task_ctx;
+	int				active_oncpu;
+	int				max_pertask;
+	int				exclusive;
+
+	/*
+	 * Recursion avoidance:
+	 *
+	 * task, softirq, irq, nmi context
+	 */
+	int				recursion[4];
+};
+
+struct perf_output_handle {
+	struct perf_event	*event;
+	struct perf_mmap_data	*data;
+	unsigned long		head;
+	unsigned long		offset;
+	int			nmi;
+	int			sample;
+	int			locked;
+	unsigned long		flags;
+};
+
+#ifdef CONFIG_PERF_EVENTS
+
+/*
+ * Set by architecture code:
+ */
+extern int perf_max_events;
+
+extern const struct pmu *hw_perf_event_init(struct perf_event *event);
+
+extern void perf_event_task_sched_in(struct task_struct *task, int cpu);
+extern void perf_event_task_sched_out(struct task_struct *task,
+					struct task_struct *next, int cpu);
+extern void perf_event_task_tick(struct task_struct *task, int cpu);
+extern int perf_event_init_task(struct task_struct *child);
+extern void perf_event_exit_task(struct task_struct *child);
+extern void perf_event_free_task(struct task_struct *task);
+extern void set_perf_event_pending(void);
+extern void perf_event_do_pending(void);
+extern void perf_event_print_debug(void);
+extern void __perf_disable(void);
+extern bool __perf_enable(void);
+extern void perf_disable(void);
+extern void perf_enable(void);
+extern int perf_event_task_disable(void);
+extern int perf_event_task_enable(void);
+extern int hw_perf_group_sched_in(struct perf_event *group_leader,
+	       struct perf_cpu_context *cpuctx,
+	       struct perf_event_context *ctx, int cpu);
+extern void perf_event_update_userpage(struct perf_event *event);
+
+struct perf_sample_data {
+	u64				type;
+
+	u64				ip;
+	struct {
+		u32	pid;
+		u32	tid;
+	}				tid_entry;
+	u64				time;
+	u64				addr;
+	u64				id;
+	u64				stream_id;
+	struct {
+		u32	cpu;
+		u32	reserved;
+	}				cpu_entry;
+	u64				period;
+	struct perf_callchain_entry	*callchain;
+	struct perf_raw_record		*raw;
+};
+
+extern void perf_output_sample(struct perf_output_handle *handle,
+			       struct perf_event_header *header,
+			       struct perf_sample_data *data,
+			       struct perf_event *event);
+extern void perf_prepare_sample(struct perf_event_header *header,
+				struct perf_sample_data *data,
+				struct perf_event *event,
+				struct pt_regs *regs);
+
+extern int perf_event_overflow(struct perf_event *event, int nmi,
+				 struct perf_sample_data *data,
+				 struct pt_regs *regs);
+
+/*
+ * Return 1 for a software event, 0 for a hardware event
+ */
+static inline int is_software_event(struct perf_event *event)
+{
+	return (event->attr.type != PERF_TYPE_RAW) &&
+		(event->attr.type != PERF_TYPE_HARDWARE) &&
+		(event->attr.type != PERF_TYPE_HW_CACHE);
+}
+
+extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
+
+extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64);
+
+static inline void
+perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
+{
+	if (atomic_read(&perf_swevent_enabled[event_id]))
+		__perf_sw_event(event_id, nr, nmi, regs, addr);
+}
+
+extern void __perf_event_mmap(struct vm_area_struct *vma);
+
+static inline void perf_event_mmap(struct vm_area_struct *vma)
+{
+	if (vma->vm_flags & VM_EXEC)
+		__perf_event_mmap(vma);
+}
+
+extern void perf_event_comm(struct task_struct *tsk);
+extern void perf_event_fork(struct task_struct *tsk);
+
+extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+
+extern int sysctl_perf_event_paranoid;
+extern int sysctl_perf_event_mlock;
+extern int sysctl_perf_event_sample_rate;
+
+extern void perf_event_init(void);
+extern void perf_tp_event(int event_id, u64 addr, u64 count,
+				 void *record, int entry_size);
+
+#ifndef perf_misc_flags
+#define perf_misc_flags(regs)	(user_mode(regs) ? PERF_RECORD_MISC_USER : \
+				 PERF_RECORD_MISC_KERNEL)
+#define perf_instruction_pointer(regs)	instruction_pointer(regs)
+#endif
+
+extern int perf_output_begin(struct perf_output_handle *handle,
+			     struct perf_event *event, unsigned int size,
+			     int nmi, int sample);
+extern void perf_output_end(struct perf_output_handle *handle);
+extern void perf_output_copy(struct perf_output_handle *handle,
+			     const void *buf, unsigned int len);
+#else
+static inline void
+perf_event_task_sched_in(struct task_struct *task, int cpu)		{ }
+static inline void
+perf_event_task_sched_out(struct task_struct *task,
+			    struct task_struct *next, int cpu)		{ }
+static inline void
+perf_event_task_tick(struct task_struct *task, int cpu)		{ }
+static inline int perf_event_init_task(struct task_struct *child)	{ return 0; }
+static inline void perf_event_exit_task(struct task_struct *child)	{ }
+static inline void perf_event_free_task(struct task_struct *task)	{ }
+static inline void perf_event_do_pending(void)			{ }
+static inline void perf_event_print_debug(void)			{ }
+static inline void perf_disable(void)					{ }
+static inline void perf_enable(void)					{ }
+static inline int perf_event_task_disable(void)	{ return -EINVAL; }
+static inline int perf_event_task_enable(void)	{ return -EINVAL; }
+
+static inline void
+perf_sw_event(u32 event_id, u64 nr, int nmi,
+		     struct pt_regs *regs, u64 addr)			{ }
+
+static inline void perf_event_mmap(struct vm_area_struct *vma)	{ }
+static inline void perf_event_comm(struct task_struct *tsk)		{ }
+static inline void perf_event_fork(struct task_struct *tsk)		{ }
+static inline void perf_event_init(void)				{ }
+
+#endif
+
+#define perf_output_put(handle, x) \
+	perf_output_copy((handle), &(x), sizeof(x))
+
+#endif /* __KERNEL__ */
+#endif /* _LINUX_PERF_EVENT_H */
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index b00df4c79c63..07bff666e65b 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -85,7 +85,7 @@
 #define PR_SET_TIMERSLACK 29
 #define PR_GET_TIMERSLACK 30
 
-#define PR_TASK_PERF_COUNTERS_DISABLE		31
-#define PR_TASK_PERF_COUNTERS_ENABLE		32
+#define PR_TASK_PERF_EVENTS_DISABLE		31
+#define PR_TASK_PERF_EVENTS_ENABLE		32
 
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8af3d249170e..8b265a8986d0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -100,7 +100,7 @@ struct robust_list_head;
 struct bio;
 struct fs_struct;
 struct bts_context;
-struct perf_counter_context;
+struct perf_event_context;
 
 /*
  * List of flags we want to share for kernel threads,
@@ -701,7 +701,7 @@ struct user_struct {
 #endif
 #endif
 
-#ifdef CONFIG_PERF_COUNTERS
+#ifdef CONFIG_PERF_EVENTS
 	atomic_long_t locked_vm;
 #endif
 };
@@ -1449,10 +1449,10 @@ struct task_struct {
 	struct list_head pi_state_list;
 	struct futex_pi_state *pi_state_cache;
 #endif
-#ifdef CONFIG_PERF_COUNTERS
-	struct perf_counter_context *perf_counter_ctxp;
-	struct mutex perf_counter_mutex;
-	struct list_head perf_counter_list;
+#ifdef CONFIG_PERF_EVENTS
+	struct perf_event_context *perf_event_ctxp;
+	struct mutex perf_event_mutex;
+	struct list_head perf_event_list;
 #endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *mempolicy;	/* Protected by alloc_lock */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a8e37821cc60..02f19f9a76c6 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -55,7 +55,7 @@ struct compat_timeval;
 struct robust_list_head;
 struct getcpu_cache;
 struct old_linux_dirent;
-struct perf_counter_attr;
+struct perf_event_attr;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -885,7 +885,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int,
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
 
-asmlinkage long sys_perf_counter_open(
-		struct perf_counter_attr __user *attr_uptr,
+asmlinkage long sys_perf_event_open(
+		struct perf_event_attr __user *attr_uptr,
 		pid_t pid, int cpu, int group_fd, unsigned long flags);
 #endif
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 72a3b437b829..ec91e78244f0 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -378,7 +378,7 @@ static inline int ftrace_get_offsets_##call(				\
 #ifdef CONFIG_EVENT_PROFILE
 
 /*
- * Generate the functions needed for tracepoint perf_counter support.
+ * Generate the functions needed for tracepoint perf_event support.
  *
  * NOTE: The insertion profile callback (ftrace_profile_<call>) is defined later
  *
@@ -656,7 +656,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
  * {
  *	struct ftrace_data_offsets_<call> __maybe_unused __data_offsets;
  *	struct ftrace_event_call *event_call = &event_<call>;
- *	extern void perf_tpcounter_event(int, u64, u64, void *, int);
+ *	extern void perf_tp_event(int, u64, u64, void *, int);
  *	struct ftrace_raw_##call *entry;
  *	u64 __addr = 0, __count = 1;
  *	unsigned long irq_flags;
@@ -691,7 +691,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
  *
  *		<assign>  <- affect our values
  *
- *		perf_tpcounter_event(event_call->id, __addr, __count, entry,
+ *		perf_tp_event(event_call->id, __addr, __count, entry,
  *			     __entry_size);  <- submit them to perf counter
  *	} while (0);
  *
@@ -712,7 +712,7 @@ static void ftrace_profile_##call(proto)				\
 {									\
 	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
 	struct ftrace_event_call *event_call = &event_##call;		\
-	extern void perf_tpcounter_event(int, u64, u64, void *, int);	\
+	extern void perf_tp_event(int, u64, u64, void *, int);	\
 	struct ftrace_raw_##call *entry;				\
 	u64 __addr = 0, __count = 1;					\
 	unsigned long irq_flags;					\
@@ -742,7 +742,7 @@ static void ftrace_profile_##call(proto)				\
 									\
 		{ assign; }						\
 									\
-		perf_tpcounter_event(event_call->id, __addr, __count, entry,\
+		perf_tp_event(event_call->id, __addr, __count, entry,\
 			     __entry_size);				\
 	} while (0);							\
 									\
diff --git a/init/Kconfig b/init/Kconfig
index 8e8b76d8a272..cfdf5c322806 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -915,17 +915,17 @@ config AIO
           by some high performance threaded applications. Disabling
           this option saves about 7k.
 
-config HAVE_PERF_COUNTERS
+config HAVE_PERF_EVENTS
 	bool
 	help
 	  See tools/perf/design.txt for details.
 
 menu "Performance Counters"
 
-config PERF_COUNTERS
+config PERF_EVENTS
 	bool "Kernel Performance Counters"
 	default y if PROFILING
-	depends on HAVE_PERF_COUNTERS
+	depends on HAVE_PERF_EVENTS
 	select ANON_INODES
 	help
 	  Enable kernel support for performance counter hardware.
@@ -947,7 +947,7 @@ config PERF_COUNTERS
 
 config EVENT_PROFILE
 	bool "Tracepoint profiling sources"
-	depends on PERF_COUNTERS && EVENT_TRACING
+	depends on PERF_EVENTS && EVENT_TRACING
 	default y
 	help
 	 Allow the use of tracepoints as software performance counters.
diff --git a/kernel/Makefile b/kernel/Makefile
index 3d9c7e27e3f9..e26a546eac44 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -96,7 +96,7 @@ obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_SLOW_WORK) += slow-work.o
-obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
+obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/exit.c b/kernel/exit.c
index ae5d8660ddff..e47ee8a06135 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -47,7 +47,7 @@
 #include <linux/tracehook.h>
 #include <linux/fs_struct.h>
 #include <linux/init_task.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <trace/events/sched.h>
 
 #include <asm/uaccess.h>
@@ -154,8 +154,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 {
 	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 
-#ifdef CONFIG_PERF_COUNTERS
-	WARN_ON_ONCE(tsk->perf_counter_ctxp);
+#ifdef CONFIG_PERF_EVENTS
+	WARN_ON_ONCE(tsk->perf_event_ctxp);
 #endif
 	trace_sched_process_free(tsk);
 	put_task_struct(tsk);
@@ -981,7 +981,7 @@ NORET_TYPE void do_exit(long code)
 	 * Flush inherited counters to the parent - before the parent
 	 * gets woken up by child-exit notifications.
 	 */
-	perf_counter_exit_task(tsk);
+	perf_event_exit_task(tsk);
 
 	exit_notify(tsk, group_dead);
 #ifdef CONFIG_NUMA
diff --git a/kernel/fork.c b/kernel/fork.c
index bfee931ee3fb..2cebfb23b0b8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -61,7 +61,7 @@
 #include <linux/blkdev.h>
 #include <linux/fs_struct.h>
 #include <linux/magic.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -1078,7 +1078,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p, clone_flags);
 
-	retval = perf_counter_init_task(p);
+	retval = perf_event_init_task(p);
 	if (retval)
 		goto bad_fork_cleanup_policy;
 
@@ -1253,7 +1253,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	write_unlock_irq(&tasklist_lock);
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
-	perf_counter_fork(p);
+	perf_event_fork(p);
 	return p;
 
 bad_fork_free_pid:
@@ -1280,7 +1280,7 @@ bad_fork_cleanup_semundo:
 bad_fork_cleanup_audit:
 	audit_free(p);
 bad_fork_cleanup_policy:
-	perf_counter_free_task(p);
+	perf_event_free_task(p);
 #ifdef CONFIG_NUMA
 	mpol_put(p->mempolicy);
 bad_fork_cleanup_cgroup:
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
deleted file mode 100644
index 62de0db8092b..000000000000
--- a/kernel/perf_counter.c
+++ /dev/null
@@ -1,5000 +0,0 @@
-/*
- * Performance counter core code
- *
- *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
- *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
- *
- *  For licensing details see kernel-base/COPYING
- */
-
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/file.h>
-#include <linux/poll.h>
-#include <linux/sysfs.h>
-#include <linux/dcache.h>
-#include <linux/percpu.h>
-#include <linux/ptrace.h>
-#include <linux/vmstat.h>
-#include <linux/hardirq.h>
-#include <linux/rculist.h>
-#include <linux/uaccess.h>
-#include <linux/syscalls.h>
-#include <linux/anon_inodes.h>
-#include <linux/kernel_stat.h>
-#include <linux/perf_counter.h>
-
-#include <asm/irq_regs.h>
-
-/*
- * Each CPU has a list of per CPU counters:
- */
-DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
-
-int perf_max_counters __read_mostly = 1;
-static int perf_reserved_percpu __read_mostly;
-static int perf_overcommit __read_mostly = 1;
-
-static atomic_t nr_counters __read_mostly;
-static atomic_t nr_mmap_counters __read_mostly;
-static atomic_t nr_comm_counters __read_mostly;
-static atomic_t nr_task_counters __read_mostly;
-
-/*
- * perf counter paranoia level:
- *  -1 - not paranoid at all
- *   0 - disallow raw tracepoint access for unpriv
- *   1 - disallow cpu counters for unpriv
- *   2 - disallow kernel profiling for unpriv
- */
-int sysctl_perf_counter_paranoid __read_mostly = 1;
-
-static inline bool perf_paranoid_tracepoint_raw(void)
-{
-	return sysctl_perf_counter_paranoid > -1;
-}
-
-static inline bool perf_paranoid_cpu(void)
-{
-	return sysctl_perf_counter_paranoid > 0;
-}
-
-static inline bool perf_paranoid_kernel(void)
-{
-	return sysctl_perf_counter_paranoid > 1;
-}
-
-int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
-
-/*
- * max perf counter sample rate
- */
-int sysctl_perf_counter_sample_rate __read_mostly = 100000;
-
-static atomic64_t perf_counter_id;
-
-/*
- * Lock for (sysadmin-configurable) counter reservations:
- */
-static DEFINE_SPINLOCK(perf_resource_lock);
-
-/*
- * Architecture provided APIs - weak aliases:
- */
-extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
-{
-	return NULL;
-}
-
-void __weak hw_perf_disable(void)		{ barrier(); }
-void __weak hw_perf_enable(void)		{ barrier(); }
-
-void __weak hw_perf_counter_setup(int cpu)	{ barrier(); }
-void __weak hw_perf_counter_setup_online(int cpu)	{ barrier(); }
-
-int __weak
-hw_perf_group_sched_in(struct perf_counter *group_leader,
-	       struct perf_cpu_context *cpuctx,
-	       struct perf_counter_context *ctx, int cpu)
-{
-	return 0;
-}
-
-void __weak perf_counter_print_debug(void)	{ }
-
-static DEFINE_PER_CPU(int, perf_disable_count);
-
-void __perf_disable(void)
-{
-	__get_cpu_var(perf_disable_count)++;
-}
-
-bool __perf_enable(void)
-{
-	return !--__get_cpu_var(perf_disable_count);
-}
-
-void perf_disable(void)
-{
-	__perf_disable();
-	hw_perf_disable();
-}
-
-void perf_enable(void)
-{
-	if (__perf_enable())
-		hw_perf_enable();
-}
-
-static void get_ctx(struct perf_counter_context *ctx)
-{
-	WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
-}
-
-static void free_ctx(struct rcu_head *head)
-{
-	struct perf_counter_context *ctx;
-
-	ctx = container_of(head, struct perf_counter_context, rcu_head);
-	kfree(ctx);
-}
-
-static void put_ctx(struct perf_counter_context *ctx)
-{
-	if (atomic_dec_and_test(&ctx->refcount)) {
-		if (ctx->parent_ctx)
-			put_ctx(ctx->parent_ctx);
-		if (ctx->task)
-			put_task_struct(ctx->task);
-		call_rcu(&ctx->rcu_head, free_ctx);
-	}
-}
-
-static void unclone_ctx(struct perf_counter_context *ctx)
-{
-	if (ctx->parent_ctx) {
-		put_ctx(ctx->parent_ctx);
-		ctx->parent_ctx = NULL;
-	}
-}
-
-/*
- * If we inherit counters we want to return the parent counter id
- * to userspace.
- */
-static u64 primary_counter_id(struct perf_counter *counter)
-{
-	u64 id = counter->id;
-
-	if (counter->parent)
-		id = counter->parent->id;
-
-	return id;
-}
-
-/*
- * Get the perf_counter_context for a task and lock it.
- * This has to cope with with the fact that until it is locked,
- * the context could get moved to another task.
- */
-static struct perf_counter_context *
-perf_lock_task_context(struct task_struct *task, unsigned long *flags)
-{
-	struct perf_counter_context *ctx;
-
-	rcu_read_lock();
- retry:
-	ctx = rcu_dereference(task->perf_counter_ctxp);
-	if (ctx) {
-		/*
-		 * If this context is a clone of another, it might
-		 * get swapped for another underneath us by
-		 * perf_counter_task_sched_out, though the
-		 * rcu_read_lock() protects us from any context
-		 * getting freed.  Lock the context and check if it
-		 * got swapped before we could get the lock, and retry
-		 * if so.  If we locked the right context, then it
-		 * can't get swapped on us any more.
-		 */
-		spin_lock_irqsave(&ctx->lock, *flags);
-		if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
-			spin_unlock_irqrestore(&ctx->lock, *flags);
-			goto retry;
-		}
-
-		if (!atomic_inc_not_zero(&ctx->refcount)) {
-			spin_unlock_irqrestore(&ctx->lock, *flags);
-			ctx = NULL;
-		}
-	}
-	rcu_read_unlock();
-	return ctx;
-}
-
-/*
- * Get the context for a task and increment its pin_count so it
- * can't get swapped to another task.  This also increments its
- * reference count so that the context can't get freed.
- */
-static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
-{
-	struct perf_counter_context *ctx;
-	unsigned long flags;
-
-	ctx = perf_lock_task_context(task, &flags);
-	if (ctx) {
-		++ctx->pin_count;
-		spin_unlock_irqrestore(&ctx->lock, flags);
-	}
-	return ctx;
-}
-
-static void perf_unpin_context(struct perf_counter_context *ctx)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&ctx->lock, flags);
-	--ctx->pin_count;
-	spin_unlock_irqrestore(&ctx->lock, flags);
-	put_ctx(ctx);
-}
-
-/*
- * Add a counter from the lists for its context.
- * Must be called with ctx->mutex and ctx->lock held.
- */
-static void
-list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
-{
-	struct perf_counter *group_leader = counter->group_leader;
-
-	/*
-	 * Depending on whether it is a standalone or sibling counter,
-	 * add it straight to the context's counter list, or to the group
-	 * leader's sibling list:
-	 */
-	if (group_leader == counter)
-		list_add_tail(&counter->group_entry, &ctx->group_list);
-	else {
-		list_add_tail(&counter->group_entry, &group_leader->sibling_list);
-		group_leader->nr_siblings++;
-	}
-
-	list_add_rcu(&counter->event_entry, &ctx->event_list);
-	ctx->nr_counters++;
-	if (counter->attr.inherit_stat)
-		ctx->nr_stat++;
-}
-
-/*
- * Remove a counter from the lists for its context.
- * Must be called with ctx->mutex and ctx->lock held.
- */
-static void
-list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
-{
-	struct perf_counter *sibling, *tmp;
-
-	if (list_empty(&counter->group_entry))
-		return;
-	ctx->nr_counters--;
-	if (counter->attr.inherit_stat)
-		ctx->nr_stat--;
-
-	list_del_init(&counter->group_entry);
-	list_del_rcu(&counter->event_entry);
-
-	if (counter->group_leader != counter)
-		counter->group_leader->nr_siblings--;
-
-	/*
-	 * If this was a group counter with sibling counters then
-	 * upgrade the siblings to singleton counters by adding them
-	 * to the context list directly:
-	 */
-	list_for_each_entry_safe(sibling, tmp, &counter->sibling_list, group_entry) {
-
-		list_move_tail(&sibling->group_entry, &ctx->group_list);
-		sibling->group_leader = sibling;
-	}
-}
-
-static void
-counter_sched_out(struct perf_counter *counter,
-		  struct perf_cpu_context *cpuctx,
-		  struct perf_counter_context *ctx)
-{
-	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
-		return;
-
-	counter->state = PERF_COUNTER_STATE_INACTIVE;
-	if (counter->pending_disable) {
-		counter->pending_disable = 0;
-		counter->state = PERF_COUNTER_STATE_OFF;
-	}
-	counter->tstamp_stopped = ctx->time;
-	counter->pmu->disable(counter);
-	counter->oncpu = -1;
-
-	if (!is_software_counter(counter))
-		cpuctx->active_oncpu--;
-	ctx->nr_active--;
-	if (counter->attr.exclusive || !cpuctx->active_oncpu)
-		cpuctx->exclusive = 0;
-}
-
-static void
-group_sched_out(struct perf_counter *group_counter,
-		struct perf_cpu_context *cpuctx,
-		struct perf_counter_context *ctx)
-{
-	struct perf_counter *counter;
-
-	if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
-		return;
-
-	counter_sched_out(group_counter, cpuctx, ctx);
-
-	/*
-	 * Schedule out siblings (if any):
-	 */
-	list_for_each_entry(counter, &group_counter->sibling_list, group_entry)
-		counter_sched_out(counter, cpuctx, ctx);
-
-	if (group_counter->attr.exclusive)
-		cpuctx->exclusive = 0;
-}
-
-/*
- * Cross CPU call to remove a performance counter
- *
- * We disable the counter on the hardware level first. After that we
- * remove it from the context list.
- */
-static void __perf_counter_remove_from_context(void *info)
-{
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	struct perf_counter *counter = info;
-	struct perf_counter_context *ctx = counter->ctx;
-
-	/*
-	 * If this is a task context, we need to check whether it is
-	 * the current task context of this cpu. If not it has been
-	 * scheduled out before the smp call arrived.
-	 */
-	if (ctx->task && cpuctx->task_ctx != ctx)
-		return;
-
-	spin_lock(&ctx->lock);
-	/*
-	 * Protect the list operation against NMI by disabling the
-	 * counters on a global level.
-	 */
-	perf_disable();
-
-	counter_sched_out(counter, cpuctx, ctx);
-
-	list_del_counter(counter, ctx);
-
-	if (!ctx->task) {
-		/*
-		 * Allow more per task counters with respect to the
-		 * reservation:
-		 */
-		cpuctx->max_pertask =
-			min(perf_max_counters - ctx->nr_counters,
-			    perf_max_counters - perf_reserved_percpu);
-	}
-
-	perf_enable();
-	spin_unlock(&ctx->lock);
-}
-
-
-/*
- * Remove the counter from a task's (or a CPU's) list of counters.
- *
- * Must be called with ctx->mutex held.
- *
- * CPU counters are removed with a smp call. For task counters we only
- * call when the task is on a CPU.
- *
- * If counter->ctx is a cloned context, callers must make sure that
- * every task struct that counter->ctx->task could possibly point to
- * remains valid.  This is OK when called from perf_release since
- * that only calls us on the top-level context, which can't be a clone.
- * When called from perf_counter_exit_task, it's OK because the
- * context has been detached from its task.
- */
-static void perf_counter_remove_from_context(struct perf_counter *counter)
-{
-	struct perf_counter_context *ctx = counter->ctx;
-	struct task_struct *task = ctx->task;
-
-	if (!task) {
-		/*
-		 * Per cpu counters are removed via an smp call and
-		 * the removal is always sucessful.
-		 */
-		smp_call_function_single(counter->cpu,
-					 __perf_counter_remove_from_context,
-					 counter, 1);
-		return;
-	}
-
-retry:
-	task_oncpu_function_call(task, __perf_counter_remove_from_context,
-				 counter);
-
-	spin_lock_irq(&ctx->lock);
-	/*
-	 * If the context is active we need to retry the smp call.
-	 */
-	if (ctx->nr_active && !list_empty(&counter->group_entry)) {
-		spin_unlock_irq(&ctx->lock);
-		goto retry;
-	}
-
-	/*
-	 * The lock prevents that this context is scheduled in so we
-	 * can remove the counter safely, if the call above did not
-	 * succeed.
-	 */
-	if (!list_empty(&counter->group_entry)) {
-		list_del_counter(counter, ctx);
-	}
-	spin_unlock_irq(&ctx->lock);
-}
-
-static inline u64 perf_clock(void)
-{
-	return cpu_clock(smp_processor_id());
-}
-
-/*
- * Update the record of the current time in a context.
- */
-static void update_context_time(struct perf_counter_context *ctx)
-{
-	u64 now = perf_clock();
-
-	ctx->time += now - ctx->timestamp;
-	ctx->timestamp = now;
-}
-
-/*
- * Update the total_time_enabled and total_time_running fields for a counter.
- */
-static void update_counter_times(struct perf_counter *counter)
-{
-	struct perf_counter_context *ctx = counter->ctx;
-	u64 run_end;
-
-	if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
-	    counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE)
-		return;
-
-	counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
-
-	if (counter->state == PERF_COUNTER_STATE_INACTIVE)
-		run_end = counter->tstamp_stopped;
-	else
-		run_end = ctx->time;
-
-	counter->total_time_running = run_end - counter->tstamp_running;
-}
-
-/*
- * Update total_time_enabled and total_time_running for all counters in a group.
- */
-static void update_group_times(struct perf_counter *leader)
-{
-	struct perf_counter *counter;
-
-	update_counter_times(leader);
-	list_for_each_entry(counter, &leader->sibling_list, group_entry)
-		update_counter_times(counter);
-}
-
-/*
- * Cross CPU call to disable a performance counter
- */
-static void __perf_counter_disable(void *info)
-{
-	struct perf_counter *counter = info;
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	struct perf_counter_context *ctx = counter->ctx;
-
-	/*
-	 * If this is a per-task counter, need to check whether this
-	 * counter's task is the current task on this cpu.
-	 */
-	if (ctx->task && cpuctx->task_ctx != ctx)
-		return;
-
-	spin_lock(&ctx->lock);
-
-	/*
-	 * If the counter is on, turn it off.
-	 * If it is in error state, leave it in error state.
-	 */
-	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
-		update_context_time(ctx);
-		update_group_times(counter);
-		if (counter == counter->group_leader)
-			group_sched_out(counter, cpuctx, ctx);
-		else
-			counter_sched_out(counter, cpuctx, ctx);
-		counter->state = PERF_COUNTER_STATE_OFF;
-	}
-
-	spin_unlock(&ctx->lock);
-}
-
-/*
- * Disable a counter.
- *
- * If counter->ctx is a cloned context, callers must make sure that
- * every task struct that counter->ctx->task could possibly point to
- * remains valid.  This condition is satisifed when called through
- * perf_counter_for_each_child or perf_counter_for_each because they
- * hold the top-level counter's child_mutex, so any descendant that
- * goes to exit will block in sync_child_counter.
- * When called from perf_pending_counter it's OK because counter->ctx
- * is the current context on this CPU and preemption is disabled,
- * hence we can't get into perf_counter_task_sched_out for this context.
- */
-static void perf_counter_disable(struct perf_counter *counter)
-{
-	struct perf_counter_context *ctx = counter->ctx;
-	struct task_struct *task = ctx->task;
-
-	if (!task) {
-		/*
-		 * Disable the counter on the cpu that it's on
-		 */
-		smp_call_function_single(counter->cpu, __perf_counter_disable,
-					 counter, 1);
-		return;
-	}
-
- retry:
-	task_oncpu_function_call(task, __perf_counter_disable, counter);
-
-	spin_lock_irq(&ctx->lock);
-	/*
-	 * If the counter is still active, we need to retry the cross-call.
-	 */
-	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
-		spin_unlock_irq(&ctx->lock);
-		goto retry;
-	}
-
-	/*
-	 * Since we have the lock this context can't be scheduled
-	 * in, so we can change the state safely.
-	 */
-	if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
-		update_group_times(counter);
-		counter->state = PERF_COUNTER_STATE_OFF;
-	}
-
-	spin_unlock_irq(&ctx->lock);
-}
-
-static int
-counter_sched_in(struct perf_counter *counter,
-		 struct perf_cpu_context *cpuctx,
-		 struct perf_counter_context *ctx,
-		 int cpu)
-{
-	if (counter->state <= PERF_COUNTER_STATE_OFF)
-		return 0;
-
-	counter->state = PERF_COUNTER_STATE_ACTIVE;
-	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
-	/*
-	 * The new state must be visible before we turn it on in the hardware:
-	 */
-	smp_wmb();
-
-	if (counter->pmu->enable(counter)) {
-		counter->state = PERF_COUNTER_STATE_INACTIVE;
-		counter->oncpu = -1;
-		return -EAGAIN;
-	}
-
-	counter->tstamp_running += ctx->time - counter->tstamp_stopped;
-
-	if (!is_software_counter(counter))
-		cpuctx->active_oncpu++;
-	ctx->nr_active++;
-
-	if (counter->attr.exclusive)
-		cpuctx->exclusive = 1;
-
-	return 0;
-}
-
-static int
-group_sched_in(struct perf_counter *group_counter,
-	       struct perf_cpu_context *cpuctx,
-	       struct perf_counter_context *ctx,
-	       int cpu)
-{
-	struct perf_counter *counter, *partial_group;
-	int ret;
-
-	if (group_counter->state == PERF_COUNTER_STATE_OFF)
-		return 0;
-
-	ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
-	if (ret)
-		return ret < 0 ? ret : 0;
-
-	if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
-		return -EAGAIN;
-
-	/*
-	 * Schedule in siblings as one group (if any):
-	 */
-	list_for_each_entry(counter, &group_counter->sibling_list, group_entry) {
-		if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
-			partial_group = counter;
-			goto group_error;
-		}
-	}
-
-	return 0;
-
-group_error:
-	/*
-	 * Groups can be scheduled in as one unit only, so undo any
-	 * partial group before returning:
-	 */
-	list_for_each_entry(counter, &group_counter->sibling_list, group_entry) {
-		if (counter == partial_group)
-			break;
-		counter_sched_out(counter, cpuctx, ctx);
-	}
-	counter_sched_out(group_counter, cpuctx, ctx);
-
-	return -EAGAIN;
-}
-
-/*
- * Return 1 for a group consisting entirely of software counters,
- * 0 if the group contains any hardware counters.
- */
-static int is_software_only_group(struct perf_counter *leader)
-{
-	struct perf_counter *counter;
-
-	if (!is_software_counter(leader))
-		return 0;
-
-	list_for_each_entry(counter, &leader->sibling_list, group_entry)
-		if (!is_software_counter(counter))
-			return 0;
-
-	return 1;
-}
-
-/*
- * Work out whether we can put this counter group on the CPU now.
- */
-static int group_can_go_on(struct perf_counter *counter,
-			   struct perf_cpu_context *cpuctx,
-			   int can_add_hw)
-{
-	/*
-	 * Groups consisting entirely of software counters can always go on.
-	 */
-	if (is_software_only_group(counter))
-		return 1;
-	/*
-	 * If an exclusive group is already on, no other hardware
-	 * counters can go on.
-	 */
-	if (cpuctx->exclusive)
-		return 0;
-	/*
-	 * If this group is exclusive and there are already
-	 * counters on the CPU, it can't go on.
-	 */
-	if (counter->attr.exclusive && cpuctx->active_oncpu)
-		return 0;
-	/*
-	 * Otherwise, try to add it if all previous groups were able
-	 * to go on.
-	 */
-	return can_add_hw;
-}
-
-static void add_counter_to_ctx(struct perf_counter *counter,
-			       struct perf_counter_context *ctx)
-{
-	list_add_counter(counter, ctx);
-	counter->tstamp_enabled = ctx->time;
-	counter->tstamp_running = ctx->time;
-	counter->tstamp_stopped = ctx->time;
-}
-
-/*
- * Cross CPU call to install and enable a performance counter
- *
- * Must be called with ctx->mutex held
- */
-static void __perf_install_in_context(void *info)
-{
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	struct perf_counter *counter = info;
-	struct perf_counter_context *ctx = counter->ctx;
-	struct perf_counter *leader = counter->group_leader;
-	int cpu = smp_processor_id();
-	int err;
-
-	/*
-	 * If this is a task context, we need to check whether it is
-	 * the current task context of this cpu. If not it has been
-	 * scheduled out before the smp call arrived.
-	 * Or possibly this is the right context but it isn't
-	 * on this cpu because it had no counters.
-	 */
-	if (ctx->task && cpuctx->task_ctx != ctx) {
-		if (cpuctx->task_ctx || ctx->task != current)
-			return;
-		cpuctx->task_ctx = ctx;
-	}
-
-	spin_lock(&ctx->lock);
-	ctx->is_active = 1;
-	update_context_time(ctx);
-
-	/*
-	 * Protect the list operation against NMI by disabling the
-	 * counters on a global level. NOP for non NMI based counters.
-	 */
-	perf_disable();
-
-	add_counter_to_ctx(counter, ctx);
-
-	/*
-	 * Don't put the counter on if it is disabled or if
-	 * it is in a group and the group isn't on.
-	 */
-	if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
-	    (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
-		goto unlock;
-
-	/*
-	 * An exclusive counter can't go on if there are already active
-	 * hardware counters, and no hardware counter can go on if there
-	 * is already an exclusive counter on.
-	 */
-	if (!group_can_go_on(counter, cpuctx, 1))
-		err = -EEXIST;
-	else
-		err = counter_sched_in(counter, cpuctx, ctx, cpu);
-
-	if (err) {
-		/*
-		 * This counter couldn't go on.  If it is in a group
-		 * then we have to pull the whole group off.
-		 * If the counter group is pinned then put it in error state.
-		 */
-		if (leader != counter)
-			group_sched_out(leader, cpuctx, ctx);
-		if (leader->attr.pinned) {
-			update_group_times(leader);
-			leader->state = PERF_COUNTER_STATE_ERROR;
-		}
-	}
-
-	if (!err && !ctx->task && cpuctx->max_pertask)
-		cpuctx->max_pertask--;
-
- unlock:
-	perf_enable();
-
-	spin_unlock(&ctx->lock);
-}
-
-/*
- * Attach a performance counter to a context
- *
- * First we add the counter to the list with the hardware enable bit
- * in counter->hw_config cleared.
- *
- * If the counter is attached to a task which is on a CPU we use a smp
- * call to enable it in the task context. The task might have been
- * scheduled away, but we check this in the smp call again.
- *
- * Must be called with ctx->mutex held.
- */
-static void
-perf_install_in_context(struct perf_counter_context *ctx,
-			struct perf_counter *counter,
-			int cpu)
-{
-	struct task_struct *task = ctx->task;
-
-	if (!task) {
-		/*
-		 * Per cpu counters are installed via an smp call and
-		 * the install is always sucessful.
-		 */
-		smp_call_function_single(cpu, __perf_install_in_context,
-					 counter, 1);
-		return;
-	}
-
-retry:
-	task_oncpu_function_call(task, __perf_install_in_context,
-				 counter);
-
-	spin_lock_irq(&ctx->lock);
-	/*
-	 * we need to retry the smp call.
-	 */
-	if (ctx->is_active && list_empty(&counter->group_entry)) {
-		spin_unlock_irq(&ctx->lock);
-		goto retry;
-	}
-
-	/*
-	 * The lock prevents that this context is scheduled in so we
-	 * can add the counter safely, if it the call above did not
-	 * succeed.
-	 */
-	if (list_empty(&counter->group_entry))
-		add_counter_to_ctx(counter, ctx);
-	spin_unlock_irq(&ctx->lock);
-}
-
-/*
- * Put a counter into inactive state and update time fields.
- * Enabling the leader of a group effectively enables all
- * the group members that aren't explicitly disabled, so we
- * have to update their ->tstamp_enabled also.
- * Note: this works for group members as well as group leaders
- * since the non-leader members' sibling_lists will be empty.
- */
-static void __perf_counter_mark_enabled(struct perf_counter *counter,
-					struct perf_counter_context *ctx)
-{
-	struct perf_counter *sub;
-
-	counter->state = PERF_COUNTER_STATE_INACTIVE;
-	counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
-	list_for_each_entry(sub, &counter->sibling_list, group_entry)
-		if (sub->state >= PERF_COUNTER_STATE_INACTIVE)
-			sub->tstamp_enabled =
-				ctx->time - sub->total_time_enabled;
-}
-
-/*
- * Cross CPU call to enable a performance counter
- */
-static void __perf_counter_enable(void *info)
-{
-	struct perf_counter *counter = info;
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	struct perf_counter_context *ctx = counter->ctx;
-	struct perf_counter *leader = counter->group_leader;
-	int err;
-
-	/*
-	 * If this is a per-task counter, need to check whether this
-	 * counter's task is the current task on this cpu.
-	 */
-	if (ctx->task && cpuctx->task_ctx != ctx) {
-		if (cpuctx->task_ctx || ctx->task != current)
-			return;
-		cpuctx->task_ctx = ctx;
-	}
-
-	spin_lock(&ctx->lock);
-	ctx->is_active = 1;
-	update_context_time(ctx);
-
-	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
-		goto unlock;
-	__perf_counter_mark_enabled(counter, ctx);
-
-	/*
-	 * If the counter is in a group and isn't the group leader,
-	 * then don't put it on unless the group is on.
-	 */
-	if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
-		goto unlock;
-
-	if (!group_can_go_on(counter, cpuctx, 1)) {
-		err = -EEXIST;
-	} else {
-		perf_disable();
-		if (counter == leader)
-			err = group_sched_in(counter, cpuctx, ctx,
-					     smp_processor_id());
-		else
-			err = counter_sched_in(counter, cpuctx, ctx,
-					       smp_processor_id());
-		perf_enable();
-	}
-
-	if (err) {
-		/*
-		 * If this counter can't go on and it's part of a
-		 * group, then the whole group has to come off.
-		 */
-		if (leader != counter)
-			group_sched_out(leader, cpuctx, ctx);
-		if (leader->attr.pinned) {
-			update_group_times(leader);
-			leader->state = PERF_COUNTER_STATE_ERROR;
-		}
-	}
-
- unlock:
-	spin_unlock(&ctx->lock);
-}
-
-/*
- * Enable a counter.
- *
- * If counter->ctx is a cloned context, callers must make sure that
- * every task struct that counter->ctx->task could possibly point to
- * remains valid.  This condition is satisfied when called through
- * perf_counter_for_each_child or perf_counter_for_each as described
- * for perf_counter_disable.
- */
-static void perf_counter_enable(struct perf_counter *counter)
-{
-	struct perf_counter_context *ctx = counter->ctx;
-	struct task_struct *task = ctx->task;
-
-	if (!task) {
-		/*
-		 * Enable the counter on the cpu that it's on
-		 */
-		smp_call_function_single(counter->cpu, __perf_counter_enable,
-					 counter, 1);
-		return;
-	}
-
-	spin_lock_irq(&ctx->lock);
-	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
-		goto out;
-
-	/*
-	 * If the counter is in error state, clear that first.
-	 * That way, if we see the counter in error state below, we
-	 * know that it has gone back into error state, as distinct
-	 * from the task having been scheduled away before the
-	 * cross-call arrived.
-	 */
-	if (counter->state == PERF_COUNTER_STATE_ERROR)
-		counter->state = PERF_COUNTER_STATE_OFF;
-
- retry:
-	spin_unlock_irq(&ctx->lock);
-	task_oncpu_function_call(task, __perf_counter_enable, counter);
-
-	spin_lock_irq(&ctx->lock);
-
-	/*
-	 * If the context is active and the counter is still off,
-	 * we need to retry the cross-call.
-	 */
-	if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
-		goto retry;
-
-	/*
-	 * Since we have the lock this context can't be scheduled
-	 * in, so we can change the state safely.
-	 */
-	if (counter->state == PERF_COUNTER_STATE_OFF)
-		__perf_counter_mark_enabled(counter, ctx);
-
- out:
-	spin_unlock_irq(&ctx->lock);
-}
-
-static int perf_counter_refresh(struct perf_counter *counter, int refresh)
-{
-	/*
-	 * not supported on inherited counters
-	 */
-	if (counter->attr.inherit)
-		return -EINVAL;
-
-	atomic_add(refresh, &counter->event_limit);
-	perf_counter_enable(counter);
-
-	return 0;
-}
-
-void __perf_counter_sched_out(struct perf_counter_context *ctx,
-			      struct perf_cpu_context *cpuctx)
-{
-	struct perf_counter *counter;
-
-	spin_lock(&ctx->lock);
-	ctx->is_active = 0;
-	if (likely(!ctx->nr_counters))
-		goto out;
-	update_context_time(ctx);
-
-	perf_disable();
-	if (ctx->nr_active) {
-		list_for_each_entry(counter, &ctx->group_list, group_entry) {
-			if (counter != counter->group_leader)
-				counter_sched_out(counter, cpuctx, ctx);
-			else
-				group_sched_out(counter, cpuctx, ctx);
-		}
-	}
-	perf_enable();
- out:
-	spin_unlock(&ctx->lock);
-}
-
-/*
- * Test whether two contexts are equivalent, i.e. whether they
- * have both been cloned from the same version of the same context
- * and they both have the same number of enabled counters.
- * If the number of enabled counters is the same, then the set
- * of enabled counters should be the same, because these are both
- * inherited contexts, therefore we can't access individual counters
- * in them directly with an fd; we can only enable/disable all
- * counters via prctl, or enable/disable all counters in a family
- * via ioctl, which will have the same effect on both contexts.
- */
-static int context_equiv(struct perf_counter_context *ctx1,
-			 struct perf_counter_context *ctx2)
-{
-	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
-		&& ctx1->parent_gen == ctx2->parent_gen
-		&& !ctx1->pin_count && !ctx2->pin_count;
-}
-
-static void __perf_counter_read(void *counter);
-
-static void __perf_counter_sync_stat(struct perf_counter *counter,
-				     struct perf_counter *next_counter)
-{
-	u64 value;
-
-	if (!counter->attr.inherit_stat)
-		return;
-
-	/*
-	 * Update the counter value, we cannot use perf_counter_read()
-	 * because we're in the middle of a context switch and have IRQs
-	 * disabled, which upsets smp_call_function_single(), however
-	 * we know the counter must be on the current CPU, therefore we
-	 * don't need to use it.
-	 */
-	switch (counter->state) {
-	case PERF_COUNTER_STATE_ACTIVE:
-		__perf_counter_read(counter);
-		break;
-
-	case PERF_COUNTER_STATE_INACTIVE:
-		update_counter_times(counter);
-		break;
-
-	default:
-		break;
-	}
-
-	/*
-	 * In order to keep per-task stats reliable we need to flip the counter
-	 * values when we flip the contexts.
-	 */
-	value = atomic64_read(&next_counter->count);
-	value = atomic64_xchg(&counter->count, value);
-	atomic64_set(&next_counter->count, value);
-
-	swap(counter->total_time_enabled, next_counter->total_time_enabled);
-	swap(counter->total_time_running, next_counter->total_time_running);
-
-	/*
-	 * Since we swizzled the values, update the user visible data too.
-	 */
-	perf_counter_update_userpage(counter);
-	perf_counter_update_userpage(next_counter);
-}
-
-#define list_next_entry(pos, member) \
-	list_entry(pos->member.next, typeof(*pos), member)
-
-static void perf_counter_sync_stat(struct perf_counter_context *ctx,
-				   struct perf_counter_context *next_ctx)
-{
-	struct perf_counter *counter, *next_counter;
-
-	if (!ctx->nr_stat)
-		return;
-
-	counter = list_first_entry(&ctx->event_list,
-				   struct perf_counter, event_entry);
-
-	next_counter = list_first_entry(&next_ctx->event_list,
-					struct perf_counter, event_entry);
-
-	while (&counter->event_entry != &ctx->event_list &&
-	       &next_counter->event_entry != &next_ctx->event_list) {
-
-		__perf_counter_sync_stat(counter, next_counter);
-
-		counter = list_next_entry(counter, event_entry);
-		next_counter = list_next_entry(next_counter, event_entry);
-	}
-}
-
-/*
- * Called from scheduler to remove the counters of the current task,
- * with interrupts disabled.
- *
- * We stop each counter and update the counter value in counter->count.
- *
- * This does not protect us against NMI, but disable()
- * sets the disabled bit in the control field of counter _before_
- * accessing the counter control register. If a NMI hits, then it will
- * not restart the counter.
- */
-void perf_counter_task_sched_out(struct task_struct *task,
-				 struct task_struct *next, int cpu)
-{
-	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-	struct perf_counter_context *ctx = task->perf_counter_ctxp;
-	struct perf_counter_context *next_ctx;
-	struct perf_counter_context *parent;
-	struct pt_regs *regs;
-	int do_switch = 1;
-
-	regs = task_pt_regs(task);
-	perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
-
-	if (likely(!ctx || !cpuctx->task_ctx))
-		return;
-
-	update_context_time(ctx);
-
-	rcu_read_lock();
-	parent = rcu_dereference(ctx->parent_ctx);
-	next_ctx = next->perf_counter_ctxp;
-	if (parent && next_ctx &&
-	    rcu_dereference(next_ctx->parent_ctx) == parent) {
-		/*
-		 * Looks like the two contexts are clones, so we might be
-		 * able to optimize the context switch.  We lock both
-		 * contexts and check that they are clones under the
-		 * lock (including re-checking that neither has been
-		 * uncloned in the meantime).  It doesn't matter which
-		 * order we take the locks because no other cpu could
-		 * be trying to lock both of these tasks.
-		 */
-		spin_lock(&ctx->lock);
-		spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
-		if (context_equiv(ctx, next_ctx)) {
-			/*
-			 * XXX do we need a memory barrier of sorts
-			 * wrt to rcu_dereference() of perf_counter_ctxp
-			 */
-			task->perf_counter_ctxp = next_ctx;
-			next->perf_counter_ctxp = ctx;
-			ctx->task = next;
-			next_ctx->task = task;
-			do_switch = 0;
-
-			perf_counter_sync_stat(ctx, next_ctx);
-		}
-		spin_unlock(&next_ctx->lock);
-		spin_unlock(&ctx->lock);
-	}
-	rcu_read_unlock();
-
-	if (do_switch) {
-		__perf_counter_sched_out(ctx, cpuctx);
-		cpuctx->task_ctx = NULL;
-	}
-}
-
-/*
- * Called with IRQs disabled
- */
-static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
-{
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-
-	if (!cpuctx->task_ctx)
-		return;
-
-	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
-		return;
-
-	__perf_counter_sched_out(ctx, cpuctx);
-	cpuctx->task_ctx = NULL;
-}
-
-/*
- * Called with IRQs disabled
- */
-static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
-{
-	__perf_counter_sched_out(&cpuctx->ctx, cpuctx);
-}
-
-static void
-__perf_counter_sched_in(struct perf_counter_context *ctx,
-			struct perf_cpu_context *cpuctx, int cpu)
-{
-	struct perf_counter *counter;
-	int can_add_hw = 1;
-
-	spin_lock(&ctx->lock);
-	ctx->is_active = 1;
-	if (likely(!ctx->nr_counters))
-		goto out;
-
-	ctx->timestamp = perf_clock();
-
-	perf_disable();
-
-	/*
-	 * First go through the list and put on any pinned groups
-	 * in order to give them the best chance of going on.
-	 */
-	list_for_each_entry(counter, &ctx->group_list, group_entry) {
-		if (counter->state <= PERF_COUNTER_STATE_OFF ||
-		    !counter->attr.pinned)
-			continue;
-		if (counter->cpu != -1 && counter->cpu != cpu)
-			continue;
-
-		if (counter != counter->group_leader)
-			counter_sched_in(counter, cpuctx, ctx, cpu);
-		else {
-			if (group_can_go_on(counter, cpuctx, 1))
-				group_sched_in(counter, cpuctx, ctx, cpu);
-		}
-
-		/*
-		 * If this pinned group hasn't been scheduled,
-		 * put it in error state.
-		 */
-		if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
-			update_group_times(counter);
-			counter->state = PERF_COUNTER_STATE_ERROR;
-		}
-	}
-
-	list_for_each_entry(counter, &ctx->group_list, group_entry) {
-		/*
-		 * Ignore counters in OFF or ERROR state, and
-		 * ignore pinned counters since we did them already.
-		 */
-		if (counter->state <= PERF_COUNTER_STATE_OFF ||
-		    counter->attr.pinned)
-			continue;
-
-		/*
-		 * Listen to the 'cpu' scheduling filter constraint
-		 * of counters:
-		 */
-		if (counter->cpu != -1 && counter->cpu != cpu)
-			continue;
-
-		if (counter != counter->group_leader) {
-			if (counter_sched_in(counter, cpuctx, ctx, cpu))
-				can_add_hw = 0;
-		} else {
-			if (group_can_go_on(counter, cpuctx, can_add_hw)) {
-				if (group_sched_in(counter, cpuctx, ctx, cpu))
-					can_add_hw = 0;
-			}
-		}
-	}
-	perf_enable();
- out:
-	spin_unlock(&ctx->lock);
-}
-
-/*
- * Called from scheduler to add the counters of the current task
- * with interrupts disabled.
- *
- * We restore the counter value and then enable it.
- *
- * This does not protect us against NMI, but enable()
- * sets the enabled bit in the control field of counter _before_
- * accessing the counter control register. If a NMI hits, then it will
- * keep the counter running.
- */
-void perf_counter_task_sched_in(struct task_struct *task, int cpu)
-{
-	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-	struct perf_counter_context *ctx = task->perf_counter_ctxp;
-
-	if (likely(!ctx))
-		return;
-	if (cpuctx->task_ctx == ctx)
-		return;
-	__perf_counter_sched_in(ctx, cpuctx, cpu);
-	cpuctx->task_ctx = ctx;
-}
-
-static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
-{
-	struct perf_counter_context *ctx = &cpuctx->ctx;
-
-	__perf_counter_sched_in(ctx, cpuctx, cpu);
-}
-
-#define MAX_INTERRUPTS (~0ULL)
-
-static void perf_log_throttle(struct perf_counter *counter, int enable);
-
-static void perf_adjust_period(struct perf_counter *counter, u64 events)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-	u64 period, sample_period;
-	s64 delta;
-
-	events *= hwc->sample_period;
-	period = div64_u64(events, counter->attr.sample_freq);
-
-	delta = (s64)(period - hwc->sample_period);
-	delta = (delta + 7) / 8; /* low pass filter */
-
-	sample_period = hwc->sample_period + delta;
-
-	if (!sample_period)
-		sample_period = 1;
-
-	hwc->sample_period = sample_period;
-}
-
-static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
-{
-	struct perf_counter *counter;
-	struct hw_perf_counter *hwc;
-	u64 interrupts, freq;
-
-	spin_lock(&ctx->lock);
-	list_for_each_entry(counter, &ctx->group_list, group_entry) {
-		if (counter->state != PERF_COUNTER_STATE_ACTIVE)
-			continue;
-
-		hwc = &counter->hw;
-
-		interrupts = hwc->interrupts;
-		hwc->interrupts = 0;
-
-		/*
-		 * unthrottle counters on the tick
-		 */
-		if (interrupts == MAX_INTERRUPTS) {
-			perf_log_throttle(counter, 1);
-			counter->pmu->unthrottle(counter);
-			interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
-		}
-
-		if (!counter->attr.freq || !counter->attr.sample_freq)
-			continue;
-
-		/*
-		 * if the specified freq < HZ then we need to skip ticks
-		 */
-		if (counter->attr.sample_freq < HZ) {
-			freq = counter->attr.sample_freq;
-
-			hwc->freq_count += freq;
-			hwc->freq_interrupts += interrupts;
-
-			if (hwc->freq_count < HZ)
-				continue;
-
-			interrupts = hwc->freq_interrupts;
-			hwc->freq_interrupts = 0;
-			hwc->freq_count -= HZ;
-		} else
-			freq = HZ;
-
-		perf_adjust_period(counter, freq * interrupts);
-
-		/*
-		 * In order to avoid being stalled by an (accidental) huge
-		 * sample period, force reset the sample period if we didn't
-		 * get any events in this freq period.
-		 */
-		if (!interrupts) {
-			perf_disable();
-			counter->pmu->disable(counter);
-			atomic64_set(&hwc->period_left, 0);
-			counter->pmu->enable(counter);
-			perf_enable();
-		}
-	}
-	spin_unlock(&ctx->lock);
-}
-
-/*
- * Round-robin a context's counters:
- */
-static void rotate_ctx(struct perf_counter_context *ctx)
-{
-	struct perf_counter *counter;
-
-	if (!ctx->nr_counters)
-		return;
-
-	spin_lock(&ctx->lock);
-	/*
-	 * Rotate the first entry last (works just fine for group counters too):
-	 */
-	perf_disable();
-	list_for_each_entry(counter, &ctx->group_list, group_entry) {
-		list_move_tail(&counter->group_entry, &ctx->group_list);
-		break;
-	}
-	perf_enable();
-
-	spin_unlock(&ctx->lock);
-}
-
-void perf_counter_task_tick(struct task_struct *curr, int cpu)
-{
-	struct perf_cpu_context *cpuctx;
-	struct perf_counter_context *ctx;
-
-	if (!atomic_read(&nr_counters))
-		return;
-
-	cpuctx = &per_cpu(perf_cpu_context, cpu);
-	ctx = curr->perf_counter_ctxp;
-
-	perf_ctx_adjust_freq(&cpuctx->ctx);
-	if (ctx)
-		perf_ctx_adjust_freq(ctx);
-
-	perf_counter_cpu_sched_out(cpuctx);
-	if (ctx)
-		__perf_counter_task_sched_out(ctx);
-
-	rotate_ctx(&cpuctx->ctx);
-	if (ctx)
-		rotate_ctx(ctx);
-
-	perf_counter_cpu_sched_in(cpuctx, cpu);
-	if (ctx)
-		perf_counter_task_sched_in(curr, cpu);
-}
-
-/*
- * Enable all of a task's counters that have been marked enable-on-exec.
- * This expects task == current.
- */
-static void perf_counter_enable_on_exec(struct task_struct *task)
-{
-	struct perf_counter_context *ctx;
-	struct perf_counter *counter;
-	unsigned long flags;
-	int enabled = 0;
-
-	local_irq_save(flags);
-	ctx = task->perf_counter_ctxp;
-	if (!ctx || !ctx->nr_counters)
-		goto out;
-
-	__perf_counter_task_sched_out(ctx);
-
-	spin_lock(&ctx->lock);
-
-	list_for_each_entry(counter, &ctx->group_list, group_entry) {
-		if (!counter->attr.enable_on_exec)
-			continue;
-		counter->attr.enable_on_exec = 0;
-		if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
-			continue;
-		__perf_counter_mark_enabled(counter, ctx);
-		enabled = 1;
-	}
-
-	/*
-	 * Unclone this context if we enabled any counter.
-	 */
-	if (enabled)
-		unclone_ctx(ctx);
-
-	spin_unlock(&ctx->lock);
-
-	perf_counter_task_sched_in(task, smp_processor_id());
- out:
-	local_irq_restore(flags);
-}
-
-/*
- * Cross CPU call to read the hardware counter
- */
-static void __perf_counter_read(void *info)
-{
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	struct perf_counter *counter = info;
-	struct perf_counter_context *ctx = counter->ctx;
-	unsigned long flags;
-
-	/*
-	 * If this is a task context, we need to check whether it is
-	 * the current task context of this cpu.  If not it has been
-	 * scheduled out before the smp call arrived.  In that case
-	 * counter->count would have been updated to a recent sample
-	 * when the counter was scheduled out.
-	 */
-	if (ctx->task && cpuctx->task_ctx != ctx)
-		return;
-
-	local_irq_save(flags);
-	if (ctx->is_active)
-		update_context_time(ctx);
-	counter->pmu->read(counter);
-	update_counter_times(counter);
-	local_irq_restore(flags);
-}
-
-static u64 perf_counter_read(struct perf_counter *counter)
-{
-	/*
-	 * If counter is enabled and currently active on a CPU, update the
-	 * value in the counter structure:
-	 */
-	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
-		smp_call_function_single(counter->oncpu,
-					 __perf_counter_read, counter, 1);
-	} else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
-		update_counter_times(counter);
-	}
-
-	return atomic64_read(&counter->count);
-}
-
-/*
- * Initialize the perf_counter context in a task_struct:
- */
-static void
-__perf_counter_init_context(struct perf_counter_context *ctx,
-			    struct task_struct *task)
-{
-	memset(ctx, 0, sizeof(*ctx));
-	spin_lock_init(&ctx->lock);
-	mutex_init(&ctx->mutex);
-	INIT_LIST_HEAD(&ctx->group_list);
-	INIT_LIST_HEAD(&ctx->event_list);
-	atomic_set(&ctx->refcount, 1);
-	ctx->task = task;
-}
-
-static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
-{
-	struct perf_counter_context *ctx;
-	struct perf_cpu_context *cpuctx;
-	struct task_struct *task;
-	unsigned long flags;
-	int err;
-
-	/*
-	 * If cpu is not a wildcard then this is a percpu counter:
-	 */
-	if (cpu != -1) {
-		/* Must be root to operate on a CPU counter: */
-		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-			return ERR_PTR(-EACCES);
-
-		if (cpu < 0 || cpu > num_possible_cpus())
-			return ERR_PTR(-EINVAL);
-
-		/*
-		 * We could be clever and allow to attach a counter to an
-		 * offline CPU and activate it when the CPU comes up, but
-		 * that's for later.
-		 */
-		if (!cpu_isset(cpu, cpu_online_map))
-			return ERR_PTR(-ENODEV);
-
-		cpuctx = &per_cpu(perf_cpu_context, cpu);
-		ctx = &cpuctx->ctx;
-		get_ctx(ctx);
-
-		return ctx;
-	}
-
-	rcu_read_lock();
-	if (!pid)
-		task = current;
-	else
-		task = find_task_by_vpid(pid);
-	if (task)
-		get_task_struct(task);
-	rcu_read_unlock();
-
-	if (!task)
-		return ERR_PTR(-ESRCH);
-
-	/*
-	 * Can't attach counters to a dying task.
-	 */
-	err = -ESRCH;
-	if (task->flags & PF_EXITING)
-		goto errout;
-
-	/* Reuse ptrace permission checks for now. */
-	err = -EACCES;
-	if (!ptrace_may_access(task, PTRACE_MODE_READ))
-		goto errout;
-
- retry:
-	ctx = perf_lock_task_context(task, &flags);
-	if (ctx) {
-		unclone_ctx(ctx);
-		spin_unlock_irqrestore(&ctx->lock, flags);
-	}
-
-	if (!ctx) {
-		ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
-		err = -ENOMEM;
-		if (!ctx)
-			goto errout;
-		__perf_counter_init_context(ctx, task);
-		get_ctx(ctx);
-		if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
-			/*
-			 * We raced with some other task; use
-			 * the context they set.
-			 */
-			kfree(ctx);
-			goto retry;
-		}
-		get_task_struct(task);
-	}
-
-	put_task_struct(task);
-	return ctx;
-
- errout:
-	put_task_struct(task);
-	return ERR_PTR(err);
-}
-
-static void free_counter_rcu(struct rcu_head *head)
-{
-	struct perf_counter *counter;
-
-	counter = container_of(head, struct perf_counter, rcu_head);
-	if (counter->ns)
-		put_pid_ns(counter->ns);
-	kfree(counter);
-}
-
-static void perf_pending_sync(struct perf_counter *counter);
-
-static void free_counter(struct perf_counter *counter)
-{
-	perf_pending_sync(counter);
-
-	if (!counter->parent) {
-		atomic_dec(&nr_counters);
-		if (counter->attr.mmap)
-			atomic_dec(&nr_mmap_counters);
-		if (counter->attr.comm)
-			atomic_dec(&nr_comm_counters);
-		if (counter->attr.task)
-			atomic_dec(&nr_task_counters);
-	}
-
-	if (counter->output) {
-		fput(counter->output->filp);
-		counter->output = NULL;
-	}
-
-	if (counter->destroy)
-		counter->destroy(counter);
-
-	put_ctx(counter->ctx);
-	call_rcu(&counter->rcu_head, free_counter_rcu);
-}
-
-/*
- * Called when the last reference to the file is gone.
- */
-static int perf_release(struct inode *inode, struct file *file)
-{
-	struct perf_counter *counter = file->private_data;
-	struct perf_counter_context *ctx = counter->ctx;
-
-	file->private_data = NULL;
-
-	WARN_ON_ONCE(ctx->parent_ctx);
-	mutex_lock(&ctx->mutex);
-	perf_counter_remove_from_context(counter);
-	mutex_unlock(&ctx->mutex);
-
-	mutex_lock(&counter->owner->perf_counter_mutex);
-	list_del_init(&counter->owner_entry);
-	mutex_unlock(&counter->owner->perf_counter_mutex);
-	put_task_struct(counter->owner);
-
-	free_counter(counter);
-
-	return 0;
-}
-
-static int perf_counter_read_size(struct perf_counter *counter)
-{
-	int entry = sizeof(u64); /* value */
-	int size = 0;
-	int nr = 1;
-
-	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-		size += sizeof(u64);
-
-	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-		size += sizeof(u64);
-
-	if (counter->attr.read_format & PERF_FORMAT_ID)
-		entry += sizeof(u64);
-
-	if (counter->attr.read_format & PERF_FORMAT_GROUP) {
-		nr += counter->group_leader->nr_siblings;
-		size += sizeof(u64);
-	}
-
-	size += entry * nr;
-
-	return size;
-}
-
-static u64 perf_counter_read_value(struct perf_counter *counter)
-{
-	struct perf_counter *child;
-	u64 total = 0;
-
-	total += perf_counter_read(counter);
-	list_for_each_entry(child, &counter->child_list, child_list)
-		total += perf_counter_read(child);
-
-	return total;
-}
-
-static int perf_counter_read_entry(struct perf_counter *counter,
-				   u64 read_format, char __user *buf)
-{
-	int n = 0, count = 0;
-	u64 values[2];
-
-	values[n++] = perf_counter_read_value(counter);
-	if (read_format & PERF_FORMAT_ID)
-		values[n++] = primary_counter_id(counter);
-
-	count = n * sizeof(u64);
-
-	if (copy_to_user(buf, values, count))
-		return -EFAULT;
-
-	return count;
-}
-
-static int perf_counter_read_group(struct perf_counter *counter,
-				   u64 read_format, char __user *buf)
-{
-	struct perf_counter *leader = counter->group_leader, *sub;
-	int n = 0, size = 0, err = -EFAULT;
-	u64 values[3];
-
-	values[n++] = 1 + leader->nr_siblings;
-	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-		values[n++] = leader->total_time_enabled +
-			atomic64_read(&leader->child_total_time_enabled);
-	}
-	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-		values[n++] = leader->total_time_running +
-			atomic64_read(&leader->child_total_time_running);
-	}
-
-	size = n * sizeof(u64);
-
-	if (copy_to_user(buf, values, size))
-		return -EFAULT;
-
-	err = perf_counter_read_entry(leader, read_format, buf + size);
-	if (err < 0)
-		return err;
-
-	size += err;
-
-	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
-		err = perf_counter_read_entry(sub, read_format,
-				buf + size);
-		if (err < 0)
-			return err;
-
-		size += err;
-	}
-
-	return size;
-}
-
-static int perf_counter_read_one(struct perf_counter *counter,
-				 u64 read_format, char __user *buf)
-{
-	u64 values[4];
-	int n = 0;
-
-	values[n++] = perf_counter_read_value(counter);
-	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-		values[n++] = counter->total_time_enabled +
-			atomic64_read(&counter->child_total_time_enabled);
-	}
-	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-		values[n++] = counter->total_time_running +
-			atomic64_read(&counter->child_total_time_running);
-	}
-	if (read_format & PERF_FORMAT_ID)
-		values[n++] = primary_counter_id(counter);
-
-	if (copy_to_user(buf, values, n * sizeof(u64)))
-		return -EFAULT;
-
-	return n * sizeof(u64);
-}
-
-/*
- * Read the performance counter - simple non blocking version for now
- */
-static ssize_t
-perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
-{
-	u64 read_format = counter->attr.read_format;
-	int ret;
-
-	/*
-	 * Return end-of-file for a read on a counter that is in
-	 * error state (i.e. because it was pinned but it couldn't be
-	 * scheduled on to the CPU at some point).
-	 */
-	if (counter->state == PERF_COUNTER_STATE_ERROR)
-		return 0;
-
-	if (count < perf_counter_read_size(counter))
-		return -ENOSPC;
-
-	WARN_ON_ONCE(counter->ctx->parent_ctx);
-	mutex_lock(&counter->child_mutex);
-	if (read_format & PERF_FORMAT_GROUP)
-		ret = perf_counter_read_group(counter, read_format, buf);
-	else
-		ret = perf_counter_read_one(counter, read_format, buf);
-	mutex_unlock(&counter->child_mutex);
-
-	return ret;
-}
-
-static ssize_t
-perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
-{
-	struct perf_counter *counter = file->private_data;
-
-	return perf_read_hw(counter, buf, count);
-}
-
-static unsigned int perf_poll(struct file *file, poll_table *wait)
-{
-	struct perf_counter *counter = file->private_data;
-	struct perf_mmap_data *data;
-	unsigned int events = POLL_HUP;
-
-	rcu_read_lock();
-	data = rcu_dereference(counter->data);
-	if (data)
-		events = atomic_xchg(&data->poll, 0);
-	rcu_read_unlock();
-
-	poll_wait(file, &counter->waitq, wait);
-
-	return events;
-}
-
-static void perf_counter_reset(struct perf_counter *counter)
-{
-	(void)perf_counter_read(counter);
-	atomic64_set(&counter->count, 0);
-	perf_counter_update_userpage(counter);
-}
-
-/*
- * Holding the top-level counter's child_mutex means that any
- * descendant process that has inherited this counter will block
- * in sync_child_counter if it goes to exit, thus satisfying the
- * task existence requirements of perf_counter_enable/disable.
- */
-static void perf_counter_for_each_child(struct perf_counter *counter,
-					void (*func)(struct perf_counter *))
-{
-	struct perf_counter *child;
-
-	WARN_ON_ONCE(counter->ctx->parent_ctx);
-	mutex_lock(&counter->child_mutex);
-	func(counter);
-	list_for_each_entry(child, &counter->child_list, child_list)
-		func(child);
-	mutex_unlock(&counter->child_mutex);
-}
-
-static void perf_counter_for_each(struct perf_counter *counter,
-				  void (*func)(struct perf_counter *))
-{
-	struct perf_counter_context *ctx = counter->ctx;
-	struct perf_counter *sibling;
-
-	WARN_ON_ONCE(ctx->parent_ctx);
-	mutex_lock(&ctx->mutex);
-	counter = counter->group_leader;
-
-	perf_counter_for_each_child(counter, func);
-	func(counter);
-	list_for_each_entry(sibling, &counter->sibling_list, group_entry)
-		perf_counter_for_each_child(counter, func);
-	mutex_unlock(&ctx->mutex);
-}
-
-static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
-{
-	struct perf_counter_context *ctx = counter->ctx;
-	unsigned long size;
-	int ret = 0;
-	u64 value;
-
-	if (!counter->attr.sample_period)
-		return -EINVAL;
-
-	size = copy_from_user(&value, arg, sizeof(value));
-	if (size != sizeof(value))
-		return -EFAULT;
-
-	if (!value)
-		return -EINVAL;
-
-	spin_lock_irq(&ctx->lock);
-	if (counter->attr.freq) {
-		if (value > sysctl_perf_counter_sample_rate) {
-			ret = -EINVAL;
-			goto unlock;
-		}
-
-		counter->attr.sample_freq = value;
-	} else {
-		counter->attr.sample_period = value;
-		counter->hw.sample_period = value;
-	}
-unlock:
-	spin_unlock_irq(&ctx->lock);
-
-	return ret;
-}
-
-int perf_counter_set_output(struct perf_counter *counter, int output_fd);
-
-static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
-	struct perf_counter *counter = file->private_data;
-	void (*func)(struct perf_counter *);
-	u32 flags = arg;
-
-	switch (cmd) {
-	case PERF_COUNTER_IOC_ENABLE:
-		func = perf_counter_enable;
-		break;
-	case PERF_COUNTER_IOC_DISABLE:
-		func = perf_counter_disable;
-		break;
-	case PERF_COUNTER_IOC_RESET:
-		func = perf_counter_reset;
-		break;
-
-	case PERF_COUNTER_IOC_REFRESH:
-		return perf_counter_refresh(counter, arg);
-
-	case PERF_COUNTER_IOC_PERIOD:
-		return perf_counter_period(counter, (u64 __user *)arg);
-
-	case PERF_COUNTER_IOC_SET_OUTPUT:
-		return perf_counter_set_output(counter, arg);
-
-	default:
-		return -ENOTTY;
-	}
-
-	if (flags & PERF_IOC_FLAG_GROUP)
-		perf_counter_for_each(counter, func);
-	else
-		perf_counter_for_each_child(counter, func);
-
-	return 0;
-}
-
-int perf_counter_task_enable(void)
-{
-	struct perf_counter *counter;
-
-	mutex_lock(&current->perf_counter_mutex);
-	list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
-		perf_counter_for_each_child(counter, perf_counter_enable);
-	mutex_unlock(&current->perf_counter_mutex);
-
-	return 0;
-}
-
-int perf_counter_task_disable(void)
-{
-	struct perf_counter *counter;
-
-	mutex_lock(&current->perf_counter_mutex);
-	list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
-		perf_counter_for_each_child(counter, perf_counter_disable);
-	mutex_unlock(&current->perf_counter_mutex);
-
-	return 0;
-}
-
-#ifndef PERF_COUNTER_INDEX_OFFSET
-# define PERF_COUNTER_INDEX_OFFSET 0
-#endif
-
-static int perf_counter_index(struct perf_counter *counter)
-{
-	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
-		return 0;
-
-	return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET;
-}
-
-/*
- * Callers need to ensure there can be no nesting of this function, otherwise
- * the seqlock logic goes bad. We can not serialize this because the arch
- * code calls this from NMI context.
- */
-void perf_counter_update_userpage(struct perf_counter *counter)
-{
-	struct perf_counter_mmap_page *userpg;
-	struct perf_mmap_data *data;
-
-	rcu_read_lock();
-	data = rcu_dereference(counter->data);
-	if (!data)
-		goto unlock;
-
-	userpg = data->user_page;
-
-	/*
-	 * Disable preemption so as to not let the corresponding user-space
-	 * spin too long if we get preempted.
-	 */
-	preempt_disable();
-	++userpg->lock;
-	barrier();
-	userpg->index = perf_counter_index(counter);
-	userpg->offset = atomic64_read(&counter->count);
-	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
-		userpg->offset -= atomic64_read(&counter->hw.prev_count);
-
-	userpg->time_enabled = counter->total_time_enabled +
-			atomic64_read(&counter->child_total_time_enabled);
-
-	userpg->time_running = counter->total_time_running +
-			atomic64_read(&counter->child_total_time_running);
-
-	barrier();
-	++userpg->lock;
-	preempt_enable();
-unlock:
-	rcu_read_unlock();
-}
-
-static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	struct perf_counter *counter = vma->vm_file->private_data;
-	struct perf_mmap_data *data;
-	int ret = VM_FAULT_SIGBUS;
-
-	if (vmf->flags & FAULT_FLAG_MKWRITE) {
-		if (vmf->pgoff == 0)
-			ret = 0;
-		return ret;
-	}
-
-	rcu_read_lock();
-	data = rcu_dereference(counter->data);
-	if (!data)
-		goto unlock;
-
-	if (vmf->pgoff == 0) {
-		vmf->page = virt_to_page(data->user_page);
-	} else {
-		int nr = vmf->pgoff - 1;
-
-		if ((unsigned)nr > data->nr_pages)
-			goto unlock;
-
-		if (vmf->flags & FAULT_FLAG_WRITE)
-			goto unlock;
-
-		vmf->page = virt_to_page(data->data_pages[nr]);
-	}
-
-	get_page(vmf->page);
-	vmf->page->mapping = vma->vm_file->f_mapping;
-	vmf->page->index   = vmf->pgoff;
-
-	ret = 0;
-unlock:
-	rcu_read_unlock();
-
-	return ret;
-}
-
-static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
-{
-	struct perf_mmap_data *data;
-	unsigned long size;
-	int i;
-
-	WARN_ON(atomic_read(&counter->mmap_count));
-
-	size = sizeof(struct perf_mmap_data);
-	size += nr_pages * sizeof(void *);
-
-	data = kzalloc(size, GFP_KERNEL);
-	if (!data)
-		goto fail;
-
-	data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
-	if (!data->user_page)
-		goto fail_user_page;
-
-	for (i = 0; i < nr_pages; i++) {
-		data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
-		if (!data->data_pages[i])
-			goto fail_data_pages;
-	}
-
-	data->nr_pages = nr_pages;
-	atomic_set(&data->lock, -1);
-
-	if (counter->attr.watermark) {
-		data->watermark = min_t(long, PAGE_SIZE * nr_pages,
-				      counter->attr.wakeup_watermark);
-	}
-	if (!data->watermark)
-		data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);
-
-	rcu_assign_pointer(counter->data, data);
-
-	return 0;
-
-fail_data_pages:
-	for (i--; i >= 0; i--)
-		free_page((unsigned long)data->data_pages[i]);
-
-	free_page((unsigned long)data->user_page);
-
-fail_user_page:
-	kfree(data);
-
-fail:
-	return -ENOMEM;
-}
-
-static void perf_mmap_free_page(unsigned long addr)
-{
-	struct page *page = virt_to_page((void *)addr);
-
-	page->mapping = NULL;
-	__free_page(page);
-}
-
-static void __perf_mmap_data_free(struct rcu_head *rcu_head)
-{
-	struct perf_mmap_data *data;
-	int i;
-
-	data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
-
-	perf_mmap_free_page((unsigned long)data->user_page);
-	for (i = 0; i < data->nr_pages; i++)
-		perf_mmap_free_page((unsigned long)data->data_pages[i]);
-
-	kfree(data);
-}
-
-static void perf_mmap_data_free(struct perf_counter *counter)
-{
-	struct perf_mmap_data *data = counter->data;
-
-	WARN_ON(atomic_read(&counter->mmap_count));
-
-	rcu_assign_pointer(counter->data, NULL);
-	call_rcu(&data->rcu_head, __perf_mmap_data_free);
-}
-
-static void perf_mmap_open(struct vm_area_struct *vma)
-{
-	struct perf_counter *counter = vma->vm_file->private_data;
-
-	atomic_inc(&counter->mmap_count);
-}
-
-static void perf_mmap_close(struct vm_area_struct *vma)
-{
-	struct perf_counter *counter = vma->vm_file->private_data;
-
-	WARN_ON_ONCE(counter->ctx->parent_ctx);
-	if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) {
-		struct user_struct *user = current_user();
-
-		atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
-		vma->vm_mm->locked_vm -= counter->data->nr_locked;
-		perf_mmap_data_free(counter);
-		mutex_unlock(&counter->mmap_mutex);
-	}
-}
-
-static struct vm_operations_struct perf_mmap_vmops = {
-	.open		= perf_mmap_open,
-	.close		= perf_mmap_close,
-	.fault		= perf_mmap_fault,
-	.page_mkwrite	= perf_mmap_fault,
-};
-
-static int perf_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	struct perf_counter *counter = file->private_data;
-	unsigned long user_locked, user_lock_limit;
-	struct user_struct *user = current_user();
-	unsigned long locked, lock_limit;
-	unsigned long vma_size;
-	unsigned long nr_pages;
-	long user_extra, extra;
-	int ret = 0;
-
-	if (!(vma->vm_flags & VM_SHARED))
-		return -EINVAL;
-
-	vma_size = vma->vm_end - vma->vm_start;
-	nr_pages = (vma_size / PAGE_SIZE) - 1;
-
-	/*
-	 * If we have data pages ensure they're a power-of-two number, so we
-	 * can do bitmasks instead of modulo.
-	 */
-	if (nr_pages != 0 && !is_power_of_2(nr_pages))
-		return -EINVAL;
-
-	if (vma_size != PAGE_SIZE * (1 + nr_pages))
-		return -EINVAL;
-
-	if (vma->vm_pgoff != 0)
-		return -EINVAL;
-
-	WARN_ON_ONCE(counter->ctx->parent_ctx);
-	mutex_lock(&counter->mmap_mutex);
-	if (counter->output) {
-		ret = -EINVAL;
-		goto unlock;
-	}
-
-	if (atomic_inc_not_zero(&counter->mmap_count)) {
-		if (nr_pages != counter->data->nr_pages)
-			ret = -EINVAL;
-		goto unlock;
-	}
-
-	user_extra = nr_pages + 1;
-	user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
-
-	/*
-	 * Increase the limit linearly with more CPUs:
-	 */
-	user_lock_limit *= num_online_cpus();
-
-	user_locked = atomic_long_read(&user->locked_vm) + user_extra;
-
-	extra = 0;
-	if (user_locked > user_lock_limit)
-		extra = user_locked - user_lock_limit;
-
-	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
-	lock_limit >>= PAGE_SHIFT;
-	locked = vma->vm_mm->locked_vm + extra;
-
-	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
-		!capable(CAP_IPC_LOCK)) {
-		ret = -EPERM;
-		goto unlock;
-	}
-
-	WARN_ON(counter->data);
-	ret = perf_mmap_data_alloc(counter, nr_pages);
-	if (ret)
-		goto unlock;
-
-	atomic_set(&counter->mmap_count, 1);
-	atomic_long_add(user_extra, &user->locked_vm);
-	vma->vm_mm->locked_vm += extra;
-	counter->data->nr_locked = extra;
-	if (vma->vm_flags & VM_WRITE)
-		counter->data->writable = 1;
-
-unlock:
-	mutex_unlock(&counter->mmap_mutex);
-
-	vma->vm_flags |= VM_RESERVED;
-	vma->vm_ops = &perf_mmap_vmops;
-
-	return ret;
-}
-
-static int perf_fasync(int fd, struct file *filp, int on)
-{
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct perf_counter *counter = filp->private_data;
-	int retval;
-
-	mutex_lock(&inode->i_mutex);
-	retval = fasync_helper(fd, filp, on, &counter->fasync);
-	mutex_unlock(&inode->i_mutex);
-
-	if (retval < 0)
-		return retval;
-
-	return 0;
-}
-
-static const struct file_operations perf_fops = {
-	.release		= perf_release,
-	.read			= perf_read,
-	.poll			= perf_poll,
-	.unlocked_ioctl		= perf_ioctl,
-	.compat_ioctl		= perf_ioctl,
-	.mmap			= perf_mmap,
-	.fasync			= perf_fasync,
-};
-
-/*
- * Perf counter wakeup
- *
- * If there's data, ensure we set the poll() state and publish everything
- * to user-space before waking everybody up.
- */
-
-void perf_counter_wakeup(struct perf_counter *counter)
-{
-	wake_up_all(&counter->waitq);
-
-	if (counter->pending_kill) {
-		kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
-		counter->pending_kill = 0;
-	}
-}
-
-/*
- * Pending wakeups
- *
- * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
- *
- * The NMI bit means we cannot possibly take locks. Therefore, maintain a
- * single linked list and use cmpxchg() to add entries lockless.
- */
-
-static void perf_pending_counter(struct perf_pending_entry *entry)
-{
-	struct perf_counter *counter = container_of(entry,
-			struct perf_counter, pending);
-
-	if (counter->pending_disable) {
-		counter->pending_disable = 0;
-		__perf_counter_disable(counter);
-	}
-
-	if (counter->pending_wakeup) {
-		counter->pending_wakeup = 0;
-		perf_counter_wakeup(counter);
-	}
-}
-
-#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
-
-static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
-	PENDING_TAIL,
-};
-
-static void perf_pending_queue(struct perf_pending_entry *entry,
-			       void (*func)(struct perf_pending_entry *))
-{
-	struct perf_pending_entry **head;
-
-	if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
-		return;
-
-	entry->func = func;
-
-	head = &get_cpu_var(perf_pending_head);
-
-	do {
-		entry->next = *head;
-	} while (cmpxchg(head, entry->next, entry) != entry->next);
-
-	set_perf_counter_pending();
-
-	put_cpu_var(perf_pending_head);
-}
-
-static int __perf_pending_run(void)
-{
-	struct perf_pending_entry *list;
-	int nr = 0;
-
-	list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
-	while (list != PENDING_TAIL) {
-		void (*func)(struct perf_pending_entry *);
-		struct perf_pending_entry *entry = list;
-
-		list = list->next;
-
-		func = entry->func;
-		entry->next = NULL;
-		/*
-		 * Ensure we observe the unqueue before we issue the wakeup,
-		 * so that we won't be waiting forever.
-		 * -- see perf_not_pending().
-		 */
-		smp_wmb();
-
-		func(entry);
-		nr++;
-	}
-
-	return nr;
-}
-
-static inline int perf_not_pending(struct perf_counter *counter)
-{
-	/*
-	 * If we flush on whatever cpu we run, there is a chance we don't
-	 * need to wait.
-	 */
-	get_cpu();
-	__perf_pending_run();
-	put_cpu();
-
-	/*
-	 * Ensure we see the proper queue state before going to sleep
-	 * so that we do not miss the wakeup. -- see perf_pending_handle()
-	 */
-	smp_rmb();
-	return counter->pending.next == NULL;
-}
-
-static void perf_pending_sync(struct perf_counter *counter)
-{
-	wait_event(counter->waitq, perf_not_pending(counter));
-}
-
-void perf_counter_do_pending(void)
-{
-	__perf_pending_run();
-}
-
-/*
- * Callchain support -- arch specific
- */
-
-__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-	return NULL;
-}
-
-/*
- * Output
- */
-static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
-			      unsigned long offset, unsigned long head)
-{
-	unsigned long mask;
-
-	if (!data->writable)
-		return true;
-
-	mask = (data->nr_pages << PAGE_SHIFT) - 1;
-
-	offset = (offset - tail) & mask;
-	head   = (head   - tail) & mask;
-
-	if ((int)(head - offset) < 0)
-		return false;
-
-	return true;
-}
-
-static void perf_output_wakeup(struct perf_output_handle *handle)
-{
-	atomic_set(&handle->data->poll, POLL_IN);
-
-	if (handle->nmi) {
-		handle->counter->pending_wakeup = 1;
-		perf_pending_queue(&handle->counter->pending,
-				   perf_pending_counter);
-	} else
-		perf_counter_wakeup(handle->counter);
-}
-
-/*
- * Curious locking construct.
- *
- * We need to ensure a later event doesn't publish a head when a former
- * event isn't done writing. However since we need to deal with NMIs we
- * cannot fully serialize things.
- *
- * What we do is serialize between CPUs so we only have to deal with NMI
- * nesting on a single CPU.
- *
- * We only publish the head (and generate a wakeup) when the outer-most
- * event completes.
- */
-static void perf_output_lock(struct perf_output_handle *handle)
-{
-	struct perf_mmap_data *data = handle->data;
-	int cpu;
-
-	handle->locked = 0;
-
-	local_irq_save(handle->flags);
-	cpu = smp_processor_id();
-
-	if (in_nmi() && atomic_read(&data->lock) == cpu)
-		return;
-
-	while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
-		cpu_relax();
-
-	handle->locked = 1;
-}
-
-static void perf_output_unlock(struct perf_output_handle *handle)
-{
-	struct perf_mmap_data *data = handle->data;
-	unsigned long head;
-	int cpu;
-
-	data->done_head = data->head;
-
-	if (!handle->locked)
-		goto out;
-
-again:
-	/*
-	 * The xchg implies a full barrier that ensures all writes are done
-	 * before we publish the new head, matched by a rmb() in userspace when
-	 * reading this position.
-	 */
-	while ((head = atomic_long_xchg(&data->done_head, 0)))
-		data->user_page->data_head = head;
-
-	/*
-	 * NMI can happen here, which means we can miss a done_head update.
-	 */
-
-	cpu = atomic_xchg(&data->lock, -1);
-	WARN_ON_ONCE(cpu != smp_processor_id());
-
-	/*
-	 * Therefore we have to validate we did not indeed do so.
-	 */
-	if (unlikely(atomic_long_read(&data->done_head))) {
-		/*
-		 * Since we had it locked, we can lock it again.
-		 */
-		while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
-			cpu_relax();
-
-		goto again;
-	}
-
-	if (atomic_xchg(&data->wakeup, 0))
-		perf_output_wakeup(handle);
-out:
-	local_irq_restore(handle->flags);
-}
-
-void perf_output_copy(struct perf_output_handle *handle,
-		      const void *buf, unsigned int len)
-{
-	unsigned int pages_mask;
-	unsigned int offset;
-	unsigned int size;
-	void **pages;
-
-	offset		= handle->offset;
-	pages_mask	= handle->data->nr_pages - 1;
-	pages		= handle->data->data_pages;
-
-	do {
-		unsigned int page_offset;
-		int nr;
-
-		nr	    = (offset >> PAGE_SHIFT) & pages_mask;
-		page_offset = offset & (PAGE_SIZE - 1);
-		size	    = min_t(unsigned int, PAGE_SIZE - page_offset, len);
-
-		memcpy(pages[nr] + page_offset, buf, size);
-
-		len	    -= size;
-		buf	    += size;
-		offset	    += size;
-	} while (len);
-
-	handle->offset = offset;
-
-	/*
-	 * Check we didn't copy past our reservation window, taking the
-	 * possible unsigned int wrap into account.
-	 */
-	WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
-}
-
-int perf_output_begin(struct perf_output_handle *handle,
-		      struct perf_counter *counter, unsigned int size,
-		      int nmi, int sample)
-{
-	struct perf_counter *output_counter;
-	struct perf_mmap_data *data;
-	unsigned long tail, offset, head;
-	int have_lost;
-	struct {
-		struct perf_event_header header;
-		u64			 id;
-		u64			 lost;
-	} lost_event;
-
-	rcu_read_lock();
-	/*
-	 * For inherited counters we send all the output towards the parent.
-	 */
-	if (counter->parent)
-		counter = counter->parent;
-
-	output_counter = rcu_dereference(counter->output);
-	if (output_counter)
-		counter = output_counter;
-
-	data = rcu_dereference(counter->data);
-	if (!data)
-		goto out;
-
-	handle->data	= data;
-	handle->counter	= counter;
-	handle->nmi	= nmi;
-	handle->sample	= sample;
-
-	if (!data->nr_pages)
-		goto fail;
-
-	have_lost = atomic_read(&data->lost);
-	if (have_lost)
-		size += sizeof(lost_event);
-
-	perf_output_lock(handle);
-
-	do {
-		/*
-		 * Userspace could choose to issue a mb() before updating the
-		 * tail pointer. So that all reads will be completed before the
-		 * write is issued.
-		 */
-		tail = ACCESS_ONCE(data->user_page->data_tail);
-		smp_rmb();
-		offset = head = atomic_long_read(&data->head);
-		head += size;
-		if (unlikely(!perf_output_space(data, tail, offset, head)))
-			goto fail;
-	} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
-
-	handle->offset	= offset;
-	handle->head	= head;
-
-	if (head - tail > data->watermark)
-		atomic_set(&data->wakeup, 1);
-
-	if (have_lost) {
-		lost_event.header.type = PERF_EVENT_LOST;
-		lost_event.header.misc = 0;
-		lost_event.header.size = sizeof(lost_event);
-		lost_event.id          = counter->id;
-		lost_event.lost        = atomic_xchg(&data->lost, 0);
-
-		perf_output_put(handle, lost_event);
-	}
-
-	return 0;
-
-fail:
-	atomic_inc(&data->lost);
-	perf_output_unlock(handle);
-out:
-	rcu_read_unlock();
-
-	return -ENOSPC;
-}
-
-void perf_output_end(struct perf_output_handle *handle)
-{
-	struct perf_counter *counter = handle->counter;
-	struct perf_mmap_data *data = handle->data;
-
-	int wakeup_events = counter->attr.wakeup_events;
-
-	if (handle->sample && wakeup_events) {
-		int events = atomic_inc_return(&data->events);
-		if (events >= wakeup_events) {
-			atomic_sub(wakeup_events, &data->events);
-			atomic_set(&data->wakeup, 1);
-		}
-	}
-
-	perf_output_unlock(handle);
-	rcu_read_unlock();
-}
-
-static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p)
-{
-	/*
-	 * only top level counters have the pid namespace they were created in
-	 */
-	if (counter->parent)
-		counter = counter->parent;
-
-	return task_tgid_nr_ns(p, counter->ns);
-}
-
-static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
-{
-	/*
-	 * only top level counters have the pid namespace they were created in
-	 */
-	if (counter->parent)
-		counter = counter->parent;
-
-	return task_pid_nr_ns(p, counter->ns);
-}
-
-static void perf_output_read_one(struct perf_output_handle *handle,
-				 struct perf_counter *counter)
-{
-	u64 read_format = counter->attr.read_format;
-	u64 values[4];
-	int n = 0;
-
-	values[n++] = atomic64_read(&counter->count);
-	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-		values[n++] = counter->total_time_enabled +
-			atomic64_read(&counter->child_total_time_enabled);
-	}
-	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-		values[n++] = counter->total_time_running +
-			atomic64_read(&counter->child_total_time_running);
-	}
-	if (read_format & PERF_FORMAT_ID)
-		values[n++] = primary_counter_id(counter);
-
-	perf_output_copy(handle, values, n * sizeof(u64));
-}
-
-/*
- * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
- */
-static void perf_output_read_group(struct perf_output_handle *handle,
-			    struct perf_counter *counter)
-{
-	struct perf_counter *leader = counter->group_leader, *sub;
-	u64 read_format = counter->attr.read_format;
-	u64 values[5];
-	int n = 0;
-
-	values[n++] = 1 + leader->nr_siblings;
-
-	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-		values[n++] = leader->total_time_enabled;
-
-	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-		values[n++] = leader->total_time_running;
-
-	if (leader != counter)
-		leader->pmu->read(leader);
-
-	values[n++] = atomic64_read(&leader->count);
-	if (read_format & PERF_FORMAT_ID)
-		values[n++] = primary_counter_id(leader);
-
-	perf_output_copy(handle, values, n * sizeof(u64));
-
-	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
-		n = 0;
-
-		if (sub != counter)
-			sub->pmu->read(sub);
-
-		values[n++] = atomic64_read(&sub->count);
-		if (read_format & PERF_FORMAT_ID)
-			values[n++] = primary_counter_id(sub);
-
-		perf_output_copy(handle, values, n * sizeof(u64));
-	}
-}
-
-static void perf_output_read(struct perf_output_handle *handle,
-			     struct perf_counter *counter)
-{
-	if (counter->attr.read_format & PERF_FORMAT_GROUP)
-		perf_output_read_group(handle, counter);
-	else
-		perf_output_read_one(handle, counter);
-}
-
-void perf_output_sample(struct perf_output_handle *handle,
-			struct perf_event_header *header,
-			struct perf_sample_data *data,
-			struct perf_counter *counter)
-{
-	u64 sample_type = data->type;
-
-	perf_output_put(handle, *header);
-
-	if (sample_type & PERF_SAMPLE_IP)
-		perf_output_put(handle, data->ip);
-
-	if (sample_type & PERF_SAMPLE_TID)
-		perf_output_put(handle, data->tid_entry);
-
-	if (sample_type & PERF_SAMPLE_TIME)
-		perf_output_put(handle, data->time);
-
-	if (sample_type & PERF_SAMPLE_ADDR)
-		perf_output_put(handle, data->addr);
-
-	if (sample_type & PERF_SAMPLE_ID)
-		perf_output_put(handle, data->id);
-
-	if (sample_type & PERF_SAMPLE_STREAM_ID)
-		perf_output_put(handle, data->stream_id);
-
-	if (sample_type & PERF_SAMPLE_CPU)
-		perf_output_put(handle, data->cpu_entry);
-
-	if (sample_type & PERF_SAMPLE_PERIOD)
-		perf_output_put(handle, data->period);
-
-	if (sample_type & PERF_SAMPLE_READ)
-		perf_output_read(handle, counter);
-
-	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
-		if (data->callchain) {
-			int size = 1;
-
-			if (data->callchain)
-				size += data->callchain->nr;
-
-			size *= sizeof(u64);
-
-			perf_output_copy(handle, data->callchain, size);
-		} else {
-			u64 nr = 0;
-			perf_output_put(handle, nr);
-		}
-	}
-
-	if (sample_type & PERF_SAMPLE_RAW) {
-		if (data->raw) {
-			perf_output_put(handle, data->raw->size);
-			perf_output_copy(handle, data->raw->data,
-					 data->raw->size);
-		} else {
-			struct {
-				u32	size;
-				u32	data;
-			} raw = {
-				.size = sizeof(u32),
-				.data = 0,
-			};
-			perf_output_put(handle, raw);
-		}
-	}
-}
-
-void perf_prepare_sample(struct perf_event_header *header,
-			 struct perf_sample_data *data,
-			 struct perf_counter *counter,
-			 struct pt_regs *regs)
-{
-	u64 sample_type = counter->attr.sample_type;
-
-	data->type = sample_type;
-
-	header->type = PERF_EVENT_SAMPLE;
-	header->size = sizeof(*header);
-
-	header->misc = 0;
-	header->misc |= perf_misc_flags(regs);
-
-	if (sample_type & PERF_SAMPLE_IP) {
-		data->ip = perf_instruction_pointer(regs);
-
-		header->size += sizeof(data->ip);
-	}
-
-	if (sample_type & PERF_SAMPLE_TID) {
-		/* namespace issues */
-		data->tid_entry.pid = perf_counter_pid(counter, current);
-		data->tid_entry.tid = perf_counter_tid(counter, current);
-
-		header->size += sizeof(data->tid_entry);
-	}
-
-	if (sample_type & PERF_SAMPLE_TIME) {
-		data->time = perf_clock();
-
-		header->size += sizeof(data->time);
-	}
-
-	if (sample_type & PERF_SAMPLE_ADDR)
-		header->size += sizeof(data->addr);
-
-	if (sample_type & PERF_SAMPLE_ID) {
-		data->id = primary_counter_id(counter);
-
-		header->size += sizeof(data->id);
-	}
-
-	if (sample_type & PERF_SAMPLE_STREAM_ID) {
-		data->stream_id = counter->id;
-
-		header->size += sizeof(data->stream_id);
-	}
-
-	if (sample_type & PERF_SAMPLE_CPU) {
-		data->cpu_entry.cpu		= raw_smp_processor_id();
-		data->cpu_entry.reserved	= 0;
-
-		header->size += sizeof(data->cpu_entry);
-	}
-
-	if (sample_type & PERF_SAMPLE_PERIOD)
-		header->size += sizeof(data->period);
-
-	if (sample_type & PERF_SAMPLE_READ)
-		header->size += perf_counter_read_size(counter);
-
-	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
-		int size = 1;
-
-		data->callchain = perf_callchain(regs);
-
-		if (data->callchain)
-			size += data->callchain->nr;
-
-		header->size += size * sizeof(u64);
-	}
-
-	if (sample_type & PERF_SAMPLE_RAW) {
-		int size = sizeof(u32);
-
-		if (data->raw)
-			size += data->raw->size;
-		else
-			size += sizeof(u32);
-
-		WARN_ON_ONCE(size & (sizeof(u64)-1));
-		header->size += size;
-	}
-}
-
-static void perf_counter_output(struct perf_counter *counter, int nmi,
-				struct perf_sample_data *data,
-				struct pt_regs *regs)
-{
-	struct perf_output_handle handle;
-	struct perf_event_header header;
-
-	perf_prepare_sample(&header, data, counter, regs);
-
-	if (perf_output_begin(&handle, counter, header.size, nmi, 1))
-		return;
-
-	perf_output_sample(&handle, &header, data, counter);
-
-	perf_output_end(&handle);
-}
-
-/*
- * read event
- */
-
-struct perf_read_event {
-	struct perf_event_header	header;
-
-	u32				pid;
-	u32				tid;
-};
-
-static void
-perf_counter_read_event(struct perf_counter *counter,
-			struct task_struct *task)
-{
-	struct perf_output_handle handle;
-	struct perf_read_event read_event = {
-		.header = {
-			.type = PERF_EVENT_READ,
-			.misc = 0,
-			.size = sizeof(read_event) + perf_counter_read_size(counter),
-		},
-		.pid = perf_counter_pid(counter, task),
-		.tid = perf_counter_tid(counter, task),
-	};
-	int ret;
-
-	ret = perf_output_begin(&handle, counter, read_event.header.size, 0, 0);
-	if (ret)
-		return;
-
-	perf_output_put(&handle, read_event);
-	perf_output_read(&handle, counter);
-
-	perf_output_end(&handle);
-}
-
-/*
- * task tracking -- fork/exit
- *
- * enabled by: attr.comm | attr.mmap | attr.task
- */
-
-struct perf_task_event {
-	struct task_struct		*task;
-	struct perf_counter_context	*task_ctx;
-
-	struct {
-		struct perf_event_header	header;
-
-		u32				pid;
-		u32				ppid;
-		u32				tid;
-		u32				ptid;
-		u64				time;
-	} event;
-};
-
-static void perf_counter_task_output(struct perf_counter *counter,
-				     struct perf_task_event *task_event)
-{
-	struct perf_output_handle handle;
-	int size;
-	struct task_struct *task = task_event->task;
-	int ret;
-
-	size  = task_event->event.header.size;
-	ret = perf_output_begin(&handle, counter, size, 0, 0);
-
-	if (ret)
-		return;
-
-	task_event->event.pid = perf_counter_pid(counter, task);
-	task_event->event.ppid = perf_counter_pid(counter, current);
-
-	task_event->event.tid = perf_counter_tid(counter, task);
-	task_event->event.ptid = perf_counter_tid(counter, current);
-
-	task_event->event.time = perf_clock();
-
-	perf_output_put(&handle, task_event->event);
-
-	perf_output_end(&handle);
-}
-
-static int perf_counter_task_match(struct perf_counter *counter)
-{
-	if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
-		return 1;
-
-	return 0;
-}
-
-static void perf_counter_task_ctx(struct perf_counter_context *ctx,
-				  struct perf_task_event *task_event)
-{
-	struct perf_counter *counter;
-
-	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-		return;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-		if (perf_counter_task_match(counter))
-			perf_counter_task_output(counter, task_event);
-	}
-	rcu_read_unlock();
-}
-
-static void perf_counter_task_event(struct perf_task_event *task_event)
-{
-	struct perf_cpu_context *cpuctx;
-	struct perf_counter_context *ctx = task_event->task_ctx;
-
-	cpuctx = &get_cpu_var(perf_cpu_context);
-	perf_counter_task_ctx(&cpuctx->ctx, task_event);
-	put_cpu_var(perf_cpu_context);
-
-	rcu_read_lock();
-	if (!ctx)
-		ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
-	if (ctx)
-		perf_counter_task_ctx(ctx, task_event);
-	rcu_read_unlock();
-}
-
-static void perf_counter_task(struct task_struct *task,
-			      struct perf_counter_context *task_ctx,
-			      int new)
-{
-	struct perf_task_event task_event;
-
-	if (!atomic_read(&nr_comm_counters) &&
-	    !atomic_read(&nr_mmap_counters) &&
-	    !atomic_read(&nr_task_counters))
-		return;
-
-	task_event = (struct perf_task_event){
-		.task	  = task,
-		.task_ctx = task_ctx,
-		.event    = {
-			.header = {
-				.type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
-				.misc = 0,
-				.size = sizeof(task_event.event),
-			},
-			/* .pid  */
-			/* .ppid */
-			/* .tid  */
-			/* .ptid */
-		},
-	};
-
-	perf_counter_task_event(&task_event);
-}
-
-void perf_counter_fork(struct task_struct *task)
-{
-	perf_counter_task(task, NULL, 1);
-}
-
-/*
- * comm tracking
- */
-
-struct perf_comm_event {
-	struct task_struct	*task;
-	char			*comm;
-	int			comm_size;
-
-	struct {
-		struct perf_event_header	header;
-
-		u32				pid;
-		u32				tid;
-	} event;
-};
-
-static void perf_counter_comm_output(struct perf_counter *counter,
-				     struct perf_comm_event *comm_event)
-{
-	struct perf_output_handle handle;
-	int size = comm_event->event.header.size;
-	int ret = perf_output_begin(&handle, counter, size, 0, 0);
-
-	if (ret)
-		return;
-
-	comm_event->event.pid = perf_counter_pid(counter, comm_event->task);
-	comm_event->event.tid = perf_counter_tid(counter, comm_event->task);
-
-	perf_output_put(&handle, comm_event->event);
-	perf_output_copy(&handle, comm_event->comm,
-				   comm_event->comm_size);
-	perf_output_end(&handle);
-}
-
-static int perf_counter_comm_match(struct perf_counter *counter)
-{
-	if (counter->attr.comm)
-		return 1;
-
-	return 0;
-}
-
-static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
-				  struct perf_comm_event *comm_event)
-{
-	struct perf_counter *counter;
-
-	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-		return;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-		if (perf_counter_comm_match(counter))
-			perf_counter_comm_output(counter, comm_event);
-	}
-	rcu_read_unlock();
-}
-
-static void perf_counter_comm_event(struct perf_comm_event *comm_event)
-{
-	struct perf_cpu_context *cpuctx;
-	struct perf_counter_context *ctx;
-	unsigned int size;
-	char comm[TASK_COMM_LEN];
-
-	memset(comm, 0, sizeof(comm));
-	strncpy(comm, comm_event->task->comm, sizeof(comm));
-	size = ALIGN(strlen(comm)+1, sizeof(u64));
-
-	comm_event->comm = comm;
-	comm_event->comm_size = size;
-
-	comm_event->event.header.size = sizeof(comm_event->event) + size;
-
-	cpuctx = &get_cpu_var(perf_cpu_context);
-	perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
-	put_cpu_var(perf_cpu_context);
-
-	rcu_read_lock();
-	/*
-	 * doesn't really matter which of the child contexts the
-	 * events ends up in.
-	 */
-	ctx = rcu_dereference(current->perf_counter_ctxp);
-	if (ctx)
-		perf_counter_comm_ctx(ctx, comm_event);
-	rcu_read_unlock();
-}
-
-void perf_counter_comm(struct task_struct *task)
-{
-	struct perf_comm_event comm_event;
-
-	if (task->perf_counter_ctxp)
-		perf_counter_enable_on_exec(task);
-
-	if (!atomic_read(&nr_comm_counters))
-		return;
-
-	comm_event = (struct perf_comm_event){
-		.task	= task,
-		/* .comm      */
-		/* .comm_size */
-		.event  = {
-			.header = {
-				.type = PERF_EVENT_COMM,
-				.misc = 0,
-				/* .size */
-			},
-			/* .pid */
-			/* .tid */
-		},
-	};
-
-	perf_counter_comm_event(&comm_event);
-}
-
-/*
- * mmap tracking
- */
-
-struct perf_mmap_event {
-	struct vm_area_struct	*vma;
-
-	const char		*file_name;
-	int			file_size;
-
-	struct {
-		struct perf_event_header	header;
-
-		u32				pid;
-		u32				tid;
-		u64				start;
-		u64				len;
-		u64				pgoff;
-	} event;
-};
-
-static void perf_counter_mmap_output(struct perf_counter *counter,
-				     struct perf_mmap_event *mmap_event)
-{
-	struct perf_output_handle handle;
-	int size = mmap_event->event.header.size;
-	int ret = perf_output_begin(&handle, counter, size, 0, 0);
-
-	if (ret)
-		return;
-
-	mmap_event->event.pid = perf_counter_pid(counter, current);
-	mmap_event->event.tid = perf_counter_tid(counter, current);
-
-	perf_output_put(&handle, mmap_event->event);
-	perf_output_copy(&handle, mmap_event->file_name,
-				   mmap_event->file_size);
-	perf_output_end(&handle);
-}
-
-static int perf_counter_mmap_match(struct perf_counter *counter,
-				   struct perf_mmap_event *mmap_event)
-{
-	if (counter->attr.mmap)
-		return 1;
-
-	return 0;
-}
-
-static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
-				  struct perf_mmap_event *mmap_event)
-{
-	struct perf_counter *counter;
-
-	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-		return;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-		if (perf_counter_mmap_match(counter, mmap_event))
-			perf_counter_mmap_output(counter, mmap_event);
-	}
-	rcu_read_unlock();
-}
-
-static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
-{
-	struct perf_cpu_context *cpuctx;
-	struct perf_counter_context *ctx;
-	struct vm_area_struct *vma = mmap_event->vma;
-	struct file *file = vma->vm_file;
-	unsigned int size;
-	char tmp[16];
-	char *buf = NULL;
-	const char *name;
-
-	memset(tmp, 0, sizeof(tmp));
-
-	if (file) {
-		/*
-		 * d_path works from the end of the buffer backwards, so we
-		 * need to add enough zero bytes after the string to handle
-		 * the 64bit alignment we do later.
-		 */
-		buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
-		if (!buf) {
-			name = strncpy(tmp, "//enomem", sizeof(tmp));
-			goto got_name;
-		}
-		name = d_path(&file->f_path, buf, PATH_MAX);
-		if (IS_ERR(name)) {
-			name = strncpy(tmp, "//toolong", sizeof(tmp));
-			goto got_name;
-		}
-	} else {
-		if (arch_vma_name(mmap_event->vma)) {
-			name = strncpy(tmp, arch_vma_name(mmap_event->vma),
-				       sizeof(tmp));
-			goto got_name;
-		}
-
-		if (!vma->vm_mm) {
-			name = strncpy(tmp, "[vdso]", sizeof(tmp));
-			goto got_name;
-		}
-
-		name = strncpy(tmp, "//anon", sizeof(tmp));
-		goto got_name;
-	}
-
-got_name:
-	size = ALIGN(strlen(name)+1, sizeof(u64));
-
-	mmap_event->file_name = name;
-	mmap_event->file_size = size;
-
-	mmap_event->event.header.size = sizeof(mmap_event->event) + size;
-
-	cpuctx = &get_cpu_var(perf_cpu_context);
-	perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
-	put_cpu_var(perf_cpu_context);
-
-	rcu_read_lock();
-	/*
-	 * doesn't really matter which of the child contexts the
-	 * events ends up in.
-	 */
-	ctx = rcu_dereference(current->perf_counter_ctxp);
-	if (ctx)
-		perf_counter_mmap_ctx(ctx, mmap_event);
-	rcu_read_unlock();
-
-	kfree(buf);
-}
-
-void __perf_counter_mmap(struct vm_area_struct *vma)
-{
-	struct perf_mmap_event mmap_event;
-
-	if (!atomic_read(&nr_mmap_counters))
-		return;
-
-	mmap_event = (struct perf_mmap_event){
-		.vma	= vma,
-		/* .file_name */
-		/* .file_size */
-		.event  = {
-			.header = {
-				.type = PERF_EVENT_MMAP,
-				.misc = 0,
-				/* .size */
-			},
-			/* .pid */
-			/* .tid */
-			.start  = vma->vm_start,
-			.len    = vma->vm_end - vma->vm_start,
-			.pgoff  = vma->vm_pgoff,
-		},
-	};
-
-	perf_counter_mmap_event(&mmap_event);
-}
-
-/*
- * IRQ throttle logging
- */
-
-static void perf_log_throttle(struct perf_counter *counter, int enable)
-{
-	struct perf_output_handle handle;
-	int ret;
-
-	struct {
-		struct perf_event_header	header;
-		u64				time;
-		u64				id;
-		u64				stream_id;
-	} throttle_event = {
-		.header = {
-			.type = PERF_EVENT_THROTTLE,
-			.misc = 0,
-			.size = sizeof(throttle_event),
-		},
-		.time		= perf_clock(),
-		.id		= primary_counter_id(counter),
-		.stream_id	= counter->id,
-	};
-
-	if (enable)
-		throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
-
-	ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
-	if (ret)
-		return;
-
-	perf_output_put(&handle, throttle_event);
-	perf_output_end(&handle);
-}
-
-/*
- * Generic counter overflow handling, sampling.
- */
-
-static int __perf_counter_overflow(struct perf_counter *counter, int nmi,
-				   int throttle, struct perf_sample_data *data,
-				   struct pt_regs *regs)
-{
-	int events = atomic_read(&counter->event_limit);
-	struct hw_perf_counter *hwc = &counter->hw;
-	int ret = 0;
-
-	throttle = (throttle && counter->pmu->unthrottle != NULL);
-
-	if (!throttle) {
-		hwc->interrupts++;
-	} else {
-		if (hwc->interrupts != MAX_INTERRUPTS) {
-			hwc->interrupts++;
-			if (HZ * hwc->interrupts >
-					(u64)sysctl_perf_counter_sample_rate) {
-				hwc->interrupts = MAX_INTERRUPTS;
-				perf_log_throttle(counter, 0);
-				ret = 1;
-			}
-		} else {
-			/*
-			 * Keep re-disabling counters even though on the previous
-			 * pass we disabled it - just in case we raced with a
-			 * sched-in and the counter got enabled again:
-			 */
-			ret = 1;
-		}
-	}
-
-	if (counter->attr.freq) {
-		u64 now = perf_clock();
-		s64 delta = now - hwc->freq_stamp;
-
-		hwc->freq_stamp = now;
-
-		if (delta > 0 && delta < TICK_NSEC)
-			perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
-	}
-
-	/*
-	 * XXX event_limit might not quite work as expected on inherited
-	 * counters
-	 */
-
-	counter->pending_kill = POLL_IN;
-	if (events && atomic_dec_and_test(&counter->event_limit)) {
-		ret = 1;
-		counter->pending_kill = POLL_HUP;
-		if (nmi) {
-			counter->pending_disable = 1;
-			perf_pending_queue(&counter->pending,
-					   perf_pending_counter);
-		} else
-			perf_counter_disable(counter);
-	}
-
-	perf_counter_output(counter, nmi, data, regs);
-	return ret;
-}
-
-int perf_counter_overflow(struct perf_counter *counter, int nmi,
-			  struct perf_sample_data *data,
-			  struct pt_regs *regs)
-{
-	return __perf_counter_overflow(counter, nmi, 1, data, regs);
-}
-
-/*
- * Generic software counter infrastructure
- */
-
-/*
- * We directly increment counter->count and keep a second value in
- * counter->hw.period_left to count intervals. This period counter
- * is kept in the range [-sample_period, 0] so that we can use the
- * sign as trigger.
- */
-
-static u64 perf_swcounter_set_period(struct perf_counter *counter)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-	u64 period = hwc->last_period;
-	u64 nr, offset;
-	s64 old, val;
-
-	hwc->last_period = hwc->sample_period;
-
-again:
-	old = val = atomic64_read(&hwc->period_left);
-	if (val < 0)
-		return 0;
-
-	nr = div64_u64(period + val, period);
-	offset = nr * period;
-	val -= offset;
-	if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
-		goto again;
-
-	return nr;
-}
-
-static void perf_swcounter_overflow(struct perf_counter *counter,
-				    int nmi, struct perf_sample_data *data,
-				    struct pt_regs *regs)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-	int throttle = 0;
-	u64 overflow;
-
-	data->period = counter->hw.last_period;
-	overflow = perf_swcounter_set_period(counter);
-
-	if (hwc->interrupts == MAX_INTERRUPTS)
-		return;
-
-	for (; overflow; overflow--) {
-		if (__perf_counter_overflow(counter, nmi, throttle,
-					    data, regs)) {
-			/*
-			 * We inhibit the overflow from happening when
-			 * hwc->interrupts == MAX_INTERRUPTS.
-			 */
-			break;
-		}
-		throttle = 1;
-	}
-}
-
-static void perf_swcounter_unthrottle(struct perf_counter *counter)
-{
-	/*
-	 * Nothing to do, we already reset hwc->interrupts.
-	 */
-}
-
-static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
-			       int nmi, struct perf_sample_data *data,
-			       struct pt_regs *regs)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-
-	atomic64_add(nr, &counter->count);
-
-	if (!hwc->sample_period)
-		return;
-
-	if (!regs)
-		return;
-
-	if (!atomic64_add_negative(nr, &hwc->period_left))
-		perf_swcounter_overflow(counter, nmi, data, regs);
-}
-
-static int perf_swcounter_is_counting(struct perf_counter *counter)
-{
-	/*
-	 * The counter is active, we're good!
-	 */
-	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
-		return 1;
-
-	/*
-	 * The counter is off/error, not counting.
-	 */
-	if (counter->state != PERF_COUNTER_STATE_INACTIVE)
-		return 0;
-
-	/*
-	 * The counter is inactive, if the context is active
-	 * we're part of a group that didn't make it on the 'pmu',
-	 * not counting.
-	 */
-	if (counter->ctx->is_active)
-		return 0;
-
-	/*
-	 * We're inactive and the context is too, this means the
-	 * task is scheduled out, we're counting events that happen
-	 * to us, like migration events.
-	 */
-	return 1;
-}
-
-static int perf_swcounter_match(struct perf_counter *counter,
-				enum perf_type_id type,
-				u32 event_id, struct pt_regs *regs)
-{
-	if (!perf_swcounter_is_counting(counter))
-		return 0;
-
-	if (counter->attr.type != type)
-		return 0;
-	if (counter->attr.config != event_id)
-		return 0;
-
-	if (regs) {
-		if (counter->attr.exclude_user && user_mode(regs))
-			return 0;
-
-		if (counter->attr.exclude_kernel && !user_mode(regs))
-			return 0;
-	}
-
-	return 1;
-}
-
-static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
-				     enum perf_type_id type,
-				     u32 event_id, u64 nr, int nmi,
-				     struct perf_sample_data *data,
-				     struct pt_regs *regs)
-{
-	struct perf_counter *counter;
-
-	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-		return;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-		if (perf_swcounter_match(counter, type, event_id, regs))
-			perf_swcounter_add(counter, nr, nmi, data, regs);
-	}
-	rcu_read_unlock();
-}
-
-static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
-{
-	if (in_nmi())
-		return &cpuctx->recursion[3];
-
-	if (in_irq())
-		return &cpuctx->recursion[2];
-
-	if (in_softirq())
-		return &cpuctx->recursion[1];
-
-	return &cpuctx->recursion[0];
-}
-
-static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
-				    u64 nr, int nmi,
-				    struct perf_sample_data *data,
-				    struct pt_regs *regs)
-{
-	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
-	int *recursion = perf_swcounter_recursion_context(cpuctx);
-	struct perf_counter_context *ctx;
-
-	if (*recursion)
-		goto out;
-
-	(*recursion)++;
-	barrier();
-
-	perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
-				 nr, nmi, data, regs);
-	rcu_read_lock();
-	/*
-	 * doesn't really matter which of the child contexts the
-	 * events ends up in.
-	 */
-	ctx = rcu_dereference(current->perf_counter_ctxp);
-	if (ctx)
-		perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data, regs);
-	rcu_read_unlock();
-
-	barrier();
-	(*recursion)--;
-
-out:
-	put_cpu_var(perf_cpu_context);
-}
-
-void __perf_swcounter_event(u32 event, u64 nr, int nmi,
-			    struct pt_regs *regs, u64 addr)
-{
-	struct perf_sample_data data = {
-		.addr = addr,
-	};
-
-	do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi,
-				&data, regs);
-}
-
-static void perf_swcounter_read(struct perf_counter *counter)
-{
-}
-
-static int perf_swcounter_enable(struct perf_counter *counter)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-
-	if (hwc->sample_period) {
-		hwc->last_period = hwc->sample_period;
-		perf_swcounter_set_period(counter);
-	}
-	return 0;
-}
-
-static void perf_swcounter_disable(struct perf_counter *counter)
-{
-}
-
-static const struct pmu perf_ops_generic = {
-	.enable		= perf_swcounter_enable,
-	.disable	= perf_swcounter_disable,
-	.read		= perf_swcounter_read,
-	.unthrottle	= perf_swcounter_unthrottle,
-};
-
-/*
- * hrtimer based swcounter callback
- */
-
-static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
-{
-	enum hrtimer_restart ret = HRTIMER_RESTART;
-	struct perf_sample_data data;
-	struct pt_regs *regs;
-	struct perf_counter *counter;
-	u64 period;
-
-	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer);
-	counter->pmu->read(counter);
-
-	data.addr = 0;
-	regs = get_irq_regs();
-	/*
-	 * In case we exclude kernel IPs or are somehow not in interrupt
-	 * context, provide the next best thing, the user IP.
-	 */
-	if ((counter->attr.exclude_kernel || !regs) &&
-			!counter->attr.exclude_user)
-		regs = task_pt_regs(current);
-
-	if (regs) {
-		if (perf_counter_overflow(counter, 0, &data, regs))
-			ret = HRTIMER_NORESTART;
-	}
-
-	period = max_t(u64, 10000, counter->hw.sample_period);
-	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
-
-	return ret;
-}
-
-/*
- * Software counter: cpu wall time clock
- */
-
-static void cpu_clock_perf_counter_update(struct perf_counter *counter)
-{
-	int cpu = raw_smp_processor_id();
-	s64 prev;
-	u64 now;
-
-	now = cpu_clock(cpu);
-	prev = atomic64_read(&counter->hw.prev_count);
-	atomic64_set(&counter->hw.prev_count, now);
-	atomic64_add(now - prev, &counter->count);
-}
-
-static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-	int cpu = raw_smp_processor_id();
-
-	atomic64_set(&hwc->prev_count, cpu_clock(cpu));
-	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	hwc->hrtimer.function = perf_swcounter_hrtimer;
-	if (hwc->sample_period) {
-		u64 period = max_t(u64, 10000, hwc->sample_period);
-		__hrtimer_start_range_ns(&hwc->hrtimer,
-				ns_to_ktime(period), 0,
-				HRTIMER_MODE_REL, 0);
-	}
-
-	return 0;
-}
-
-static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
-{
-	if (counter->hw.sample_period)
-		hrtimer_cancel(&counter->hw.hrtimer);
-	cpu_clock_perf_counter_update(counter);
-}
-
-static void cpu_clock_perf_counter_read(struct perf_counter *counter)
-{
-	cpu_clock_perf_counter_update(counter);
-}
-
-static const struct pmu perf_ops_cpu_clock = {
-	.enable		= cpu_clock_perf_counter_enable,
-	.disable	= cpu_clock_perf_counter_disable,
-	.read		= cpu_clock_perf_counter_read,
-};
-
-/*
- * Software counter: task time clock
- */
-
-static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
-{
-	u64 prev;
-	s64 delta;
-
-	prev = atomic64_xchg(&counter->hw.prev_count, now);
-	delta = now - prev;
-	atomic64_add(delta, &counter->count);
-}
-
-static int task_clock_perf_counter_enable(struct perf_counter *counter)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-	u64 now;
-
-	now = counter->ctx->time;
-
-	atomic64_set(&hwc->prev_count, now);
-	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	hwc->hrtimer.function = perf_swcounter_hrtimer;
-	if (hwc->sample_period) {
-		u64 period = max_t(u64, 10000, hwc->sample_period);
-		__hrtimer_start_range_ns(&hwc->hrtimer,
-				ns_to_ktime(period), 0,
-				HRTIMER_MODE_REL, 0);
-	}
-
-	return 0;
-}
-
-static void task_clock_perf_counter_disable(struct perf_counter *counter)
-{
-	if (counter->hw.sample_period)
-		hrtimer_cancel(&counter->hw.hrtimer);
-	task_clock_perf_counter_update(counter, counter->ctx->time);
-
-}
-
-static void task_clock_perf_counter_read(struct perf_counter *counter)
-{
-	u64 time;
-
-	if (!in_nmi()) {
-		update_context_time(counter->ctx);
-		time = counter->ctx->time;
-	} else {
-		u64 now = perf_clock();
-		u64 delta = now - counter->ctx->timestamp;
-		time = counter->ctx->time + delta;
-	}
-
-	task_clock_perf_counter_update(counter, time);
-}
-
-static const struct pmu perf_ops_task_clock = {
-	.enable		= task_clock_perf_counter_enable,
-	.disable	= task_clock_perf_counter_disable,
-	.read		= task_clock_perf_counter_read,
-};
-
-#ifdef CONFIG_EVENT_PROFILE
-void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
-			  int entry_size)
-{
-	struct perf_raw_record raw = {
-		.size = entry_size,
-		.data = record,
-	};
-
-	struct perf_sample_data data = {
-		.addr = addr,
-		.raw = &raw,
-	};
-
-	struct pt_regs *regs = get_irq_regs();
-
-	if (!regs)
-		regs = task_pt_regs(current);
-
-	do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
-				&data, regs);
-}
-EXPORT_SYMBOL_GPL(perf_tpcounter_event);
-
-extern int ftrace_profile_enable(int);
-extern void ftrace_profile_disable(int);
-
-static void tp_perf_counter_destroy(struct perf_counter *counter)
-{
-	ftrace_profile_disable(counter->attr.config);
-}
-
-static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
-{
-	/*
-	 * Raw tracepoint data is a severe data leak, only allow root to
-	 * have these.
-	 */
-	if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
-			perf_paranoid_tracepoint_raw() &&
-			!capable(CAP_SYS_ADMIN))
-		return ERR_PTR(-EPERM);
-
-	if (ftrace_profile_enable(counter->attr.config))
-		return NULL;
-
-	counter->destroy = tp_perf_counter_destroy;
-
-	return &perf_ops_generic;
-}
-#else
-static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
-{
-	return NULL;
-}
-#endif
-
-atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
-
-static void sw_perf_counter_destroy(struct perf_counter *counter)
-{
-	u64 event_id = counter->attr.config;
-
-	WARN_ON(counter->parent);
-
-	atomic_dec(&perf_swcounter_enabled[event_id]);
-}
-
-static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
-{
-	const struct pmu *pmu = NULL;
-	u64 event_id = counter->attr.config;
-
-	/*
-	 * Software counters (currently) can't in general distinguish
-	 * between user, kernel and hypervisor events.
-	 * However, context switches and cpu migrations are considered
-	 * to be kernel events, and page faults are never hypervisor
-	 * events.
-	 */
-	switch (event_id) {
-	case PERF_COUNT_SW_CPU_CLOCK:
-		pmu = &perf_ops_cpu_clock;
-
-		break;
-	case PERF_COUNT_SW_TASK_CLOCK:
-		/*
-		 * If the user instantiates this as a per-cpu counter,
-		 * use the cpu_clock counter instead.
-		 */
-		if (counter->ctx->task)
-			pmu = &perf_ops_task_clock;
-		else
-			pmu = &perf_ops_cpu_clock;
-
-		break;
-	case PERF_COUNT_SW_PAGE_FAULTS:
-	case PERF_COUNT_SW_PAGE_FAULTS_MIN:
-	case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
-	case PERF_COUNT_SW_CONTEXT_SWITCHES:
-	case PERF_COUNT_SW_CPU_MIGRATIONS:
-		if (!counter->parent) {
-			atomic_inc(&perf_swcounter_enabled[event_id]);
-			counter->destroy = sw_perf_counter_destroy;
-		}
-		pmu = &perf_ops_generic;
-		break;
-	}
-
-	return pmu;
-}
-
-/*
- * Allocate and initialize a counter structure
- */
-static struct perf_counter *
-perf_counter_alloc(struct perf_counter_attr *attr,
-		   int cpu,
-		   struct perf_counter_context *ctx,
-		   struct perf_counter *group_leader,
-		   struct perf_counter *parent_counter,
-		   gfp_t gfpflags)
-{
-	const struct pmu *pmu;
-	struct perf_counter *counter;
-	struct hw_perf_counter *hwc;
-	long err;
-
-	counter = kzalloc(sizeof(*counter), gfpflags);
-	if (!counter)
-		return ERR_PTR(-ENOMEM);
-
-	/*
-	 * Single counters are their own group leaders, with an
-	 * empty sibling list:
-	 */
-	if (!group_leader)
-		group_leader = counter;
-
-	mutex_init(&counter->child_mutex);
-	INIT_LIST_HEAD(&counter->child_list);
-
-	INIT_LIST_HEAD(&counter->group_entry);
-	INIT_LIST_HEAD(&counter->event_entry);
-	INIT_LIST_HEAD(&counter->sibling_list);
-	init_waitqueue_head(&counter->waitq);
-
-	mutex_init(&counter->mmap_mutex);
-
-	counter->cpu		= cpu;
-	counter->attr		= *attr;
-	counter->group_leader	= group_leader;
-	counter->pmu		= NULL;
-	counter->ctx		= ctx;
-	counter->oncpu		= -1;
-
-	counter->parent		= parent_counter;
-
-	counter->ns		= get_pid_ns(current->nsproxy->pid_ns);
-	counter->id		= atomic64_inc_return(&perf_counter_id);
-
-	counter->state		= PERF_COUNTER_STATE_INACTIVE;
-
-	if (attr->disabled)
-		counter->state = PERF_COUNTER_STATE_OFF;
-
-	pmu = NULL;
-
-	hwc = &counter->hw;
-	hwc->sample_period = attr->sample_period;
-	if (attr->freq && attr->sample_freq)
-		hwc->sample_period = 1;
-	hwc->last_period = hwc->sample_period;
-
-	atomic64_set(&hwc->period_left, hwc->sample_period);
-
-	/*
-	 * we currently do not support PERF_FORMAT_GROUP on inherited counters
-	 */
-	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
-		goto done;
-
-	switch (attr->type) {
-	case PERF_TYPE_RAW:
-	case PERF_TYPE_HARDWARE:
-	case PERF_TYPE_HW_CACHE:
-		pmu = hw_perf_counter_init(counter);
-		break;
-
-	case PERF_TYPE_SOFTWARE:
-		pmu = sw_perf_counter_init(counter);
-		break;
-
-	case PERF_TYPE_TRACEPOINT:
-		pmu = tp_perf_counter_init(counter);
-		break;
-
-	default:
-		break;
-	}
-done:
-	err = 0;
-	if (!pmu)
-		err = -EINVAL;
-	else if (IS_ERR(pmu))
-		err = PTR_ERR(pmu);
-
-	if (err) {
-		if (counter->ns)
-			put_pid_ns(counter->ns);
-		kfree(counter);
-		return ERR_PTR(err);
-	}
-
-	counter->pmu = pmu;
-
-	if (!counter->parent) {
-		atomic_inc(&nr_counters);
-		if (counter->attr.mmap)
-			atomic_inc(&nr_mmap_counters);
-		if (counter->attr.comm)
-			atomic_inc(&nr_comm_counters);
-		if (counter->attr.task)
-			atomic_inc(&nr_task_counters);
-	}
-
-	return counter;
-}
-
-static int perf_copy_attr(struct perf_counter_attr __user *uattr,
-			  struct perf_counter_attr *attr)
-{
-	u32 size;
-	int ret;
-
-	if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
-		return -EFAULT;
-
-	/*
-	 * zero the full structure, so that a short copy will be nice.
-	 */
-	memset(attr, 0, sizeof(*attr));
-
-	ret = get_user(size, &uattr->size);
-	if (ret)
-		return ret;
-
-	if (size > PAGE_SIZE)	/* silly large */
-		goto err_size;
-
-	if (!size)		/* abi compat */
-		size = PERF_ATTR_SIZE_VER0;
-
-	if (size < PERF_ATTR_SIZE_VER0)
-		goto err_size;
-
-	/*
-	 * If we're handed a bigger struct than we know of,
-	 * ensure all the unknown bits are 0 - i.e. new
-	 * user-space does not rely on any kernel feature
-	 * extensions we dont know about yet.
-	 */
-	if (size > sizeof(*attr)) {
-		unsigned char __user *addr;
-		unsigned char __user *end;
-		unsigned char val;
-
-		addr = (void __user *)uattr + sizeof(*attr);
-		end  = (void __user *)uattr + size;
-
-		for (; addr < end; addr++) {
-			ret = get_user(val, addr);
-			if (ret)
-				return ret;
-			if (val)
-				goto err_size;
-		}
-		size = sizeof(*attr);
-	}
-
-	ret = copy_from_user(attr, uattr, size);
-	if (ret)
-		return -EFAULT;
-
-	/*
-	 * If the type exists, the corresponding creation will verify
-	 * the attr->config.
-	 */
-	if (attr->type >= PERF_TYPE_MAX)
-		return -EINVAL;
-
-	if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
-		return -EINVAL;
-
-	if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
-		return -EINVAL;
-
-	if (attr->read_format & ~(PERF_FORMAT_MAX-1))
-		return -EINVAL;
-
-out:
-	return ret;
-
-err_size:
-	put_user(sizeof(*attr), &uattr->size);
-	ret = -E2BIG;
-	goto out;
-}
-
-int perf_counter_set_output(struct perf_counter *counter, int output_fd)
-{
-	struct perf_counter *output_counter = NULL;
-	struct file *output_file = NULL;
-	struct perf_counter *old_output;
-	int fput_needed = 0;
-	int ret = -EINVAL;
-
-	if (!output_fd)
-		goto set;
-
-	output_file = fget_light(output_fd, &fput_needed);
-	if (!output_file)
-		return -EBADF;
-
-	if (output_file->f_op != &perf_fops)
-		goto out;
-
-	output_counter = output_file->private_data;
-
-	/* Don't chain output fds */
-	if (output_counter->output)
-		goto out;
-
-	/* Don't set an output fd when we already have an output channel */
-	if (counter->data)
-		goto out;
-
-	atomic_long_inc(&output_file->f_count);
-
-set:
-	mutex_lock(&counter->mmap_mutex);
-	old_output = counter->output;
-	rcu_assign_pointer(counter->output, output_counter);
-	mutex_unlock(&counter->mmap_mutex);
-
-	if (old_output) {
-		/*
-		 * we need to make sure no existing perf_output_*()
-		 * is still referencing this counter.
-		 */
-		synchronize_rcu();
-		fput(old_output->filp);
-	}
-
-	ret = 0;
-out:
-	fput_light(output_file, fput_needed);
-	return ret;
-}
-
-/**
- * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
- *
- * @attr_uptr:	event type attributes for monitoring/sampling
- * @pid:		target pid
- * @cpu:		target cpu
- * @group_fd:		group leader counter fd
- */
-SYSCALL_DEFINE5(perf_counter_open,
-		struct perf_counter_attr __user *, attr_uptr,
-		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
-{
-	struct perf_counter *counter, *group_leader;
-	struct perf_counter_attr attr;
-	struct perf_counter_context *ctx;
-	struct file *counter_file = NULL;
-	struct file *group_file = NULL;
-	int fput_needed = 0;
-	int fput_needed2 = 0;
-	int err;
-
-	/* for future expandability... */
-	if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
-		return -EINVAL;
-
-	err = perf_copy_attr(attr_uptr, &attr);
-	if (err)
-		return err;
-
-	if (!attr.exclude_kernel) {
-		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
-			return -EACCES;
-	}
-
-	if (attr.freq) {
-		if (attr.sample_freq > sysctl_perf_counter_sample_rate)
-			return -EINVAL;
-	}
-
-	/*
-	 * Get the target context (task or percpu):
-	 */
-	ctx = find_get_context(pid, cpu);
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
-	/*
-	 * Look up the group leader (we will attach this counter to it):
-	 */
-	group_leader = NULL;
-	if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
-		err = -EINVAL;
-		group_file = fget_light(group_fd, &fput_needed);
-		if (!group_file)
-			goto err_put_context;
-		if (group_file->f_op != &perf_fops)
-			goto err_put_context;
-
-		group_leader = group_file->private_data;
-		/*
-		 * Do not allow a recursive hierarchy (this new sibling
-		 * becoming part of another group-sibling):
-		 */
-		if (group_leader->group_leader != group_leader)
-			goto err_put_context;
-		/*
-		 * Do not allow to attach to a group in a different
-		 * task or CPU context:
-		 */
-		if (group_leader->ctx != ctx)
-			goto err_put_context;
-		/*
-		 * Only a group leader can be exclusive or pinned
-		 */
-		if (attr.exclusive || attr.pinned)
-			goto err_put_context;
-	}
-
-	counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
-				     NULL, GFP_KERNEL);
-	err = PTR_ERR(counter);
-	if (IS_ERR(counter))
-		goto err_put_context;
-
-	err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
-	if (err < 0)
-		goto err_free_put_context;
-
-	counter_file = fget_light(err, &fput_needed2);
-	if (!counter_file)
-		goto err_free_put_context;
-
-	if (flags & PERF_FLAG_FD_OUTPUT) {
-		err = perf_counter_set_output(counter, group_fd);
-		if (err)
-			goto err_fput_free_put_context;
-	}
-
-	counter->filp = counter_file;
-	WARN_ON_ONCE(ctx->parent_ctx);
-	mutex_lock(&ctx->mutex);
-	perf_install_in_context(ctx, counter, cpu);
-	++ctx->generation;
-	mutex_unlock(&ctx->mutex);
-
-	counter->owner = current;
-	get_task_struct(current);
-	mutex_lock(&current->perf_counter_mutex);
-	list_add_tail(&counter->owner_entry, &current->perf_counter_list);
-	mutex_unlock(&current->perf_counter_mutex);
-
-err_fput_free_put_context:
-	fput_light(counter_file, fput_needed2);
-
-err_free_put_context:
-	if (err < 0)
-		kfree(counter);
-
-err_put_context:
-	if (err < 0)
-		put_ctx(ctx);
-
-	fput_light(group_file, fput_needed);
-
-	return err;
-}
-
-/*
- * inherit a counter from parent task to child task:
- */
-static struct perf_counter *
-inherit_counter(struct perf_counter *parent_counter,
-	      struct task_struct *parent,
-	      struct perf_counter_context *parent_ctx,
-	      struct task_struct *child,
-	      struct perf_counter *group_leader,
-	      struct perf_counter_context *child_ctx)
-{
-	struct perf_counter *child_counter;
-
-	/*
-	 * Instead of creating recursive hierarchies of counters,
-	 * we link inherited counters back to the original parent,
-	 * which has a filp for sure, which we use as the reference
-	 * count:
-	 */
-	if (parent_counter->parent)
-		parent_counter = parent_counter->parent;
-
-	child_counter = perf_counter_alloc(&parent_counter->attr,
-					   parent_counter->cpu, child_ctx,
-					   group_leader, parent_counter,
-					   GFP_KERNEL);
-	if (IS_ERR(child_counter))
-		return child_counter;
-	get_ctx(child_ctx);
-
-	/*
-	 * Make the child state follow the state of the parent counter,
-	 * not its attr.disabled bit.  We hold the parent's mutex,
-	 * so we won't race with perf_counter_{en, dis}able_family.
-	 */
-	if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
-		child_counter->state = PERF_COUNTER_STATE_INACTIVE;
-	else
-		child_counter->state = PERF_COUNTER_STATE_OFF;
-
-	if (parent_counter->attr.freq)
-		child_counter->hw.sample_period = parent_counter->hw.sample_period;
-
-	/*
-	 * Link it up in the child's context:
-	 */
-	add_counter_to_ctx(child_counter, child_ctx);
-
-	/*
-	 * Get a reference to the parent filp - we will fput it
-	 * when the child counter exits. This is safe to do because
-	 * we are in the parent and we know that the filp still
-	 * exists and has a nonzero count:
-	 */
-	atomic_long_inc(&parent_counter->filp->f_count);
-
-	/*
-	 * Link this into the parent counter's child list
-	 */
-	WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
-	mutex_lock(&parent_counter->child_mutex);
-	list_add_tail(&child_counter->child_list, &parent_counter->child_list);
-	mutex_unlock(&parent_counter->child_mutex);
-
-	return child_counter;
-}
-
-static int inherit_group(struct perf_counter *parent_counter,
-	      struct task_struct *parent,
-	      struct perf_counter_context *parent_ctx,
-	      struct task_struct *child,
-	      struct perf_counter_context *child_ctx)
-{
-	struct perf_counter *leader;
-	struct perf_counter *sub;
-	struct perf_counter *child_ctr;
-
-	leader = inherit_counter(parent_counter, parent, parent_ctx,
-				 child, NULL, child_ctx);
-	if (IS_ERR(leader))
-		return PTR_ERR(leader);
-	list_for_each_entry(sub, &parent_counter->sibling_list, group_entry) {
-		child_ctr = inherit_counter(sub, parent, parent_ctx,
-					    child, leader, child_ctx);
-		if (IS_ERR(child_ctr))
-			return PTR_ERR(child_ctr);
-	}
-	return 0;
-}
-
-static void sync_child_counter(struct perf_counter *child_counter,
-			       struct task_struct *child)
-{
-	struct perf_counter *parent_counter = child_counter->parent;
-	u64 child_val;
-
-	if (child_counter->attr.inherit_stat)
-		perf_counter_read_event(child_counter, child);
-
-	child_val = atomic64_read(&child_counter->count);
-
-	/*
-	 * Add back the child's count to the parent's count:
-	 */
-	atomic64_add(child_val, &parent_counter->count);
-	atomic64_add(child_counter->total_time_enabled,
-		     &parent_counter->child_total_time_enabled);
-	atomic64_add(child_counter->total_time_running,
-		     &parent_counter->child_total_time_running);
-
-	/*
-	 * Remove this counter from the parent's list
-	 */
-	WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
-	mutex_lock(&parent_counter->child_mutex);
-	list_del_init(&child_counter->child_list);
-	mutex_unlock(&parent_counter->child_mutex);
-
-	/*
-	 * Release the parent counter, if this was the last
-	 * reference to it.
-	 */
-	fput(parent_counter->filp);
-}
-
-static void
-__perf_counter_exit_task(struct perf_counter *child_counter,
-			 struct perf_counter_context *child_ctx,
-			 struct task_struct *child)
-{
-	struct perf_counter *parent_counter;
-
-	update_counter_times(child_counter);
-	perf_counter_remove_from_context(child_counter);
-
-	parent_counter = child_counter->parent;
-	/*
-	 * It can happen that parent exits first, and has counters
-	 * that are still around due to the child reference. These
-	 * counters need to be zapped - but otherwise linger.
-	 */
-	if (parent_counter) {
-		sync_child_counter(child_counter, child);
-		free_counter(child_counter);
-	}
-}
-
-/*
- * When a child task exits, feed back counter values to parent counters.
- */
-void perf_counter_exit_task(struct task_struct *child)
-{
-	struct perf_counter *child_counter, *tmp;
-	struct perf_counter_context *child_ctx;
-	unsigned long flags;
-
-	if (likely(!child->perf_counter_ctxp)) {
-		perf_counter_task(child, NULL, 0);
-		return;
-	}
-
-	local_irq_save(flags);
-	/*
-	 * We can't reschedule here because interrupts are disabled,
-	 * and either child is current or it is a task that can't be
-	 * scheduled, so we are now safe from rescheduling changing
-	 * our context.
-	 */
-	child_ctx = child->perf_counter_ctxp;
-	__perf_counter_task_sched_out(child_ctx);
-
-	/*
-	 * Take the context lock here so that if find_get_context is
-	 * reading child->perf_counter_ctxp, we wait until it has
-	 * incremented the context's refcount before we do put_ctx below.
-	 */
-	spin_lock(&child_ctx->lock);
-	child->perf_counter_ctxp = NULL;
-	/*
-	 * If this context is a clone; unclone it so it can't get
-	 * swapped to another process while we're removing all
-	 * the counters from it.
-	 */
-	unclone_ctx(child_ctx);
-	spin_unlock_irqrestore(&child_ctx->lock, flags);
-
-	/*
-	 * Report the task dead after unscheduling the counters so that we
-	 * won't get any samples after PERF_EVENT_EXIT. We can however still
-	 * get a few PERF_EVENT_READ events.
-	 */
-	perf_counter_task(child, child_ctx, 0);
-
-	/*
-	 * We can recurse on the same lock type through:
-	 *
-	 *   __perf_counter_exit_task()
-	 *     sync_child_counter()
-	 *       fput(parent_counter->filp)
-	 *         perf_release()
-	 *           mutex_lock(&ctx->mutex)
-	 *
-	 * But since its the parent context it won't be the same instance.
-	 */
-	mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
-
-again:
-	list_for_each_entry_safe(child_counter, tmp, &child_ctx->group_list,
-				 group_entry)
-		__perf_counter_exit_task(child_counter, child_ctx, child);
-
-	/*
-	 * If the last counter was a group counter, it will have appended all
-	 * its siblings to the list, but we obtained 'tmp' before that which
-	 * will still point to the list head terminating the iteration.
-	 */
-	if (!list_empty(&child_ctx->group_list))
-		goto again;
-
-	mutex_unlock(&child_ctx->mutex);
-
-	put_ctx(child_ctx);
-}
-
-/*
- * free an unexposed, unused context as created by inheritance by
- * init_task below, used by fork() in case of fail.
- */
-void perf_counter_free_task(struct task_struct *task)
-{
-	struct perf_counter_context *ctx = task->perf_counter_ctxp;
-	struct perf_counter *counter, *tmp;
-
-	if (!ctx)
-		return;
-
-	mutex_lock(&ctx->mutex);
-again:
-	list_for_each_entry_safe(counter, tmp, &ctx->group_list, group_entry) {
-		struct perf_counter *parent = counter->parent;
-
-		if (WARN_ON_ONCE(!parent))
-			continue;
-
-		mutex_lock(&parent->child_mutex);
-		list_del_init(&counter->child_list);
-		mutex_unlock(&parent->child_mutex);
-
-		fput(parent->filp);
-
-		list_del_counter(counter, ctx);
-		free_counter(counter);
-	}
-
-	if (!list_empty(&ctx->group_list))
-		goto again;
-
-	mutex_unlock(&ctx->mutex);
-
-	put_ctx(ctx);
-}
-
-/*
- * Initialize the perf_counter context in task_struct
- */
-int perf_counter_init_task(struct task_struct *child)
-{
-	struct perf_counter_context *child_ctx, *parent_ctx;
-	struct perf_counter_context *cloned_ctx;
-	struct perf_counter *counter;
-	struct task_struct *parent = current;
-	int inherited_all = 1;
-	int ret = 0;
-
-	child->perf_counter_ctxp = NULL;
-
-	mutex_init(&child->perf_counter_mutex);
-	INIT_LIST_HEAD(&child->perf_counter_list);
-
-	if (likely(!parent->perf_counter_ctxp))
-		return 0;
-
-	/*
-	 * This is executed from the parent task context, so inherit
-	 * counters that have been marked for cloning.
-	 * First allocate and initialize a context for the child.
-	 */
-
-	child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
-	if (!child_ctx)
-		return -ENOMEM;
-
-	__perf_counter_init_context(child_ctx, child);
-	child->perf_counter_ctxp = child_ctx;
-	get_task_struct(child);
-
-	/*
-	 * If the parent's context is a clone, pin it so it won't get
-	 * swapped under us.
-	 */
-	parent_ctx = perf_pin_task_context(parent);
-
-	/*
-	 * No need to check if parent_ctx != NULL here; since we saw
-	 * it non-NULL earlier, the only reason for it to become NULL
-	 * is if we exit, and since we're currently in the middle of
-	 * a fork we can't be exiting at the same time.
-	 */
-
-	/*
-	 * Lock the parent list. No need to lock the child - not PID
-	 * hashed yet and not running, so nobody can access it.
-	 */
-	mutex_lock(&parent_ctx->mutex);
-
-	/*
-	 * We dont have to disable NMIs - we are only looking at
-	 * the list, not manipulating it:
-	 */
-	list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) {
-		if (counter != counter->group_leader)
-			continue;
-
-		if (!counter->attr.inherit) {
-			inherited_all = 0;
-			continue;
-		}
-
-		ret = inherit_group(counter, parent, parent_ctx,
-					     child, child_ctx);
-		if (ret) {
-			inherited_all = 0;
-			break;
-		}
-	}
-
-	if (inherited_all) {
-		/*
-		 * Mark the child context as a clone of the parent
-		 * context, or of whatever the parent is a clone of.
-		 * Note that if the parent is a clone, it could get
-		 * uncloned at any point, but that doesn't matter
-		 * because the list of counters and the generation
-		 * count can't have changed since we took the mutex.
-		 */
-		cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
-		if (cloned_ctx) {
-			child_ctx->parent_ctx = cloned_ctx;
-			child_ctx->parent_gen = parent_ctx->parent_gen;
-		} else {
-			child_ctx->parent_ctx = parent_ctx;
-			child_ctx->parent_gen = parent_ctx->generation;
-		}
-		get_ctx(child_ctx->parent_ctx);
-	}
-
-	mutex_unlock(&parent_ctx->mutex);
-
-	perf_unpin_context(parent_ctx);
-
-	return ret;
-}
-
-static void __cpuinit perf_counter_init_cpu(int cpu)
-{
-	struct perf_cpu_context *cpuctx;
-
-	cpuctx = &per_cpu(perf_cpu_context, cpu);
-	__perf_counter_init_context(&cpuctx->ctx, NULL);
-
-	spin_lock(&perf_resource_lock);
-	cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
-	spin_unlock(&perf_resource_lock);
-
-	hw_perf_counter_setup(cpu);
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-static void __perf_counter_exit_cpu(void *info)
-{
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	struct perf_counter_context *ctx = &cpuctx->ctx;
-	struct perf_counter *counter, *tmp;
-
-	list_for_each_entry_safe(counter, tmp, &ctx->group_list, group_entry)
-		__perf_counter_remove_from_context(counter);
-}
-static void perf_counter_exit_cpu(int cpu)
-{
-	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-	struct perf_counter_context *ctx = &cpuctx->ctx;
-
-	mutex_lock(&ctx->mutex);
-	smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
-	mutex_unlock(&ctx->mutex);
-}
-#else
-static inline void perf_counter_exit_cpu(int cpu) { }
-#endif
-
-static int __cpuinit
-perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
-{
-	unsigned int cpu = (long)hcpu;
-
-	switch (action) {
-
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		perf_counter_init_cpu(cpu);
-		break;
-
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-		hw_perf_counter_setup_online(cpu);
-		break;
-
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
-		perf_counter_exit_cpu(cpu);
-		break;
-
-	default:
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
-/*
- * This has to have a higher priority than migration_notifier in sched.c.
- */
-static struct notifier_block __cpuinitdata perf_cpu_nb = {
-	.notifier_call		= perf_cpu_notify,
-	.priority		= 20,
-};
-
-void __init perf_counter_init(void)
-{
-	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
-			(void *)(long)smp_processor_id());
-	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
-			(void *)(long)smp_processor_id());
-	register_cpu_notifier(&perf_cpu_nb);
-}
-
-static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
-{
-	return sprintf(buf, "%d\n", perf_reserved_percpu);
-}
-
-static ssize_t
-perf_set_reserve_percpu(struct sysdev_class *class,
-			const char *buf,
-			size_t count)
-{
-	struct perf_cpu_context *cpuctx;
-	unsigned long val;
-	int err, cpu, mpt;
-
-	err = strict_strtoul(buf, 10, &val);
-	if (err)
-		return err;
-	if (val > perf_max_counters)
-		return -EINVAL;
-
-	spin_lock(&perf_resource_lock);
-	perf_reserved_percpu = val;
-	for_each_online_cpu(cpu) {
-		cpuctx = &per_cpu(perf_cpu_context, cpu);
-		spin_lock_irq(&cpuctx->ctx.lock);
-		mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
-			  perf_max_counters - perf_reserved_percpu);
-		cpuctx->max_pertask = mpt;
-		spin_unlock_irq(&cpuctx->ctx.lock);
-	}
-	spin_unlock(&perf_resource_lock);
-
-	return count;
-}
-
-static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
-{
-	return sprintf(buf, "%d\n", perf_overcommit);
-}
-
-static ssize_t
-perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
-{
-	unsigned long val;
-	int err;
-
-	err = strict_strtoul(buf, 10, &val);
-	if (err)
-		return err;
-	if (val > 1)
-		return -EINVAL;
-
-	spin_lock(&perf_resource_lock);
-	perf_overcommit = val;
-	spin_unlock(&perf_resource_lock);
-
-	return count;
-}
-
-static SYSDEV_CLASS_ATTR(
-				reserve_percpu,
-				0644,
-				perf_show_reserve_percpu,
-				perf_set_reserve_percpu
-			);
-
-static SYSDEV_CLASS_ATTR(
-				overcommit,
-				0644,
-				perf_show_overcommit,
-				perf_set_overcommit
-			);
-
-static struct attribute *perfclass_attrs[] = {
-	&attr_reserve_percpu.attr,
-	&attr_overcommit.attr,
-	NULL
-};
-
-static struct attribute_group perfclass_attr_group = {
-	.attrs			= perfclass_attrs,
-	.name			= "perf_counters",
-};
-
-static int __init perf_counter_sysfs_init(void)
-{
-	return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
-				  &perfclass_attr_group);
-}
-device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
new file mode 100644
index 000000000000..6e8b99a04e1e
--- /dev/null
+++ b/kernel/perf_event.c
@@ -0,0 +1,5000 @@
+/*
+ * Performance event core code
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ *  For licensing details see kernel-base/COPYING
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/sysfs.h>
+#include <linux/dcache.h>
+#include <linux/percpu.h>
+#include <linux/ptrace.h>
+#include <linux/vmstat.h>
+#include <linux/hardirq.h>
+#include <linux/rculist.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/anon_inodes.h>
+#include <linux/kernel_stat.h>
+#include <linux/perf_event.h>
+
+#include <asm/irq_regs.h>
+
+/*
+ * Each CPU has a list of per CPU events:
+ */
+DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+
+int perf_max_events __read_mostly = 1;
+static int perf_reserved_percpu __read_mostly;
+static int perf_overcommit __read_mostly = 1;
+
+static atomic_t nr_events __read_mostly;
+static atomic_t nr_mmap_events __read_mostly;
+static atomic_t nr_comm_events __read_mostly;
+static atomic_t nr_task_events __read_mostly;
+
+/*
+ * perf event paranoia level:
+ *  -1 - not paranoid at all
+ *   0 - disallow raw tracepoint access for unpriv
+ *   1 - disallow cpu events for unpriv
+ *   2 - disallow kernel profiling for unpriv
+ */
+int sysctl_perf_event_paranoid __read_mostly = 1;
+
+static inline bool perf_paranoid_tracepoint_raw(void)
+{
+	return sysctl_perf_event_paranoid > -1;
+}
+
+static inline bool perf_paranoid_cpu(void)
+{
+	return sysctl_perf_event_paranoid > 0;
+}
+
+static inline bool perf_paranoid_kernel(void)
+{
+	return sysctl_perf_event_paranoid > 1;
+}
+
+int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
+
+/*
+ * max perf event sample rate
+ */
+int sysctl_perf_event_sample_rate __read_mostly = 100000;
+
+static atomic64_t perf_event_id;
+
+/*
+ * Lock for (sysadmin-configurable) event reservations:
+ */
+static DEFINE_SPINLOCK(perf_resource_lock);
+
+/*
+ * Architecture provided APIs - weak aliases:
+ */
+extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
+{
+	return NULL;
+}
+
+void __weak hw_perf_disable(void)		{ barrier(); }
+void __weak hw_perf_enable(void)		{ barrier(); }
+
+void __weak hw_perf_event_setup(int cpu)	{ barrier(); }
+void __weak hw_perf_event_setup_online(int cpu)	{ barrier(); }
+
+int __weak
+hw_perf_group_sched_in(struct perf_event *group_leader,
+	       struct perf_cpu_context *cpuctx,
+	       struct perf_event_context *ctx, int cpu)
+{
+	return 0;
+}
+
+void __weak perf_event_print_debug(void)	{ }
+
+static DEFINE_PER_CPU(int, perf_disable_count);
+
+void __perf_disable(void)
+{
+	__get_cpu_var(perf_disable_count)++;
+}
+
+bool __perf_enable(void)
+{
+	return !--__get_cpu_var(perf_disable_count);
+}
+
+void perf_disable(void)
+{
+	__perf_disable();
+	hw_perf_disable();
+}
+
+void perf_enable(void)
+{
+	if (__perf_enable())
+		hw_perf_enable();
+}
+
+static void get_ctx(struct perf_event_context *ctx)
+{
+	WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
+}
+
+static void free_ctx(struct rcu_head *head)
+{
+	struct perf_event_context *ctx;
+
+	ctx = container_of(head, struct perf_event_context, rcu_head);
+	kfree(ctx);
+}
+
+static void put_ctx(struct perf_event_context *ctx)
+{
+	if (atomic_dec_and_test(&ctx->refcount)) {
+		if (ctx->parent_ctx)
+			put_ctx(ctx->parent_ctx);
+		if (ctx->task)
+			put_task_struct(ctx->task);
+		call_rcu(&ctx->rcu_head, free_ctx);
+	}
+}
+
+static void unclone_ctx(struct perf_event_context *ctx)
+{
+	if (ctx->parent_ctx) {
+		put_ctx(ctx->parent_ctx);
+		ctx->parent_ctx = NULL;
+	}
+}
+
+/*
+ * If we inherit events we want to return the parent event id
+ * to userspace.
+ */
+static u64 primary_event_id(struct perf_event *event)
+{
+	u64 id = event->id;
+
+	if (event->parent)
+		id = event->parent->id;
+
+	return id;
+}
+
+/*
+ * Get the perf_event_context for a task and lock it.
+ * This has to cope with with the fact that until it is locked,
+ * the context could get moved to another task.
+ */
+static struct perf_event_context *
+perf_lock_task_context(struct task_struct *task, unsigned long *flags)
+{
+	struct perf_event_context *ctx;
+
+	rcu_read_lock();
+ retry:
+	ctx = rcu_dereference(task->perf_event_ctxp);
+	if (ctx) {
+		/*
+		 * If this context is a clone of another, it might
+		 * get swapped for another underneath us by
+		 * perf_event_task_sched_out, though the
+		 * rcu_read_lock() protects us from any context
+		 * getting freed.  Lock the context and check if it
+		 * got swapped before we could get the lock, and retry
+		 * if so.  If we locked the right context, then it
+		 * can't get swapped on us any more.
+		 */
+		spin_lock_irqsave(&ctx->lock, *flags);
+		if (ctx != rcu_dereference(task->perf_event_ctxp)) {
+			spin_unlock_irqrestore(&ctx->lock, *flags);
+			goto retry;
+		}
+
+		if (!atomic_inc_not_zero(&ctx->refcount)) {
+			spin_unlock_irqrestore(&ctx->lock, *flags);
+			ctx = NULL;
+		}
+	}
+	rcu_read_unlock();
+	return ctx;
+}
+
+/*
+ * Get the context for a task and increment its pin_count so it
+ * can't get swapped to another task.  This also increments its
+ * reference count so that the context can't get freed.
+ */
+static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
+{
+	struct perf_event_context *ctx;
+	unsigned long flags;
+
+	ctx = perf_lock_task_context(task, &flags);
+	if (ctx) {
+		++ctx->pin_count;
+		spin_unlock_irqrestore(&ctx->lock, flags);
+	}
+	return ctx;
+}
+
+static void perf_unpin_context(struct perf_event_context *ctx)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+	--ctx->pin_count;
+	spin_unlock_irqrestore(&ctx->lock, flags);
+	put_ctx(ctx);
+}
+
+/*
+ * Add a event from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
+static void
+list_add_event(struct perf_event *event, struct perf_event_context *ctx)
+{
+	struct perf_event *group_leader = event->group_leader;
+
+	/*
+	 * Depending on whether it is a standalone or sibling event,
+	 * add it straight to the context's event list, or to the group
+	 * leader's sibling list:
+	 */
+	if (group_leader == event)
+		list_add_tail(&event->group_entry, &ctx->group_list);
+	else {
+		list_add_tail(&event->group_entry, &group_leader->sibling_list);
+		group_leader->nr_siblings++;
+	}
+
+	list_add_rcu(&event->event_entry, &ctx->event_list);
+	ctx->nr_events++;
+	if (event->attr.inherit_stat)
+		ctx->nr_stat++;
+}
+
+/*
+ * Remove a event from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
+static void
+list_del_event(struct perf_event *event, struct perf_event_context *ctx)
+{
+	struct perf_event *sibling, *tmp;
+
+	if (list_empty(&event->group_entry))
+		return;
+	ctx->nr_events--;
+	if (event->attr.inherit_stat)
+		ctx->nr_stat--;
+
+	list_del_init(&event->group_entry);
+	list_del_rcu(&event->event_entry);
+
+	if (event->group_leader != event)
+		event->group_leader->nr_siblings--;
+
+	/*
+	 * If this was a group event with sibling events then
+	 * upgrade the siblings to singleton events by adding them
+	 * to the context list directly:
+	 */
+	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
+
+		list_move_tail(&sibling->group_entry, &ctx->group_list);
+		sibling->group_leader = sibling;
+	}
+}
+
+static void
+event_sched_out(struct perf_event *event,
+		  struct perf_cpu_context *cpuctx,
+		  struct perf_event_context *ctx)
+{
+	if (event->state != PERF_EVENT_STATE_ACTIVE)
+		return;
+
+	event->state = PERF_EVENT_STATE_INACTIVE;
+	if (event->pending_disable) {
+		event->pending_disable = 0;
+		event->state = PERF_EVENT_STATE_OFF;
+	}
+	event->tstamp_stopped = ctx->time;
+	event->pmu->disable(event);
+	event->oncpu = -1;
+
+	if (!is_software_event(event))
+		cpuctx->active_oncpu--;
+	ctx->nr_active--;
+	if (event->attr.exclusive || !cpuctx->active_oncpu)
+		cpuctx->exclusive = 0;
+}
+
+static void
+group_sched_out(struct perf_event *group_event,
+		struct perf_cpu_context *cpuctx,
+		struct perf_event_context *ctx)
+{
+	struct perf_event *event;
+
+	if (group_event->state != PERF_EVENT_STATE_ACTIVE)
+		return;
+
+	event_sched_out(group_event, cpuctx, ctx);
+
+	/*
+	 * Schedule out siblings (if any):
+	 */
+	list_for_each_entry(event, &group_event->sibling_list, group_entry)
+		event_sched_out(event, cpuctx, ctx);
+
+	if (group_event->attr.exclusive)
+		cpuctx->exclusive = 0;
+}
+
+/*
+ * Cross CPU call to remove a performance event
+ *
+ * We disable the event on the hardware level first. After that we
+ * remove it from the context list.
+ */
+static void __perf_event_remove_from_context(void *info)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_event *event = info;
+	struct perf_event_context *ctx = event->ctx;
+
+	/*
+	 * If this is a task context, we need to check whether it is
+	 * the current task context of this cpu. If not it has been
+	 * scheduled out before the smp call arrived.
+	 */
+	if (ctx->task && cpuctx->task_ctx != ctx)
+		return;
+
+	spin_lock(&ctx->lock);
+	/*
+	 * Protect the list operation against NMI by disabling the
+	 * events on a global level.
+	 */
+	perf_disable();
+
+	event_sched_out(event, cpuctx, ctx);
+
+	list_del_event(event, ctx);
+
+	if (!ctx->task) {
+		/*
+		 * Allow more per task events with respect to the
+		 * reservation:
+		 */
+		cpuctx->max_pertask =
+			min(perf_max_events - ctx->nr_events,
+			    perf_max_events - perf_reserved_percpu);
+	}
+
+	perf_enable();
+	spin_unlock(&ctx->lock);
+}
+
+
+/*
+ * Remove the event from a task's (or a CPU's) list of events.
+ *
+ * Must be called with ctx->mutex held.
+ *
+ * CPU events are removed with a smp call. For task events we only
+ * call when the task is on a CPU.
+ *
+ * If event->ctx is a cloned context, callers must make sure that
+ * every task struct that event->ctx->task could possibly point to
+ * remains valid.  This is OK when called from perf_release since
+ * that only calls us on the top-level context, which can't be a clone.
+ * When called from perf_event_exit_task, it's OK because the
+ * context has been detached from its task.
+ */
+static void perf_event_remove_from_context(struct perf_event *event)
+{
+	struct perf_event_context *ctx = event->ctx;
+	struct task_struct *task = ctx->task;
+
+	if (!task) {
+		/*
+		 * Per cpu events are removed via an smp call and
+		 * the removal is always sucessful.
+		 */
+		smp_call_function_single(event->cpu,
+					 __perf_event_remove_from_context,
+					 event, 1);
+		return;
+	}
+
+retry:
+	task_oncpu_function_call(task, __perf_event_remove_from_context,
+				 event);
+
+	spin_lock_irq(&ctx->lock);
+	/*
+	 * If the context is active we need to retry the smp call.
+	 */
+	if (ctx->nr_active && !list_empty(&event->group_entry)) {
+		spin_unlock_irq(&ctx->lock);
+		goto retry;
+	}
+
+	/*
+	 * The lock prevents that this context is scheduled in so we
+	 * can remove the event safely, if the call above did not
+	 * succeed.
+	 */
+	if (!list_empty(&event->group_entry)) {
+		list_del_event(event, ctx);
+	}
+	spin_unlock_irq(&ctx->lock);
+}
+
+static inline u64 perf_clock(void)
+{
+	return cpu_clock(smp_processor_id());
+}
+
+/*
+ * Update the record of the current time in a context.
+ */
+static void update_context_time(struct perf_event_context *ctx)
+{
+	u64 now = perf_clock();
+
+	ctx->time += now - ctx->timestamp;
+	ctx->timestamp = now;
+}
+
+/*
+ * Update the total_time_enabled and total_time_running fields for a event.
+ */
+static void update_event_times(struct perf_event *event)
+{
+	struct perf_event_context *ctx = event->ctx;
+	u64 run_end;
+
+	if (event->state < PERF_EVENT_STATE_INACTIVE ||
+	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
+		return;
+
+	event->total_time_enabled = ctx->time - event->tstamp_enabled;
+
+	if (event->state == PERF_EVENT_STATE_INACTIVE)
+		run_end = event->tstamp_stopped;
+	else
+		run_end = ctx->time;
+
+	event->total_time_running = run_end - event->tstamp_running;
+}
+
+/*
+ * Update total_time_enabled and total_time_running for all events in a group.
+ */
+static void update_group_times(struct perf_event *leader)
+{
+	struct perf_event *event;
+
+	update_event_times(leader);
+	list_for_each_entry(event, &leader->sibling_list, group_entry)
+		update_event_times(event);
+}
+
+/*
+ * Cross CPU call to disable a performance event
+ */
+static void __perf_event_disable(void *info)
+{
+	struct perf_event *event = info;
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_event_context *ctx = event->ctx;
+
+	/*
+	 * If this is a per-task event, need to check whether this
+	 * event's task is the current task on this cpu.
+	 */
+	if (ctx->task && cpuctx->task_ctx != ctx)
+		return;
+
+	spin_lock(&ctx->lock);
+
+	/*
+	 * If the event is on, turn it off.
+	 * If it is in error state, leave it in error state.
+	 */
+	if (event->state >= PERF_EVENT_STATE_INACTIVE) {
+		update_context_time(ctx);
+		update_group_times(event);
+		if (event == event->group_leader)
+			group_sched_out(event, cpuctx, ctx);
+		else
+			event_sched_out(event, cpuctx, ctx);
+		event->state = PERF_EVENT_STATE_OFF;
+	}
+
+	spin_unlock(&ctx->lock);
+}
+
+/*
+ * Disable a event.
+ *
+ * If event->ctx is a cloned context, callers must make sure that
+ * every task struct that event->ctx->task could possibly point to
+ * remains valid.  This condition is satisifed when called through
+ * perf_event_for_each_child or perf_event_for_each because they
+ * hold the top-level event's child_mutex, so any descendant that
+ * goes to exit will block in sync_child_event.
+ * When called from perf_pending_event it's OK because event->ctx
+ * is the current context on this CPU and preemption is disabled,
+ * hence we can't get into perf_event_task_sched_out for this context.
+ */
+static void perf_event_disable(struct perf_event *event)
+{
+	struct perf_event_context *ctx = event->ctx;
+	struct task_struct *task = ctx->task;
+
+	if (!task) {
+		/*
+		 * Disable the event on the cpu that it's on
+		 */
+		smp_call_function_single(event->cpu, __perf_event_disable,
+					 event, 1);
+		return;
+	}
+
+ retry:
+	task_oncpu_function_call(task, __perf_event_disable, event);
+
+	spin_lock_irq(&ctx->lock);
+	/*
+	 * If the event is still active, we need to retry the cross-call.
+	 */
+	if (event->state == PERF_EVENT_STATE_ACTIVE) {
+		spin_unlock_irq(&ctx->lock);
+		goto retry;
+	}
+
+	/*
+	 * Since we have the lock this context can't be scheduled
+	 * in, so we can change the state safely.
+	 */
+	if (event->state == PERF_EVENT_STATE_INACTIVE) {
+		update_group_times(event);
+		event->state = PERF_EVENT_STATE_OFF;
+	}
+
+	spin_unlock_irq(&ctx->lock);
+}
+
+static int
+event_sched_in(struct perf_event *event,
+		 struct perf_cpu_context *cpuctx,
+		 struct perf_event_context *ctx,
+		 int cpu)
+{
+	if (event->state <= PERF_EVENT_STATE_OFF)
+		return 0;
+
+	event->state = PERF_EVENT_STATE_ACTIVE;
+	event->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
+	/*
+	 * The new state must be visible before we turn it on in the hardware:
+	 */
+	smp_wmb();
+
+	if (event->pmu->enable(event)) {
+		event->state = PERF_EVENT_STATE_INACTIVE;
+		event->oncpu = -1;
+		return -EAGAIN;
+	}
+
+	event->tstamp_running += ctx->time - event->tstamp_stopped;
+
+	if (!is_software_event(event))
+		cpuctx->active_oncpu++;
+	ctx->nr_active++;
+
+	if (event->attr.exclusive)
+		cpuctx->exclusive = 1;
+
+	return 0;
+}
+
+static int
+group_sched_in(struct perf_event *group_event,
+	       struct perf_cpu_context *cpuctx,
+	       struct perf_event_context *ctx,
+	       int cpu)
+{
+	struct perf_event *event, *partial_group;
+	int ret;
+
+	if (group_event->state == PERF_EVENT_STATE_OFF)
+		return 0;
+
+	ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
+	if (ret)
+		return ret < 0 ? ret : 0;
+
+	if (event_sched_in(group_event, cpuctx, ctx, cpu))
+		return -EAGAIN;
+
+	/*
+	 * Schedule in siblings as one group (if any):
+	 */
+	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+		if (event_sched_in(event, cpuctx, ctx, cpu)) {
+			partial_group = event;
+			goto group_error;
+		}
+	}
+
+	return 0;
+
+group_error:
+	/*
+	 * Groups can be scheduled in as one unit only, so undo any
+	 * partial group before returning:
+	 */
+	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+		if (event == partial_group)
+			break;
+		event_sched_out(event, cpuctx, ctx);
+	}
+	event_sched_out(group_event, cpuctx, ctx);
+
+	return -EAGAIN;
+}
+
+/*
+ * Return 1 for a group consisting entirely of software events,
+ * 0 if the group contains any hardware events.
+ */
+static int is_software_only_group(struct perf_event *leader)
+{
+	struct perf_event *event;
+
+	if (!is_software_event(leader))
+		return 0;
+
+	list_for_each_entry(event, &leader->sibling_list, group_entry)
+		if (!is_software_event(event))
+			return 0;
+
+	return 1;
+}
+
+/*
+ * Work out whether we can put this event group on the CPU now.
+ */
+static int group_can_go_on(struct perf_event *event,
+			   struct perf_cpu_context *cpuctx,
+			   int can_add_hw)
+{
+	/*
+	 * Groups consisting entirely of software events can always go on.
+	 */
+	if (is_software_only_group(event))
+		return 1;
+	/*
+	 * If an exclusive group is already on, no other hardware
+	 * events can go on.
+	 */
+	if (cpuctx->exclusive)
+		return 0;
+	/*
+	 * If this group is exclusive and there are already
+	 * events on the CPU, it can't go on.
+	 */
+	if (event->attr.exclusive && cpuctx->active_oncpu)
+		return 0;
+	/*
+	 * Otherwise, try to add it if all previous groups were able
+	 * to go on.
+	 */
+	return can_add_hw;
+}
+
+static void add_event_to_ctx(struct perf_event *event,
+			       struct perf_event_context *ctx)
+{
+	list_add_event(event, ctx);
+	event->tstamp_enabled = ctx->time;
+	event->tstamp_running = ctx->time;
+	event->tstamp_stopped = ctx->time;
+}
+
+/*
+ * Cross CPU call to install and enable a performance event
+ *
+ * Must be called with ctx->mutex held
+ */
+static void __perf_install_in_context(void *info)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_event *event = info;
+	struct perf_event_context *ctx = event->ctx;
+	struct perf_event *leader = event->group_leader;
+	int cpu = smp_processor_id();
+	int err;
+
+	/*
+	 * If this is a task context, we need to check whether it is
+	 * the current task context of this cpu. If not it has been
+	 * scheduled out before the smp call arrived.
+	 * Or possibly this is the right context but it isn't
+	 * on this cpu because it had no events.
+	 */
+	if (ctx->task && cpuctx->task_ctx != ctx) {
+		if (cpuctx->task_ctx || ctx->task != current)
+			return;
+		cpuctx->task_ctx = ctx;
+	}
+
+	spin_lock(&ctx->lock);
+	ctx->is_active = 1;
+	update_context_time(ctx);
+
+	/*
+	 * Protect the list operation against NMI by disabling the
+	 * events on a global level. NOP for non NMI based events.
+	 */
+	perf_disable();
+
+	add_event_to_ctx(event, ctx);
+
+	/*
+	 * Don't put the event on if it is disabled or if
+	 * it is in a group and the group isn't on.
+	 */
+	if (event->state != PERF_EVENT_STATE_INACTIVE ||
+	    (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
+		goto unlock;
+
+	/*
+	 * An exclusive event can't go on if there are already active
+	 * hardware events, and no hardware event can go on if there
+	 * is already an exclusive event on.
+	 */
+	if (!group_can_go_on(event, cpuctx, 1))
+		err = -EEXIST;
+	else
+		err = event_sched_in(event, cpuctx, ctx, cpu);
+
+	if (err) {
+		/*
+		 * This event couldn't go on.  If it is in a group
+		 * then we have to pull the whole group off.
+		 * If the event group is pinned then put it in error state.
+		 */
+		if (leader != event)
+			group_sched_out(leader, cpuctx, ctx);
+		if (leader->attr.pinned) {
+			update_group_times(leader);
+			leader->state = PERF_EVENT_STATE_ERROR;
+		}
+	}
+
+	if (!err && !ctx->task && cpuctx->max_pertask)
+		cpuctx->max_pertask--;
+
+ unlock:
+	perf_enable();
+
+	spin_unlock(&ctx->lock);
+}
+
+/*
+ * Attach a performance event to a context
+ *
+ * First we add the event to the list with the hardware enable bit
+ * in event->hw_config cleared.
+ *
+ * If the event is attached to a task which is on a CPU we use a smp
+ * call to enable it in the task context. The task might have been
+ * scheduled away, but we check this in the smp call again.
+ *
+ * Must be called with ctx->mutex held.
+ */
+static void
+perf_install_in_context(struct perf_event_context *ctx,
+			struct perf_event *event,
+			int cpu)
+{
+	struct task_struct *task = ctx->task;
+
+	if (!task) {
+		/*
+		 * Per cpu events are installed via an smp call and
+		 * the install is always sucessful.
+		 */
+		smp_call_function_single(cpu, __perf_install_in_context,
+					 event, 1);
+		return;
+	}
+
+retry:
+	task_oncpu_function_call(task, __perf_install_in_context,
+				 event);
+
+	spin_lock_irq(&ctx->lock);
+	/*
+	 * we need to retry the smp call.
+	 */
+	if (ctx->is_active && list_empty(&event->group_entry)) {
+		spin_unlock_irq(&ctx->lock);
+		goto retry;
+	}
+
+	/*
+	 * The lock prevents that this context is scheduled in so we
+	 * can add the event safely, if it the call above did not
+	 * succeed.
+	 */
+	if (list_empty(&event->group_entry))
+		add_event_to_ctx(event, ctx);
+	spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Put a event into inactive state and update time fields.
+ * Enabling the leader of a group effectively enables all
+ * the group members that aren't explicitly disabled, so we
+ * have to update their ->tstamp_enabled also.
+ * Note: this works for group members as well as group leaders
+ * since the non-leader members' sibling_lists will be empty.
+ */
+static void __perf_event_mark_enabled(struct perf_event *event,
+					struct perf_event_context *ctx)
+{
+	struct perf_event *sub;
+
+	event->state = PERF_EVENT_STATE_INACTIVE;
+	event->tstamp_enabled = ctx->time - event->total_time_enabled;
+	list_for_each_entry(sub, &event->sibling_list, group_entry)
+		if (sub->state >= PERF_EVENT_STATE_INACTIVE)
+			sub->tstamp_enabled =
+				ctx->time - sub->total_time_enabled;
+}
+
+/*
+ * Cross CPU call to enable a performance event
+ */
+static void __perf_event_enable(void *info)
+{
+	struct perf_event *event = info;
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_event_context *ctx = event->ctx;
+	struct perf_event *leader = event->group_leader;
+	int err;
+
+	/*
+	 * If this is a per-task event, need to check whether this
+	 * event's task is the current task on this cpu.
+	 */
+	if (ctx->task && cpuctx->task_ctx != ctx) {
+		if (cpuctx->task_ctx || ctx->task != current)
+			return;
+		cpuctx->task_ctx = ctx;
+	}
+
+	spin_lock(&ctx->lock);
+	ctx->is_active = 1;
+	update_context_time(ctx);
+
+	if (event->state >= PERF_EVENT_STATE_INACTIVE)
+		goto unlock;
+	__perf_event_mark_enabled(event, ctx);
+
+	/*
+	 * If the event is in a group and isn't the group leader,
+	 * then don't put it on unless the group is on.
+	 */
+	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
+		goto unlock;
+
+	if (!group_can_go_on(event, cpuctx, 1)) {
+		err = -EEXIST;
+	} else {
+		perf_disable();
+		if (event == leader)
+			err = group_sched_in(event, cpuctx, ctx,
+					     smp_processor_id());
+		else
+			err = event_sched_in(event, cpuctx, ctx,
+					       smp_processor_id());
+		perf_enable();
+	}
+
+	if (err) {
+		/*
+		 * If this event can't go on and it's part of a
+		 * group, then the whole group has to come off.
+		 */
+		if (leader != event)
+			group_sched_out(leader, cpuctx, ctx);
+		if (leader->attr.pinned) {
+			update_group_times(leader);
+			leader->state = PERF_EVENT_STATE_ERROR;
+		}
+	}
+
+ unlock:
+	spin_unlock(&ctx->lock);
+}
+
+/*
+ * Enable a event.
+ *
+ * If event->ctx is a cloned context, callers must make sure that
+ * every task struct that event->ctx->task could possibly point to
+ * remains valid.  This condition is satisfied when called through
+ * perf_event_for_each_child or perf_event_for_each as described
+ * for perf_event_disable.
+ */
+static void perf_event_enable(struct perf_event *event)
+{
+	struct perf_event_context *ctx = event->ctx;
+	struct task_struct *task = ctx->task;
+
+	if (!task) {
+		/*
+		 * Enable the event on the cpu that it's on
+		 */
+		smp_call_function_single(event->cpu, __perf_event_enable,
+					 event, 1);
+		return;
+	}
+
+	spin_lock_irq(&ctx->lock);
+	if (event->state >= PERF_EVENT_STATE_INACTIVE)
+		goto out;
+
+	/*
+	 * If the event is in error state, clear that first.
+	 * That way, if we see the event in error state below, we
+	 * know that it has gone back into error state, as distinct
+	 * from the task having been scheduled away before the
+	 * cross-call arrived.
+	 */
+	if (event->state == PERF_EVENT_STATE_ERROR)
+		event->state = PERF_EVENT_STATE_OFF;
+
+ retry:
+	spin_unlock_irq(&ctx->lock);
+	task_oncpu_function_call(task, __perf_event_enable, event);
+
+	spin_lock_irq(&ctx->lock);
+
+	/*
+	 * If the context is active and the event is still off,
+	 * we need to retry the cross-call.
+	 */
+	if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
+		goto retry;
+
+	/*
+	 * Since we have the lock this context can't be scheduled
+	 * in, so we can change the state safely.
+	 */
+	if (event->state == PERF_EVENT_STATE_OFF)
+		__perf_event_mark_enabled(event, ctx);
+
+ out:
+	spin_unlock_irq(&ctx->lock);
+}
+
+static int perf_event_refresh(struct perf_event *event, int refresh)
+{
+	/*
+	 * not supported on inherited events
+	 */
+	if (event->attr.inherit)
+		return -EINVAL;
+
+	atomic_add(refresh, &event->event_limit);
+	perf_event_enable(event);
+
+	return 0;
+}
+
+void __perf_event_sched_out(struct perf_event_context *ctx,
+			      struct perf_cpu_context *cpuctx)
+{
+	struct perf_event *event;
+
+	spin_lock(&ctx->lock);
+	ctx->is_active = 0;
+	if (likely(!ctx->nr_events))
+		goto out;
+	update_context_time(ctx);
+
+	perf_disable();
+	if (ctx->nr_active) {
+		list_for_each_entry(event, &ctx->group_list, group_entry) {
+			if (event != event->group_leader)
+				event_sched_out(event, cpuctx, ctx);
+			else
+				group_sched_out(event, cpuctx, ctx);
+		}
+	}
+	perf_enable();
+ out:
+	spin_unlock(&ctx->lock);
+}
+
+/*
+ * Test whether two contexts are equivalent, i.e. whether they
+ * have both been cloned from the same version of the same context
+ * and they both have the same number of enabled events.
+ * If the number of enabled events is the same, then the set
+ * of enabled events should be the same, because these are both
+ * inherited contexts, therefore we can't access individual events
+ * in them directly with an fd; we can only enable/disable all
+ * events via prctl, or enable/disable all events in a family
+ * via ioctl, which will have the same effect on both contexts.
+ */
+static int context_equiv(struct perf_event_context *ctx1,
+			 struct perf_event_context *ctx2)
+{
+	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
+		&& ctx1->parent_gen == ctx2->parent_gen
+		&& !ctx1->pin_count && !ctx2->pin_count;
+}
+
+static void __perf_event_read(void *event);
+
+static void __perf_event_sync_stat(struct perf_event *event,
+				     struct perf_event *next_event)
+{
+	u64 value;
+
+	if (!event->attr.inherit_stat)
+		return;
+
+	/*
+	 * Update the event value, we cannot use perf_event_read()
+	 * because we're in the middle of a context switch and have IRQs
+	 * disabled, which upsets smp_call_function_single(), however
+	 * we know the event must be on the current CPU, therefore we
+	 * don't need to use it.
+	 */
+	switch (event->state) {
+	case PERF_EVENT_STATE_ACTIVE:
+		__perf_event_read(event);
+		break;
+
+	case PERF_EVENT_STATE_INACTIVE:
+		update_event_times(event);
+		break;
+
+	default:
+		break;
+	}
+
+	/*
+	 * In order to keep per-task stats reliable we need to flip the event
+	 * values when we flip the contexts.
+	 */
+	value = atomic64_read(&next_event->count);
+	value = atomic64_xchg(&event->count, value);
+	atomic64_set(&next_event->count, value);
+
+	swap(event->total_time_enabled, next_event->total_time_enabled);
+	swap(event->total_time_running, next_event->total_time_running);
+
+	/*
+	 * Since we swizzled the values, update the user visible data too.
+	 */
+	perf_event_update_userpage(event);
+	perf_event_update_userpage(next_event);
+}
+
+#define list_next_entry(pos, member) \
+	list_entry(pos->member.next, typeof(*pos), member)
+
+static void perf_event_sync_stat(struct perf_event_context *ctx,
+				   struct perf_event_context *next_ctx)
+{
+	struct perf_event *event, *next_event;
+
+	if (!ctx->nr_stat)
+		return;
+
+	event = list_first_entry(&ctx->event_list,
+				   struct perf_event, event_entry);
+
+	next_event = list_first_entry(&next_ctx->event_list,
+					struct perf_event, event_entry);
+
+	while (&event->event_entry != &ctx->event_list &&
+	       &next_event->event_entry != &next_ctx->event_list) {
+
+		__perf_event_sync_stat(event, next_event);
+
+		event = list_next_entry(event, event_entry);
+		next_event = list_next_entry(next_event, event_entry);
+	}
+}
+
+/*
+ * Called from scheduler to remove the events of the current task,
+ * with interrupts disabled.
+ *
+ * We stop each event and update the event value in event->count.
+ *
+ * This does not protect us against NMI, but disable()
+ * sets the disabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * not restart the event.
+ */
+void perf_event_task_sched_out(struct task_struct *task,
+				 struct task_struct *next, int cpu)
+{
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct perf_event_context *ctx = task->perf_event_ctxp;
+	struct perf_event_context *next_ctx;
+	struct perf_event_context *parent;
+	struct pt_regs *regs;
+	int do_switch = 1;
+
+	regs = task_pt_regs(task);
+	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
+
+	if (likely(!ctx || !cpuctx->task_ctx))
+		return;
+
+	update_context_time(ctx);
+
+	rcu_read_lock();
+	parent = rcu_dereference(ctx->parent_ctx);
+	next_ctx = next->perf_event_ctxp;
+	if (parent && next_ctx &&
+	    rcu_dereference(next_ctx->parent_ctx) == parent) {
+		/*
+		 * Looks like the two contexts are clones, so we might be
+		 * able to optimize the context switch.  We lock both
+		 * contexts and check that they are clones under the
+		 * lock (including re-checking that neither has been
+		 * uncloned in the meantime).  It doesn't matter which
+		 * order we take the locks because no other cpu could
+		 * be trying to lock both of these tasks.
+		 */
+		spin_lock(&ctx->lock);
+		spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
+		if (context_equiv(ctx, next_ctx)) {
+			/*
+			 * XXX do we need a memory barrier of sorts
+			 * wrt to rcu_dereference() of perf_event_ctxp
+			 */
+			task->perf_event_ctxp = next_ctx;
+			next->perf_event_ctxp = ctx;
+			ctx->task = next;
+			next_ctx->task = task;
+			do_switch = 0;
+
+			perf_event_sync_stat(ctx, next_ctx);
+		}
+		spin_unlock(&next_ctx->lock);
+		spin_unlock(&ctx->lock);
+	}
+	rcu_read_unlock();
+
+	if (do_switch) {
+		__perf_event_sched_out(ctx, cpuctx);
+		cpuctx->task_ctx = NULL;
+	}
+}
+
+/*
+ * Called with IRQs disabled
+ */
+static void __perf_event_task_sched_out(struct perf_event_context *ctx)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+
+	if (!cpuctx->task_ctx)
+		return;
+
+	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
+		return;
+
+	__perf_event_sched_out(ctx, cpuctx);
+	cpuctx->task_ctx = NULL;
+}
+
+/*
+ * Called with IRQs disabled
+ */
+static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
+{
+	__perf_event_sched_out(&cpuctx->ctx, cpuctx);
+}
+
+static void
+__perf_event_sched_in(struct perf_event_context *ctx,
+			struct perf_cpu_context *cpuctx, int cpu)
+{
+	struct perf_event *event;
+	int can_add_hw = 1;
+
+	spin_lock(&ctx->lock);
+	ctx->is_active = 1;
+	if (likely(!ctx->nr_events))
+		goto out;
+
+	ctx->timestamp = perf_clock();
+
+	perf_disable();
+
+	/*
+	 * First go through the list and put on any pinned groups
+	 * in order to give them the best chance of going on.
+	 */
+	list_for_each_entry(event, &ctx->group_list, group_entry) {
+		if (event->state <= PERF_EVENT_STATE_OFF ||
+		    !event->attr.pinned)
+			continue;
+		if (event->cpu != -1 && event->cpu != cpu)
+			continue;
+
+		if (event != event->group_leader)
+			event_sched_in(event, cpuctx, ctx, cpu);
+		else {
+			if (group_can_go_on(event, cpuctx, 1))
+				group_sched_in(event, cpuctx, ctx, cpu);
+		}
+
+		/*
+		 * If this pinned group hasn't been scheduled,
+		 * put it in error state.
+		 */
+		if (event->state == PERF_EVENT_STATE_INACTIVE) {
+			update_group_times(event);
+			event->state = PERF_EVENT_STATE_ERROR;
+		}
+	}
+
+	list_for_each_entry(event, &ctx->group_list, group_entry) {
+		/*
+		 * Ignore events in OFF or ERROR state, and
+		 * ignore pinned events since we did them already.
+		 */
+		if (event->state <= PERF_EVENT_STATE_OFF ||
+		    event->attr.pinned)
+			continue;
+
+		/*
+		 * Listen to the 'cpu' scheduling filter constraint
+		 * of events:
+		 */
+		if (event->cpu != -1 && event->cpu != cpu)
+			continue;
+
+		if (event != event->group_leader) {
+			if (event_sched_in(event, cpuctx, ctx, cpu))
+				can_add_hw = 0;
+		} else {
+			if (group_can_go_on(event, cpuctx, can_add_hw)) {
+				if (group_sched_in(event, cpuctx, ctx, cpu))
+					can_add_hw = 0;
+			}
+		}
+	}
+	perf_enable();
+ out:
+	spin_unlock(&ctx->lock);
+}
+
+/*
+ * Called from scheduler to add the events of the current task
+ * with interrupts disabled.
+ *
+ * We restore the event value and then enable it.
+ *
+ * This does not protect us against NMI, but enable()
+ * sets the enabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * keep the event running.
+ */
+void perf_event_task_sched_in(struct task_struct *task, int cpu)
+{
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct perf_event_context *ctx = task->perf_event_ctxp;
+
+	if (likely(!ctx))
+		return;
+	if (cpuctx->task_ctx == ctx)
+		return;
+	__perf_event_sched_in(ctx, cpuctx, cpu);
+	cpuctx->task_ctx = ctx;
+}
+
+static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
+{
+	struct perf_event_context *ctx = &cpuctx->ctx;
+
+	__perf_event_sched_in(ctx, cpuctx, cpu);
+}
+
+#define MAX_INTERRUPTS (~0ULL)
+
+static void perf_log_throttle(struct perf_event *event, int enable);
+
+static void perf_adjust_period(struct perf_event *event, u64 events)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 period, sample_period;
+	s64 delta;
+
+	events *= hwc->sample_period;
+	period = div64_u64(events, event->attr.sample_freq);
+
+	delta = (s64)(period - hwc->sample_period);
+	delta = (delta + 7) / 8; /* low pass filter */
+
+	sample_period = hwc->sample_period + delta;
+
+	if (!sample_period)
+		sample_period = 1;
+
+	hwc->sample_period = sample_period;
+}
+
+static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
+{
+	struct perf_event *event;
+	struct hw_perf_event *hwc;
+	u64 interrupts, freq;
+
+	spin_lock(&ctx->lock);
+	list_for_each_entry(event, &ctx->group_list, group_entry) {
+		if (event->state != PERF_EVENT_STATE_ACTIVE)
+			continue;
+
+		hwc = &event->hw;
+
+		interrupts = hwc->interrupts;
+		hwc->interrupts = 0;
+
+		/*
+		 * unthrottle events on the tick
+		 */
+		if (interrupts == MAX_INTERRUPTS) {
+			perf_log_throttle(event, 1);
+			event->pmu->unthrottle(event);
+			interrupts = 2*sysctl_perf_event_sample_rate/HZ;
+		}
+
+		if (!event->attr.freq || !event->attr.sample_freq)
+			continue;
+
+		/*
+		 * if the specified freq < HZ then we need to skip ticks
+		 */
+		if (event->attr.sample_freq < HZ) {
+			freq = event->attr.sample_freq;
+
+			hwc->freq_count += freq;
+			hwc->freq_interrupts += interrupts;
+
+			if (hwc->freq_count < HZ)
+				continue;
+
+			interrupts = hwc->freq_interrupts;
+			hwc->freq_interrupts = 0;
+			hwc->freq_count -= HZ;
+		} else
+			freq = HZ;
+
+		perf_adjust_period(event, freq * interrupts);
+
+		/*
+		 * In order to avoid being stalled by an (accidental) huge
+		 * sample period, force reset the sample period if we didn't
+		 * get any events in this freq period.
+		 */
+		if (!interrupts) {
+			perf_disable();
+			event->pmu->disable(event);
+			atomic64_set(&hwc->period_left, 0);
+			event->pmu->enable(event);
+			perf_enable();
+		}
+	}
+	spin_unlock(&ctx->lock);
+}
+
+/*
+ * Round-robin a context's events:
+ */
+static void rotate_ctx(struct perf_event_context *ctx)
+{
+	struct perf_event *event;
+
+	if (!ctx->nr_events)
+		return;
+
+	spin_lock(&ctx->lock);
+	/*
+	 * Rotate the first entry last (works just fine for group events too):
+	 */
+	perf_disable();
+	list_for_each_entry(event, &ctx->group_list, group_entry) {
+		list_move_tail(&event->group_entry, &ctx->group_list);
+		break;
+	}
+	perf_enable();
+
+	spin_unlock(&ctx->lock);
+}
+
+void perf_event_task_tick(struct task_struct *curr, int cpu)
+{
+	struct perf_cpu_context *cpuctx;
+	struct perf_event_context *ctx;
+
+	if (!atomic_read(&nr_events))
+		return;
+
+	cpuctx = &per_cpu(perf_cpu_context, cpu);
+	ctx = curr->perf_event_ctxp;
+
+	perf_ctx_adjust_freq(&cpuctx->ctx);
+	if (ctx)
+		perf_ctx_adjust_freq(ctx);
+
+	perf_event_cpu_sched_out(cpuctx);
+	if (ctx)
+		__perf_event_task_sched_out(ctx);
+
+	rotate_ctx(&cpuctx->ctx);
+	if (ctx)
+		rotate_ctx(ctx);
+
+	perf_event_cpu_sched_in(cpuctx, cpu);
+	if (ctx)
+		perf_event_task_sched_in(curr, cpu);
+}
+
+/*
+ * Enable all of a task's events that have been marked enable-on-exec.
+ * This expects task == current.
+ */
+static void perf_event_enable_on_exec(struct task_struct *task)
+{
+	struct perf_event_context *ctx;
+	struct perf_event *event;
+	unsigned long flags;
+	int enabled = 0;
+
+	local_irq_save(flags);
+	ctx = task->perf_event_ctxp;
+	if (!ctx || !ctx->nr_events)
+		goto out;
+
+	__perf_event_task_sched_out(ctx);
+
+	spin_lock(&ctx->lock);
+
+	list_for_each_entry(event, &ctx->group_list, group_entry) {
+		if (!event->attr.enable_on_exec)
+			continue;
+		event->attr.enable_on_exec = 0;
+		if (event->state >= PERF_EVENT_STATE_INACTIVE)
+			continue;
+		__perf_event_mark_enabled(event, ctx);
+		enabled = 1;
+	}
+
+	/*
+	 * Unclone this context if we enabled any event.
+	 */
+	if (enabled)
+		unclone_ctx(ctx);
+
+	spin_unlock(&ctx->lock);
+
+	perf_event_task_sched_in(task, smp_processor_id());
+ out:
+	local_irq_restore(flags);
+}
+
+/*
+ * Cross CPU call to read the hardware event
+ */
+static void __perf_event_read(void *info)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_event *event = info;
+	struct perf_event_context *ctx = event->ctx;
+	unsigned long flags;
+
+	/*
+	 * If this is a task context, we need to check whether it is
+	 * the current task context of this cpu.  If not it has been
+	 * scheduled out before the smp call arrived.  In that case
+	 * event->count would have been updated to a recent sample
+	 * when the event was scheduled out.
+	 */
+	if (ctx->task && cpuctx->task_ctx != ctx)
+		return;
+
+	local_irq_save(flags);
+	if (ctx->is_active)
+		update_context_time(ctx);
+	event->pmu->read(event);
+	update_event_times(event);
+	local_irq_restore(flags);
+}
+
+static u64 perf_event_read(struct perf_event *event)
+{
+	/*
+	 * If event is enabled and currently active on a CPU, update the
+	 * value in the event structure:
+	 */
+	if (event->state == PERF_EVENT_STATE_ACTIVE) {
+		smp_call_function_single(event->oncpu,
+					 __perf_event_read, event, 1);
+	} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
+		update_event_times(event);
+	}
+
+	return atomic64_read(&event->count);
+}
+
+/*
+ * Initialize the perf_event context in a task_struct:
+ */
+static void
+__perf_event_init_context(struct perf_event_context *ctx,
+			    struct task_struct *task)
+{
+	memset(ctx, 0, sizeof(*ctx));
+	spin_lock_init(&ctx->lock);
+	mutex_init(&ctx->mutex);
+	INIT_LIST_HEAD(&ctx->group_list);
+	INIT_LIST_HEAD(&ctx->event_list);
+	atomic_set(&ctx->refcount, 1);
+	ctx->task = task;
+}
+
+static struct perf_event_context *find_get_context(pid_t pid, int cpu)
+{
+	struct perf_event_context *ctx;
+	struct perf_cpu_context *cpuctx;
+	struct task_struct *task;
+	unsigned long flags;
+	int err;
+
+	/*
+	 * If cpu is not a wildcard then this is a percpu event:
+	 */
+	if (cpu != -1) {
+		/* Must be root to operate on a CPU event: */
+		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+			return ERR_PTR(-EACCES);
+
+		if (cpu < 0 || cpu > num_possible_cpus())
+			return ERR_PTR(-EINVAL);
+
+		/*
+		 * We could be clever and allow to attach a event to an
+		 * offline CPU and activate it when the CPU comes up, but
+		 * that's for later.
+		 */
+		if (!cpu_isset(cpu, cpu_online_map))
+			return ERR_PTR(-ENODEV);
+
+		cpuctx = &per_cpu(perf_cpu_context, cpu);
+		ctx = &cpuctx->ctx;
+		get_ctx(ctx);
+
+		return ctx;
+	}
+
+	rcu_read_lock();
+	if (!pid)
+		task = current;
+	else
+		task = find_task_by_vpid(pid);
+	if (task)
+		get_task_struct(task);
+	rcu_read_unlock();
+
+	if (!task)
+		return ERR_PTR(-ESRCH);
+
+	/*
+	 * Can't attach events to a dying task.
+	 */
+	err = -ESRCH;
+	if (task->flags & PF_EXITING)
+		goto errout;
+
+	/* Reuse ptrace permission checks for now. */
+	err = -EACCES;
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto errout;
+
+ retry:
+	ctx = perf_lock_task_context(task, &flags);
+	if (ctx) {
+		unclone_ctx(ctx);
+		spin_unlock_irqrestore(&ctx->lock, flags);
+	}
+
+	if (!ctx) {
+		ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
+		err = -ENOMEM;
+		if (!ctx)
+			goto errout;
+		__perf_event_init_context(ctx, task);
+		get_ctx(ctx);
+		if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
+			/*
+			 * We raced with some other task; use
+			 * the context they set.
+			 */
+			kfree(ctx);
+			goto retry;
+		}
+		get_task_struct(task);
+	}
+
+	put_task_struct(task);
+	return ctx;
+
+ errout:
+	put_task_struct(task);
+	return ERR_PTR(err);
+}
+
+static void free_event_rcu(struct rcu_head *head)
+{
+	struct perf_event *event;
+
+	event = container_of(head, struct perf_event, rcu_head);
+	if (event->ns)
+		put_pid_ns(event->ns);
+	kfree(event);
+}
+
+static void perf_pending_sync(struct perf_event *event);
+
+static void free_event(struct perf_event *event)
+{
+	perf_pending_sync(event);
+
+	if (!event->parent) {
+		atomic_dec(&nr_events);
+		if (event->attr.mmap)
+			atomic_dec(&nr_mmap_events);
+		if (event->attr.comm)
+			atomic_dec(&nr_comm_events);
+		if (event->attr.task)
+			atomic_dec(&nr_task_events);
+	}
+
+	if (event->output) {
+		fput(event->output->filp);
+		event->output = NULL;
+	}
+
+	if (event->destroy)
+		event->destroy(event);
+
+	put_ctx(event->ctx);
+	call_rcu(&event->rcu_head, free_event_rcu);
+}
+
+/*
+ * Called when the last reference to the file is gone.
+ */
+static int perf_release(struct inode *inode, struct file *file)
+{
+	struct perf_event *event = file->private_data;
+	struct perf_event_context *ctx = event->ctx;
+
+	file->private_data = NULL;
+
+	WARN_ON_ONCE(ctx->parent_ctx);
+	mutex_lock(&ctx->mutex);
+	perf_event_remove_from_context(event);
+	mutex_unlock(&ctx->mutex);
+
+	mutex_lock(&event->owner->perf_event_mutex);
+	list_del_init(&event->owner_entry);
+	mutex_unlock(&event->owner->perf_event_mutex);
+	put_task_struct(event->owner);
+
+	free_event(event);
+
+	return 0;
+}
+
+static int perf_event_read_size(struct perf_event *event)
+{
+	int entry = sizeof(u64); /* value */
+	int size = 0;
+	int nr = 1;
+
+	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		size += sizeof(u64);
+
+	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		size += sizeof(u64);
+
+	if (event->attr.read_format & PERF_FORMAT_ID)
+		entry += sizeof(u64);
+
+	if (event->attr.read_format & PERF_FORMAT_GROUP) {
+		nr += event->group_leader->nr_siblings;
+		size += sizeof(u64);
+	}
+
+	size += entry * nr;
+
+	return size;
+}
+
+static u64 perf_event_read_value(struct perf_event *event)
+{
+	struct perf_event *child;
+	u64 total = 0;
+
+	total += perf_event_read(event);
+	list_for_each_entry(child, &event->child_list, child_list)
+		total += perf_event_read(child);
+
+	return total;
+}
+
+static int perf_event_read_entry(struct perf_event *event,
+				   u64 read_format, char __user *buf)
+{
+	int n = 0, count = 0;
+	u64 values[2];
+
+	values[n++] = perf_event_read_value(event);
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_event_id(event);
+
+	count = n * sizeof(u64);
+
+	if (copy_to_user(buf, values, count))
+		return -EFAULT;
+
+	return count;
+}
+
+static int perf_event_read_group(struct perf_event *event,
+				   u64 read_format, char __user *buf)
+{
+	struct perf_event *leader = event->group_leader, *sub;
+	int n = 0, size = 0, err = -EFAULT;
+	u64 values[3];
+
+	values[n++] = 1 + leader->nr_siblings;
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+		values[n++] = leader->total_time_enabled +
+			atomic64_read(&leader->child_total_time_enabled);
+	}
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+		values[n++] = leader->total_time_running +
+			atomic64_read(&leader->child_total_time_running);
+	}
+
+	size = n * sizeof(u64);
+
+	if (copy_to_user(buf, values, size))
+		return -EFAULT;
+
+	err = perf_event_read_entry(leader, read_format, buf + size);
+	if (err < 0)
+		return err;
+
+	size += err;
+
+	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+		err = perf_event_read_entry(sub, read_format,
+				buf + size);
+		if (err < 0)
+			return err;
+
+		size += err;
+	}
+
+	return size;
+}
+
+static int perf_event_read_one(struct perf_event *event,
+				 u64 read_format, char __user *buf)
+{
+	u64 values[4];
+	int n = 0;
+
+	values[n++] = perf_event_read_value(event);
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+		values[n++] = event->total_time_enabled +
+			atomic64_read(&event->child_total_time_enabled);
+	}
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+		values[n++] = event->total_time_running +
+			atomic64_read(&event->child_total_time_running);
+	}
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_event_id(event);
+
+	if (copy_to_user(buf, values, n * sizeof(u64)))
+		return -EFAULT;
+
+	return n * sizeof(u64);
+}
+
+/*
+ * Read the performance event - simple non blocking version for now
+ */
+static ssize_t
+perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
+{
+	u64 read_format = event->attr.read_format;
+	int ret;
+
+	/*
+	 * Return end-of-file for a read on a event that is in
+	 * error state (i.e. because it was pinned but it couldn't be
+	 * scheduled on to the CPU at some point).
+	 */
+	if (event->state == PERF_EVENT_STATE_ERROR)
+		return 0;
+
+	if (count < perf_event_read_size(event))
+		return -ENOSPC;
+
+	WARN_ON_ONCE(event->ctx->parent_ctx);
+	mutex_lock(&event->child_mutex);
+	if (read_format & PERF_FORMAT_GROUP)
+		ret = perf_event_read_group(event, read_format, buf);
+	else
+		ret = perf_event_read_one(event, read_format, buf);
+	mutex_unlock(&event->child_mutex);
+
+	return ret;
+}
+
+static ssize_t
+perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	struct perf_event *event = file->private_data;
+
+	return perf_read_hw(event, buf, count);
+}
+
+static unsigned int perf_poll(struct file *file, poll_table *wait)
+{
+	struct perf_event *event = file->private_data;
+	struct perf_mmap_data *data;
+	unsigned int events = POLL_HUP;
+
+	rcu_read_lock();
+	data = rcu_dereference(event->data);
+	if (data)
+		events = atomic_xchg(&data->poll, 0);
+	rcu_read_unlock();
+
+	poll_wait(file, &event->waitq, wait);
+
+	return events;
+}
+
+static void perf_event_reset(struct perf_event *event)
+{
+	(void)perf_event_read(event);
+	atomic64_set(&event->count, 0);
+	perf_event_update_userpage(event);
+}
+
+/*
+ * Holding the top-level event's child_mutex means that any
+ * descendant process that has inherited this event will block
+ * in sync_child_event if it goes to exit, thus satisfying the
+ * task existence requirements of perf_event_enable/disable.
+ */
+static void perf_event_for_each_child(struct perf_event *event,
+					void (*func)(struct perf_event *))
+{
+	struct perf_event *child;
+
+	WARN_ON_ONCE(event->ctx->parent_ctx);
+	mutex_lock(&event->child_mutex);
+	func(event);
+	list_for_each_entry(child, &event->child_list, child_list)
+		func(child);
+	mutex_unlock(&event->child_mutex);
+}
+
+static void perf_event_for_each(struct perf_event *event,
+				  void (*func)(struct perf_event *))
+{
+	struct perf_event_context *ctx = event->ctx;
+	struct perf_event *sibling;
+
+	WARN_ON_ONCE(ctx->parent_ctx);
+	mutex_lock(&ctx->mutex);
+	event = event->group_leader;
+
+	perf_event_for_each_child(event, func);
+	func(event);
+	list_for_each_entry(sibling, &event->sibling_list, group_entry)
+		perf_event_for_each_child(event, func);
+	mutex_unlock(&ctx->mutex);
+}
+
+static int perf_event_period(struct perf_event *event, u64 __user *arg)
+{
+	struct perf_event_context *ctx = event->ctx;
+	unsigned long size;
+	int ret = 0;
+	u64 value;
+
+	if (!event->attr.sample_period)
+		return -EINVAL;
+
+	size = copy_from_user(&value, arg, sizeof(value));
+	if (size != sizeof(value))
+		return -EFAULT;
+
+	if (!value)
+		return -EINVAL;
+
+	spin_lock_irq(&ctx->lock);
+	if (event->attr.freq) {
+		if (value > sysctl_perf_event_sample_rate) {
+			ret = -EINVAL;
+			goto unlock;
+		}
+
+		event->attr.sample_freq = value;
+	} else {
+		event->attr.sample_period = value;
+		event->hw.sample_period = value;
+	}
+unlock:
+	spin_unlock_irq(&ctx->lock);
+
+	return ret;
+}
+
+int perf_event_set_output(struct perf_event *event, int output_fd);
+
+static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct perf_event *event = file->private_data;
+	void (*func)(struct perf_event *);
+	u32 flags = arg;
+
+	switch (cmd) {
+	case PERF_EVENT_IOC_ENABLE:
+		func = perf_event_enable;
+		break;
+	case PERF_EVENT_IOC_DISABLE:
+		func = perf_event_disable;
+		break;
+	case PERF_EVENT_IOC_RESET:
+		func = perf_event_reset;
+		break;
+
+	case PERF_EVENT_IOC_REFRESH:
+		return perf_event_refresh(event, arg);
+
+	case PERF_EVENT_IOC_PERIOD:
+		return perf_event_period(event, (u64 __user *)arg);
+
+	case PERF_EVENT_IOC_SET_OUTPUT:
+		return perf_event_set_output(event, arg);
+
+	default:
+		return -ENOTTY;
+	}
+
+	if (flags & PERF_IOC_FLAG_GROUP)
+		perf_event_for_each(event, func);
+	else
+		perf_event_for_each_child(event, func);
+
+	return 0;
+}
+
+int perf_event_task_enable(void)
+{
+	struct perf_event *event;
+
+	mutex_lock(&current->perf_event_mutex);
+	list_for_each_entry(event, &current->perf_event_list, owner_entry)
+		perf_event_for_each_child(event, perf_event_enable);
+	mutex_unlock(&current->perf_event_mutex);
+
+	return 0;
+}
+
+int perf_event_task_disable(void)
+{
+	struct perf_event *event;
+
+	mutex_lock(&current->perf_event_mutex);
+	list_for_each_entry(event, &current->perf_event_list, owner_entry)
+		perf_event_for_each_child(event, perf_event_disable);
+	mutex_unlock(&current->perf_event_mutex);
+
+	return 0;
+}
+
+#ifndef PERF_EVENT_INDEX_OFFSET
+# define PERF_EVENT_INDEX_OFFSET 0
+#endif
+
+static int perf_event_index(struct perf_event *event)
+{
+	if (event->state != PERF_EVENT_STATE_ACTIVE)
+		return 0;
+
+	return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
+}
+
+/*
+ * Callers need to ensure there can be no nesting of this function, otherwise
+ * the seqlock logic goes bad. We can not serialize this because the arch
+ * code calls this from NMI context.
+ */
+void perf_event_update_userpage(struct perf_event *event)
+{
+	struct perf_event_mmap_page *userpg;
+	struct perf_mmap_data *data;
+
+	rcu_read_lock();
+	data = rcu_dereference(event->data);
+	if (!data)
+		goto unlock;
+
+	userpg = data->user_page;
+
+	/*
+	 * Disable preemption so as to not let the corresponding user-space
+	 * spin too long if we get preempted.
+	 */
+	preempt_disable();
+	++userpg->lock;
+	barrier();
+	userpg->index = perf_event_index(event);
+	userpg->offset = atomic64_read(&event->count);
+	if (event->state == PERF_EVENT_STATE_ACTIVE)
+		userpg->offset -= atomic64_read(&event->hw.prev_count);
+
+	userpg->time_enabled = event->total_time_enabled +
+			atomic64_read(&event->child_total_time_enabled);
+
+	userpg->time_running = event->total_time_running +
+			atomic64_read(&event->child_total_time_running);
+
+	barrier();
+	++userpg->lock;
+	preempt_enable();
+unlock:
+	rcu_read_unlock();
+}
+
+static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct perf_event *event = vma->vm_file->private_data;
+	struct perf_mmap_data *data;
+	int ret = VM_FAULT_SIGBUS;
+
+	if (vmf->flags & FAULT_FLAG_MKWRITE) {
+		if (vmf->pgoff == 0)
+			ret = 0;
+		return ret;
+	}
+
+	rcu_read_lock();
+	data = rcu_dereference(event->data);
+	if (!data)
+		goto unlock;
+
+	if (vmf->pgoff == 0) {
+		vmf->page = virt_to_page(data->user_page);
+	} else {
+		int nr = vmf->pgoff - 1;
+
+		if ((unsigned)nr > data->nr_pages)
+			goto unlock;
+
+		if (vmf->flags & FAULT_FLAG_WRITE)
+			goto unlock;
+
+		vmf->page = virt_to_page(data->data_pages[nr]);
+	}
+
+	get_page(vmf->page);
+	vmf->page->mapping = vma->vm_file->f_mapping;
+	vmf->page->index   = vmf->pgoff;
+
+	ret = 0;
+unlock:
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
+{
+	struct perf_mmap_data *data;
+	unsigned long size;
+	int i;
+
+	WARN_ON(atomic_read(&event->mmap_count));
+
+	size = sizeof(struct perf_mmap_data);
+	size += nr_pages * sizeof(void *);
+
+	data = kzalloc(size, GFP_KERNEL);
+	if (!data)
+		goto fail;
+
+	data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!data->user_page)
+		goto fail_user_page;
+
+	for (i = 0; i < nr_pages; i++) {
+		data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+		if (!data->data_pages[i])
+			goto fail_data_pages;
+	}
+
+	data->nr_pages = nr_pages;
+	atomic_set(&data->lock, -1);
+
+	if (event->attr.watermark) {
+		data->watermark = min_t(long, PAGE_SIZE * nr_pages,
+				      event->attr.wakeup_watermark);
+	}
+	if (!data->watermark)
+		data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);
+
+	rcu_assign_pointer(event->data, data);
+
+	return 0;
+
+fail_data_pages:
+	for (i--; i >= 0; i--)
+		free_page((unsigned long)data->data_pages[i]);
+
+	free_page((unsigned long)data->user_page);
+
+fail_user_page:
+	kfree(data);
+
+fail:
+	return -ENOMEM;
+}
+
+static void perf_mmap_free_page(unsigned long addr)
+{
+	struct page *page = virt_to_page((void *)addr);
+
+	page->mapping = NULL;
+	__free_page(page);
+}
+
+static void __perf_mmap_data_free(struct rcu_head *rcu_head)
+{
+	struct perf_mmap_data *data;
+	int i;
+
+	data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
+
+	perf_mmap_free_page((unsigned long)data->user_page);
+	for (i = 0; i < data->nr_pages; i++)
+		perf_mmap_free_page((unsigned long)data->data_pages[i]);
+
+	kfree(data);
+}
+
+static void perf_mmap_data_free(struct perf_event *event)
+{
+	struct perf_mmap_data *data = event->data;
+
+	WARN_ON(atomic_read(&event->mmap_count));
+
+	rcu_assign_pointer(event->data, NULL);
+	call_rcu(&data->rcu_head, __perf_mmap_data_free);
+}
+
+static void perf_mmap_open(struct vm_area_struct *vma)
+{
+	struct perf_event *event = vma->vm_file->private_data;
+
+	atomic_inc(&event->mmap_count);
+}
+
+static void perf_mmap_close(struct vm_area_struct *vma)
+{
+	struct perf_event *event = vma->vm_file->private_data;
+
+	WARN_ON_ONCE(event->ctx->parent_ctx);
+	if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
+		struct user_struct *user = current_user();
+
+		atomic_long_sub(event->data->nr_pages + 1, &user->locked_vm);
+		vma->vm_mm->locked_vm -= event->data->nr_locked;
+		perf_mmap_data_free(event);
+		mutex_unlock(&event->mmap_mutex);
+	}
+}
+
+static struct vm_operations_struct perf_mmap_vmops = {
+	.open		= perf_mmap_open,
+	.close		= perf_mmap_close,
+	.fault		= perf_mmap_fault,
+	.page_mkwrite	= perf_mmap_fault,
+};
+
+static int perf_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct perf_event *event = file->private_data;
+	unsigned long user_locked, user_lock_limit;
+	struct user_struct *user = current_user();
+	unsigned long locked, lock_limit;
+	unsigned long vma_size;
+	unsigned long nr_pages;
+	long user_extra, extra;
+	int ret = 0;
+
+	if (!(vma->vm_flags & VM_SHARED))
+		return -EINVAL;
+
+	vma_size = vma->vm_end - vma->vm_start;
+	nr_pages = (vma_size / PAGE_SIZE) - 1;
+
+	/*
+	 * If we have data pages ensure they're a power-of-two number, so we
+	 * can do bitmasks instead of modulo.
+	 */
+	if (nr_pages != 0 && !is_power_of_2(nr_pages))
+		return -EINVAL;
+
+	if (vma_size != PAGE_SIZE * (1 + nr_pages))
+		return -EINVAL;
+
+	if (vma->vm_pgoff != 0)
+		return -EINVAL;
+
+	WARN_ON_ONCE(event->ctx->parent_ctx);
+	mutex_lock(&event->mmap_mutex);
+	if (event->output) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	if (atomic_inc_not_zero(&event->mmap_count)) {
+		if (nr_pages != event->data->nr_pages)
+			ret = -EINVAL;
+		goto unlock;
+	}
+
+	user_extra = nr_pages + 1;
+	user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
+
+	/*
+	 * Increase the limit linearly with more CPUs:
+	 */
+	user_lock_limit *= num_online_cpus();
+
+	user_locked = atomic_long_read(&user->locked_vm) + user_extra;
+
+	extra = 0;
+	if (user_locked > user_lock_limit)
+		extra = user_locked - user_lock_limit;
+
+	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+	lock_limit >>= PAGE_SHIFT;
+	locked = vma->vm_mm->locked_vm + extra;
+
+	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
+		!capable(CAP_IPC_LOCK)) {
+		ret = -EPERM;
+		goto unlock;
+	}
+
+	WARN_ON(event->data);
+	ret = perf_mmap_data_alloc(event, nr_pages);
+	if (ret)
+		goto unlock;
+
+	atomic_set(&event->mmap_count, 1);
+	atomic_long_add(user_extra, &user->locked_vm);
+	vma->vm_mm->locked_vm += extra;
+	event->data->nr_locked = extra;
+	if (vma->vm_flags & VM_WRITE)
+		event->data->writable = 1;
+
+unlock:
+	mutex_unlock(&event->mmap_mutex);
+
+	vma->vm_flags |= VM_RESERVED;
+	vma->vm_ops = &perf_mmap_vmops;
+
+	return ret;
+}
+
+static int perf_fasync(int fd, struct file *filp, int on)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct perf_event *event = filp->private_data;
+	int retval;
+
+	mutex_lock(&inode->i_mutex);
+	retval = fasync_helper(fd, filp, on, &event->fasync);
+	mutex_unlock(&inode->i_mutex);
+
+	if (retval < 0)
+		return retval;
+
+	return 0;
+}
+
+static const struct file_operations perf_fops = {
+	.release		= perf_release,
+	.read			= perf_read,
+	.poll			= perf_poll,
+	.unlocked_ioctl		= perf_ioctl,
+	.compat_ioctl		= perf_ioctl,
+	.mmap			= perf_mmap,
+	.fasync			= perf_fasync,
+};
+
+/*
+ * Perf event wakeup
+ *
+ * If there's data, ensure we set the poll() state and publish everything
+ * to user-space before waking everybody up.
+ */
+
+void perf_event_wakeup(struct perf_event *event)
+{
+	wake_up_all(&event->waitq);
+
+	if (event->pending_kill) {
+		kill_fasync(&event->fasync, SIGIO, event->pending_kill);
+		event->pending_kill = 0;
+	}
+}
+
+/*
+ * Pending wakeups
+ *
+ * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
+ *
+ * The NMI bit means we cannot possibly take locks. Therefore, maintain a
+ * single linked list and use cmpxchg() to add entries lockless.
+ */
+
+static void perf_pending_event(struct perf_pending_entry *entry)
+{
+	struct perf_event *event = container_of(entry,
+			struct perf_event, pending);
+
+	if (event->pending_disable) {
+		event->pending_disable = 0;
+		__perf_event_disable(event);
+	}
+
+	if (event->pending_wakeup) {
+		event->pending_wakeup = 0;
+		perf_event_wakeup(event);
+	}
+}
+
+#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
+
+static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
+	PENDING_TAIL,
+};
+
+static void perf_pending_queue(struct perf_pending_entry *entry,
+			       void (*func)(struct perf_pending_entry *))
+{
+	struct perf_pending_entry **head;
+
+	if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
+		return;
+
+	entry->func = func;
+
+	head = &get_cpu_var(perf_pending_head);
+
+	do {
+		entry->next = *head;
+	} while (cmpxchg(head, entry->next, entry) != entry->next);
+
+	set_perf_event_pending();
+
+	put_cpu_var(perf_pending_head);
+}
+
+static int __perf_pending_run(void)
+{
+	struct perf_pending_entry *list;
+	int nr = 0;
+
+	list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
+	while (list != PENDING_TAIL) {
+		void (*func)(struct perf_pending_entry *);
+		struct perf_pending_entry *entry = list;
+
+		list = list->next;
+
+		func = entry->func;
+		entry->next = NULL;
+		/*
+		 * Ensure we observe the unqueue before we issue the wakeup,
+		 * so that we won't be waiting forever.
+		 * -- see perf_not_pending().
+		 */
+		smp_wmb();
+
+		func(entry);
+		nr++;
+	}
+
+	return nr;
+}
+
+static inline int perf_not_pending(struct perf_event *event)
+{
+	/*
+	 * If we flush on whatever cpu we run, there is a chance we don't
+	 * need to wait.
+	 */
+	get_cpu();
+	__perf_pending_run();
+	put_cpu();
+
+	/*
+	 * Ensure we see the proper queue state before going to sleep
+	 * so that we do not miss the wakeup. -- see perf_pending_handle()
+	 */
+	smp_rmb();
+	return event->pending.next == NULL;
+}
+
+static void perf_pending_sync(struct perf_event *event)
+{
+	wait_event(event->waitq, perf_not_pending(event));
+}
+
+void perf_event_do_pending(void)
+{
+	__perf_pending_run();
+}
+
+/*
+ * Callchain support -- arch specific
+ */
+
+__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+	return NULL;
+}
+
+/*
+ * Output
+ */
+static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
+			      unsigned long offset, unsigned long head)
+{
+	unsigned long mask;
+
+	if (!data->writable)
+		return true;
+
+	mask = (data->nr_pages << PAGE_SHIFT) - 1;
+
+	offset = (offset - tail) & mask;
+	head   = (head   - tail) & mask;
+
+	if ((int)(head - offset) < 0)
+		return false;
+
+	return true;
+}
+
+static void perf_output_wakeup(struct perf_output_handle *handle)
+{
+	atomic_set(&handle->data->poll, POLL_IN);
+
+	if (handle->nmi) {
+		handle->event->pending_wakeup = 1;
+		perf_pending_queue(&handle->event->pending,
+				   perf_pending_event);
+	} else
+		perf_event_wakeup(handle->event);
+}
+
+/*
+ * Curious locking construct.
+ *
+ * We need to ensure a later event_id doesn't publish a head when a former
+ * event_id isn't done writing. However since we need to deal with NMIs we
+ * cannot fully serialize things.
+ *
+ * What we do is serialize between CPUs so we only have to deal with NMI
+ * nesting on a single CPU.
+ *
+ * We only publish the head (and generate a wakeup) when the outer-most
+ * event_id completes.
+ */
+static void perf_output_lock(struct perf_output_handle *handle)
+{
+	struct perf_mmap_data *data = handle->data;
+	int cpu;
+
+	handle->locked = 0;
+
+	local_irq_save(handle->flags);
+	cpu = smp_processor_id();
+
+	if (in_nmi() && atomic_read(&data->lock) == cpu)
+		return;
+
+	while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
+		cpu_relax();
+
+	handle->locked = 1;
+}
+
+static void perf_output_unlock(struct perf_output_handle *handle)
+{
+	struct perf_mmap_data *data = handle->data;
+	unsigned long head;
+	int cpu;
+
+	data->done_head = data->head;
+
+	if (!handle->locked)
+		goto out;
+
+again:
+	/*
+	 * The xchg implies a full barrier that ensures all writes are done
+	 * before we publish the new head, matched by a rmb() in userspace when
+	 * reading this position.
+	 */
+	while ((head = atomic_long_xchg(&data->done_head, 0)))
+		data->user_page->data_head = head;
+
+	/*
+	 * NMI can happen here, which means we can miss a done_head update.
+	 */
+
+	cpu = atomic_xchg(&data->lock, -1);
+	WARN_ON_ONCE(cpu != smp_processor_id());
+
+	/*
+	 * Therefore we have to validate we did not indeed do so.
+	 */
+	if (unlikely(atomic_long_read(&data->done_head))) {
+		/*
+		 * Since we had it locked, we can lock it again.
+		 */
+		while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
+			cpu_relax();
+
+		goto again;
+	}
+
+	if (atomic_xchg(&data->wakeup, 0))
+		perf_output_wakeup(handle);
+out:
+	local_irq_restore(handle->flags);
+}
+
+void perf_output_copy(struct perf_output_handle *handle,
+		      const void *buf, unsigned int len)
+{
+	unsigned int pages_mask;
+	unsigned int offset;
+	unsigned int size;
+	void **pages;
+
+	offset		= handle->offset;
+	pages_mask	= handle->data->nr_pages - 1;
+	pages		= handle->data->data_pages;
+
+	do {
+		unsigned int page_offset;
+		int nr;
+
+		nr	    = (offset >> PAGE_SHIFT) & pages_mask;
+		page_offset = offset & (PAGE_SIZE - 1);
+		size	    = min_t(unsigned int, PAGE_SIZE - page_offset, len);
+
+		memcpy(pages[nr] + page_offset, buf, size);
+
+		len	    -= size;
+		buf	    += size;
+		offset	    += size;
+	} while (len);
+
+	handle->offset = offset;
+
+	/*
+	 * Check we didn't copy past our reservation window, taking the
+	 * possible unsigned int wrap into account.
+	 */
+	WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
+}
+
+int perf_output_begin(struct perf_output_handle *handle,
+		      struct perf_event *event, unsigned int size,
+		      int nmi, int sample)
+{
+	struct perf_event *output_event;
+	struct perf_mmap_data *data;
+	unsigned long tail, offset, head;
+	int have_lost;
+	struct {
+		struct perf_event_header header;
+		u64			 id;
+		u64			 lost;
+	} lost_event;
+
+	rcu_read_lock();
+	/*
+	 * For inherited events we send all the output towards the parent.
+	 */
+	if (event->parent)
+		event = event->parent;
+
+	output_event = rcu_dereference(event->output);
+	if (output_event)
+		event = output_event;
+
+	data = rcu_dereference(event->data);
+	if (!data)
+		goto out;
+
+	handle->data	= data;
+	handle->event	= event;
+	handle->nmi	= nmi;
+	handle->sample	= sample;
+
+	if (!data->nr_pages)
+		goto fail;
+
+	have_lost = atomic_read(&data->lost);
+	if (have_lost)
+		size += sizeof(lost_event);
+
+	perf_output_lock(handle);
+
+	do {
+		/*
+		 * Userspace could choose to issue a mb() before updating the
+		 * tail pointer. So that all reads will be completed before the
+		 * write is issued.
+		 */
+		tail = ACCESS_ONCE(data->user_page->data_tail);
+		smp_rmb();
+		offset = head = atomic_long_read(&data->head);
+		head += size;
+		if (unlikely(!perf_output_space(data, tail, offset, head)))
+			goto fail;
+	} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
+
+	handle->offset	= offset;
+	handle->head	= head;
+
+	if (head - tail > data->watermark)
+		atomic_set(&data->wakeup, 1);
+
+	if (have_lost) {
+		lost_event.header.type = PERF_RECORD_LOST;
+		lost_event.header.misc = 0;
+		lost_event.header.size = sizeof(lost_event);
+		lost_event.id          = event->id;
+		lost_event.lost        = atomic_xchg(&data->lost, 0);
+
+		perf_output_put(handle, lost_event);
+	}
+
+	return 0;
+
+fail:
+	atomic_inc(&data->lost);
+	perf_output_unlock(handle);
+out:
+	rcu_read_unlock();
+
+	return -ENOSPC;
+}
+
+void perf_output_end(struct perf_output_handle *handle)
+{
+	struct perf_event *event = handle->event;
+	struct perf_mmap_data *data = handle->data;
+
+	int wakeup_events = event->attr.wakeup_events;
+
+	if (handle->sample && wakeup_events) {
+		int events = atomic_inc_return(&data->events);
+		if (events >= wakeup_events) {
+			atomic_sub(wakeup_events, &data->events);
+			atomic_set(&data->wakeup, 1);
+		}
+	}
+
+	perf_output_unlock(handle);
+	rcu_read_unlock();
+}
+
+static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
+{
+	/*
+	 * only top level events have the pid namespace they were created in
+	 */
+	if (event->parent)
+		event = event->parent;
+
+	return task_tgid_nr_ns(p, event->ns);
+}
+
+static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+{
+	/*
+	 * only top level events have the pid namespace they were created in
+	 */
+	if (event->parent)
+		event = event->parent;
+
+	return task_pid_nr_ns(p, event->ns);
+}
+
+static void perf_output_read_one(struct perf_output_handle *handle,
+				 struct perf_event *event)
+{
+	u64 read_format = event->attr.read_format;
+	u64 values[4];
+	int n = 0;
+
+	values[n++] = atomic64_read(&event->count);
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+		values[n++] = event->total_time_enabled +
+			atomic64_read(&event->child_total_time_enabled);
+	}
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+		values[n++] = event->total_time_running +
+			atomic64_read(&event->child_total_time_running);
+	}
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_event_id(event);
+
+	perf_output_copy(handle, values, n * sizeof(u64));
+}
+
+/*
+ * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
+ */
+static void perf_output_read_group(struct perf_output_handle *handle,
+			    struct perf_event *event)
+{
+	struct perf_event *leader = event->group_leader, *sub;
+	u64 read_format = event->attr.read_format;
+	u64 values[5];
+	int n = 0;
+
+	values[n++] = 1 + leader->nr_siblings;
+
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		values[n++] = leader->total_time_enabled;
+
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		values[n++] = leader->total_time_running;
+
+	if (leader != event)
+		leader->pmu->read(leader);
+
+	values[n++] = atomic64_read(&leader->count);
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_event_id(leader);
+
+	perf_output_copy(handle, values, n * sizeof(u64));
+
+	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+		n = 0;
+
+		if (sub != event)
+			sub->pmu->read(sub);
+
+		values[n++] = atomic64_read(&sub->count);
+		if (read_format & PERF_FORMAT_ID)
+			values[n++] = primary_event_id(sub);
+
+		perf_output_copy(handle, values, n * sizeof(u64));
+	}
+}
+
+static void perf_output_read(struct perf_output_handle *handle,
+			     struct perf_event *event)
+{
+	if (event->attr.read_format & PERF_FORMAT_GROUP)
+		perf_output_read_group(handle, event);
+	else
+		perf_output_read_one(handle, event);
+}
+
+void perf_output_sample(struct perf_output_handle *handle,
+			struct perf_event_header *header,
+			struct perf_sample_data *data,
+			struct perf_event *event)
+{
+	u64 sample_type = data->type;
+
+	perf_output_put(handle, *header);
+
+	if (sample_type & PERF_SAMPLE_IP)
+		perf_output_put(handle, data->ip);
+
+	if (sample_type & PERF_SAMPLE_TID)
+		perf_output_put(handle, data->tid_entry);
+
+	if (sample_type & PERF_SAMPLE_TIME)
+		perf_output_put(handle, data->time);
+
+	if (sample_type & PERF_SAMPLE_ADDR)
+		perf_output_put(handle, data->addr);
+
+	if (sample_type & PERF_SAMPLE_ID)
+		perf_output_put(handle, data->id);
+
+	if (sample_type & PERF_SAMPLE_STREAM_ID)
+		perf_output_put(handle, data->stream_id);
+
+	if (sample_type & PERF_SAMPLE_CPU)
+		perf_output_put(handle, data->cpu_entry);
+
+	if (sample_type & PERF_SAMPLE_PERIOD)
+		perf_output_put(handle, data->period);
+
+	if (sample_type & PERF_SAMPLE_READ)
+		perf_output_read(handle, event);
+
+	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+		if (data->callchain) {
+			int size = 1;
+
+			if (data->callchain)
+				size += data->callchain->nr;
+
+			size *= sizeof(u64);
+
+			perf_output_copy(handle, data->callchain, size);
+		} else {
+			u64 nr = 0;
+			perf_output_put(handle, nr);
+		}
+	}
+
+	if (sample_type & PERF_SAMPLE_RAW) {
+		if (data->raw) {
+			perf_output_put(handle, data->raw->size);
+			perf_output_copy(handle, data->raw->data,
+					 data->raw->size);
+		} else {
+			struct {
+				u32	size;
+				u32	data;
+			} raw = {
+				.size = sizeof(u32),
+				.data = 0,
+			};
+			perf_output_put(handle, raw);
+		}
+	}
+}
+
+void perf_prepare_sample(struct perf_event_header *header,
+			 struct perf_sample_data *data,
+			 struct perf_event *event,
+			 struct pt_regs *regs)
+{
+	u64 sample_type = event->attr.sample_type;
+
+	data->type = sample_type;
+
+	header->type = PERF_RECORD_SAMPLE;
+	header->size = sizeof(*header);
+
+	header->misc = 0;
+	header->misc |= perf_misc_flags(regs);
+
+	if (sample_type & PERF_SAMPLE_IP) {
+		data->ip = perf_instruction_pointer(regs);
+
+		header->size += sizeof(data->ip);
+	}
+
+	if (sample_type & PERF_SAMPLE_TID) {
+		/* namespace issues */
+		data->tid_entry.pid = perf_event_pid(event, current);
+		data->tid_entry.tid = perf_event_tid(event, current);
+
+		header->size += sizeof(data->tid_entry);
+	}
+
+	if (sample_type & PERF_SAMPLE_TIME) {
+		data->time = perf_clock();
+
+		header->size += sizeof(data->time);
+	}
+
+	if (sample_type & PERF_SAMPLE_ADDR)
+		header->size += sizeof(data->addr);
+
+	if (sample_type & PERF_SAMPLE_ID) {
+		data->id = primary_event_id(event);
+
+		header->size += sizeof(data->id);
+	}
+
+	if (sample_type & PERF_SAMPLE_STREAM_ID) {
+		data->stream_id = event->id;
+
+		header->size += sizeof(data->stream_id);
+	}
+
+	if (sample_type & PERF_SAMPLE_CPU) {
+		data->cpu_entry.cpu		= raw_smp_processor_id();
+		data->cpu_entry.reserved	= 0;
+
+		header->size += sizeof(data->cpu_entry);
+	}
+
+	if (sample_type & PERF_SAMPLE_PERIOD)
+		header->size += sizeof(data->period);
+
+	if (sample_type & PERF_SAMPLE_READ)
+		header->size += perf_event_read_size(event);
+
+	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+		int size = 1;
+
+		data->callchain = perf_callchain(regs);
+
+		if (data->callchain)
+			size += data->callchain->nr;
+
+		header->size += size * sizeof(u64);
+	}
+
+	if (sample_type & PERF_SAMPLE_RAW) {
+		int size = sizeof(u32);
+
+		if (data->raw)
+			size += data->raw->size;
+		else
+			size += sizeof(u32);
+
+		WARN_ON_ONCE(size & (sizeof(u64)-1));
+		header->size += size;
+	}
+}
+
+static void perf_event_output(struct perf_event *event, int nmi,
+				struct perf_sample_data *data,
+				struct pt_regs *regs)
+{
+	struct perf_output_handle handle;
+	struct perf_event_header header;
+
+	perf_prepare_sample(&header, data, event, regs);
+
+	if (perf_output_begin(&handle, event, header.size, nmi, 1))
+		return;
+
+	perf_output_sample(&handle, &header, data, event);
+
+	perf_output_end(&handle);
+}
+
+/*
+ * read event_id
+ */
+
+struct perf_read_event {
+	struct perf_event_header	header;
+
+	u32				pid;
+	u32				tid;
+};
+
+static void
+perf_event_read_event(struct perf_event *event,
+			struct task_struct *task)
+{
+	struct perf_output_handle handle;
+	struct perf_read_event read_event = {
+		.header = {
+			.type = PERF_RECORD_READ,
+			.misc = 0,
+			.size = sizeof(read_event) + perf_event_read_size(event),
+		},
+		.pid = perf_event_pid(event, task),
+		.tid = perf_event_tid(event, task),
+	};
+	int ret;
+
+	ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
+	if (ret)
+		return;
+
+	perf_output_put(&handle, read_event);
+	perf_output_read(&handle, event);
+
+	perf_output_end(&handle);
+}
+
+/*
+ * task tracking -- fork/exit
+ *
+ * enabled by: attr.comm | attr.mmap | attr.task
+ */
+
+struct perf_task_event {
+	struct task_struct		*task;
+	struct perf_event_context	*task_ctx;
+
+	struct {
+		struct perf_event_header	header;
+
+		u32				pid;
+		u32				ppid;
+		u32				tid;
+		u32				ptid;
+		u64				time;
+	} event_id;
+};
+
+static void perf_event_task_output(struct perf_event *event,
+				     struct perf_task_event *task_event)
+{
+	struct perf_output_handle handle;
+	int size;
+	struct task_struct *task = task_event->task;
+	int ret;
+
+	size  = task_event->event_id.header.size;
+	ret = perf_output_begin(&handle, event, size, 0, 0);
+
+	if (ret)
+		return;
+
+	task_event->event_id.pid = perf_event_pid(event, task);
+	task_event->event_id.ppid = perf_event_pid(event, current);
+
+	task_event->event_id.tid = perf_event_tid(event, task);
+	task_event->event_id.ptid = perf_event_tid(event, current);
+
+	task_event->event_id.time = perf_clock();
+
+	perf_output_put(&handle, task_event->event_id);
+
+	perf_output_end(&handle);
+}
+
+static int perf_event_task_match(struct perf_event *event)
+{
+	if (event->attr.comm || event->attr.mmap || event->attr.task)
+		return 1;
+
+	return 0;
+}
+
+static void perf_event_task_ctx(struct perf_event_context *ctx,
+				  struct perf_task_event *task_event)
+{
+	struct perf_event *event;
+
+	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+		if (perf_event_task_match(event))
+			perf_event_task_output(event, task_event);
+	}
+	rcu_read_unlock();
+}
+
+static void perf_event_task_event(struct perf_task_event *task_event)
+{
+	struct perf_cpu_context *cpuctx;
+	struct perf_event_context *ctx = task_event->task_ctx;
+
+	cpuctx = &get_cpu_var(perf_cpu_context);
+	perf_event_task_ctx(&cpuctx->ctx, task_event);
+	put_cpu_var(perf_cpu_context);
+
+	rcu_read_lock();
+	if (!ctx)
+		ctx = rcu_dereference(task_event->task->perf_event_ctxp);
+	if (ctx)
+		perf_event_task_ctx(ctx, task_event);
+	rcu_read_unlock();
+}
+
+static void perf_event_task(struct task_struct *task,
+			      struct perf_event_context *task_ctx,
+			      int new)
+{
+	struct perf_task_event task_event;
+
+	if (!atomic_read(&nr_comm_events) &&
+	    !atomic_read(&nr_mmap_events) &&
+	    !atomic_read(&nr_task_events))
+		return;
+
+	task_event = (struct perf_task_event){
+		.task	  = task,
+		.task_ctx = task_ctx,
+		.event_id    = {
+			.header = {
+				.type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
+				.misc = 0,
+				.size = sizeof(task_event.event_id),
+			},
+			/* .pid  */
+			/* .ppid */
+			/* .tid  */
+			/* .ptid */
+		},
+	};
+
+	perf_event_task_event(&task_event);
+}
+
+void perf_event_fork(struct task_struct *task)
+{
+	perf_event_task(task, NULL, 1);
+}
+
+/*
+ * comm tracking
+ */
+
+struct perf_comm_event {
+	struct task_struct	*task;
+	char			*comm;
+	int			comm_size;
+
+	struct {
+		struct perf_event_header	header;
+
+		u32				pid;
+		u32				tid;
+	} event_id;
+};
+
+static void perf_event_comm_output(struct perf_event *event,
+				     struct perf_comm_event *comm_event)
+{
+	struct perf_output_handle handle;
+	int size = comm_event->event_id.header.size;
+	int ret = perf_output_begin(&handle, event, size, 0, 0);
+
+	if (ret)
+		return;
+
+	comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
+	comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
+
+	perf_output_put(&handle, comm_event->event_id);
+	perf_output_copy(&handle, comm_event->comm,
+				   comm_event->comm_size);
+	perf_output_end(&handle);
+}
+
+static int perf_event_comm_match(struct perf_event *event)
+{
+	if (event->attr.comm)
+		return 1;
+
+	return 0;
+}
+
+static void perf_event_comm_ctx(struct perf_event_context *ctx,
+				  struct perf_comm_event *comm_event)
+{
+	struct perf_event *event;
+
+	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+		if (perf_event_comm_match(event))
+			perf_event_comm_output(event, comm_event);
+	}
+	rcu_read_unlock();
+}
+
+static void perf_event_comm_event(struct perf_comm_event *comm_event)
+{
+	struct perf_cpu_context *cpuctx;
+	struct perf_event_context *ctx;
+	unsigned int size;
+	char comm[TASK_COMM_LEN];
+
+	memset(comm, 0, sizeof(comm));
+	strncpy(comm, comm_event->task->comm, sizeof(comm));
+	size = ALIGN(strlen(comm)+1, sizeof(u64));
+
+	comm_event->comm = comm;
+	comm_event->comm_size = size;
+
+	comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
+
+	cpuctx = &get_cpu_var(perf_cpu_context);
+	perf_event_comm_ctx(&cpuctx->ctx, comm_event);
+	put_cpu_var(perf_cpu_context);
+
+	rcu_read_lock();
+	/*
+	 * doesn't really matter which of the child contexts the
+	 * events ends up in.
+	 */
+	ctx = rcu_dereference(current->perf_event_ctxp);
+	if (ctx)
+		perf_event_comm_ctx(ctx, comm_event);
+	rcu_read_unlock();
+}
+
+void perf_event_comm(struct task_struct *task)
+{
+	struct perf_comm_event comm_event;
+
+	if (task->perf_event_ctxp)
+		perf_event_enable_on_exec(task);
+
+	if (!atomic_read(&nr_comm_events))
+		return;
+
+	comm_event = (struct perf_comm_event){
+		.task	= task,
+		/* .comm      */
+		/* .comm_size */
+		.event_id  = {
+			.header = {
+				.type = PERF_RECORD_COMM,
+				.misc = 0,
+				/* .size */
+			},
+			/* .pid */
+			/* .tid */
+		},
+	};
+
+	perf_event_comm_event(&comm_event);
+}
+
+/*
+ * mmap tracking
+ */
+
+struct perf_mmap_event {
+	struct vm_area_struct	*vma;
+
+	const char		*file_name;
+	int			file_size;
+
+	struct {
+		struct perf_event_header	header;
+
+		u32				pid;
+		u32				tid;
+		u64				start;
+		u64				len;
+		u64				pgoff;
+	} event_id;
+};
+
+static void perf_event_mmap_output(struct perf_event *event,
+				     struct perf_mmap_event *mmap_event)
+{
+	struct perf_output_handle handle;
+	int size = mmap_event->event_id.header.size;
+	int ret = perf_output_begin(&handle, event, size, 0, 0);
+
+	if (ret)
+		return;
+
+	mmap_event->event_id.pid = perf_event_pid(event, current);
+	mmap_event->event_id.tid = perf_event_tid(event, current);
+
+	perf_output_put(&handle, mmap_event->event_id);
+	perf_output_copy(&handle, mmap_event->file_name,
+				   mmap_event->file_size);
+	perf_output_end(&handle);
+}
+
+static int perf_event_mmap_match(struct perf_event *event,
+				   struct perf_mmap_event *mmap_event)
+{
+	if (event->attr.mmap)
+		return 1;
+
+	return 0;
+}
+
+static void perf_event_mmap_ctx(struct perf_event_context *ctx,
+				  struct perf_mmap_event *mmap_event)
+{
+	struct perf_event *event;
+
+	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+		if (perf_event_mmap_match(event, mmap_event))
+			perf_event_mmap_output(event, mmap_event);
+	}
+	rcu_read_unlock();
+}
+
+static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
+{
+	struct perf_cpu_context *cpuctx;
+	struct perf_event_context *ctx;
+	struct vm_area_struct *vma = mmap_event->vma;
+	struct file *file = vma->vm_file;
+	unsigned int size;
+	char tmp[16];
+	char *buf = NULL;
+	const char *name;
+
+	memset(tmp, 0, sizeof(tmp));
+
+	if (file) {
+		/*
+		 * d_path works from the end of the buffer backwards, so we
+		 * need to add enough zero bytes after the string to handle
+		 * the 64bit alignment we do later.
+		 */
+		buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
+		if (!buf) {
+			name = strncpy(tmp, "//enomem", sizeof(tmp));
+			goto got_name;
+		}
+		name = d_path(&file->f_path, buf, PATH_MAX);
+		if (IS_ERR(name)) {
+			name = strncpy(tmp, "//toolong", sizeof(tmp));
+			goto got_name;
+		}
+	} else {
+		if (arch_vma_name(mmap_event->vma)) {
+			name = strncpy(tmp, arch_vma_name(mmap_event->vma),
+				       sizeof(tmp));
+			goto got_name;
+		}
+
+		if (!vma->vm_mm) {
+			name = strncpy(tmp, "[vdso]", sizeof(tmp));
+			goto got_name;
+		}
+
+		name = strncpy(tmp, "//anon", sizeof(tmp));
+		goto got_name;
+	}
+
+got_name:
+	size = ALIGN(strlen(name)+1, sizeof(u64));
+
+	mmap_event->file_name = name;
+	mmap_event->file_size = size;
+
+	mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
+
+	cpuctx = &get_cpu_var(perf_cpu_context);
+	perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
+	put_cpu_var(perf_cpu_context);
+
+	rcu_read_lock();
+	/*
+	 * doesn't really matter which of the child contexts the
+	 * events ends up in.
+	 */
+	ctx = rcu_dereference(current->perf_event_ctxp);
+	if (ctx)
+		perf_event_mmap_ctx(ctx, mmap_event);
+	rcu_read_unlock();
+
+	kfree(buf);
+}
+
+void __perf_event_mmap(struct vm_area_struct *vma)
+{
+	struct perf_mmap_event mmap_event;
+
+	if (!atomic_read(&nr_mmap_events))
+		return;
+
+	mmap_event = (struct perf_mmap_event){
+		.vma	= vma,
+		/* .file_name */
+		/* .file_size */
+		.event_id  = {
+			.header = {
+				.type = PERF_RECORD_MMAP,
+				.misc = 0,
+				/* .size */
+			},
+			/* .pid */
+			/* .tid */
+			.start  = vma->vm_start,
+			.len    = vma->vm_end - vma->vm_start,
+			.pgoff  = vma->vm_pgoff,
+		},
+	};
+
+	perf_event_mmap_event(&mmap_event);
+}
+
+/*
+ * IRQ throttle logging
+ */
+
+static void perf_log_throttle(struct perf_event *event, int enable)
+{
+	struct perf_output_handle handle;
+	int ret;
+
+	struct {
+		struct perf_event_header	header;
+		u64				time;
+		u64				id;
+		u64				stream_id;
+	} throttle_event = {
+		.header = {
+			.type = PERF_RECORD_THROTTLE,
+			.misc = 0,
+			.size = sizeof(throttle_event),
+		},
+		.time		= perf_clock(),
+		.id		= primary_event_id(event),
+		.stream_id	= event->id,
+	};
+
+	if (enable)
+		throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
+
+	ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
+	if (ret)
+		return;
+
+	perf_output_put(&handle, throttle_event);
+	perf_output_end(&handle);
+}
+
+/*
+ * Generic event overflow handling, sampling.
+ */
+
+static int __perf_event_overflow(struct perf_event *event, int nmi,
+				   int throttle, struct perf_sample_data *data,
+				   struct pt_regs *regs)
+{
+	int events = atomic_read(&event->event_limit);
+	struct hw_perf_event *hwc = &event->hw;
+	int ret = 0;
+
+	throttle = (throttle && event->pmu->unthrottle != NULL);
+
+	if (!throttle) {
+		hwc->interrupts++;
+	} else {
+		if (hwc->interrupts != MAX_INTERRUPTS) {
+			hwc->interrupts++;
+			if (HZ * hwc->interrupts >
+					(u64)sysctl_perf_event_sample_rate) {
+				hwc->interrupts = MAX_INTERRUPTS;
+				perf_log_throttle(event, 0);
+				ret = 1;
+			}
+		} else {
+			/*
+			 * Keep re-disabling events even though on the previous
+			 * pass we disabled it - just in case we raced with a
+			 * sched-in and the event got enabled again:
+			 */
+			ret = 1;
+		}
+	}
+
+	if (event->attr.freq) {
+		u64 now = perf_clock();
+		s64 delta = now - hwc->freq_stamp;
+
+		hwc->freq_stamp = now;
+
+		if (delta > 0 && delta < TICK_NSEC)
+			perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
+	}
+
+	/*
+	 * XXX event_limit might not quite work as expected on inherited
+	 * events
+	 */
+
+	event->pending_kill = POLL_IN;
+	if (events && atomic_dec_and_test(&event->event_limit)) {
+		ret = 1;
+		event->pending_kill = POLL_HUP;
+		if (nmi) {
+			event->pending_disable = 1;
+			perf_pending_queue(&event->pending,
+					   perf_pending_event);
+		} else
+			perf_event_disable(event);
+	}
+
+	perf_event_output(event, nmi, data, regs);
+	return ret;
+}
+
+int perf_event_overflow(struct perf_event *event, int nmi,
+			  struct perf_sample_data *data,
+			  struct pt_regs *regs)
+{
+	return __perf_event_overflow(event, nmi, 1, data, regs);
+}
+
+/*
+ * Generic software event infrastructure
+ */
+
+/*
+ * We directly increment event->count and keep a second value in
+ * event->hw.period_left to count intervals. This period event
+ * is kept in the range [-sample_period, 0] so that we can use the
+ * sign as trigger.
+ */
+
+static u64 perf_swevent_set_period(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 period = hwc->last_period;
+	u64 nr, offset;
+	s64 old, val;
+
+	hwc->last_period = hwc->sample_period;
+
+again:
+	old = val = atomic64_read(&hwc->period_left);
+	if (val < 0)
+		return 0;
+
+	nr = div64_u64(period + val, period);
+	offset = nr * period;
+	val -= offset;
+	if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
+		goto again;
+
+	return nr;
+}
+
+static void perf_swevent_overflow(struct perf_event *event,
+				    int nmi, struct perf_sample_data *data,
+				    struct pt_regs *regs)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int throttle = 0;
+	u64 overflow;
+
+	data->period = event->hw.last_period;
+	overflow = perf_swevent_set_period(event);
+
+	if (hwc->interrupts == MAX_INTERRUPTS)
+		return;
+
+	for (; overflow; overflow--) {
+		if (__perf_event_overflow(event, nmi, throttle,
+					    data, regs)) {
+			/*
+			 * We inhibit the overflow from happening when
+			 * hwc->interrupts == MAX_INTERRUPTS.
+			 */
+			break;
+		}
+		throttle = 1;
+	}
+}
+
+static void perf_swevent_unthrottle(struct perf_event *event)
+{
+	/*
+	 * Nothing to do, we already reset hwc->interrupts.
+	 */
+}
+
+static void perf_swevent_add(struct perf_event *event, u64 nr,
+			       int nmi, struct perf_sample_data *data,
+			       struct pt_regs *regs)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	atomic64_add(nr, &event->count);
+
+	if (!hwc->sample_period)
+		return;
+
+	if (!regs)
+		return;
+
+	if (!atomic64_add_negative(nr, &hwc->period_left))
+		perf_swevent_overflow(event, nmi, data, regs);
+}
+
+static int perf_swevent_is_counting(struct perf_event *event)
+{
+	/*
+	 * The event is active, we're good!
+	 */
+	if (event->state == PERF_EVENT_STATE_ACTIVE)
+		return 1;
+
+	/*
+	 * The event is off/error, not counting.
+	 */
+	if (event->state != PERF_EVENT_STATE_INACTIVE)
+		return 0;
+
+	/*
+	 * The event is inactive, if the context is active
+	 * we're part of a group that didn't make it on the 'pmu',
+	 * not counting.
+	 */
+	if (event->ctx->is_active)
+		return 0;
+
+	/*
+	 * We're inactive and the context is too, this means the
+	 * task is scheduled out, we're counting events that happen
+	 * to us, like migration events.
+	 */
+	return 1;
+}
+
+static int perf_swevent_match(struct perf_event *event,
+				enum perf_type_id type,
+				u32 event_id, struct pt_regs *regs)
+{
+	if (!perf_swevent_is_counting(event))
+		return 0;
+
+	if (event->attr.type != type)
+		return 0;
+	if (event->attr.config != event_id)
+		return 0;
+
+	if (regs) {
+		if (event->attr.exclude_user && user_mode(regs))
+			return 0;
+
+		if (event->attr.exclude_kernel && !user_mode(regs))
+			return 0;
+	}
+
+	return 1;
+}
+
+static void perf_swevent_ctx_event(struct perf_event_context *ctx,
+				     enum perf_type_id type,
+				     u32 event_id, u64 nr, int nmi,
+				     struct perf_sample_data *data,
+				     struct pt_regs *regs)
+{
+	struct perf_event *event;
+
+	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+		if (perf_swevent_match(event, type, event_id, regs))
+			perf_swevent_add(event, nr, nmi, data, regs);
+	}
+	rcu_read_unlock();
+}
+
+static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx)
+{
+	if (in_nmi())
+		return &cpuctx->recursion[3];
+
+	if (in_irq())
+		return &cpuctx->recursion[2];
+
+	if (in_softirq())
+		return &cpuctx->recursion[1];
+
+	return &cpuctx->recursion[0];
+}
+
+static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
+				    u64 nr, int nmi,
+				    struct perf_sample_data *data,
+				    struct pt_regs *regs)
+{
+	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+	int *recursion = perf_swevent_recursion_context(cpuctx);
+	struct perf_event_context *ctx;
+
+	if (*recursion)
+		goto out;
+
+	(*recursion)++;
+	barrier();
+
+	perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
+				 nr, nmi, data, regs);
+	rcu_read_lock();
+	/*
+	 * doesn't really matter which of the child contexts the
+	 * events ends up in.
+	 */
+	ctx = rcu_dereference(current->perf_event_ctxp);
+	if (ctx)
+		perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
+	rcu_read_unlock();
+
+	barrier();
+	(*recursion)--;
+
+out:
+	put_cpu_var(perf_cpu_context);
+}
+
+void __perf_sw_event(u32 event_id, u64 nr, int nmi,
+			    struct pt_regs *regs, u64 addr)
+{
+	struct perf_sample_data data = {
+		.addr = addr,
+	};
+
+	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi,
+				&data, regs);
+}
+
+static void perf_swevent_read(struct perf_event *event)
+{
+}
+
+static int perf_swevent_enable(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (hwc->sample_period) {
+		hwc->last_period = hwc->sample_period;
+		perf_swevent_set_period(event);
+	}
+	return 0;
+}
+
+static void perf_swevent_disable(struct perf_event *event)
+{
+}
+
+static const struct pmu perf_ops_generic = {
+	.enable		= perf_swevent_enable,
+	.disable	= perf_swevent_disable,
+	.read		= perf_swevent_read,
+	.unthrottle	= perf_swevent_unthrottle,
+};
+
+/*
+ * hrtimer based swevent callback
+ */
+
+static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
+{
+	enum hrtimer_restart ret = HRTIMER_RESTART;
+	struct perf_sample_data data;
+	struct pt_regs *regs;
+	struct perf_event *event;
+	u64 period;
+
+	event	= container_of(hrtimer, struct perf_event, hw.hrtimer);
+	event->pmu->read(event);
+
+	data.addr = 0;
+	regs = get_irq_regs();
+	/*
+	 * In case we exclude kernel IPs or are somehow not in interrupt
+	 * context, provide the next best thing, the user IP.
+	 */
+	if ((event->attr.exclude_kernel || !regs) &&
+			!event->attr.exclude_user)
+		regs = task_pt_regs(current);
+
+	if (regs) {
+		if (perf_event_overflow(event, 0, &data, regs))
+			ret = HRTIMER_NORESTART;
+	}
+
+	period = max_t(u64, 10000, event->hw.sample_period);
+	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+
+	return ret;
+}
+
+/*
+ * Software event: cpu wall time clock
+ */
+
+static void cpu_clock_perf_event_update(struct perf_event *event)
+{
+	int cpu = raw_smp_processor_id();
+	s64 prev;
+	u64 now;
+
+	now = cpu_clock(cpu);
+	prev = atomic64_read(&event->hw.prev_count);
+	atomic64_set(&event->hw.prev_count, now);
+	atomic64_add(now - prev, &event->count);
+}
+
+static int cpu_clock_perf_event_enable(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int cpu = raw_smp_processor_id();
+
+	atomic64_set(&hwc->prev_count, cpu_clock(cpu));
+	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hwc->hrtimer.function = perf_swevent_hrtimer;
+	if (hwc->sample_period) {
+		u64 period = max_t(u64, 10000, hwc->sample_period);
+		__hrtimer_start_range_ns(&hwc->hrtimer,
+				ns_to_ktime(period), 0,
+				HRTIMER_MODE_REL, 0);
+	}
+
+	return 0;
+}
+
+static void cpu_clock_perf_event_disable(struct perf_event *event)
+{
+	if (event->hw.sample_period)
+		hrtimer_cancel(&event->hw.hrtimer);
+	cpu_clock_perf_event_update(event);
+}
+
+static void cpu_clock_perf_event_read(struct perf_event *event)
+{
+	cpu_clock_perf_event_update(event);
+}
+
+static const struct pmu perf_ops_cpu_clock = {
+	.enable		= cpu_clock_perf_event_enable,
+	.disable	= cpu_clock_perf_event_disable,
+	.read		= cpu_clock_perf_event_read,
+};
+
+/*
+ * Software event: task time clock
+ */
+
+static void task_clock_perf_event_update(struct perf_event *event, u64 now)
+{
+	u64 prev;
+	s64 delta;
+
+	prev = atomic64_xchg(&event->hw.prev_count, now);
+	delta = now - prev;
+	atomic64_add(delta, &event->count);
+}
+
+static int task_clock_perf_event_enable(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 now;
+
+	now = event->ctx->time;
+
+	atomic64_set(&hwc->prev_count, now);
+	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hwc->hrtimer.function = perf_swevent_hrtimer;
+	if (hwc->sample_period) {
+		u64 period = max_t(u64, 10000, hwc->sample_period);
+		__hrtimer_start_range_ns(&hwc->hrtimer,
+				ns_to_ktime(period), 0,
+				HRTIMER_MODE_REL, 0);
+	}
+
+	return 0;
+}
+
+static void task_clock_perf_event_disable(struct perf_event *event)
+{
+	if (event->hw.sample_period)
+		hrtimer_cancel(&event->hw.hrtimer);
+	task_clock_perf_event_update(event, event->ctx->time);
+
+}
+
+static void task_clock_perf_event_read(struct perf_event *event)
+{
+	u64 time;
+
+	if (!in_nmi()) {
+		update_context_time(event->ctx);
+		time = event->ctx->time;
+	} else {
+		u64 now = perf_clock();
+		u64 delta = now - event->ctx->timestamp;
+		time = event->ctx->time + delta;
+	}
+
+	task_clock_perf_event_update(event, time);
+}
+
+static const struct pmu perf_ops_task_clock = {
+	.enable		= task_clock_perf_event_enable,
+	.disable	= task_clock_perf_event_disable,
+	.read		= task_clock_perf_event_read,
+};
+
+#ifdef CONFIG_EVENT_PROFILE
+void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
+			  int entry_size)
+{
+	struct perf_raw_record raw = {
+		.size = entry_size,
+		.data = record,
+	};
+
+	struct perf_sample_data data = {
+		.addr = addr,
+		.raw = &raw,
+	};
+
+	struct pt_regs *regs = get_irq_regs();
+
+	if (!regs)
+		regs = task_pt_regs(current);
+
+	do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
+				&data, regs);
+}
+EXPORT_SYMBOL_GPL(perf_tp_event);
+
+extern int ftrace_profile_enable(int);
+extern void ftrace_profile_disable(int);
+
+static void tp_perf_event_destroy(struct perf_event *event)
+{
+	ftrace_profile_disable(event->attr.config);
+}
+
+static const struct pmu *tp_perf_event_init(struct perf_event *event)
+{
+	/*
+	 * Raw tracepoint data is a severe data leak, only allow root to
+	 * have these.
+	 */
+	if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
+			perf_paranoid_tracepoint_raw() &&
+			!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	if (ftrace_profile_enable(event->attr.config))
+		return NULL;
+
+	event->destroy = tp_perf_event_destroy;
+
+	return &perf_ops_generic;
+}
+#else
+static const struct pmu *tp_perf_event_init(struct perf_event *event)
+{
+	return NULL;
+}
+#endif
+
+atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
+
+static void sw_perf_event_destroy(struct perf_event *event)
+{
+	u64 event_id = event->attr.config;
+
+	WARN_ON(event->parent);
+
+	atomic_dec(&perf_swevent_enabled[event_id]);
+}
+
+static const struct pmu *sw_perf_event_init(struct perf_event *event)
+{
+	const struct pmu *pmu = NULL;
+	u64 event_id = event->attr.config;
+
+	/*
+	 * Software events (currently) can't in general distinguish
+	 * between user, kernel and hypervisor events.
+	 * However, context switches and cpu migrations are considered
+	 * to be kernel events, and page faults are never hypervisor
+	 * events.
+	 */
+	switch (event_id) {
+	case PERF_COUNT_SW_CPU_CLOCK:
+		pmu = &perf_ops_cpu_clock;
+
+		break;
+	case PERF_COUNT_SW_TASK_CLOCK:
+		/*
+		 * If the user instantiates this as a per-cpu event,
+		 * use the cpu_clock event instead.
+		 */
+		if (event->ctx->task)
+			pmu = &perf_ops_task_clock;
+		else
+			pmu = &perf_ops_cpu_clock;
+
+		break;
+	case PERF_COUNT_SW_PAGE_FAULTS:
+	case PERF_COUNT_SW_PAGE_FAULTS_MIN:
+	case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
+	case PERF_COUNT_SW_CONTEXT_SWITCHES:
+	case PERF_COUNT_SW_CPU_MIGRATIONS:
+		if (!event->parent) {
+			atomic_inc(&perf_swevent_enabled[event_id]);
+			event->destroy = sw_perf_event_destroy;
+		}
+		pmu = &perf_ops_generic;
+		break;
+	}
+
+	return pmu;
+}
+
+/*
+ * Allocate and initialize a event structure
+ */
+static struct perf_event *
+perf_event_alloc(struct perf_event_attr *attr,
+		   int cpu,
+		   struct perf_event_context *ctx,
+		   struct perf_event *group_leader,
+		   struct perf_event *parent_event,
+		   gfp_t gfpflags)
+{
+	const struct pmu *pmu;
+	struct perf_event *event;
+	struct hw_perf_event *hwc;
+	long err;
+
+	event = kzalloc(sizeof(*event), gfpflags);
+	if (!event)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * Single events are their own group leaders, with an
+	 * empty sibling list:
+	 */
+	if (!group_leader)
+		group_leader = event;
+
+	mutex_init(&event->child_mutex);
+	INIT_LIST_HEAD(&event->child_list);
+
+	INIT_LIST_HEAD(&event->group_entry);
+	INIT_LIST_HEAD(&event->event_entry);
+	INIT_LIST_HEAD(&event->sibling_list);
+	init_waitqueue_head(&event->waitq);
+
+	mutex_init(&event->mmap_mutex);
+
+	event->cpu		= cpu;
+	event->attr		= *attr;
+	event->group_leader	= group_leader;
+	event->pmu		= NULL;
+	event->ctx		= ctx;
+	event->oncpu		= -1;
+
+	event->parent		= parent_event;
+
+	event->ns		= get_pid_ns(current->nsproxy->pid_ns);
+	event->id		= atomic64_inc_return(&perf_event_id);
+
+	event->state		= PERF_EVENT_STATE_INACTIVE;
+
+	if (attr->disabled)
+		event->state = PERF_EVENT_STATE_OFF;
+
+	pmu = NULL;
+
+	hwc = &event->hw;
+	hwc->sample_period = attr->sample_period;
+	if (attr->freq && attr->sample_freq)
+		hwc->sample_period = 1;
+	hwc->last_period = hwc->sample_period;
+
+	atomic64_set(&hwc->period_left, hwc->sample_period);
+
+	/*
+	 * we currently do not support PERF_FORMAT_GROUP on inherited events
+	 */
+	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
+		goto done;
+
+	switch (attr->type) {
+	case PERF_TYPE_RAW:
+	case PERF_TYPE_HARDWARE:
+	case PERF_TYPE_HW_CACHE:
+		pmu = hw_perf_event_init(event);
+		break;
+
+	case PERF_TYPE_SOFTWARE:
+		pmu = sw_perf_event_init(event);
+		break;
+
+	case PERF_TYPE_TRACEPOINT:
+		pmu = tp_perf_event_init(event);
+		break;
+
+	default:
+		break;
+	}
+done:
+	err = 0;
+	if (!pmu)
+		err = -EINVAL;
+	else if (IS_ERR(pmu))
+		err = PTR_ERR(pmu);
+
+	if (err) {
+		if (event->ns)
+			put_pid_ns(event->ns);
+		kfree(event);
+		return ERR_PTR(err);
+	}
+
+	event->pmu = pmu;
+
+	if (!event->parent) {
+		atomic_inc(&nr_events);
+		if (event->attr.mmap)
+			atomic_inc(&nr_mmap_events);
+		if (event->attr.comm)
+			atomic_inc(&nr_comm_events);
+		if (event->attr.task)
+			atomic_inc(&nr_task_events);
+	}
+
+	return event;
+}
+
+static int perf_copy_attr(struct perf_event_attr __user *uattr,
+			  struct perf_event_attr *attr)
+{
+	u32 size;
+	int ret;
+
+	if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
+		return -EFAULT;
+
+	/*
+	 * zero the full structure, so that a short copy will be nice.
+	 */
+	memset(attr, 0, sizeof(*attr));
+
+	ret = get_user(size, &uattr->size);
+	if (ret)
+		return ret;
+
+	if (size > PAGE_SIZE)	/* silly large */
+		goto err_size;
+
+	if (!size)		/* abi compat */
+		size = PERF_ATTR_SIZE_VER0;
+
+	if (size < PERF_ATTR_SIZE_VER0)
+		goto err_size;
+
+	/*
+	 * If we're handed a bigger struct than we know of,
+	 * ensure all the unknown bits are 0 - i.e. new
+	 * user-space does not rely on any kernel feature
+	 * extensions we dont know about yet.
+	 */
+	if (size > sizeof(*attr)) {
+		unsigned char __user *addr;
+		unsigned char __user *end;
+		unsigned char val;
+
+		addr = (void __user *)uattr + sizeof(*attr);
+		end  = (void __user *)uattr + size;
+
+		for (; addr < end; addr++) {
+			ret = get_user(val, addr);
+			if (ret)
+				return ret;
+			if (val)
+				goto err_size;
+		}
+		size = sizeof(*attr);
+	}
+
+	ret = copy_from_user(attr, uattr, size);
+	if (ret)
+		return -EFAULT;
+
+	/*
+	 * If the type exists, the corresponding creation will verify
+	 * the attr->config.
+	 */
+	if (attr->type >= PERF_TYPE_MAX)
+		return -EINVAL;
+
+	if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
+		return -EINVAL;
+
+	if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
+		return -EINVAL;
+
+	if (attr->read_format & ~(PERF_FORMAT_MAX-1))
+		return -EINVAL;
+
+out:
+	return ret;
+
+err_size:
+	put_user(sizeof(*attr), &uattr->size);
+	ret = -E2BIG;
+	goto out;
+}
+
+int perf_event_set_output(struct perf_event *event, int output_fd)
+{
+	struct perf_event *output_event = NULL;
+	struct file *output_file = NULL;
+	struct perf_event *old_output;
+	int fput_needed = 0;
+	int ret = -EINVAL;
+
+	if (!output_fd)
+		goto set;
+
+	output_file = fget_light(output_fd, &fput_needed);
+	if (!output_file)
+		return -EBADF;
+
+	if (output_file->f_op != &perf_fops)
+		goto out;
+
+	output_event = output_file->private_data;
+
+	/* Don't chain output fds */
+	if (output_event->output)
+		goto out;
+
+	/* Don't set an output fd when we already have an output channel */
+	if (event->data)
+		goto out;
+
+	atomic_long_inc(&output_file->f_count);
+
+set:
+	mutex_lock(&event->mmap_mutex);
+	old_output = event->output;
+	rcu_assign_pointer(event->output, output_event);
+	mutex_unlock(&event->mmap_mutex);
+
+	if (old_output) {
+		/*
+		 * we need to make sure no existing perf_output_*()
+		 * is still referencing this event.
+		 */
+		synchronize_rcu();
+		fput(old_output->filp);
+	}
+
+	ret = 0;
+out:
+	fput_light(output_file, fput_needed);
+	return ret;
+}
+
+/**
+ * sys_perf_event_open - open a performance event, associate it to a task/cpu
+ *
+ * @attr_uptr:	event_id type attributes for monitoring/sampling
+ * @pid:		target pid
+ * @cpu:		target cpu
+ * @group_fd:		group leader event fd
+ */
+SYSCALL_DEFINE5(perf_event_open,
+		struct perf_event_attr __user *, attr_uptr,
+		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
+{
+	struct perf_event *event, *group_leader;
+	struct perf_event_attr attr;
+	struct perf_event_context *ctx;
+	struct file *event_file = NULL;
+	struct file *group_file = NULL;
+	int fput_needed = 0;
+	int fput_needed2 = 0;
+	int err;
+
+	/* for future expandability... */
+	if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
+		return -EINVAL;
+
+	err = perf_copy_attr(attr_uptr, &attr);
+	if (err)
+		return err;
+
+	if (!attr.exclude_kernel) {
+		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+			return -EACCES;
+	}
+
+	if (attr.freq) {
+		if (attr.sample_freq > sysctl_perf_event_sample_rate)
+			return -EINVAL;
+	}
+
+	/*
+	 * Get the target context (task or percpu):
+	 */
+	ctx = find_get_context(pid, cpu);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	/*
+	 * Look up the group leader (we will attach this event to it):
+	 */
+	group_leader = NULL;
+	if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
+		err = -EINVAL;
+		group_file = fget_light(group_fd, &fput_needed);
+		if (!group_file)
+			goto err_put_context;
+		if (group_file->f_op != &perf_fops)
+			goto err_put_context;
+
+		group_leader = group_file->private_data;
+		/*
+		 * Do not allow a recursive hierarchy (this new sibling
+		 * becoming part of another group-sibling):
+		 */
+		if (group_leader->group_leader != group_leader)
+			goto err_put_context;
+		/*
+		 * Do not allow to attach to a group in a different
+		 * task or CPU context:
+		 */
+		if (group_leader->ctx != ctx)
+			goto err_put_context;
+		/*
+		 * Only a group leader can be exclusive or pinned
+		 */
+		if (attr.exclusive || attr.pinned)
+			goto err_put_context;
+	}
+
+	event = perf_event_alloc(&attr, cpu, ctx, group_leader,
+				     NULL, GFP_KERNEL);
+	err = PTR_ERR(event);
+	if (IS_ERR(event))
+		goto err_put_context;
+
+	err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0);
+	if (err < 0)
+		goto err_free_put_context;
+
+	event_file = fget_light(err, &fput_needed2);
+	if (!event_file)
+		goto err_free_put_context;
+
+	if (flags & PERF_FLAG_FD_OUTPUT) {
+		err = perf_event_set_output(event, group_fd);
+		if (err)
+			goto err_fput_free_put_context;
+	}
+
+	event->filp = event_file;
+	WARN_ON_ONCE(ctx->parent_ctx);
+	mutex_lock(&ctx->mutex);
+	perf_install_in_context(ctx, event, cpu);
+	++ctx->generation;
+	mutex_unlock(&ctx->mutex);
+
+	event->owner = current;
+	get_task_struct(current);
+	mutex_lock(&current->perf_event_mutex);
+	list_add_tail(&event->owner_entry, &current->perf_event_list);
+	mutex_unlock(&current->perf_event_mutex);
+
+err_fput_free_put_context:
+	fput_light(event_file, fput_needed2);
+
+err_free_put_context:
+	if (err < 0)
+		kfree(event);
+
+err_put_context:
+	if (err < 0)
+		put_ctx(ctx);
+
+	fput_light(group_file, fput_needed);
+
+	return err;
+}
+
+/*
+ * inherit a event from parent task to child task:
+ */
+static struct perf_event *
+inherit_event(struct perf_event *parent_event,
+	      struct task_struct *parent,
+	      struct perf_event_context *parent_ctx,
+	      struct task_struct *child,
+	      struct perf_event *group_leader,
+	      struct perf_event_context *child_ctx)
+{
+	struct perf_event *child_event;
+
+	/*
+	 * Instead of creating recursive hierarchies of events,
+	 * we link inherited events back to the original parent,
+	 * which has a filp for sure, which we use as the reference
+	 * count:
+	 */
+	if (parent_event->parent)
+		parent_event = parent_event->parent;
+
+	child_event = perf_event_alloc(&parent_event->attr,
+					   parent_event->cpu, child_ctx,
+					   group_leader, parent_event,
+					   GFP_KERNEL);
+	if (IS_ERR(child_event))
+		return child_event;
+	get_ctx(child_ctx);
+
+	/*
+	 * Make the child state follow the state of the parent event,
+	 * not its attr.disabled bit.  We hold the parent's mutex,
+	 * so we won't race with perf_event_{en, dis}able_family.
+	 */
+	if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
+		child_event->state = PERF_EVENT_STATE_INACTIVE;
+	else
+		child_event->state = PERF_EVENT_STATE_OFF;
+
+	if (parent_event->attr.freq)
+		child_event->hw.sample_period = parent_event->hw.sample_period;
+
+	/*
+	 * Link it up in the child's context:
+	 */
+	add_event_to_ctx(child_event, child_ctx);
+
+	/*
+	 * Get a reference to the parent filp - we will fput it
+	 * when the child event exits. This is safe to do because
+	 * we are in the parent and we know that the filp still
+	 * exists and has a nonzero count:
+	 */
+	atomic_long_inc(&parent_event->filp->f_count);
+
+	/*
+	 * Link this into the parent event's child list
+	 */
+	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+	mutex_lock(&parent_event->child_mutex);
+	list_add_tail(&child_event->child_list, &parent_event->child_list);
+	mutex_unlock(&parent_event->child_mutex);
+
+	return child_event;
+}
+
+static int inherit_group(struct perf_event *parent_event,
+	      struct task_struct *parent,
+	      struct perf_event_context *parent_ctx,
+	      struct task_struct *child,
+	      struct perf_event_context *child_ctx)
+{
+	struct perf_event *leader;
+	struct perf_event *sub;
+	struct perf_event *child_ctr;
+
+	leader = inherit_event(parent_event, parent, parent_ctx,
+				 child, NULL, child_ctx);
+	if (IS_ERR(leader))
+		return PTR_ERR(leader);
+	list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
+		child_ctr = inherit_event(sub, parent, parent_ctx,
+					    child, leader, child_ctx);
+		if (IS_ERR(child_ctr))
+			return PTR_ERR(child_ctr);
+	}
+	return 0;
+}
+
+static void sync_child_event(struct perf_event *child_event,
+			       struct task_struct *child)
+{
+	struct perf_event *parent_event = child_event->parent;
+	u64 child_val;
+
+	if (child_event->attr.inherit_stat)
+		perf_event_read_event(child_event, child);
+
+	child_val = atomic64_read(&child_event->count);
+
+	/*
+	 * Add back the child's count to the parent's count:
+	 */
+	atomic64_add(child_val, &parent_event->count);
+	atomic64_add(child_event->total_time_enabled,
+		     &parent_event->child_total_time_enabled);
+	atomic64_add(child_event->total_time_running,
+		     &parent_event->child_total_time_running);
+
+	/*
+	 * Remove this event from the parent's list
+	 */
+	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+	mutex_lock(&parent_event->child_mutex);
+	list_del_init(&child_event->child_list);
+	mutex_unlock(&parent_event->child_mutex);
+
+	/*
+	 * Release the parent event, if this was the last
+	 * reference to it.
+	 */
+	fput(parent_event->filp);
+}
+
+static void
+__perf_event_exit_task(struct perf_event *child_event,
+			 struct perf_event_context *child_ctx,
+			 struct task_struct *child)
+{
+	struct perf_event *parent_event;
+
+	update_event_times(child_event);
+	perf_event_remove_from_context(child_event);
+
+	parent_event = child_event->parent;
+	/*
+	 * It can happen that parent exits first, and has events
+	 * that are still around due to the child reference. These
+	 * events need to be zapped - but otherwise linger.
+	 */
+	if (parent_event) {
+		sync_child_event(child_event, child);
+		free_event(child_event);
+	}
+}
+
+/*
+ * When a child task exits, feed back event values to parent events.
+ */
+void perf_event_exit_task(struct task_struct *child)
+{
+	struct perf_event *child_event, *tmp;
+	struct perf_event_context *child_ctx;
+	unsigned long flags;
+
+	if (likely(!child->perf_event_ctxp)) {
+		perf_event_task(child, NULL, 0);
+		return;
+	}
+
+	local_irq_save(flags);
+	/*
+	 * We can't reschedule here because interrupts are disabled,
+	 * and either child is current or it is a task that can't be
+	 * scheduled, so we are now safe from rescheduling changing
+	 * our context.
+	 */
+	child_ctx = child->perf_event_ctxp;
+	__perf_event_task_sched_out(child_ctx);
+
+	/*
+	 * Take the context lock here so that if find_get_context is
+	 * reading child->perf_event_ctxp, we wait until it has
+	 * incremented the context's refcount before we do put_ctx below.
+	 */
+	spin_lock(&child_ctx->lock);
+	child->perf_event_ctxp = NULL;
+	/*
+	 * If this context is a clone; unclone it so it can't get
+	 * swapped to another process while we're removing all
+	 * the events from it.
+	 */
+	unclone_ctx(child_ctx);
+	spin_unlock_irqrestore(&child_ctx->lock, flags);
+
+	/*
+	 * Report the task dead after unscheduling the events so that we
+	 * won't get any samples after PERF_RECORD_EXIT. We can however still
+	 * get a few PERF_RECORD_READ events.
+	 */
+	perf_event_task(child, child_ctx, 0);
+
+	/*
+	 * We can recurse on the same lock type through:
+	 *
+	 *   __perf_event_exit_task()
+	 *     sync_child_event()
+	 *       fput(parent_event->filp)
+	 *         perf_release()
+	 *           mutex_lock(&ctx->mutex)
+	 *
+	 * But since its the parent context it won't be the same instance.
+	 */
+	mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
+
+again:
+	list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
+				 group_entry)
+		__perf_event_exit_task(child_event, child_ctx, child);
+
+	/*
+	 * If the last event was a group event, it will have appended all
+	 * its siblings to the list, but we obtained 'tmp' before that which
+	 * will still point to the list head terminating the iteration.
+	 */
+	if (!list_empty(&child_ctx->group_list))
+		goto again;
+
+	mutex_unlock(&child_ctx->mutex);
+
+	put_ctx(child_ctx);
+}
+
+/*
+ * free an unexposed, unused context as created by inheritance by
+ * init_task below, used by fork() in case of fail.
+ */
+void perf_event_free_task(struct task_struct *task)
+{
+	struct perf_event_context *ctx = task->perf_event_ctxp;
+	struct perf_event *event, *tmp;
+
+	if (!ctx)
+		return;
+
+	mutex_lock(&ctx->mutex);
+again:
+	list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
+		struct perf_event *parent = event->parent;
+
+		if (WARN_ON_ONCE(!parent))
+			continue;
+
+		mutex_lock(&parent->child_mutex);
+		list_del_init(&event->child_list);
+		mutex_unlock(&parent->child_mutex);
+
+		fput(parent->filp);
+
+		list_del_event(event, ctx);
+		free_event(event);
+	}
+
+	if (!list_empty(&ctx->group_list))
+		goto again;
+
+	mutex_unlock(&ctx->mutex);
+
+	put_ctx(ctx);
+}
+
+/*
+ * Initialize the perf_event context in task_struct
+ */
+int perf_event_init_task(struct task_struct *child)
+{
+	struct perf_event_context *child_ctx, *parent_ctx;
+	struct perf_event_context *cloned_ctx;
+	struct perf_event *event;
+	struct task_struct *parent = current;
+	int inherited_all = 1;
+	int ret = 0;
+
+	child->perf_event_ctxp = NULL;
+
+	mutex_init(&child->perf_event_mutex);
+	INIT_LIST_HEAD(&child->perf_event_list);
+
+	if (likely(!parent->perf_event_ctxp))
+		return 0;
+
+	/*
+	 * This is executed from the parent task context, so inherit
+	 * events that have been marked for cloning.
+	 * First allocate and initialize a context for the child.
+	 */
+
+	child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
+	if (!child_ctx)
+		return -ENOMEM;
+
+	__perf_event_init_context(child_ctx, child);
+	child->perf_event_ctxp = child_ctx;
+	get_task_struct(child);
+
+	/*
+	 * If the parent's context is a clone, pin it so it won't get
+	 * swapped under us.
+	 */
+	parent_ctx = perf_pin_task_context(parent);
+
+	/*
+	 * No need to check if parent_ctx != NULL here; since we saw
+	 * it non-NULL earlier, the only reason for it to become NULL
+	 * is if we exit, and since we're currently in the middle of
+	 * a fork we can't be exiting at the same time.
+	 */
+
+	/*
+	 * Lock the parent list. No need to lock the child - not PID
+	 * hashed yet and not running, so nobody can access it.
+	 */
+	mutex_lock(&parent_ctx->mutex);
+
+	/*
+	 * We dont have to disable NMIs - we are only looking at
+	 * the list, not manipulating it:
+	 */
+	list_for_each_entry_rcu(event, &parent_ctx->event_list, event_entry) {
+		if (event != event->group_leader)
+			continue;
+
+		if (!event->attr.inherit) {
+			inherited_all = 0;
+			continue;
+		}
+
+		ret = inherit_group(event, parent, parent_ctx,
+					     child, child_ctx);
+		if (ret) {
+			inherited_all = 0;
+			break;
+		}
+	}
+
+	if (inherited_all) {
+		/*
+		 * Mark the child context as a clone of the parent
+		 * context, or of whatever the parent is a clone of.
+		 * Note that if the parent is a clone, it could get
+		 * uncloned at any point, but that doesn't matter
+		 * because the list of events and the generation
+		 * count can't have changed since we took the mutex.
+		 */
+		cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
+		if (cloned_ctx) {
+			child_ctx->parent_ctx = cloned_ctx;
+			child_ctx->parent_gen = parent_ctx->parent_gen;
+		} else {
+			child_ctx->parent_ctx = parent_ctx;
+			child_ctx->parent_gen = parent_ctx->generation;
+		}
+		get_ctx(child_ctx->parent_ctx);
+	}
+
+	mutex_unlock(&parent_ctx->mutex);
+
+	perf_unpin_context(parent_ctx);
+
+	return ret;
+}
+
+static void __cpuinit perf_event_init_cpu(int cpu)
+{
+	struct perf_cpu_context *cpuctx;
+
+	cpuctx = &per_cpu(perf_cpu_context, cpu);
+	__perf_event_init_context(&cpuctx->ctx, NULL);
+
+	spin_lock(&perf_resource_lock);
+	cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
+	spin_unlock(&perf_resource_lock);
+
+	hw_perf_event_setup(cpu);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void __perf_event_exit_cpu(void *info)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_event_context *ctx = &cpuctx->ctx;
+	struct perf_event *event, *tmp;
+
+	list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
+		__perf_event_remove_from_context(event);
+}
+static void perf_event_exit_cpu(int cpu)
+{
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct perf_event_context *ctx = &cpuctx->ctx;
+
+	mutex_lock(&ctx->mutex);
+	smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
+	mutex_unlock(&ctx->mutex);
+}
+#else
+static inline void perf_event_exit_cpu(int cpu) { }
+#endif
+
+static int __cpuinit
+perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (long)hcpu;
+
+	switch (action) {
+
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		perf_event_init_cpu(cpu);
+		break;
+
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		hw_perf_event_setup_online(cpu);
+		break;
+
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		perf_event_exit_cpu(cpu);
+		break;
+
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+/*
+ * This has to have a higher priority than migration_notifier in sched.c.
+ */
+static struct notifier_block __cpuinitdata perf_cpu_nb = {
+	.notifier_call		= perf_cpu_notify,
+	.priority		= 20,
+};
+
+void __init perf_event_init(void)
+{
+	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
+			(void *)(long)smp_processor_id());
+	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
+			(void *)(long)smp_processor_id());
+	register_cpu_notifier(&perf_cpu_nb);
+}
+
+static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
+{
+	return sprintf(buf, "%d\n", perf_reserved_percpu);
+}
+
+static ssize_t
+perf_set_reserve_percpu(struct sysdev_class *class,
+			const char *buf,
+			size_t count)
+{
+	struct perf_cpu_context *cpuctx;
+	unsigned long val;
+	int err, cpu, mpt;
+
+	err = strict_strtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val > perf_max_events)
+		return -EINVAL;
+
+	spin_lock(&perf_resource_lock);
+	perf_reserved_percpu = val;
+	for_each_online_cpu(cpu) {
+		cpuctx = &per_cpu(perf_cpu_context, cpu);
+		spin_lock_irq(&cpuctx->ctx.lock);
+		mpt = min(perf_max_events - cpuctx->ctx.nr_events,
+			  perf_max_events - perf_reserved_percpu);
+		cpuctx->max_pertask = mpt;
+		spin_unlock_irq(&cpuctx->ctx.lock);
+	}
+	spin_unlock(&perf_resource_lock);
+
+	return count;
+}
+
+static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
+{
+	return sprintf(buf, "%d\n", perf_overcommit);
+}
+
+static ssize_t
+perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
+{
+	unsigned long val;
+	int err;
+
+	err = strict_strtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val > 1)
+		return -EINVAL;
+
+	spin_lock(&perf_resource_lock);
+	perf_overcommit = val;
+	spin_unlock(&perf_resource_lock);
+
+	return count;
+}
+
+static SYSDEV_CLASS_ATTR(
+				reserve_percpu,
+				0644,
+				perf_show_reserve_percpu,
+				perf_set_reserve_percpu
+			);
+
+static SYSDEV_CLASS_ATTR(
+				overcommit,
+				0644,
+				perf_show_overcommit,
+				perf_set_overcommit
+			);
+
+static struct attribute *perfclass_attrs[] = {
+	&attr_reserve_percpu.attr,
+	&attr_overcommit.attr,
+	NULL
+};
+
+static struct attribute_group perfclass_attr_group = {
+	.attrs			= perfclass_attrs,
+	.name			= "perf_events",
+};
+
+static int __init perf_event_sysfs_init(void)
+{
+	return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
+				  &perfclass_attr_group);
+}
+device_initcall(perf_event_sysfs_init);
diff --git a/kernel/sched.c b/kernel/sched.c
index faf4d463bbff..291c8d213d13 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,7 +39,7 @@
 #include <linux/completion.h>
 #include <linux/kernel_stat.h>
 #include <linux/debug_locks.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
@@ -2059,7 +2059,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		if (task_hot(p, old_rq->clock, NULL))
 			schedstat_inc(p, se.nr_forced2_migrations);
 #endif
-		perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
+		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
 				     1, 1, NULL, 0);
 	}
 	p->se.vruntime -= old_cfsrq->min_vruntime -
@@ -2724,7 +2724,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	 */
 	prev_state = prev->state;
 	finish_arch_switch(prev);
-	perf_counter_task_sched_in(current, cpu_of(rq));
+	perf_event_task_sched_in(current, cpu_of(rq));
 	finish_lock_switch(rq, prev);
 
 	fire_sched_in_preempt_notifiers(current);
@@ -5199,7 +5199,7 @@ void scheduler_tick(void)
 	curr->sched_class->task_tick(rq, curr, 0);
 	spin_unlock(&rq->lock);
 
-	perf_counter_task_tick(curr, cpu);
+	perf_event_task_tick(curr, cpu);
 
 #ifdef CONFIG_SMP
 	rq->idle_at_tick = idle_cpu(cpu);
@@ -5415,7 +5415,7 @@ need_resched_nonpreemptible:
 
 	if (likely(prev != next)) {
 		sched_info_switch(prev, next);
-		perf_counter_task_sched_out(prev, next, cpu);
+		perf_event_task_sched_out(prev, next, cpu);
 
 		rq->nr_switches++;
 		rq->curr = next;
@@ -7692,7 +7692,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 /*
  * Register at high priority so that task migration (migrate_all_tasks)
  * happens before everything else.  This has to be lower priority than
- * the notifier in the perf_counter subsystem, though.
+ * the notifier in the perf_event subsystem, though.
  */
 static struct notifier_block __cpuinitdata migration_notifier = {
 	.notifier_call = migration_call,
@@ -9549,7 +9549,7 @@ void __init sched_init(void)
 	alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 #endif /* SMP */
 
-	perf_counter_init();
+	perf_event_init();
 
 	scheduler_running = 1;
 }
diff --git a/kernel/sys.c b/kernel/sys.c
index b3f1097c76fa..ea5c3bcac881 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,7 +14,7 @@
 #include <linux/prctl.h>
 #include <linux/highuid.h>
 #include <linux/fs.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/resource.h>
 #include <linux/kernel.h>
 #include <linux/kexec.h>
@@ -1511,11 +1511,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		case PR_SET_TSC:
 			error = SET_TSC_CTL(arg2);
 			break;
-		case PR_TASK_PERF_COUNTERS_DISABLE:
-			error = perf_counter_task_disable();
+		case PR_TASK_PERF_EVENTS_DISABLE:
+			error = perf_event_task_disable();
 			break;
-		case PR_TASK_PERF_COUNTERS_ENABLE:
-			error = perf_counter_task_enable();
+		case PR_TASK_PERF_EVENTS_ENABLE:
+			error = perf_event_task_enable();
 			break;
 		case PR_GET_TIMERSLACK:
 			error = current->timer_slack_ns;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 68320f6b07b5..515bc230ac2a 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -177,4 +177,4 @@ cond_syscall(sys_eventfd);
 cond_syscall(sys_eventfd2);
 
 /* performance counters: */
-cond_syscall(sys_perf_counter_open);
+cond_syscall(sys_perf_event_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1a631ba684a4..6ba49c7cb128 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -50,7 +50,7 @@
 #include <linux/reboot.h>
 #include <linux/ftrace.h>
 #include <linux/slow-work.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -964,28 +964,28 @@ static struct ctl_table kern_table[] = {
 		.child		= slow_work_sysctls,
 	},
 #endif
-#ifdef CONFIG_PERF_COUNTERS
+#ifdef CONFIG_PERF_EVENTS
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "perf_counter_paranoid",
-		.data		= &sysctl_perf_counter_paranoid,
-		.maxlen		= sizeof(sysctl_perf_counter_paranoid),
+		.procname	= "perf_event_paranoid",
+		.data		= &sysctl_perf_event_paranoid,
+		.maxlen		= sizeof(sysctl_perf_event_paranoid),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "perf_counter_mlock_kb",
-		.data		= &sysctl_perf_counter_mlock,
-		.maxlen		= sizeof(sysctl_perf_counter_mlock),
+		.procname	= "perf_event_mlock_kb",
+		.data		= &sysctl_perf_event_mlock,
+		.maxlen		= sizeof(sysctl_perf_event_mlock),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "perf_counter_max_sample_rate",
-		.data		= &sysctl_perf_counter_sample_rate,
-		.maxlen		= sizeof(sysctl_perf_counter_sample_rate),
+		.procname	= "perf_event_max_sample_rate",
+		.data		= &sysctl_perf_event_sample_rate,
+		.maxlen		= sizeof(sysctl_perf_event_sample_rate),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
diff --git a/kernel/timer.c b/kernel/timer.c
index bbb51074680e..811e5c391456 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,7 +37,7 @@
 #include <linux/delay.h>
 #include <linux/tick.h>
 #include <linux/kallsyms.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/sched.h>
 
 #include <asm/uaccess.h>
@@ -1187,7 +1187,7 @@ static void run_timer_softirq(struct softirq_action *h)
 {
 	struct tvec_base *base = __get_cpu_var(tvec_bases);
 
-	perf_counter_do_pending();
+	perf_event_do_pending();
 
 	hrtimer_run_pending();
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8712ce3c6a0e..233f3483ac83 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -2,7 +2,7 @@
 #include <trace/events/syscalls.h>
 #include <linux/kernel.h>
 #include <linux/ftrace.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <asm/syscall.h>
 
 #include "trace_output.h"
@@ -414,7 +414,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 		rec->nr = syscall_nr;
 		syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 				       (unsigned long *)&rec->args);
-		perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
+		perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
 	} while(0);
 }
 
@@ -476,7 +476,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 	rec.nr = syscall_nr;
 	rec.ret = syscall_get_return_value(current, regs);
 
-	perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec));
+	perf_tp_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec));
 }
 
 int reg_prof_syscall_exit(char *name)
diff --git a/mm/mmap.c b/mm/mmap.c
index 26892e346d8f..376492ed08f4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,7 +28,7 @@
 #include <linux/mempolicy.h>
 #include <linux/rmap.h>
 #include <linux/mmu_notifier.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -1220,7 +1220,7 @@ munmap_back:
 	if (correct_wcount)
 		atomic_inc(&inode->i_writecount);
 out:
-	perf_counter_mmap(vma);
+	perf_event_mmap(vma);
 
 	mm->total_vm += len >> PAGE_SHIFT;
 	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
@@ -2308,7 +2308,7 @@ int install_special_mapping(struct mm_struct *mm,
 
 	mm->total_vm += len >> PAGE_SHIFT;
 
-	perf_counter_mmap(vma);
+	perf_event_mmap(vma);
 
 	return 0;
 }
diff --git a/mm/mprotect.c b/mm/mprotect.c
index d80311baeb2d..8bc969d8112d 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,7 +23,7 @@
 #include <linux/swapops.h>
 #include <linux/mmu_notifier.h>
 #include <linux/migrate.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
@@ -300,7 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
 		error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
 		if (error)
 			goto out;
-		perf_counter_mmap(vma);
+		perf_event_mmap(vma);
 		nstart = tmp;
 
 		if (nstart < prev->vm_end)
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 0aba8b6e9c54..b5f1953b6144 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -318,7 +318,7 @@ export PERL_PATH
 
 LIB_FILE=libperf.a
 
-LIB_H += ../../include/linux/perf_counter.h
+LIB_H += ../../include/linux/perf_event.h
 LIB_H += ../../include/linux/rbtree.h
 LIB_H += ../../include/linux/list.h
 LIB_H += util/include/linux/list.h
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 043d85b7e254..1ec741615814 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -505,7 +505,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 		return -1;
 	}
 
-	if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
+	if (event->header.misc & PERF_RECORD_MISC_KERNEL) {
 		show = SHOW_KERNEL;
 		level = 'k';
 
@@ -513,7 +513,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 
 		dump_printf(" ...... dso: %s\n", dso->name);
 
-	} else if (event->header.misc & PERF_EVENT_MISC_USER) {
+	} else if (event->header.misc & PERF_RECORD_MISC_USER) {
 
 		show = SHOW_USER;
 		level = '.';
@@ -565,7 +565,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
 
 	thread = threads__findnew(event->mmap.pid, &threads, &last_match);
 
-	dump_printf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n",
+	dump_printf("%p [%p]: PERF_RECORD_MMAP %d: [%p(%p) @ %p]: %s\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->mmap.pid,
@@ -575,7 +575,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
 		event->mmap.filename);
 
 	if (thread == NULL || map == NULL) {
-		dump_printf("problem processing PERF_EVENT_MMAP, skipping event.\n");
+		dump_printf("problem processing PERF_RECORD_MMAP, skipping event.\n");
 		return 0;
 	}
 
@@ -591,14 +591,14 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head)
 	struct thread *thread;
 
 	thread = threads__findnew(event->comm.pid, &threads, &last_match);
-	dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+	dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->comm.comm, event->comm.pid);
 
 	if (thread == NULL ||
 	    thread__set_comm(thread, event->comm.comm)) {
-		dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n");
+		dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
 		return -1;
 	}
 	total_comm++;
@@ -614,7 +614,7 @@ process_fork_event(event_t *event, unsigned long offset, unsigned long head)
 
 	thread = threads__findnew(event->fork.pid, &threads, &last_match);
 	parent = threads__findnew(event->fork.ppid, &threads, &last_match);
-	dump_printf("%p [%p]: PERF_EVENT_FORK: %d:%d\n",
+	dump_printf("%p [%p]: PERF_RECORD_FORK: %d:%d\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->fork.pid, event->fork.ppid);
@@ -627,7 +627,7 @@ process_fork_event(event_t *event, unsigned long offset, unsigned long head)
 		return 0;
 
 	if (!thread || !parent || thread__fork(thread, parent)) {
-		dump_printf("problem processing PERF_EVENT_FORK, skipping event.\n");
+		dump_printf("problem processing PERF_RECORD_FORK, skipping event.\n");
 		return -1;
 	}
 	total_fork++;
@@ -639,23 +639,23 @@ static int
 process_event(event_t *event, unsigned long offset, unsigned long head)
 {
 	switch (event->header.type) {
-	case PERF_EVENT_SAMPLE:
+	case PERF_RECORD_SAMPLE:
 		return process_sample_event(event, offset, head);
 
-	case PERF_EVENT_MMAP:
+	case PERF_RECORD_MMAP:
 		return process_mmap_event(event, offset, head);
 
-	case PERF_EVENT_COMM:
+	case PERF_RECORD_COMM:
 		return process_comm_event(event, offset, head);
 
-	case PERF_EVENT_FORK:
+	case PERF_RECORD_FORK:
 		return process_fork_event(event, offset, head);
 	/*
 	 * We dont process them right now but they are fine:
 	 */
 
-	case PERF_EVENT_THROTTLE:
-	case PERF_EVENT_UNTHROTTLE:
+	case PERF_RECORD_THROTTLE:
+	case PERF_RECORD_UNTHROTTLE:
 		return 0;
 
 	default:
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 2459e5a22ed8..a5a050af8e7d 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -77,7 +77,7 @@ static struct mmap_data		mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 
 static unsigned long mmap_read_head(struct mmap_data *md)
 {
-	struct perf_counter_mmap_page *pc = md->base;
+	struct perf_event_mmap_page *pc = md->base;
 	long head;
 
 	head = pc->data_head;
@@ -88,7 +88,7 @@ static unsigned long mmap_read_head(struct mmap_data *md)
 
 static void mmap_write_tail(struct mmap_data *md, unsigned long tail)
 {
-	struct perf_counter_mmap_page *pc = md->base;
+	struct perf_event_mmap_page *pc = md->base;
 
 	/*
 	 * ensure all reads are done before we write the tail out.
@@ -233,7 +233,7 @@ static pid_t pid_synthesize_comm_event(pid_t pid, int full)
 		}
 	}
 
-	comm_ev.header.type = PERF_EVENT_COMM;
+	comm_ev.header.type = PERF_RECORD_COMM;
 	size = ALIGN(size, sizeof(u64));
 	comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
 
@@ -288,7 +288,7 @@ static void pid_synthesize_mmap_samples(pid_t pid, pid_t tgid)
 	while (1) {
 		char bf[BUFSIZ], *pbf = bf;
 		struct mmap_event mmap_ev = {
-			.header = { .type = PERF_EVENT_MMAP },
+			.header = { .type = PERF_RECORD_MMAP },
 		};
 		int n;
 		size_t size;
@@ -355,7 +355,7 @@ static void synthesize_all(void)
 
 static int group_fd;
 
-static struct perf_header_attr *get_header_attr(struct perf_counter_attr *a, int nr)
+static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int nr)
 {
 	struct perf_header_attr *h_attr;
 
@@ -371,7 +371,7 @@ static struct perf_header_attr *get_header_attr(struct perf_counter_attr *a, int
 
 static void create_counter(int counter, int cpu, pid_t pid)
 {
-	struct perf_counter_attr *attr = attrs + counter;
+	struct perf_event_attr *attr = attrs + counter;
 	struct perf_header_attr *h_attr;
 	int track = !counter; /* only the first counter needs these */
 	struct {
@@ -417,7 +417,7 @@ static void create_counter(int counter, int cpu, pid_t pid)
 	attr->disabled		= 1;
 
 try_again:
-	fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0);
+	fd[nr_cpu][counter] = sys_perf_event_open(attr, pid, cpu, group_fd, 0);
 
 	if (fd[nr_cpu][counter] < 0) {
 		int err = errno;
@@ -444,7 +444,7 @@ try_again:
 		printf("\n");
 		error("perfcounter syscall returned with %d (%s)\n",
 			fd[nr_cpu][counter], strerror(err));
-		die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n");
+		die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
 		exit(-1);
 	}
 
@@ -478,7 +478,7 @@ try_again:
 	if (multiplex && fd[nr_cpu][counter] != multiplex_fd) {
 		int ret;
 
-		ret = ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_SET_OUTPUT, multiplex_fd);
+		ret = ioctl(fd[nr_cpu][counter], PERF_EVENT_IOC_SET_OUTPUT, multiplex_fd);
 		assert(ret != -1);
 	} else {
 		event_array[nr_poll].fd = fd[nr_cpu][counter];
@@ -496,7 +496,7 @@ try_again:
 		}
 	}
 
-	ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE);
+	ioctl(fd[nr_cpu][counter], PERF_EVENT_IOC_ENABLE);
 }
 
 static void open_counters(int cpu, pid_t pid)
@@ -642,7 +642,7 @@ static int __cmd_record(int argc, const char **argv)
 		if (done) {
 			for (i = 0; i < nr_cpu; i++) {
 				for (counter = 0; counter < nr_counters; counter++)
-					ioctl(fd[i][counter], PERF_COUNTER_IOC_DISABLE);
+					ioctl(fd[i][counter], PERF_EVENT_IOC_DISABLE);
 			}
 		}
 	}
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index cdf9a8d27bb9..19669c20088e 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -1121,7 +1121,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 		more_data += sizeof(u64);
 	}
 
-	dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
+	dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->header.misc,
@@ -1158,9 +1158,9 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 	if (comm_list && !strlist__has_entry(comm_list, thread->comm))
 		return 0;
 
-	cpumode = event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK;
+	cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
 
-	if (cpumode == PERF_EVENT_MISC_KERNEL) {
+	if (cpumode == PERF_RECORD_MISC_KERNEL) {
 		show = SHOW_KERNEL;
 		level = 'k';
 
@@ -1168,7 +1168,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 
 		dump_printf(" ...... dso: %s\n", dso->name);
 
-	} else if (cpumode == PERF_EVENT_MISC_USER) {
+	} else if (cpumode == PERF_RECORD_MISC_USER) {
 
 		show = SHOW_USER;
 		level = '.';
@@ -1210,7 +1210,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
 
 	thread = threads__findnew(event->mmap.pid, &threads, &last_match);
 
-	dump_printf("%p [%p]: PERF_EVENT_MMAP %d/%d: [%p(%p) @ %p]: %s\n",
+	dump_printf("%p [%p]: PERF_RECORD_MMAP %d/%d: [%p(%p) @ %p]: %s\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->mmap.pid,
@@ -1221,7 +1221,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
 		event->mmap.filename);
 
 	if (thread == NULL || map == NULL) {
-		dump_printf("problem processing PERF_EVENT_MMAP, skipping event.\n");
+		dump_printf("problem processing PERF_RECORD_MMAP, skipping event.\n");
 		return 0;
 	}
 
@@ -1238,14 +1238,14 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head)
 
 	thread = threads__findnew(event->comm.pid, &threads, &last_match);
 
-	dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+	dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->comm.comm, event->comm.pid);
 
 	if (thread == NULL ||
 	    thread__set_comm_adjust(thread, event->comm.comm)) {
-		dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n");
+		dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
 		return -1;
 	}
 	total_comm++;
@@ -1262,10 +1262,10 @@ process_task_event(event_t *event, unsigned long offset, unsigned long head)
 	thread = threads__findnew(event->fork.pid, &threads, &last_match);
 	parent = threads__findnew(event->fork.ppid, &threads, &last_match);
 
-	dump_printf("%p [%p]: PERF_EVENT_%s: (%d:%d):(%d:%d)\n",
+	dump_printf("%p [%p]: PERF_RECORD_%s: (%d:%d):(%d:%d)\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
-		event->header.type == PERF_EVENT_FORK ? "FORK" : "EXIT",
+		event->header.type == PERF_RECORD_FORK ? "FORK" : "EXIT",
 		event->fork.pid, event->fork.tid,
 		event->fork.ppid, event->fork.ptid);
 
@@ -1276,11 +1276,11 @@ process_task_event(event_t *event, unsigned long offset, unsigned long head)
 	if (thread == parent)
 		return 0;
 
-	if (event->header.type == PERF_EVENT_EXIT)
+	if (event->header.type == PERF_RECORD_EXIT)
 		return 0;
 
 	if (!thread || !parent || thread__fork(thread, parent)) {
-		dump_printf("problem processing PERF_EVENT_FORK, skipping event.\n");
+		dump_printf("problem processing PERF_RECORD_FORK, skipping event.\n");
 		return -1;
 	}
 	total_fork++;
@@ -1291,7 +1291,7 @@ process_task_event(event_t *event, unsigned long offset, unsigned long head)
 static int
 process_lost_event(event_t *event, unsigned long offset, unsigned long head)
 {
-	dump_printf("%p [%p]: PERF_EVENT_LOST: id:%Ld: lost:%Ld\n",
+	dump_printf("%p [%p]: PERF_RECORD_LOST: id:%Ld: lost:%Ld\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->lost.id,
@@ -1305,7 +1305,7 @@ process_lost_event(event_t *event, unsigned long offset, unsigned long head)
 static int
 process_read_event(event_t *event, unsigned long offset, unsigned long head)
 {
-	struct perf_counter_attr *attr;
+	struct perf_event_attr *attr;
 
 	attr = perf_header__find_attr(event->read.id, header);
 
@@ -1319,7 +1319,7 @@ process_read_event(event_t *event, unsigned long offset, unsigned long head)
 					   event->read.value);
 	}
 
-	dump_printf("%p [%p]: PERF_EVENT_READ: %d %d %s %Lu\n",
+	dump_printf("%p [%p]: PERF_RECORD_READ: %d %d %s %Lu\n",
 			(void *)(offset + head),
 			(void *)(long)(event->header.size),
 			event->read.pid,
@@ -1337,31 +1337,31 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 	trace_event(event);
 
 	switch (event->header.type) {
-	case PERF_EVENT_SAMPLE:
+	case PERF_RECORD_SAMPLE:
 		return process_sample_event(event, offset, head);
 
-	case PERF_EVENT_MMAP:
+	case PERF_RECORD_MMAP:
 		return process_mmap_event(event, offset, head);
 
-	case PERF_EVENT_COMM:
+	case PERF_RECORD_COMM:
 		return process_comm_event(event, offset, head);
 
-	case PERF_EVENT_FORK:
-	case PERF_EVENT_EXIT:
+	case PERF_RECORD_FORK:
+	case PERF_RECORD_EXIT:
 		return process_task_event(event, offset, head);
 
-	case PERF_EVENT_LOST:
+	case PERF_RECORD_LOST:
 		return process_lost_event(event, offset, head);
 
-	case PERF_EVENT_READ:
+	case PERF_RECORD_READ:
 		return process_read_event(event, offset, head);
 
 	/*
 	 * We dont process them right now but they are fine:
 	 */
 
-	case PERF_EVENT_THROTTLE:
-	case PERF_EVENT_UNTHROTTLE:
+	case PERF_RECORD_THROTTLE:
+	case PERF_RECORD_UNTHROTTLE:
 		return 0;
 
 	default:
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 275d79c6627a..ea9c15c0cdfe 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1573,7 +1573,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 		more_data += sizeof(u64);
 	}
 
-	dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
+	dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->header.misc,
@@ -1589,9 +1589,9 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 		return -1;
 	}
 
-	cpumode = event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK;
+	cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
 
-	if (cpumode == PERF_EVENT_MISC_KERNEL) {
+	if (cpumode == PERF_RECORD_MISC_KERNEL) {
 		show = SHOW_KERNEL;
 		level = 'k';
 
@@ -1599,7 +1599,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 
 		dump_printf(" ...... dso: %s\n", dso->name);
 
-	} else if (cpumode == PERF_EVENT_MISC_USER) {
+	} else if (cpumode == PERF_RECORD_MISC_USER) {
 
 		show = SHOW_USER;
 		level = '.';
@@ -1626,23 +1626,23 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 
 	nr_events++;
 	switch (event->header.type) {
-	case PERF_EVENT_MMAP:
+	case PERF_RECORD_MMAP:
 		return 0;
-	case PERF_EVENT_LOST:
+	case PERF_RECORD_LOST:
 		nr_lost_chunks++;
 		nr_lost_events += event->lost.lost;
 		return 0;
 
-	case PERF_EVENT_COMM:
+	case PERF_RECORD_COMM:
 		return process_comm_event(event, offset, head);
 
-	case PERF_EVENT_EXIT ... PERF_EVENT_READ:
+	case PERF_RECORD_EXIT ... PERF_RECORD_READ:
 		return 0;
 
-	case PERF_EVENT_SAMPLE:
+	case PERF_RECORD_SAMPLE:
 		return process_sample_event(event, offset, head);
 
-	case PERF_EVENT_MAX:
+	case PERF_RECORD_MAX:
 	default:
 		return -1;
 	}
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 61b828236c11..16af2d82e858 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -48,7 +48,7 @@
 #include <sys/prctl.h>
 #include <math.h>
 
-static struct perf_counter_attr default_attrs[] = {
+static struct perf_event_attr default_attrs[] = {
 
   { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK	},
   { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES},
@@ -130,11 +130,11 @@ struct stats			runtime_cycles_stats;
 	 attrs[counter].config == PERF_COUNT_##c)
 
 #define ERR_PERF_OPEN \
-"Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n"
+"Error: counter %d, sys_perf_event_open() syscall returned with %d (%s)\n"
 
 static void create_perf_stat_counter(int counter, int pid)
 {
-	struct perf_counter_attr *attr = attrs + counter;
+	struct perf_event_attr *attr = attrs + counter;
 
 	if (scale)
 		attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
@@ -144,7 +144,7 @@ static void create_perf_stat_counter(int counter, int pid)
 		unsigned int cpu;
 
 		for (cpu = 0; cpu < nr_cpus; cpu++) {
-			fd[cpu][counter] = sys_perf_counter_open(attr, -1, cpu, -1, 0);
+			fd[cpu][counter] = sys_perf_event_open(attr, -1, cpu, -1, 0);
 			if (fd[cpu][counter] < 0 && verbose)
 				fprintf(stderr, ERR_PERF_OPEN, counter,
 					fd[cpu][counter], strerror(errno));
@@ -154,7 +154,7 @@ static void create_perf_stat_counter(int counter, int pid)
 		attr->disabled	     = 1;
 		attr->enable_on_exec = 1;
 
-		fd[0][counter] = sys_perf_counter_open(attr, pid, -1, -1, 0);
+		fd[0][counter] = sys_perf_event_open(attr, pid, -1, -1, 0);
 		if (fd[0][counter] < 0 && verbose)
 			fprintf(stderr, ERR_PERF_OPEN, counter,
 				fd[0][counter], strerror(errno));
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index 600406396274..4405681b3134 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -937,21 +937,21 @@ process_event(event_t *event)
 
 	switch (event->header.type) {
 
-	case PERF_EVENT_COMM:
+	case PERF_RECORD_COMM:
 		return process_comm_event(event);
-	case PERF_EVENT_FORK:
+	case PERF_RECORD_FORK:
 		return process_fork_event(event);
-	case PERF_EVENT_EXIT:
+	case PERF_RECORD_EXIT:
 		return process_exit_event(event);
-	case PERF_EVENT_SAMPLE:
+	case PERF_RECORD_SAMPLE:
 		return queue_sample_event(event);
 
 	/*
 	 * We dont process them right now but they are fine:
 	 */
-	case PERF_EVENT_MMAP:
-	case PERF_EVENT_THROTTLE:
-	case PERF_EVENT_UNTHROTTLE:
+	case PERF_RECORD_MMAP:
+	case PERF_RECORD_THROTTLE:
+	case PERF_RECORD_UNTHROTTLE:
 		return 0;
 
 	default:
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 4002ccb36750..1ca88896eee4 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -901,7 +901,7 @@ struct mmap_data {
 
 static unsigned int mmap_read_head(struct mmap_data *md)
 {
-	struct perf_counter_mmap_page *pc = md->base;
+	struct perf_event_mmap_page *pc = md->base;
 	int head;
 
 	head = pc->data_head;
@@ -977,9 +977,9 @@ static void mmap_read_counter(struct mmap_data *md)
 
 		old += size;
 
-		if (event->header.type == PERF_EVENT_SAMPLE) {
+		if (event->header.type == PERF_RECORD_SAMPLE) {
 			int user =
-	(event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK) == PERF_EVENT_MISC_USER;
+	(event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK) == PERF_RECORD_MISC_USER;
 			process_event(event->ip.ip, md->counter, user);
 		}
 	}
@@ -1005,7 +1005,7 @@ int group_fd;
 
 static void start_counter(int i, int counter)
 {
-	struct perf_counter_attr *attr;
+	struct perf_event_attr *attr;
 	int cpu;
 
 	cpu = profile_cpu;
@@ -1019,7 +1019,7 @@ static void start_counter(int i, int counter)
 	attr->inherit		= (cpu < 0) && inherit;
 
 try_again:
-	fd[i][counter] = sys_perf_counter_open(attr, target_pid, cpu, group_fd, 0);
+	fd[i][counter] = sys_perf_event_open(attr, target_pid, cpu, group_fd, 0);
 
 	if (fd[i][counter] < 0) {
 		int err = errno;
@@ -1044,7 +1044,7 @@ try_again:
 		printf("\n");
 		error("perfcounter syscall returned with %d (%s)\n",
 			fd[i][counter], strerror(err));
-		die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n");
+		die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
 		exit(-1);
 	}
 	assert(fd[i][counter] >= 0);
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 914ab366e369..e9d256e2f47d 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -35,14 +35,14 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head)
 
 	thread = threads__findnew(event->comm.pid, &threads, &last_match);
 
-	dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+	dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->comm.comm, event->comm.pid);
 
 	if (thread == NULL ||
 	    thread__set_comm(thread, event->comm.comm)) {
-		dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n");
+		dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
 		return -1;
 	}
 	total_comm++;
@@ -82,7 +82,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 		more_data += sizeof(u64);
 	}
 
-	dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
+	dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->header.misc,
@@ -98,9 +98,9 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 		return -1;
 	}
 
-	cpumode = event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK;
+	cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
 
-	if (cpumode == PERF_EVENT_MISC_KERNEL) {
+	if (cpumode == PERF_RECORD_MISC_KERNEL) {
 		show = SHOW_KERNEL;
 		level = 'k';
 
@@ -108,7 +108,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 
 		dump_printf(" ...... dso: %s\n", dso->name);
 
-	} else if (cpumode == PERF_EVENT_MISC_USER) {
+	} else if (cpumode == PERF_RECORD_MISC_USER) {
 
 		show = SHOW_USER;
 		level = '.';
@@ -146,19 +146,19 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 	trace_event(event);
 
 	switch (event->header.type) {
-	case PERF_EVENT_MMAP ... PERF_EVENT_LOST:
+	case PERF_RECORD_MMAP ... PERF_RECORD_LOST:
 		return 0;
 
-	case PERF_EVENT_COMM:
+	case PERF_RECORD_COMM:
 		return process_comm_event(event, offset, head);
 
-	case PERF_EVENT_EXIT ... PERF_EVENT_READ:
+	case PERF_RECORD_EXIT ... PERF_RECORD_READ:
 		return 0;
 
-	case PERF_EVENT_SAMPLE:
+	case PERF_RECORD_SAMPLE:
 		return process_sample_event(event, offset, head);
 
-	case PERF_EVENT_MAX:
+	case PERF_RECORD_MAX:
 	default:
 		return -1;
 	}
diff --git a/tools/perf/design.txt b/tools/perf/design.txt
index f71e0d245cba..f1946d107b10 100644
--- a/tools/perf/design.txt
+++ b/tools/perf/design.txt
@@ -18,10 +18,10 @@ underlying hardware counters.
 Performance counters are accessed via special file descriptors.
 There's one file descriptor per virtual counter used.
 
-The special file descriptor is opened via the perf_counter_open()
+The special file descriptor is opened via the perf_event_open()
 system call:
 
-   int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
+   int sys_perf_event_open(struct perf_event_hw_event *hw_event_uptr,
 			     pid_t pid, int cpu, int group_fd,
 			     unsigned long flags);
 
@@ -32,9 +32,9 @@ can be used to set the blocking mode, etc.
 Multiple counters can be kept open at a time, and the counters
 can be poll()ed.
 
-When creating a new counter fd, 'perf_counter_hw_event' is:
+When creating a new counter fd, 'perf_event_hw_event' is:
 
-struct perf_counter_hw_event {
+struct perf_event_hw_event {
         /*
          * The MSB of the config word signifies if the rest contains cpu
          * specific (raw) counter configuration data, if unset, the next
@@ -93,7 +93,7 @@ specified by 'event_id':
 
 /*
  * Generalized performance counter event types, used by the hw_event.event_id
- * parameter of the sys_perf_counter_open() syscall:
+ * parameter of the sys_perf_event_open() syscall:
  */
 enum hw_event_ids {
 	/*
@@ -159,7 +159,7 @@ in size.
  * reads on the counter should return the indicated quantities,
  * in increasing order of bit value, after the counter value.
  */
-enum perf_counter_read_format {
+enum perf_event_read_format {
         PERF_FORMAT_TOTAL_TIME_ENABLED  =  1,
         PERF_FORMAT_TOTAL_TIME_RUNNING  =  2,
 };
@@ -178,7 +178,7 @@ interrupt:
  * Bits that can be set in hw_event.record_type to request information
  * in the overflow packets.
  */
-enum perf_counter_record_format {
+enum perf_event_record_format {
         PERF_RECORD_IP          = 1U << 0,
         PERF_RECORD_TID         = 1U << 1,
         PERF_RECORD_TIME        = 1U << 2,
@@ -228,7 +228,7 @@ these events are recorded in the ring-buffer (see below).
 The 'comm' bit allows tracking of process comm data on process creation.
 This too is recorded in the ring-buffer (see below).
 
-The 'pid' parameter to the perf_counter_open() system call allows the
+The 'pid' parameter to the perf_event_open() system call allows the
 counter to be specific to a task:
 
  pid == 0: if the pid parameter is zero, the counter is attached to the
@@ -258,7 +258,7 @@ The 'flags' parameter is currently unused and must be zero.
 
 The 'group_fd' parameter allows counter "groups" to be set up.  A
 counter group has one counter which is the group "leader".  The leader
-is created first, with group_fd = -1 in the perf_counter_open call
+is created first, with group_fd = -1 in the perf_event_open call
 that creates it.  The rest of the group members are created
 subsequently, with group_fd giving the fd of the group leader.
 (A single counter on its own is created with group_fd = -1 and is
@@ -277,13 +277,13 @@ tracking are logged into a ring-buffer. This ring-buffer is created and
 accessed through mmap().
 
 The mmap size should be 1+2^n pages, where the first page is a meta-data page
-(struct perf_counter_mmap_page) that contains various bits of information such
+(struct perf_event_mmap_page) that contains various bits of information such
 as where the ring-buffer head is.
 
 /*
  * Structure of the page that can be mapped via mmap
  */
-struct perf_counter_mmap_page {
+struct perf_event_mmap_page {
         __u32   version;                /* version number of this structure */
         __u32   compat_version;         /* lowest version this is compat with */
 
@@ -317,7 +317,7 @@ struct perf_counter_mmap_page {
          * Control data for the mmap() data buffer.
          *
          * User-space reading this value should issue an rmb(), on SMP capable
-         * platforms, after reading this value -- see perf_counter_wakeup().
+         * platforms, after reading this value -- see perf_event_wakeup().
          */
         __u32   data_head;              /* head in the data section */
 };
@@ -327,9 +327,9 @@ NOTE: the hw-counter userspace bits are arch specific and are currently only
 
 The following 2^n pages are the ring-buffer which contains events of the form:
 
-#define PERF_EVENT_MISC_KERNEL          (1 << 0)
-#define PERF_EVENT_MISC_USER            (1 << 1)
-#define PERF_EVENT_MISC_OVERFLOW        (1 << 2)
+#define PERF_RECORD_MISC_KERNEL          (1 << 0)
+#define PERF_RECORD_MISC_USER            (1 << 1)
+#define PERF_RECORD_MISC_OVERFLOW        (1 << 2)
 
 struct perf_event_header {
         __u32   type;
@@ -353,8 +353,8 @@ enum perf_event_type {
          *      char                            filename[];
          * };
          */
-        PERF_EVENT_MMAP                 = 1,
-        PERF_EVENT_MUNMAP               = 2,
+        PERF_RECORD_MMAP                 = 1,
+        PERF_RECORD_MUNMAP               = 2,
 
         /*
          * struct {
@@ -364,10 +364,10 @@ enum perf_event_type {
          *      char                            comm[];
          * };
          */
-        PERF_EVENT_COMM                 = 3,
+        PERF_RECORD_COMM                 = 3,
 
         /*
-         * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
+         * When header.misc & PERF_RECORD_MISC_OVERFLOW the event_type field
          * will be PERF_RECORD_*
          *
          * struct {
@@ -397,7 +397,7 @@ Notification of new events is possible through poll()/select()/epoll() and
 fcntl() managing signals.
 
 Normally a notification is generated for every page filled, however one can
-additionally set perf_counter_hw_event.wakeup_events to generate one every
+additionally set perf_event_hw_event.wakeup_events to generate one every
 so many counter overflow events.
 
 Future work will include a splice() interface to the ring-buffer.
@@ -409,11 +409,11 @@ events but does continue to exist and maintain its count value.
 
 An individual counter or counter group can be enabled with
 
-	ioctl(fd, PERF_COUNTER_IOC_ENABLE);
+	ioctl(fd, PERF_EVENT_IOC_ENABLE);
 
 or disabled with
 
-	ioctl(fd, PERF_COUNTER_IOC_DISABLE);
+	ioctl(fd, PERF_EVENT_IOC_DISABLE);
 
 Enabling or disabling the leader of a group enables or disables the
 whole group; that is, while the group leader is disabled, none of the
@@ -424,16 +424,16 @@ other counter.
 
 Additionally, non-inherited overflow counters can use
 
-	ioctl(fd, PERF_COUNTER_IOC_REFRESH, nr);
+	ioctl(fd, PERF_EVENT_IOC_REFRESH, nr);
 
 to enable a counter for 'nr' events, after which it gets disabled again.
 
 A process can enable or disable all the counter groups that are
 attached to it, using prctl:
 
-	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
+	prctl(PR_TASK_PERF_EVENTS_ENABLE);
 
-	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
+	prctl(PR_TASK_PERF_EVENTS_DISABLE);
 
 This applies to all counters on the current process, whether created
 by this process or by another, and doesn't affect any counters that
@@ -447,11 +447,11 @@ Arch requirements
 If your architecture does not have hardware performance metrics, you can
 still use the generic software counters based on hrtimers for sampling.
 
-So to start with, in order to add HAVE_PERF_COUNTERS to your Kconfig, you
+So to start with, in order to add HAVE_PERF_EVENTS to your Kconfig, you
 will need at least this:
-	- asm/perf_counter.h - a basic stub will suffice at first
+	- asm/perf_event.h - a basic stub will suffice at first
 	- support for atomic64 types (and associated helper functions)
-	- set_perf_counter_pending() implemented
+	- set_perf_event_pending() implemented
 
 If your architecture does have hardware capabilities, you can override the
-weak stub hw_perf_counter_init() to register hardware counters.
+weak stub hw_perf_event_init() to register hardware counters.
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 2abeb20d0bf3..8cc4623afd6f 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -52,15 +52,15 @@
 #include <sys/types.h>
 #include <sys/syscall.h>
 
-#include "../../include/linux/perf_counter.h"
+#include "../../include/linux/perf_event.h"
 #include "util/types.h"
 
 /*
- * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
+ * prctl(PR_TASK_PERF_EVENTS_DISABLE) will (cheaply) disable all
  * counters in the current task.
  */
-#define PR_TASK_PERF_COUNTERS_DISABLE   31
-#define PR_TASK_PERF_COUNTERS_ENABLE    32
+#define PR_TASK_PERF_EVENTS_DISABLE   31
+#define PR_TASK_PERF_EVENTS_ENABLE    32
 
 #ifndef NSEC_PER_SEC
 # define NSEC_PER_SEC			1000000000ULL
@@ -90,12 +90,12 @@ static inline unsigned long long rdclock(void)
 	_min1 < _min2 ? _min1 : _min2; })
 
 static inline int
-sys_perf_counter_open(struct perf_counter_attr *attr,
+sys_perf_event_open(struct perf_event_attr *attr,
 		      pid_t pid, int cpu, int group_fd,
 		      unsigned long flags)
 {
 	attr->size = sizeof(*attr);
-	return syscall(__NR_perf_counter_open, attr, pid, cpu,
+	return syscall(__NR_perf_event_open, attr, pid, cpu,
 		       group_fd, flags);
 }
 
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 018d414a09d1..2c9c26d6ded0 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -1,5 +1,5 @@
-#ifndef __PERF_EVENT_H
-#define __PERF_EVENT_H
+#ifndef __PERF_RECORD_H
+#define __PERF_RECORD_H
 #include "../perf.h"
 #include "util.h"
 #include <linux/list.h>
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index bb4fca3efcc3..e306857b2c2b 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -9,7 +9,7 @@
 /*
  * Create new perf.data header attribute:
  */
-struct perf_header_attr *perf_header_attr__new(struct perf_counter_attr *attr)
+struct perf_header_attr *perf_header_attr__new(struct perf_event_attr *attr)
 {
 	struct perf_header_attr *self = malloc(sizeof(*self));
 
@@ -134,7 +134,7 @@ struct perf_file_section {
 };
 
 struct perf_file_attr {
-	struct perf_counter_attr	attr;
+	struct perf_event_attr	attr;
 	struct perf_file_section	ids;
 };
 
@@ -320,7 +320,7 @@ u64 perf_header__sample_type(struct perf_header *header)
 	return type;
 }
 
-struct perf_counter_attr *
+struct perf_event_attr *
 perf_header__find_attr(u64 id, struct perf_header *header)
 {
 	int i;
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 7b0e84a87179..a0761bc7863c 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -1,12 +1,12 @@
 #ifndef _PERF_HEADER_H
 #define _PERF_HEADER_H
 
-#include "../../../include/linux/perf_counter.h"
+#include "../../../include/linux/perf_event.h"
 #include <sys/types.h>
 #include "types.h"
 
 struct perf_header_attr {
-	struct perf_counter_attr attr;
+	struct perf_event_attr attr;
 	int ids, size;
 	u64 *id;
 	off_t id_offset;
@@ -34,11 +34,11 @@ char *perf_header__find_event(u64 id);
 
 
 struct perf_header_attr *
-perf_header_attr__new(struct perf_counter_attr *attr);
+perf_header_attr__new(struct perf_event_attr *attr);
 void perf_header_attr__add_id(struct perf_header_attr *self, u64 id);
 
 u64 perf_header__sample_type(struct perf_header *header);
-struct perf_counter_attr *
+struct perf_event_attr *
 perf_header__find_attr(u64 id, struct perf_header *header);
 
 
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 89172fd0038b..13ab4b842d49 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -10,7 +10,7 @@
 
 int					nr_counters;
 
-struct perf_counter_attr		attrs[MAX_COUNTERS];
+struct perf_event_attr		attrs[MAX_COUNTERS];
 
 struct event_symbol {
 	u8		type;
@@ -48,13 +48,13 @@ static struct event_symbol event_symbols[] = {
   { CSW(CPU_MIGRATIONS),	"cpu-migrations",	"migrations"	},
 };
 
-#define __PERF_COUNTER_FIELD(config, name) \
-	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
+#define __PERF_EVENT_FIELD(config, name) \
+	((config & PERF_EVENT_##name##_MASK) >> PERF_EVENT_##name##_SHIFT)
 
-#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
-#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
-#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
-#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
+#define PERF_EVENT_RAW(config)	__PERF_EVENT_FIELD(config, RAW)
+#define PERF_EVENT_CONFIG(config)	__PERF_EVENT_FIELD(config, CONFIG)
+#define PERF_EVENT_TYPE(config)	__PERF_EVENT_FIELD(config, TYPE)
+#define PERF_EVENT_ID(config)		__PERF_EVENT_FIELD(config, EVENT)
 
 static const char *hw_event_names[] = {
 	"cycles",
@@ -352,7 +352,7 @@ static int parse_aliases(const char **str, const char *names[][MAX_ALIASES], int
 }
 
 static enum event_result
-parse_generic_hw_event(const char **str, struct perf_counter_attr *attr)
+parse_generic_hw_event(const char **str, struct perf_event_attr *attr)
 {
 	const char *s = *str;
 	int cache_type = -1, cache_op = -1, cache_result = -1;
@@ -417,7 +417,7 @@ parse_single_tracepoint_event(char *sys_name,
 			      const char *evt_name,
 			      unsigned int evt_length,
 			      char *flags,
-			      struct perf_counter_attr *attr,
+			      struct perf_event_attr *attr,
 			      const char **strp)
 {
 	char evt_path[MAXPATHLEN];
@@ -505,7 +505,7 @@ parse_subsystem_tracepoint_event(char *sys_name, char *flags)
 
 
 static enum event_result parse_tracepoint_event(const char **strp,
-				    struct perf_counter_attr *attr)
+				    struct perf_event_attr *attr)
 {
 	const char *evt_name;
 	char *flags;
@@ -563,7 +563,7 @@ static int check_events(const char *str, unsigned int i)
 }
 
 static enum event_result
-parse_symbolic_event(const char **strp, struct perf_counter_attr *attr)
+parse_symbolic_event(const char **strp, struct perf_event_attr *attr)
 {
 	const char *str = *strp;
 	unsigned int i;
@@ -582,7 +582,7 @@ parse_symbolic_event(const char **strp, struct perf_counter_attr *attr)
 }
 
 static enum event_result
-parse_raw_event(const char **strp, struct perf_counter_attr *attr)
+parse_raw_event(const char **strp, struct perf_event_attr *attr)
 {
 	const char *str = *strp;
 	u64 config;
@@ -601,7 +601,7 @@ parse_raw_event(const char **strp, struct perf_counter_attr *attr)
 }
 
 static enum event_result
-parse_numeric_event(const char **strp, struct perf_counter_attr *attr)
+parse_numeric_event(const char **strp, struct perf_event_attr *attr)
 {
 	const char *str = *strp;
 	char *endp;
@@ -623,7 +623,7 @@ parse_numeric_event(const char **strp, struct perf_counter_attr *attr)
 }
 
 static enum event_result
-parse_event_modifier(const char **strp, struct perf_counter_attr *attr)
+parse_event_modifier(const char **strp, struct perf_event_attr *attr)
 {
 	const char *str = *strp;
 	int eu = 1, ek = 1, eh = 1;
@@ -656,7 +656,7 @@ parse_event_modifier(const char **strp, struct perf_counter_attr *attr)
  * Symbolic names are (almost) exactly matched.
  */
 static enum event_result
-parse_event_symbols(const char **str, struct perf_counter_attr *attr)
+parse_event_symbols(const char **str, struct perf_event_attr *attr)
 {
 	enum event_result ret;
 
@@ -711,7 +711,7 @@ static void store_event_type(const char *orgname)
 
 int parse_events(const struct option *opt __used, const char *str, int unset __used)
 {
-	struct perf_counter_attr attr;
+	struct perf_event_attr attr;
 	enum event_result ret;
 
 	if (strchr(str, ':'))
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index 60704c15961f..30c608112845 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -16,7 +16,7 @@ extern struct tracepoint_path *tracepoint_id_to_path(u64 config);
 
 extern int			nr_counters;
 
-extern struct perf_counter_attr attrs[MAX_COUNTERS];
+extern struct perf_event_attr attrs[MAX_COUNTERS];
 
 extern const char *event_name(int ctr);
 extern const char *__event_name(int type, u64 config);
diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c
index 1fd824c1f1c4..af4b0573b37f 100644
--- a/tools/perf/util/trace-event-info.c
+++ b/tools/perf/util/trace-event-info.c
@@ -480,12 +480,12 @@ out:
 }
 
 static struct tracepoint_path *
-get_tracepoints_path(struct perf_counter_attr *pattrs, int nb_counters)
+get_tracepoints_path(struct perf_event_attr *pattrs, int nb_events)
 {
 	struct tracepoint_path path, *ppath = &path;
 	int i;
 
-	for (i = 0; i < nb_counters; i++) {
+	for (i = 0; i < nb_events; i++) {
 		if (pattrs[i].type != PERF_TYPE_TRACEPOINT)
 			continue;
 		ppath->next = tracepoint_id_to_path(pattrs[i].config);
@@ -496,7 +496,7 @@ get_tracepoints_path(struct perf_counter_attr *pattrs, int nb_counters)
 
 	return path.next;
 }
-void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters)
+void read_tracing_data(struct perf_event_attr *pattrs, int nb_events)
 {
 	char buf[BUFSIZ];
 	struct tracepoint_path *tps;
@@ -530,7 +530,7 @@ void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters)
 	page_size = getpagesize();
 	write_or_die(&page_size, 4);
 
-	tps = get_tracepoints_path(pattrs, nb_counters);
+	tps = get_tracepoints_path(pattrs, nb_events);
 
 	read_header_files();
 	read_ftrace_files(tps);
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index d35ebf1e29ff..693f815c9429 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -240,6 +240,6 @@ unsigned long long
 raw_field_value(struct event *event, const char *name, void *data);
 void *raw_field_ptr(struct event *event, const char *name, void *data);
 
-void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters);
+void read_tracing_data(struct perf_event_attr *pattrs, int nb_events);
 
 #endif /* _TRACE_EVENTS_H */
-- 
cgit v1.2.3-71-gd317


From 57c0c15b5244320065374ad2c54f4fbec77a6428 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 21 Sep 2009 12:20:38 +0200
Subject: perf: Tidy up after the big rename

 - provide compatibility Kconfig entry for existing PERF_COUNTERS .config's

 - provide courtesy copy of old perf_counter.h, for user-space projects

 - small indentation fixups

 - fix up MAINTAINERS

 - fix small x86 printout fallout

 - fix up small PowerPC comment fallout (use 'counter' as in register)

Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 MAINTAINERS                      |   2 +-
 arch/powerpc/include/asm/paca.h  |   2 +-
 arch/powerpc/kernel/perf_event.c |  12 +-
 arch/x86/kernel/cpu/perf_event.c |  14 +-
 include/linux/perf_counter.h     | 441 +++++++++++++++++++++++++++++++++++++++
 include/linux/perf_event.h       |  98 ++++-----
 init/Kconfig                     |  37 +++-
 kernel/perf_event.c              |   4 +-
 8 files changed, 534 insertions(+), 76 deletions(-)
 create mode 100644 include/linux/perf_counter.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 43761a00e3f1..751a307dc44e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4000,7 +4000,7 @@ S:	Maintained
 F:	include/linux/delayacct.h
 F:	kernel/delayacct.c
 
-PERFORMANCE COUNTER SUBSYSTEM
+PERFORMANCE EVENTS SUBSYSTEM
 M:	Peter Zijlstra <a.p.zijlstra@chello.nl>
 M:	Paul Mackerras <paulus@samba.org>
 M:	Ingo Molnar <mingo@elte.hu>
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 154f405b642f..7d8514ceceae 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -122,7 +122,7 @@ struct paca_struct {
 	u8 soft_enabled;		/* irq soft-enable flag */
 	u8 hard_enabled;		/* set if irqs are enabled in MSR */
 	u8 io_sync;			/* writel() needs spin_unlock sync */
-	u8 perf_event_pending;	/* PM interrupt while soft-disabled */
+	u8 perf_event_pending;		/* PM interrupt while soft-disabled */
 
 	/* Stuff for accurate time accounting */
 	u64 user_time;			/* accumulated usermode TB ticks */
diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c
index c98321fcb459..197b7d958796 100644
--- a/arch/powerpc/kernel/perf_event.c
+++ b/arch/powerpc/kernel/perf_event.c
@@ -41,7 +41,7 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
 struct power_pmu *ppmu;
 
 /*
- * Normally, to ignore kernel events we set the FCS (freeze events
+ * Normally, to ignore kernel events we set the FCS (freeze counters
  * in supervisor mode) bit in MMCR0, but if the kernel runs with the
  * hypervisor bit set in the MSR, or if we are running on a processor
  * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
@@ -159,7 +159,7 @@ void perf_event_print_debug(void)
 }
 
 /*
- * Read one performance monitor event (PMC).
+ * Read one performance monitor counter (PMC).
  */
 static unsigned long read_pmc(int idx)
 {
@@ -409,7 +409,7 @@ static void power_pmu_read(struct perf_event *event)
 		val = read_pmc(event->hw.idx);
 	} while (atomic64_cmpxchg(&event->hw.prev_count, prev, val) != prev);
 
-	/* The events are only 32 bits wide */
+	/* The counters are only 32 bits wide */
 	delta = (val - prev) & 0xfffffffful;
 	atomic64_add(delta, &event->count);
 	atomic64_sub(delta, &event->hw.period_left);
@@ -543,7 +543,7 @@ void hw_perf_disable(void)
 		}
 
 		/*
-		 * Set the 'freeze events' bit.
+		 * Set the 'freeze counters' bit.
 		 * The barrier is to make sure the mtspr has been
 		 * executed and the PMU has frozen the events
 		 * before we return.
@@ -1124,7 +1124,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
 }
 
 /*
- * A event has overflowed; update its count and record
+ * A counter has overflowed; update its count and record
  * things if requested.  Note that interrupts are hard-disabled
  * here so there is no possibility of being interrupted.
  */
@@ -1271,7 +1271,7 @@ static void perf_event_interrupt(struct pt_regs *regs)
 
 	/*
 	 * Reset MMCR0 to its normal value.  This will set PMXE and
-	 * clear FC (freeze events) and PMAO (perf mon alert occurred)
+	 * clear FC (freeze counters) and PMAO (perf mon alert occurred)
 	 * and thus allow interrupts to occur again.
 	 * XXX might want to use MSR.PM to keep the events frozen until
 	 * we get back out of this interrupt.
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 0d03629fb1a5..a3c7adb06b78 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2081,13 +2081,13 @@ void __init init_hw_perf_events(void)
 	perf_events_lapic_init();
 	register_die_notifier(&perf_event_nmi_notifier);
 
-	pr_info("... version:                 %d\n",     x86_pmu.version);
-	pr_info("... bit width:               %d\n",     x86_pmu.event_bits);
-	pr_info("... generic events:        %d\n",     x86_pmu.num_events);
-	pr_info("... value mask:              %016Lx\n", x86_pmu.event_mask);
-	pr_info("... max period:              %016Lx\n", x86_pmu.max_period);
-	pr_info("... fixed-purpose events:  %d\n",     x86_pmu.num_events_fixed);
-	pr_info("... event mask:            %016Lx\n", perf_event_mask);
+	pr_info("... version:                %d\n",     x86_pmu.version);
+	pr_info("... bit width:              %d\n",     x86_pmu.event_bits);
+	pr_info("... generic registers:      %d\n",     x86_pmu.num_events);
+	pr_info("... value mask:             %016Lx\n", x86_pmu.event_mask);
+	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
+	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_events_fixed);
+	pr_info("... event mask:             %016Lx\n", perf_event_mask);
 }
 
 static inline void x86_pmu_read(struct perf_event *event)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
new file mode 100644
index 000000000000..368bd70f1d2d
--- /dev/null
+++ b/include/linux/perf_counter.h
@@ -0,0 +1,441 @@
+/*
+ *  NOTE: this file will be removed in a future kernel release, it is
+ *  provided as a courtesy copy of user-space code that relies on the
+ *  old (pre-rename) symbols and constants.
+ *
+ *  Performance events:
+ *
+ *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
+ *    Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar
+ *    Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra
+ *
+ *  Data type definitions, declarations, prototypes.
+ *
+ *    Started by: Thomas Gleixner and Ingo Molnar
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+#ifndef _LINUX_PERF_COUNTER_H
+#define _LINUX_PERF_COUNTER_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+#include <asm/byteorder.h>
+
+/*
+ * User-space ABI bits:
+ */
+
+/*
+ * attr.type
+ */
+enum perf_type_id {
+	PERF_TYPE_HARDWARE			= 0,
+	PERF_TYPE_SOFTWARE			= 1,
+	PERF_TYPE_TRACEPOINT			= 2,
+	PERF_TYPE_HW_CACHE			= 3,
+	PERF_TYPE_RAW				= 4,
+
+	PERF_TYPE_MAX,				/* non-ABI */
+};
+
+/*
+ * Generalized performance counter event types, used by the
+ * attr.event_id parameter of the sys_perf_counter_open()
+ * syscall:
+ */
+enum perf_hw_id {
+	/*
+	 * Common hardware events, generalized by the kernel:
+	 */
+	PERF_COUNT_HW_CPU_CYCLES		= 0,
+	PERF_COUNT_HW_INSTRUCTIONS		= 1,
+	PERF_COUNT_HW_CACHE_REFERENCES		= 2,
+	PERF_COUNT_HW_CACHE_MISSES		= 3,
+	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4,
+	PERF_COUNT_HW_BRANCH_MISSES		= 5,
+	PERF_COUNT_HW_BUS_CYCLES		= 6,
+
+	PERF_COUNT_HW_MAX,			/* non-ABI */
+};
+
+/*
+ * Generalized hardware cache counters:
+ *
+ *       { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x
+ *       { read, write, prefetch } x
+ *       { accesses, misses }
+ */
+enum perf_hw_cache_id {
+	PERF_COUNT_HW_CACHE_L1D			= 0,
+	PERF_COUNT_HW_CACHE_L1I			= 1,
+	PERF_COUNT_HW_CACHE_LL			= 2,
+	PERF_COUNT_HW_CACHE_DTLB		= 3,
+	PERF_COUNT_HW_CACHE_ITLB		= 4,
+	PERF_COUNT_HW_CACHE_BPU			= 5,
+
+	PERF_COUNT_HW_CACHE_MAX,		/* non-ABI */
+};
+
+enum perf_hw_cache_op_id {
+	PERF_COUNT_HW_CACHE_OP_READ		= 0,
+	PERF_COUNT_HW_CACHE_OP_WRITE		= 1,
+	PERF_COUNT_HW_CACHE_OP_PREFETCH		= 2,
+
+	PERF_COUNT_HW_CACHE_OP_MAX,		/* non-ABI */
+};
+
+enum perf_hw_cache_op_result_id {
+	PERF_COUNT_HW_CACHE_RESULT_ACCESS	= 0,
+	PERF_COUNT_HW_CACHE_RESULT_MISS		= 1,
+
+	PERF_COUNT_HW_CACHE_RESULT_MAX,		/* non-ABI */
+};
+
+/*
+ * Special "software" counters provided by the kernel, even if the hardware
+ * does not support performance counters. These counters measure various
+ * physical and sw events of the kernel (and allow the profiling of them as
+ * well):
+ */
+enum perf_sw_ids {
+	PERF_COUNT_SW_CPU_CLOCK			= 0,
+	PERF_COUNT_SW_TASK_CLOCK		= 1,
+	PERF_COUNT_SW_PAGE_FAULTS		= 2,
+	PERF_COUNT_SW_CONTEXT_SWITCHES		= 3,
+	PERF_COUNT_SW_CPU_MIGRATIONS		= 4,
+	PERF_COUNT_SW_PAGE_FAULTS_MIN		= 5,
+	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
+
+	PERF_COUNT_SW_MAX,			/* non-ABI */
+};
+
+/*
+ * Bits that can be set in attr.sample_type to request information
+ * in the overflow packets.
+ */
+enum perf_counter_sample_format {
+	PERF_SAMPLE_IP				= 1U << 0,
+	PERF_SAMPLE_TID				= 1U << 1,
+	PERF_SAMPLE_TIME			= 1U << 2,
+	PERF_SAMPLE_ADDR			= 1U << 3,
+	PERF_SAMPLE_READ			= 1U << 4,
+	PERF_SAMPLE_CALLCHAIN			= 1U << 5,
+	PERF_SAMPLE_ID				= 1U << 6,
+	PERF_SAMPLE_CPU				= 1U << 7,
+	PERF_SAMPLE_PERIOD			= 1U << 8,
+	PERF_SAMPLE_STREAM_ID			= 1U << 9,
+	PERF_SAMPLE_RAW				= 1U << 10,
+
+	PERF_SAMPLE_MAX = 1U << 11,		/* non-ABI */
+};
+
+/*
+ * The format of the data returned by read() on a perf counter fd,
+ * as specified by attr.read_format:
+ *
+ * struct read_format {
+ *	{ u64		value;
+ *	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
+ *	  { u64		time_running; } && PERF_FORMAT_RUNNING
+ *	  { u64		id;           } && PERF_FORMAT_ID
+ *	} && !PERF_FORMAT_GROUP
+ *
+ *	{ u64		nr;
+ *	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
+ *	  { u64		time_running; } && PERF_FORMAT_RUNNING
+ *	  { u64		value;
+ *	    { u64	id;           } && PERF_FORMAT_ID
+ *	  }		cntr[nr];
+ *	} && PERF_FORMAT_GROUP
+ * };
+ */
+enum perf_counter_read_format {
+	PERF_FORMAT_TOTAL_TIME_ENABLED		= 1U << 0,
+	PERF_FORMAT_TOTAL_TIME_RUNNING		= 1U << 1,
+	PERF_FORMAT_ID				= 1U << 2,
+	PERF_FORMAT_GROUP			= 1U << 3,
+
+	PERF_FORMAT_MAX = 1U << 4, 		/* non-ABI */
+};
+
+#define PERF_ATTR_SIZE_VER0	64	/* sizeof first published struct */
+
+/*
+ * Hardware event to monitor via a performance monitoring counter:
+ */
+struct perf_counter_attr {
+
+	/*
+	 * Major type: hardware/software/tracepoint/etc.
+	 */
+	__u32			type;
+
+	/*
+	 * Size of the attr structure, for fwd/bwd compat.
+	 */
+	__u32			size;
+
+	/*
+	 * Type specific configuration information.
+	 */
+	__u64			config;
+
+	union {
+		__u64		sample_period;
+		__u64		sample_freq;
+	};
+
+	__u64			sample_type;
+	__u64			read_format;
+
+	__u64			disabled       :  1, /* off by default        */
+				inherit	       :  1, /* children inherit it   */
+				pinned	       :  1, /* must always be on PMU */
+				exclusive      :  1, /* only group on PMU     */
+				exclude_user   :  1, /* don't count user      */
+				exclude_kernel :  1, /* ditto kernel          */
+				exclude_hv     :  1, /* ditto hypervisor      */
+				exclude_idle   :  1, /* don't count when idle */
+				mmap           :  1, /* include mmap data     */
+				comm	       :  1, /* include comm data     */
+				freq           :  1, /* use freq, not period  */
+				inherit_stat   :  1, /* per task counts       */
+				enable_on_exec :  1, /* next exec enables     */
+				task           :  1, /* trace fork/exit       */
+				watermark      :  1, /* wakeup_watermark      */
+
+				__reserved_1   : 49;
+
+	union {
+		__u32		wakeup_events;	  /* wakeup every n events */
+		__u32		wakeup_watermark; /* bytes before wakeup   */
+	};
+	__u32			__reserved_2;
+
+	__u64			__reserved_3;
+};
+
+/*
+ * Ioctls that can be done on a perf counter fd:
+ */
+#define PERF_COUNTER_IOC_ENABLE		_IO ('$', 0)
+#define PERF_COUNTER_IOC_DISABLE	_IO ('$', 1)
+#define PERF_COUNTER_IOC_REFRESH	_IO ('$', 2)
+#define PERF_COUNTER_IOC_RESET		_IO ('$', 3)
+#define PERF_COUNTER_IOC_PERIOD		_IOW('$', 4, u64)
+#define PERF_COUNTER_IOC_SET_OUTPUT	_IO ('$', 5)
+
+enum perf_counter_ioc_flags {
+	PERF_IOC_FLAG_GROUP		= 1U << 0,
+};
+
+/*
+ * Structure of the page that can be mapped via mmap
+ */
+struct perf_counter_mmap_page {
+	__u32	version;		/* version number of this structure */
+	__u32	compat_version;		/* lowest version this is compat with */
+
+	/*
+	 * Bits needed to read the hw counters in user-space.
+	 *
+	 *   u32 seq;
+	 *   s64 count;
+	 *
+	 *   do {
+	 *     seq = pc->lock;
+	 *
+	 *     barrier()
+	 *     if (pc->index) {
+	 *       count = pmc_read(pc->index - 1);
+	 *       count += pc->offset;
+	 *     } else
+	 *       goto regular_read;
+	 *
+	 *     barrier();
+	 *   } while (pc->lock != seq);
+	 *
+	 * NOTE: for obvious reason this only works on self-monitoring
+	 *       processes.
+	 */
+	__u32	lock;			/* seqlock for synchronization */
+	__u32	index;			/* hardware counter identifier */
+	__s64	offset;			/* add to hardware counter value */
+	__u64	time_enabled;		/* time counter active */
+	__u64	time_running;		/* time counter on cpu */
+
+		/*
+		 * Hole for extension of the self monitor capabilities
+		 */
+
+	__u64	__reserved[123];	/* align to 1k */
+
+	/*
+	 * Control data for the mmap() data buffer.
+	 *
+	 * User-space reading the @data_head value should issue an rmb(), on
+	 * SMP capable platforms, after reading this value -- see
+	 * perf_counter_wakeup().
+	 *
+	 * When the mapping is PROT_WRITE the @data_tail value should be
+	 * written by userspace to reflect the last read data. In this case
+	 * the kernel will not over-write unread data.
+	 */
+	__u64   data_head;		/* head in the data section */
+	__u64	data_tail;		/* user-space written tail */
+};
+
+#define PERF_EVENT_MISC_CPUMODE_MASK		(3 << 0)
+#define PERF_EVENT_MISC_CPUMODE_UNKNOWN		(0 << 0)
+#define PERF_EVENT_MISC_KERNEL			(1 << 0)
+#define PERF_EVENT_MISC_USER			(2 << 0)
+#define PERF_EVENT_MISC_HYPERVISOR		(3 << 0)
+
+struct perf_event_header {
+	__u32	type;
+	__u16	misc;
+	__u16	size;
+};
+
+enum perf_event_type {
+
+	/*
+	 * The MMAP events record the PROT_EXEC mappings so that we can
+	 * correlate userspace IPs to code. They have the following structure:
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *
+	 *	u32				pid, tid;
+	 *	u64				addr;
+	 *	u64				len;
+	 *	u64				pgoff;
+	 *	char				filename[];
+	 * };
+	 */
+	PERF_EVENT_MMAP			= 1,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u64				id;
+	 *	u64				lost;
+	 * };
+	 */
+	PERF_EVENT_LOST			= 2,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *
+	 *	u32				pid, tid;
+	 *	char				comm[];
+	 * };
+	 */
+	PERF_EVENT_COMM			= 3,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u32				pid, ppid;
+	 *	u32				tid, ptid;
+	 *	u64				time;
+	 * };
+	 */
+	PERF_EVENT_EXIT			= 4,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u64				time;
+	 *	u64				id;
+	 *	u64				stream_id;
+	 * };
+	 */
+	PERF_EVENT_THROTTLE		= 5,
+	PERF_EVENT_UNTHROTTLE		= 6,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u32				pid, ppid;
+	 *	u32				tid, ptid;
+	 *	{ u64				time;     } && PERF_SAMPLE_TIME
+	 * };
+	 */
+	PERF_EVENT_FORK			= 7,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u32				pid, tid;
+	 *
+	 *	struct read_format		values;
+	 * };
+	 */
+	PERF_EVENT_READ			= 8,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *
+	 *	{ u64			ip;	  } && PERF_SAMPLE_IP
+	 *	{ u32			pid, tid; } && PERF_SAMPLE_TID
+	 *	{ u64			time;     } && PERF_SAMPLE_TIME
+	 *	{ u64			addr;     } && PERF_SAMPLE_ADDR
+	 *	{ u64			id;	  } && PERF_SAMPLE_ID
+	 *	{ u64			stream_id;} && PERF_SAMPLE_STREAM_ID
+	 *	{ u32			cpu, res; } && PERF_SAMPLE_CPU
+	 *	{ u64			period;   } && PERF_SAMPLE_PERIOD
+	 *
+	 *	{ struct read_format	values;	  } && PERF_SAMPLE_READ
+	 *
+	 *	{ u64			nr,
+	 *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
+	 *
+	 *	#
+	 *	# The RAW record below is opaque data wrt the ABI
+	 *	#
+	 *	# That is, the ABI doesn't make any promises wrt to
+	 *	# the stability of its content, it may vary depending
+	 *	# on event, hardware, kernel version and phase of
+	 *	# the moon.
+	 *	#
+	 *	# In other words, PERF_SAMPLE_RAW contents are not an ABI.
+	 *	#
+	 *
+	 *	{ u32			size;
+	 *	  char                  data[size];}&& PERF_SAMPLE_RAW
+	 * };
+	 */
+	PERF_EVENT_SAMPLE		= 9,
+
+	PERF_EVENT_MAX,			/* non-ABI */
+};
+
+enum perf_callchain_context {
+	PERF_CONTEXT_HV			= (__u64)-32,
+	PERF_CONTEXT_KERNEL		= (__u64)-128,
+	PERF_CONTEXT_USER		= (__u64)-512,
+
+	PERF_CONTEXT_GUEST		= (__u64)-2048,
+	PERF_CONTEXT_GUEST_KERNEL	= (__u64)-2176,
+	PERF_CONTEXT_GUEST_USER		= (__u64)-2560,
+
+	PERF_CONTEXT_MAX		= (__u64)-4095,
+};
+
+#define PERF_FLAG_FD_NO_GROUP		(1U << 0)
+#define PERF_FLAG_FD_OUTPUT		(1U << 1)
+
+/*
+ * In case some app still references the old symbols:
+ */
+
+#define __NR_perf_counter_open		__NR_perf_event_open
+
+#define PR_TASK_PERF_COUNTERS_DISABLE	PR_TASK_PERF_EVENTS_DISABLE
+#define PR_TASK_PERF_COUNTERS_ENABLE	PR_TASK_PERF_EVENTS_ENABLE
+
+#endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ae9d9ed6df2a..acefaf71e6dd 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1,15 +1,15 @@
 /*
- *  Performance events:
+ * Performance events:
  *
  *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
  *    Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar
  *    Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra
  *
- *  Data type definitions, declarations, prototypes.
+ * Data type definitions, declarations, prototypes.
  *
  *    Started by: Thomas Gleixner and Ingo Molnar
  *
- *  For licencing details see kernel-base/COPYING
+ * For licencing details see kernel-base/COPYING
  */
 #ifndef _LINUX_PERF_EVENT_H
 #define _LINUX_PERF_EVENT_H
@@ -131,19 +131,19 @@ enum perf_event_sample_format {
  * as specified by attr.read_format:
  *
  * struct read_format {
- * 	{ u64		value;
- * 	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
- * 	  { u64		time_running; } && PERF_FORMAT_RUNNING
- * 	  { u64		id;           } && PERF_FORMAT_ID
- * 	} && !PERF_FORMAT_GROUP
+ *	{ u64		value;
+ *	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
+ *	  { u64		time_running; } && PERF_FORMAT_RUNNING
+ *	  { u64		id;           } && PERF_FORMAT_ID
+ *	} && !PERF_FORMAT_GROUP
  *
- * 	{ u64		nr;
- * 	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
- * 	  { u64		time_running; } && PERF_FORMAT_RUNNING
- * 	  { u64		value;
- * 	    { u64	id;           } && PERF_FORMAT_ID
- * 	  }		cntr[nr];
- * 	} && PERF_FORMAT_GROUP
+ *	{ u64		nr;
+ *	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
+ *	  { u64		time_running; } && PERF_FORMAT_RUNNING
+ *	  { u64		value;
+ *	    { u64	id;           } && PERF_FORMAT_ID
+ *	  }		cntr[nr];
+ *	} && PERF_FORMAT_GROUP
  * };
  */
 enum perf_event_read_format {
@@ -152,7 +152,7 @@ enum perf_event_read_format {
 	PERF_FORMAT_ID				= 1U << 2,
 	PERF_FORMAT_GROUP			= 1U << 3,
 
-	PERF_FORMAT_MAX = 1U << 4, 		/* non-ABI */
+	PERF_FORMAT_MAX = 1U << 4,		/* non-ABI */
 };
 
 #define PERF_ATTR_SIZE_VER0	64	/* sizeof first published struct */
@@ -216,8 +216,8 @@ struct perf_event_attr {
  * Ioctls that can be done on a perf event fd:
  */
 #define PERF_EVENT_IOC_ENABLE		_IO ('$', 0)
-#define PERF_EVENT_IOC_DISABLE	_IO ('$', 1)
-#define PERF_EVENT_IOC_REFRESH	_IO ('$', 2)
+#define PERF_EVENT_IOC_DISABLE		_IO ('$', 1)
+#define PERF_EVENT_IOC_REFRESH		_IO ('$', 2)
 #define PERF_EVENT_IOC_RESET		_IO ('$', 3)
 #define PERF_EVENT_IOC_PERIOD		_IOW('$', 4, u64)
 #define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
@@ -314,9 +314,9 @@ enum perf_event_type {
 
 	/*
 	 * struct {
-	 * 	struct perf_event_header	header;
-	 * 	u64				id;
-	 * 	u64				lost;
+	 *	struct perf_event_header	header;
+	 *	u64				id;
+	 *	u64				lost;
 	 * };
 	 */
 	PERF_RECORD_LOST			= 2,
@@ -383,23 +383,23 @@ enum perf_event_type {
 	 *	{ u64			id;	  } && PERF_SAMPLE_ID
 	 *	{ u64			stream_id;} && PERF_SAMPLE_STREAM_ID
 	 *	{ u32			cpu, res; } && PERF_SAMPLE_CPU
-	 * 	{ u64			period;   } && PERF_SAMPLE_PERIOD
+	 *	{ u64			period;   } && PERF_SAMPLE_PERIOD
 	 *
 	 *	{ struct read_format	values;	  } && PERF_SAMPLE_READ
 	 *
 	 *	{ u64			nr,
 	 *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
 	 *
-	 * 	#
-	 * 	# The RAW record below is opaque data wrt the ABI
-	 * 	#
-	 * 	# That is, the ABI doesn't make any promises wrt to
-	 * 	# the stability of its content, it may vary depending
-	 * 	# on event_id, hardware, kernel version and phase of
-	 * 	# the moon.
-	 * 	#
-	 * 	# In other words, PERF_SAMPLE_RAW contents are not an ABI.
-	 * 	#
+	 *	#
+	 *	# The RAW record below is opaque data wrt the ABI
+	 *	#
+	 *	# That is, the ABI doesn't make any promises wrt to
+	 *	# the stability of its content, it may vary depending
+	 *	# on event, hardware, kernel version and phase of
+	 *	# the moon.
+	 *	#
+	 *	# In other words, PERF_SAMPLE_RAW contents are not an ABI.
+	 *	#
 	 *
 	 *	{ u32			size;
 	 *	  char                  data[size];}&& PERF_SAMPLE_RAW
@@ -503,10 +503,10 @@ struct pmu {
  * enum perf_event_active_state - the states of a event
  */
 enum perf_event_active_state {
-	PERF_EVENT_STATE_ERROR	= -2,
+	PERF_EVENT_STATE_ERROR		= -2,
 	PERF_EVENT_STATE_OFF		= -1,
 	PERF_EVENT_STATE_INACTIVE	=  0,
-	PERF_EVENT_STATE_ACTIVE	=  1,
+	PERF_EVENT_STATE_ACTIVE		=  1,
 };
 
 struct file;
@@ -529,7 +529,7 @@ struct perf_mmap_data {
 
 	long				watermark;	/* wakeup watermark  */
 
-	struct perf_event_mmap_page   *user_page;
+	struct perf_event_mmap_page	*user_page;
 	void				*data_pages[0];
 };
 
@@ -694,14 +694,14 @@ struct perf_cpu_context {
 };
 
 struct perf_output_handle {
-	struct perf_event	*event;
-	struct perf_mmap_data	*data;
-	unsigned long		head;
-	unsigned long		offset;
-	int			nmi;
-	int			sample;
-	int			locked;
-	unsigned long		flags;
+	struct perf_event		*event;
+	struct perf_mmap_data		*data;
+	unsigned long			head;
+	unsigned long			offset;
+	int				nmi;
+	int				sample;
+	int				locked;
+	unsigned long			flags;
 };
 
 #ifdef CONFIG_PERF_EVENTS
@@ -829,22 +829,22 @@ static inline void
 perf_event_task_sched_out(struct task_struct *task,
 			    struct task_struct *next, int cpu)		{ }
 static inline void
-perf_event_task_tick(struct task_struct *task, int cpu)		{ }
+perf_event_task_tick(struct task_struct *task, int cpu)			{ }
 static inline int perf_event_init_task(struct task_struct *child)	{ return 0; }
 static inline void perf_event_exit_task(struct task_struct *child)	{ }
 static inline void perf_event_free_task(struct task_struct *task)	{ }
-static inline void perf_event_do_pending(void)			{ }
-static inline void perf_event_print_debug(void)			{ }
+static inline void perf_event_do_pending(void)				{ }
+static inline void perf_event_print_debug(void)				{ }
 static inline void perf_disable(void)					{ }
 static inline void perf_enable(void)					{ }
-static inline int perf_event_task_disable(void)	{ return -EINVAL; }
-static inline int perf_event_task_enable(void)	{ return -EINVAL; }
+static inline int perf_event_task_disable(void)				{ return -EINVAL; }
+static inline int perf_event_task_enable(void)				{ return -EINVAL; }
 
 static inline void
 perf_sw_event(u32 event_id, u64 nr, int nmi,
 		     struct pt_regs *regs, u64 addr)			{ }
 
-static inline void perf_event_mmap(struct vm_area_struct *vma)	{ }
+static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
 static inline void perf_event_comm(struct task_struct *tsk)		{ }
 static inline void perf_event_fork(struct task_struct *tsk)		{ }
 static inline void perf_event_init(void)				{ }
diff --git a/init/Kconfig b/init/Kconfig
index cfdf5c322806..706728be312f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -920,26 +920,31 @@ config HAVE_PERF_EVENTS
 	help
 	  See tools/perf/design.txt for details.
 
-menu "Performance Counters"
+menu "Kernel Performance Events And Counters"
 
 config PERF_EVENTS
-	bool "Kernel Performance Counters"
-	default y if PROFILING
+	bool "Kernel performance events and counters"
+	default y if (PROFILING || PERF_COUNTERS)
 	depends on HAVE_PERF_EVENTS
 	select ANON_INODES
 	help
-	  Enable kernel support for performance counter hardware.
+	  Enable kernel support for various performance events provided
+	  by software and hardware.
 
-	  Performance counters are special hardware registers available
-	  on most modern CPUs. These registers count the number of certain
+	  Software events are supported either build-in or via the
+	  use of generic tracepoints.
+
+	  Most modern CPUs support performance events via performance
+	  counter registers. These registers count the number of certain
 	  types of hw events: such as instructions executed, cachemisses
 	  suffered, or branches mis-predicted - without slowing down the
 	  kernel or applications. These registers can also trigger interrupts
 	  when a threshold number of events have passed - and can thus be
 	  used to profile the code that runs on that CPU.
 
-	  The Linux Performance Counter subsystem provides an abstraction of
-	  these hardware capabilities, available via a system call. It
+	  The Linux Performance Event subsystem provides an abstraction of
+	  these software and hardware cevent apabilities, available via a
+	  system call and used by the "perf" utility in tools/perf/. It
 	  provides per task and per CPU counters, and it provides event
 	  capabilities on top of those.
 
@@ -950,14 +955,26 @@ config EVENT_PROFILE
 	depends on PERF_EVENTS && EVENT_TRACING
 	default y
 	help
-	 Allow the use of tracepoints as software performance counters.
+	 Allow the use of tracepoints as software performance events.
 
-	 When this is enabled, you can create perf counters based on
+	 When this is enabled, you can create perf events based on
 	 tracepoints using PERF_TYPE_TRACEPOINT and the tracepoint ID
 	 found in debugfs://tracing/events/*/*/id. (The -e/--events
 	 option to the perf tool can parse and interpret symbolic
 	 tracepoints, in the subsystem:tracepoint_name format.)
 
+config PERF_COUNTERS
+	bool "Kernel performance counters (old config option)"
+	depends on HAVE_PERF_EVENTS
+	help
+	  This config has been obsoleted by the PERF_EVENTS
+	  config option - please see that one for details.
+
+	  It has no effect on the kernel whether you enable
+	  it or not, it is a compatibility placeholder.
+
+	  Say N if unsure.
+
 endmenu
 
 config VM_EVENT_COUNTERS
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 6e8b99a04e1e..76ac4db405e9 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1,12 +1,12 @@
 /*
- * Performance event core code
+ * Performance events core code:
  *
  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
  *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
  *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  *
- *  For licensing details see kernel-base/COPYING
+ * For licensing details see kernel-base/COPYING
  */
 
 #include <linux/fs.h>
-- 
cgit v1.2.3-71-gd317


From bcf56442429a15bdd6e1d81a9d4c89f93a44fdf7 Mon Sep 17 00:00:00 2001
From: GeunSik Lim <leemgs1@gmail.com>
Date: Tue, 16 Jun 2009 10:26:25 +0200
Subject: trivial: change address of the libcap source.

This is patch to change ftp site of the libcap source.
"ftp://linux.kernel.org" address does not exist.

Signed-off-by: GeunSik Lim <geunsik.lim@samsung.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/capability.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index c3021105edc0..c8f2a5f70ed5 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -7,7 +7,7 @@
  *
  * See here for the libcap library ("POSIX draft" compliance):
  *
- * ftp://linux.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.6/
+ * ftp://www.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.6/
  */
 
 #ifndef _LINUX_CAPABILITY_H
-- 
cgit v1.2.3-71-gd317


From 47a0dfaad9eb82b16193477cf99c2462af03c329 Mon Sep 17 00:00:00 2001
From: Ori Avtalion <ori@avtalion.name>
Date: Tue, 16 Jun 2009 12:17:53 +0300
Subject: trivial: fix typo in namei.h comment

Signed-off-by: Ori Avtalion <ori@avtalion.name>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/namei.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/namei.h b/include/linux/namei.h
index d870ae2faedc..ec0f607b364a 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -40,7 +40,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
  *  - follow links at the end
  *  - require a directory
  *  - ending slashes ok even for nonexistent files
- *  - internal "there are more path compnents" flag
+ *  - internal "there are more path components" flag
  *  - locked when lookup done with dcache_lock held
  *  - dentry cache is untrusted; force a real lookup
  */
-- 
cgit v1.2.3-71-gd317


From b9049df5a0e7f35456c06b949b08b898b9c2e7bc Mon Sep 17 00:00:00 2001
From: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
Date: Tue, 23 Jun 2009 12:09:29 +0200
Subject: Change "useing" -> "using".

Signed-off-by: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/workqueue.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 6273fa97b527..7ef0c7b94f31 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -94,7 +94,7 @@ struct execute_work {
 /*
  * initialize all of a work item in one go
  *
- * NOTE! No point in using "atomic_long_set()": useing a direct
+ * NOTE! No point in using "atomic_long_set()": using a direct
  * assignment of the work data initializer allows the compiler
  * to generate better code.
  */
-- 
cgit v1.2.3-71-gd317


From 411c94038594b2a3fd123d09bdec3fe2500e383d Mon Sep 17 00:00:00 2001
From: Anand Gadiyar <gadiyar@ti.com>
Date: Tue, 7 Jul 2009 15:24:23 +0530
Subject: trivial: fix typo "for for" in multiple files

trivial: fix typo "for for" in multiple files

Signed-off-by: Anand Gadiyar <gadiyar@ti.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/filesystems/nfsroot.txt            | 2 +-
 Documentation/powerpc/dts-bindings/marvell.txt   | 2 +-
 arch/blackfin/mach-bf538/include/mach/defBF539.h | 2 +-
 arch/powerpc/kernel/udbg_16550.c                 | 2 +-
 arch/powerpc/platforms/powermac/udbg_scc.c       | 2 +-
 drivers/edac/edac_core.h                         | 2 +-
 drivers/infiniband/hw/ipath/ipath_iba6110.c      | 2 +-
 drivers/isdn/i4l/isdn_common.c                   | 4 ++--
 drivers/md/md.h                                  | 2 +-
 drivers/net/bnx2x_reg.h                          | 2 +-
 drivers/net/rionet.c                             | 2 +-
 drivers/usb/host/ehci-pci.c                      | 2 +-
 drivers/usb/host/ehci.h                          | 2 +-
 fs/xfs/xfs_fs.h                                  | 2 +-
 include/linux/usb.h                              | 2 +-
 15 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/nfsroot.txt b/Documentation/filesystems/nfsroot.txt
index 68baddf3c3e0..3ba0b945aaf8 100644
--- a/Documentation/filesystems/nfsroot.txt
+++ b/Documentation/filesystems/nfsroot.txt
@@ -105,7 +105,7 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>
 		the client address and this parameter is NOT empty only
 		replies from the specified server are accepted.
 
-		Only required for for NFS root. That is autoconfiguration
+		Only required for NFS root. That is autoconfiguration
 		will not be triggered if it is missing and NFS root is not
 		in operation.
 
diff --git a/Documentation/powerpc/dts-bindings/marvell.txt b/Documentation/powerpc/dts-bindings/marvell.txt
index 3708a2fd4747..f1533d91953a 100644
--- a/Documentation/powerpc/dts-bindings/marvell.txt
+++ b/Documentation/powerpc/dts-bindings/marvell.txt
@@ -32,7 +32,7 @@ prefixed with the string "marvell,", for Marvell Technology Group Ltd.
       devices.  This field represents the number of cells needed to
       represent the address of the memory-mapped registers of devices
       within the system controller chip.
-    - #size-cells : Size representation for for the memory-mapped
+    - #size-cells : Size representation for the memory-mapped
       registers within the system controller chip.
     - #interrupt-cells : Defines the width of cells used to represent
       interrupts.
diff --git a/arch/blackfin/mach-bf538/include/mach/defBF539.h b/arch/blackfin/mach-bf538/include/mach/defBF539.h
index bdc330cd0e1c..1c58914a8740 100644
--- a/arch/blackfin/mach-bf538/include/mach/defBF539.h
+++ b/arch/blackfin/mach-bf538/include/mach/defBF539.h
@@ -2325,7 +2325,7 @@
 #define	AMBEN_B0_B1	0x0004	/* Enable Asynchronous Memory Banks 0 &	1 only */
 #define	AMBEN_B0_B1_B2	0x0006	/* Enable Asynchronous Memory Banks 0, 1, and 2 */
 #define	AMBEN_ALL	0x0008	/* Enable Asynchronous Memory Banks (all) 0, 1,	2, and 3 */
-#define	CDPRIO		0x0100	/* DMA has priority over core for for external accesses */
+#define	CDPRIO		0x0100	/* DMA has priority over core for external accesses */
 
 /* EBIU_AMGCTL Bit Positions */
 #define	AMCKEN_P		0x0000	/* Enable CLKOUT */
diff --git a/arch/powerpc/kernel/udbg_16550.c b/arch/powerpc/kernel/udbg_16550.c
index acb74a17bbbf..b4b167b33643 100644
--- a/arch/powerpc/kernel/udbg_16550.c
+++ b/arch/powerpc/kernel/udbg_16550.c
@@ -1,5 +1,5 @@
 /*
- * udbg for for NS16550 compatable serial ports
+ * udbg for NS16550 compatable serial ports
  *
  * Copyright (C) 2001-2005 PPC 64 Team, IBM Corp
  *
diff --git a/arch/powerpc/platforms/powermac/udbg_scc.c b/arch/powerpc/platforms/powermac/udbg_scc.c
index 572771fd8463..9490157da62e 100644
--- a/arch/powerpc/platforms/powermac/udbg_scc.c
+++ b/arch/powerpc/platforms/powermac/udbg_scc.c
@@ -1,5 +1,5 @@
 /*
- * udbg for for zilog scc ports as found on Apple PowerMacs
+ * udbg for zilog scc ports as found on Apple PowerMacs
  *
  * Copyright (C) 2001-2005 PPC 64 Team, IBM Corp
  *
diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h
index 871c13b4c148..12f355cafdbe 100644
--- a/drivers/edac/edac_core.h
+++ b/drivers/edac/edac_core.h
@@ -286,7 +286,7 @@ enum scrub_type {
  *			is irrespective of the memory devices being mounted
  *			on both sides of the memory stick.
  *
- * Socket set:		All of the memory sticks that are required for for
+ * Socket set:		All of the memory sticks that are required for
  *			a single memory access or all of the memory sticks
  *			spanned by a chip-select row.  A single socket set
  *			has two chip-select rows and if double-sided sticks
diff --git a/drivers/infiniband/hw/ipath/ipath_iba6110.c b/drivers/infiniband/hw/ipath/ipath_iba6110.c
index 02831ad070b8..4bd39c8af80f 100644
--- a/drivers/infiniband/hw/ipath/ipath_iba6110.c
+++ b/drivers/infiniband/hw/ipath/ipath_iba6110.c
@@ -809,7 +809,7 @@ static int ipath_setup_ht_reset(struct ipath_devdata *dd)
  * errors.  We only bother to do this at load time, because it's OK if
  * it happened before we were loaded (first time after boot/reset),
  * but any time after that, it's fatal anyway.  Also need to not check
- * for for upper byte errors if we are in 8 bit mode, so figure out
+ * for upper byte errors if we are in 8 bit mode, so figure out
  * our width.  For now, at least, also complain if it's 8 bit.
  */
 static void slave_or_pri_blk(struct ipath_devdata *dd, struct pci_dev *pdev,
diff --git a/drivers/isdn/i4l/isdn_common.c b/drivers/isdn/i4l/isdn_common.c
index 7188c59a76ff..adb1e8c36b46 100644
--- a/drivers/isdn/i4l/isdn_common.c
+++ b/drivers/isdn/i4l/isdn_common.c
@@ -761,7 +761,7 @@ isdn_getnum(char **p)
  * Be aware that this is not an atomic operation when sleep != 0, even though 
  * interrupts are turned off! Well, like that we are currently only called
  * on behalf of a read system call on raw device files (which are documented
- * to be dangerous and for for debugging purpose only). The inode semaphore
+ * to be dangerous and for debugging purpose only). The inode semaphore
  * takes care that this is not called for the same minor device number while
  * we are sleeping, but access is not serialized against simultaneous read()
  * from the corresponding ttyI device. Can other ugly events, like changes
@@ -873,7 +873,7 @@ isdn_readbchan(int di, int channel, u_char * buf, u_char * fp, int len, wait_que
  * Be aware that this is not an atomic operation when sleep != 0, even though
  * interrupts are turned off! Well, like that we are currently only called
  * on behalf of a read system call on raw device files (which are documented
- * to be dangerous and for for debugging purpose only). The inode semaphore
+ * to be dangerous and for debugging purpose only). The inode semaphore
  * takes care that this is not called for the same minor device number while
  * we are sleeping, but access is not serialized against simultaneous read()
  * from the corresponding ttyI device. Can other ugly events, like changes
diff --git a/drivers/md/md.h b/drivers/md/md.h
index f8fc188bc762..f55d2ff95133 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -201,7 +201,7 @@ struct mddev_s
 	 * INTR:     resync needs to be aborted for some reason
 	 * DONE:     thread is done and is waiting to be reaped
 	 * REQUEST:  user-space has requested a sync (used with SYNC)
-	 * CHECK:    user-space request for for check-only, no repair
+	 * CHECK:    user-space request for check-only, no repair
 	 * RESHAPE:  A reshape is happening
 	 *
 	 * If neither SYNC or RESHAPE are set, then it is a recovery.
diff --git a/drivers/net/bnx2x_reg.h b/drivers/net/bnx2x_reg.h
index 0695be14cf91..aa76cbada5e2 100644
--- a/drivers/net/bnx2x_reg.h
+++ b/drivers/net/bnx2x_reg.h
@@ -3122,7 +3122,7 @@
    The fields are:[4:0] - tail pointer; [10:5] - Link List size; 15:11] -
    header pointer. */
 #define TCM_REG_XX_TABLE					 0x50240
-/* [RW 4] Load value for for cfc ac credit cnt. */
+/* [RW 4] Load value for cfc ac credit cnt. */
 #define TM_REG_CFC_AC_CRDCNT_VAL				 0x164208
 /* [RW 4] Load value for cfc cld credit cnt. */
 #define TM_REG_CFC_CLD_CRDCNT_VAL				 0x164210
diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c
index bc98e7f69ee9..ede937ee50c7 100644
--- a/drivers/net/rionet.c
+++ b/drivers/net/rionet.c
@@ -72,7 +72,7 @@ static int rionet_check = 0;
 static int rionet_capable = 1;
 
 /*
- * This is a fast lookup table for for translating TX
+ * This is a fast lookup table for translating TX
  * Ethernet packets into a destination RIO device. It
  * could be made into a hash table to save memory depending
  * on system trade-offs.
diff --git a/drivers/usb/host/ehci-pci.c b/drivers/usb/host/ehci-pci.c
index c2f1b7df918c..b5b83c43898a 100644
--- a/drivers/usb/host/ehci-pci.c
+++ b/drivers/usb/host/ehci-pci.c
@@ -242,7 +242,7 @@ static int ehci_pci_setup(struct usb_hcd *hcd)
 	 * System suspend currently expects to be able to suspend the entire
 	 * device tree, device-at-a-time.  If we failed selective suspend
 	 * reports, system suspend would fail; so the root hub code must claim
-	 * success.  That's lying to usbcore, and it matters for for runtime
+	 * success.  That's lying to usbcore, and it matters for runtime
 	 * PM scenarios with selective suspend and remote wakeup...
 	 */
 	if (ehci->no_selective_suspend && device_can_wakeup(&pdev->dev))
diff --git a/drivers/usb/host/ehci.h b/drivers/usb/host/ehci.h
index 2bfff30f4704..48b9e889a18b 100644
--- a/drivers/usb/host/ehci.h
+++ b/drivers/usb/host/ehci.h
@@ -37,7 +37,7 @@ typedef __u16 __bitwise __hc16;
 #define __hc16	__le16
 #endif
 
-/* statistics can be kept for for tuning/monitoring */
+/* statistics can be kept for tuning/monitoring */
 struct ehci_stats {
 	/* irq usage */
 	unsigned long		normal;
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index c4ea51b55dce..f52ac276277e 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -117,7 +117,7 @@ struct getbmapx {
 #define BMV_IF_VALID	\
 	(BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC)
 
-/*	bmv_oflags values - returned for for each non-header segment */
+/*	bmv_oflags values - returned for each non-header segment */
 #define BMV_OF_PREALLOC		0x1	/* segment = unwritten pre-allocation */
 #define BMV_OF_DELALLOC		0x2	/* segment = delayed allocation */
 #define BMV_OF_LAST		0x4	/* segment is the last in the file */
diff --git a/include/linux/usb.h b/include/linux/usb.h
index a8fe05f224e5..19fabc487beb 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1071,7 +1071,7 @@ typedef void (*usb_complete_t)(struct urb *);
  * @start_frame: Returns the initial frame for isochronous transfers.
  * @number_of_packets: Lists the number of ISO transfer buffers.
  * @interval: Specifies the polling interval for interrupt or isochronous
- *	transfers.  The units are frames (milliseconds) for for full and low
+ *	transfers.  The units are frames (milliseconds) for full and low
  *	speed devices, and microframes (1/8 millisecond) for highspeed ones.
  * @error_count: Returns the number of ISO transfers that reported errors.
  * @context: For use in completion functions.  This normally points to
-- 
cgit v1.2.3-71-gd317


From 3dbda77e6f3375f87090cfce97b2551d3723521b Mon Sep 17 00:00:00 2001
From: Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de>
Date: Thu, 23 Jul 2009 08:31:31 +0200
Subject: trivial: fix typos "man[ae]g?ment" -> "management"

Signed-off-by: Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/DocBook/mtdnand.tmpl           | 2 +-
 Documentation/DocBook/scsi.tmpl              | 2 +-
 Documentation/scsi/ChangeLog.megaraid        | 2 +-
 Documentation/sound/alsa/HD-Audio-Models.txt | 2 +-
 Documentation/trace/ftrace.txt               | 2 +-
 arch/arm/Makefile                            | 2 +-
 arch/frv/lib/cache.S                         | 2 +-
 arch/mn10300/include/asm/cacheflush.h        | 4 ++--
 drivers/media/dvb/siano/smscoreapi.c         | 2 +-
 drivers/media/dvb/siano/smscoreapi.h         | 4 ++--
 drivers/media/radio/radio-mr800.c            | 2 +-
 drivers/message/fusion/mptbase.c             | 4 ++--
 drivers/net/macb.c                           | 2 +-
 drivers/net/wireless/ath/ath5k/reg.h         | 2 +-
 drivers/net/wireless/atmel.c                 | 2 +-
 drivers/usb/host/xhci.h                      | 2 +-
 drivers/usb/wusbcore/wa-hc.h                 | 2 +-
 include/linux/mISDNif.h                      | 2 +-
 18 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl
index 8e145857fc9d..df0d089d0fb9 100644
--- a/Documentation/DocBook/mtdnand.tmpl
+++ b/Documentation/DocBook/mtdnand.tmpl
@@ -568,7 +568,7 @@ static void board_select_chip (struct mtd_info *mtd, int chip)
 			<para>
 				The blocks in which the tables are stored are procteted against
 				accidental access by marking them bad in the memory bad block
-				table. The bad block table managment functions are allowed
+				table. The bad block table management functions are allowed
 				to circumvernt this protection.
 			</para>
 			<para>
diff --git a/Documentation/DocBook/scsi.tmpl b/Documentation/DocBook/scsi.tmpl
index 10a150ae2a7e..d87f4569e768 100644
--- a/Documentation/DocBook/scsi.tmpl
+++ b/Documentation/DocBook/scsi.tmpl
@@ -317,7 +317,7 @@
         <para>
           The SAS transport class contains common code to deal with SAS HBAs,
           an aproximated representation of SAS topologies in the driver model,
-          and various sysfs attributes to expose these topologies and managment
+          and various sysfs attributes to expose these topologies and management
           interfaces to userspace.
         </para>
         <para>
diff --git a/Documentation/scsi/ChangeLog.megaraid b/Documentation/scsi/ChangeLog.megaraid
index eaa4801f2ce6..38e9e7cadc90 100644
--- a/Documentation/scsi/ChangeLog.megaraid
+++ b/Documentation/scsi/ChangeLog.megaraid
@@ -514,7 +514,7 @@ iv.	Remove yield() while mailbox handshake in synchronous commands
 
 v.	Remove redundant __megaraid_busywait_mbox routine
 
-vi.	Fix bug in the managment module, which causes a system lockup when the
+vi.	Fix bug in the management module, which causes a system lockup when the
 	IO module is loaded and then unloaded, followed by executing any
 	management utility. The current version of management module does not
 	handle the adapter unregister properly.
diff --git a/Documentation/sound/alsa/HD-Audio-Models.txt b/Documentation/sound/alsa/HD-Audio-Models.txt
index 97eebd63bedc..f1708b79f963 100644
--- a/Documentation/sound/alsa/HD-Audio-Models.txt
+++ b/Documentation/sound/alsa/HD-Audio-Models.txt
@@ -387,7 +387,7 @@ STAC92HD73*
 STAC92HD83*
 ===========
   ref		Reference board
-  mic-ref	Reference board with power managment for ports
+  mic-ref	Reference board with power management for ports
   dell-s14	Dell laptop
   auto		BIOS setup (default)
 
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 1b6292bbdd6d..957b22fde2df 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -133,7 +133,7 @@ of ftrace. Here is a list of some of the key files:
 	than requested, the rest of the page will be used,
 	making the actual allocation bigger than requested.
 	( Note, the size may not be a multiple of the page size
-	  due to buffer managment overhead. )
+	  due to buffer management overhead. )
 
 	This can only be updated when the current_tracer
 	is set to "nop".
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 7350557a81e0..54661125a8bf 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -25,7 +25,7 @@ KBUILD_CFLAGS	+=$(call cc-option,-marm,)
 # Select a platform tht is kept up-to-date
 KBUILD_DEFCONFIG := versatile_defconfig
 
-# defines filename extension depending memory manement type.
+# defines filename extension depending memory management type.
 ifeq ($(CONFIG_MMU),)
 MMUEXT		:= -nommu
 endif
diff --git a/arch/frv/lib/cache.S b/arch/frv/lib/cache.S
index 0e10ad8dc462..0c4fb204911b 100644
--- a/arch/frv/lib/cache.S
+++ b/arch/frv/lib/cache.S
@@ -1,4 +1,4 @@
-/* cache.S: cache managment routines
+/* cache.S: cache management routines
  *
  * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
diff --git a/arch/mn10300/include/asm/cacheflush.h b/arch/mn10300/include/asm/cacheflush.h
index 2db746a251f8..1a55d61f0d06 100644
--- a/arch/mn10300/include/asm/cacheflush.h
+++ b/arch/mn10300/include/asm/cacheflush.h
@@ -17,7 +17,7 @@
 #include <linux/mm.h>
 
 /*
- * virtually-indexed cache managment (our cache is physically indexed)
+ * virtually-indexed cache management (our cache is physically indexed)
  */
 #define flush_cache_all()			do {} while (0)
 #define flush_cache_mm(mm)			do {} while (0)
@@ -31,7 +31,7 @@
 #define flush_dcache_mmap_unlock(mapping)	do {} while (0)
 
 /*
- * physically-indexed cache managment
+ * physically-indexed cache management
  */
 #ifndef CONFIG_MN10300_CACHE_DISABLED
 
diff --git a/drivers/media/dvb/siano/smscoreapi.c b/drivers/media/dvb/siano/smscoreapi.c
index bd9ab9d0d12a..fa6a62369a78 100644
--- a/drivers/media/dvb/siano/smscoreapi.c
+++ b/drivers/media/dvb/siano/smscoreapi.c
@@ -1367,7 +1367,7 @@ int smscore_set_gpio(struct smscore_device_t *coredev, u32 pin, int level)
 					    &msg, sizeof(msg));
 }
 
-/* new GPIO managment implementation */
+/* new GPIO management implementation */
 static int GetGpioPinParams(u32 PinNum, u32 *pTranslatedPinNum,
 		u32 *pGroupNum, u32 *pGroupCfg) {
 
diff --git a/drivers/media/dvb/siano/smscoreapi.h b/drivers/media/dvb/siano/smscoreapi.h
index f1108c64e895..eec18aaf5512 100644
--- a/drivers/media/dvb/siano/smscoreapi.h
+++ b/drivers/media/dvb/siano/smscoreapi.h
@@ -657,12 +657,12 @@ struct smscore_buffer_t *smscore_getbuffer(struct smscore_device_t *coredev);
 extern void smscore_putbuffer(struct smscore_device_t *coredev,
 			      struct smscore_buffer_t *cb);
 
-/* old GPIO managment */
+/* old GPIO management */
 int smscore_configure_gpio(struct smscore_device_t *coredev, u32 pin,
 			   struct smscore_config_gpio *pinconfig);
 int smscore_set_gpio(struct smscore_device_t *coredev, u32 pin, int level);
 
-/* new GPIO managment */
+/* new GPIO management */
 extern int smscore_gpio_configure(struct smscore_device_t *coredev, u8 PinNum,
 		struct smscore_gpio_config *pGpioConfig);
 extern int smscore_gpio_set_level(struct smscore_device_t *coredev, u8 PinNum,
diff --git a/drivers/media/radio/radio-mr800.c b/drivers/media/radio/radio-mr800.c
index 575bf9d89419..a1239083472d 100644
--- a/drivers/media/radio/radio-mr800.c
+++ b/drivers/media/radio/radio-mr800.c
@@ -46,7 +46,7 @@
  * Version 0.11:	Converted to v4l2_device.
  *
  * Many things to do:
- * 	- Correct power managment of device (suspend & resume)
+ * 	- Correct power management of device (suspend & resume)
  * 	- Add code for scanning and smooth tuning
  * 	- Add code for sensitivity value
  * 	- Correct mistakes
diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c
index 76fa2ee0b574..610e914abe6c 100644
--- a/drivers/message/fusion/mptbase.c
+++ b/drivers/message/fusion/mptbase.c
@@ -6821,7 +6821,7 @@ mpt_print_ioc_summary(MPT_ADAPTER *ioc, char *buffer, int *size, int len, int sh
 	*size = y;
 }
 /**
- *	mpt_set_taskmgmt_in_progress_flag - set flags associated with task managment
+ *	mpt_set_taskmgmt_in_progress_flag - set flags associated with task management
  *	@ioc: Pointer to MPT_ADAPTER structure
  *
  *	Returns 0 for SUCCESS or -1 if FAILED.
@@ -6854,7 +6854,7 @@ mpt_set_taskmgmt_in_progress_flag(MPT_ADAPTER *ioc)
 EXPORT_SYMBOL(mpt_set_taskmgmt_in_progress_flag);
 
 /**
- *	mpt_clear_taskmgmt_in_progress_flag - clear flags associated with task managment
+ *	mpt_clear_taskmgmt_in_progress_flag - clear flags associated with task management
  *	@ioc: Pointer to MPT_ADAPTER structure
  *
  **/
diff --git a/drivers/net/macb.c b/drivers/net/macb.c
index fb65b427c692..1d0d4d9ab623 100644
--- a/drivers/net/macb.c
+++ b/drivers/net/macb.c
@@ -241,7 +241,7 @@ static int macb_mii_init(struct macb *bp)
 	struct eth_platform_data *pdata;
 	int err = -ENXIO, i;
 
-	/* Enable managment port */
+	/* Enable management port */
 	macb_writel(bp, NCR, MACB_BIT(MPE));
 
 	bp->mii_bus = mdiobus_alloc();
diff --git a/drivers/net/wireless/ath/ath5k/reg.h b/drivers/net/wireless/ath/ath5k/reg.h
index debad07d9900..c63ea6afd96f 100644
--- a/drivers/net/wireless/ath/ath5k/reg.h
+++ b/drivers/net/wireless/ath/ath5k/reg.h
@@ -982,7 +982,7 @@
 #define AR5K_5414_CBCFG_BUF_DIS	0x10	/* Disable buffer */
 
 /*
- * PCI-E Power managment configuration
+ * PCI-E Power management configuration
  * and status register [5424+]
  */
 #define	AR5K_PCIE_PM_CTL		0x4068			/* Register address */
diff --git a/drivers/net/wireless/atmel.c b/drivers/net/wireless/atmel.c
index a3b36b3a9d67..cce188837d10 100644
--- a/drivers/net/wireless/atmel.c
+++ b/drivers/net/wireless/atmel.c
@@ -3330,7 +3330,7 @@ static void atmel_smooth_qual(struct atmel_private *priv)
 	priv->wstats.qual.updated &= ~IW_QUAL_QUAL_INVALID;
 }
 
-/* deals with incoming managment frames. */
+/* deals with incoming management frames. */
 static void atmel_management_frame(struct atmel_private *priv,
 				   struct ieee80211_hdr *header,
 				   u16 frame_len, u8 rssi)
diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
index d31d32206ba3..ffe1625d4e1b 100644
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h
@@ -1150,7 +1150,7 @@ void xhci_dbg_cmd_ptrs(struct xhci_hcd *xhci);
 void xhci_dbg_ring_ptrs(struct xhci_hcd *xhci, struct xhci_ring *ring);
 void xhci_dbg_ctx(struct xhci_hcd *xhci, struct xhci_container_ctx *ctx, unsigned int last_ep);
 
-/* xHCI memory managment */
+/* xHCI memory management */
 void xhci_mem_cleanup(struct xhci_hcd *xhci);
 int xhci_mem_init(struct xhci_hcd *xhci, gfp_t flags);
 void xhci_free_virt_device(struct xhci_hcd *xhci, int slot_id);
diff --git a/drivers/usb/wusbcore/wa-hc.h b/drivers/usb/wusbcore/wa-hc.h
index 586d350cdb4d..d6bea3e0b54a 100644
--- a/drivers/usb/wusbcore/wa-hc.h
+++ b/drivers/usb/wusbcore/wa-hc.h
@@ -47,7 +47,7 @@
  *             to an endpoint on a WUSB device that is connected to a
  *             HWA RC.
  *
- *  xfer       Transfer managment -- this is all the code that gets a
+ *  xfer       Transfer management -- this is all the code that gets a
  *             buffer and pushes it to a device (or viceversa). *
  *
  * Some day a lot of this code will be shared between this driver and
diff --git a/include/linux/mISDNif.h b/include/linux/mISDNif.h
index 536ca12442ca..78c3bed1c3f5 100644
--- a/include/linux/mISDNif.h
+++ b/include/linux/mISDNif.h
@@ -104,7 +104,7 @@
 #define DL_UNITDATA_IND		0x3108
 #define DL_INFORMATION_IND	0x0008
 
-/* intern layer 2 managment */
+/* intern layer 2 management */
 #define MDL_ASSIGN_REQ		0x1804
 #define MDL_ASSIGN_IND		0x1904
 #define MDL_REMOVE_REQ		0x1A04
-- 
cgit v1.2.3-71-gd317


From 181f7c5dd3832763bdf2756b6d2d8a49bdf12791 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 6 Jul 2009 11:53:03 +0200
Subject: kmemcheck: add missing braces to do-while in
 kmemcheck_annotate_bitfield

Whether or not the sparse warning

	warning: do-while statement is not a compound statement

is justified or not in this case, it is annoying and trivial to fix.

[vegard.nossum@gmail.com: title and cleanup]
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
---
 include/linux/kmemcheck.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h
index 47b39b7c7e84..06c6c5501f13 100644
--- a/include/linux/kmemcheck.h
+++ b/include/linux/kmemcheck.h
@@ -137,7 +137,10 @@ static inline void kmemcheck_mark_initialized_pages(struct page *p,
 	int name##_end[0];
 
 #define kmemcheck_annotate_bitfield(ptr, name)				\
-	do if (ptr) {							\
+	do {								\
+		if (!ptr)						\
+			break;						\
+									\
 		int _n = (long) &((ptr)->name##_end)			\
 			- (long) &((ptr)->name##_begin);		\
 		BUILD_BUG_ON(_n < 0);					\
-- 
cgit v1.2.3-71-gd317


From 40f9244f4da8976eeb6d5ed6313c635ba238a9d3 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Wed, 17 Jun 2009 17:56:39 +0100
Subject: regulator: Allow consumer supplies to be set up with dev_name()

Follow the approach suggested by Russell King and implemented by him in
the clkdev API and allow consumer device supply mapings to be set up
using the dev_name() for the consumer instead of the struct device.
In order to avoid making existing machines instabuggy and creating merge
issues the use of struct device is still supported for the time being.

This resolves problems working with buses such as I2C which make the
struct device available late providing that the final device name is
known, which is the case for most embedded systems with fixed setups.

Consumers must still use the struct device when calling regulator_get().

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/core.c          | 62 ++++++++++++++++++++++++++++++---------
 include/linux/regulator/machine.h |  7 ++++-
 2 files changed, 54 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 91ba9bfaa706..24e05b7607b4 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -37,7 +37,7 @@ static int has_full_constraints;
  */
 struct regulator_map {
 	struct list_head list;
-	struct device *dev;
+	const char *dev_name;   /* The dev_name() for the consumer */
 	const char *supply;
 	struct regulator_dev *regulator;
 };
@@ -857,23 +857,33 @@ out:
  * set_consumer_device_supply: Bind a regulator to a symbolic supply
  * @rdev:         regulator source
  * @consumer_dev: device the supply applies to
+ * @consumer_dev_name: dev_name() string for device supply applies to
  * @supply:       symbolic name for supply
  *
  * Allows platform initialisation code to map physical regulator
  * sources to symbolic names for supplies for use by devices.  Devices
  * should use these symbolic names to request regulators, avoiding the
  * need to provide board-specific regulator names as platform data.
+ *
+ * Only one of consumer_dev and consumer_dev_name may be specified.
  */
 static int set_consumer_device_supply(struct regulator_dev *rdev,
-	struct device *consumer_dev, const char *supply)
+	struct device *consumer_dev, const char *consumer_dev_name,
+	const char *supply)
 {
 	struct regulator_map *node;
 
+	if (consumer_dev && consumer_dev_name)
+		return -EINVAL;
+
+	if (!consumer_dev_name && consumer_dev)
+		consumer_dev_name = dev_name(consumer_dev);
+
 	if (supply == NULL)
 		return -EINVAL;
 
 	list_for_each_entry(node, &regulator_map_list, list) {
-		if (consumer_dev != node->dev)
+		if (consumer_dev_name != node->dev_name)
 			continue;
 		if (strcmp(node->supply, supply) != 0)
 			continue;
@@ -891,25 +901,38 @@ static int set_consumer_device_supply(struct regulator_dev *rdev,
 		return -ENOMEM;
 
 	node->regulator = rdev;
-	node->dev = consumer_dev;
+	node->dev_name = kstrdup(consumer_dev_name, GFP_KERNEL);
 	node->supply = supply;
 
+	if (node->dev_name == NULL) {
+		kfree(node);
+		return -ENOMEM;
+	}
+
 	list_add(&node->list, &regulator_map_list);
 	return 0;
 }
 
 static void unset_consumer_device_supply(struct regulator_dev *rdev,
-	struct device *consumer_dev)
+	const char *consumer_dev_name, struct device *consumer_dev)
 {
 	struct regulator_map *node, *n;
 
+	if (consumer_dev && !consumer_dev_name)
+		consumer_dev_name = dev_name(consumer_dev);
+
 	list_for_each_entry_safe(node, n, &regulator_map_list, list) {
-		if (rdev == node->regulator &&
-			consumer_dev == node->dev) {
-			list_del(&node->list);
-			kfree(node);
-			return;
-		}
+		if (rdev != node->regulator)
+			continue;
+
+		if (consumer_dev_name && node->dev_name &&
+		    strcmp(consumer_dev_name, node->dev_name))
+			continue;
+
+		list_del(&node->list);
+		kfree(node->dev_name);
+		kfree(node);
+		return;
 	}
 }
 
@@ -920,6 +943,7 @@ static void unset_regulator_supplies(struct regulator_dev *rdev)
 	list_for_each_entry_safe(node, n, &regulator_map_list, list) {
 		if (rdev == node->regulator) {
 			list_del(&node->list);
+			kfree(node->dev_name);
 			kfree(node);
 			return;
 		}
@@ -1019,17 +1043,25 @@ struct regulator *regulator_get(struct device *dev, const char *id)
 	struct regulator_dev *rdev;
 	struct regulator_map *map;
 	struct regulator *regulator = ERR_PTR(-ENODEV);
+	const char *devname = NULL;
 
 	if (id == NULL) {
 		printk(KERN_ERR "regulator: get() with no identifier\n");
 		return regulator;
 	}
 
+	if (dev)
+		devname = dev_name(dev);
+
 	mutex_lock(&regulator_list_mutex);
 
 	list_for_each_entry(map, &regulator_map_list, list) {
-		if (dev == map->dev &&
-		    strcmp(map->supply, id) == 0) {
+		/* If the mapping has a device set up it must match */
+		if (map->dev_name &&
+		    (!devname || strcmp(map->dev_name, devname)))
+			continue;
+
+		if (strcmp(map->supply, id) == 0) {
 			rdev = map->regulator;
 			goto found;
 		}
@@ -2091,11 +2123,13 @@ struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
 	for (i = 0; i < init_data->num_consumer_supplies; i++) {
 		ret = set_consumer_device_supply(rdev,
 			init_data->consumer_supplies[i].dev,
+			init_data->consumer_supplies[i].dev_name,
 			init_data->consumer_supplies[i].supply);
 		if (ret < 0) {
 			for (--i; i >= 0; i--)
 				unset_consumer_device_supply(rdev,
-					init_data->consumer_supplies[i].dev);
+				    init_data->consumer_supplies[i].dev_name,
+				    init_data->consumer_supplies[i].dev);
 			goto scrub;
 		}
 	}
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index bac64fa390f2..9328090eca20 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -126,13 +126,18 @@ struct regulation_constraints {
 /**
  * struct regulator_consumer_supply - supply -> device mapping
  *
- * This maps a supply name to a device.
+ * This maps a supply name to a device.  Only one of dev or dev_name
+ * can be specified.  Use of dev_name allows support for buses which
+ * make struct device available late such as I2C and is the preferred
+ * form.
  *
  * @dev: Device structure for the consumer.
+ * @dev_name: Result of dev_name() for the consumer.
  * @supply: Name for the supply.
  */
 struct regulator_consumer_supply {
 	struct device *dev;	/* consumer */
+	const char *dev_name;   /* dev_name() for consumer */
 	const char *supply;	/* consumer supply - e.g. "vcc" */
 };
 
-- 
cgit v1.2.3-71-gd317


From 9c19bc0444490e76197f47316c649590dc6f10a4 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Thu, 9 Jul 2009 15:44:31 +0100
Subject: regulator: Define full constraints function with REGULATOR disabled

This allows machine drivers to build without ifdefs if they have
full constraints. Suggested by machine drivers contributed by
Haojian Zhuang <haojian.zhuang@gmail.com>.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 include/linux/regulator/machine.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 9328090eca20..73a88f6cbb1c 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -171,6 +171,12 @@ struct regulator_init_data {
 
 int regulator_suspend_prepare(suspend_state_t state);
 
+#ifdef CONFIG_REGULATOR
 void regulator_has_full_constraints(void);
+#else
+static inline void regulator_has_full_constraints(void)
+{
+}
+#endif
 
 #endif
-- 
cgit v1.2.3-71-gd317


From 0198d1163b3e0313b3f073b62384abfab1a17cff Mon Sep 17 00:00:00 2001
From: Haojian Zhuang <haojian.zhuang@marvell.com>
Date: Fri, 26 Jun 2009 19:20:59 +0800
Subject: regulator: add buck3 in da903x driver

BUCK3 is the new component in DA9035. So there're three BUCKs in DA9035.
And there're two BUCKs in DA9034.

Signed-off-by: Haojian Zhuang <haojian.zhuang@marvell.com>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Acked-by: Eric Miao <eric.y.miao@gmail.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/da903x.c | 32 ++++++++++++++++++++++++++++++++
 include/linux/mfd/da903x.h |  4 +++-
 2 files changed, 35 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/da903x.c b/drivers/regulator/da903x.c
index b8b89ef10a84..33dfeeb9407c 100644
--- a/drivers/regulator/da903x.c
+++ b/drivers/regulator/da903x.c
@@ -64,6 +64,14 @@
 #define DA9034_MDTV2		(0x33)
 #define DA9034_MVRC		(0x34)
 
+/* DA9035 Registers. DA9034 Registers are comptabile to DA9035. */
+#define DA9035_OVER3		(0x12)
+#define DA9035_VCC2		(0x1f)
+#define DA9035_3DTV1		(0x2c)
+#define DA9035_3DTV2		(0x2d)
+#define DA9035_3VRC		(0x2e)
+#define DA9035_AUTOSKIP		(0x2f)
+
 struct da903x_regulator_info {
 	struct regulator_desc desc;
 
@@ -388,6 +396,27 @@ static struct regulator_ops da9034_regulator_ldo12_ops = {
 	.enable_bit	= (ebit),					\
 }
 
+#define DA9035_DVC(_id, min, max, step, vreg, nbits, ureg, ubit, ereg, ebit) \
+{									\
+	.desc	= {							\
+		.name	= #_id,						\
+		.ops	= &da9034_regulator_dvc_ops,			\
+		.type	= REGULATOR_VOLTAGE,				\
+		.id	= DA9035_ID_##_id,				\
+		.owner	= THIS_MODULE,					\
+	},								\
+	.min_uV		= (min) * 1000,					\
+	.max_uV		= (max) * 1000,					\
+	.step_uV	= (step) * 1000,				\
+	.vol_reg	= DA9035_##vreg,				\
+	.vol_shift	= (0),						\
+	.vol_nbits	= (nbits),					\
+	.update_reg	= DA9035_##ureg,				\
+	.update_bit	= (ubit),					\
+	.enable_reg	= DA9035_##ereg,				\
+	.enable_bit	= (ebit),					\
+}
+
 #define DA9034_LDO(_id, min, max, step, vreg, shift, nbits, ereg, ebit)	\
 	DA903x_LDO(DA9034, _id, min, max, step, vreg, shift, nbits, ereg, ebit)
 
@@ -435,6 +464,9 @@ static struct da903x_regulator_info da903x_regulator_info[] = {
 	DA9034_LDO(14, 1800, 3300, 100, LDO1514, 0, 4, OVER3, 0),
 	DA9034_LDO(15, 1800, 3300, 100, LDO1514, 4, 4, OVER3, 1),
 	DA9034_LDO(5, 3100, 3100, 0, INVAL, 0, 0, OVER3, 7), /* fixed @3.1V */
+
+	/* DA9035 */
+	DA9035_DVC(BUCK3, 1800, 2200, 100, 3DTV1, 3, VCC2, 0, OVER3, 3),
 };
 
 static inline struct da903x_regulator_info *find_regulator_info(int id)
diff --git a/include/linux/mfd/da903x.h b/include/linux/mfd/da903x.h
index 115dbe965082..c63b65c94429 100644
--- a/include/linux/mfd/da903x.h
+++ b/include/linux/mfd/da903x.h
@@ -1,7 +1,7 @@
 #ifndef __LINUX_PMIC_DA903X_H
 #define __LINUX_PMIC_DA903X_H
 
-/* Unified sub device IDs for DA9030/DA9034 */
+/* Unified sub device IDs for DA9030/DA9034/DA9035 */
 enum {
 	DA9030_ID_LED_1,
 	DA9030_ID_LED_2,
@@ -57,6 +57,8 @@ enum {
 	DA9034_ID_LDO13,
 	DA9034_ID_LDO14,
 	DA9034_ID_LDO15,
+
+	DA9035_ID_BUCK3,
 };
 
 /*
-- 
cgit v1.2.3-71-gd317


From ed6543243a1c557dbe2005a86f6d8e851c1ebb79 Mon Sep 17 00:00:00 2001
From: roald <roald@sh-dt-4505.(none)>
Date: Mon, 13 Jul 2009 17:25:21 +0800
Subject: regulator: add initialization macro of regulator supply

Signed-off-by: Haojian Zhuang <haojian.zhuang@marvell.com>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 include/linux/regulator/machine.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 73a88f6cbb1c..99a4e2eb36aa 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -141,6 +141,13 @@ struct regulator_consumer_supply {
 	const char *supply;	/* consumer supply - e.g. "vcc" */
 };
 
+/* Initialize struct regulator_consumer_supply */
+#define REGULATOR_SUPPLY(_name, _dev_name)			\
+{								\
+	.supply		= _name,				\
+	.dev_name	= _dev_name,				\
+}
+
 /**
  * struct regulator_init_data - regulator platform initialisation data.
  *
-- 
cgit v1.2.3-71-gd317


From 5ffbd136e6c51c8d1eec7a4a0c5d2180c81aea30 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Tue, 21 Jul 2009 16:00:23 +0100
Subject: regulator: Add regulator_get_exclusive() API

Some consumers require complete control of the regulator and can't
tolerate sharing it with other consumers, most commonly because they need
to have the regulator actually disabled so can't have other consumers
forcing it on. This new regulator_get_exclusive() API call allows these
consumers to explicitly request this, documenting the assumptions that
they are making.

In order to simplify coding of such consumers the use count for regulators
they request is forced to match the enabled state of the regulator when
it is requested. This is not possible for consumers which can share
regulators due to the need to keep track of the ownership of use counts.

A new API call is used rather than an additional argument to the existing
regulator_get() in order to avoid merge headaches with driver code in
other trees.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/core.c           | 88 ++++++++++++++++++++++++++++++++------
 include/linux/regulator/consumer.h |  2 +
 include/linux/regulator/driver.h   |  2 +
 3 files changed, 78 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 24e05b7607b4..68549008582c 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -1025,25 +1025,15 @@ overflow_err:
 	return NULL;
 }
 
-/**
- * regulator_get - lookup and obtain a reference to a regulator.
- * @dev: device for regulator "consumer"
- * @id: Supply name or regulator ID.
- *
- * Returns a struct regulator corresponding to the regulator producer,
- * or IS_ERR() condition containing errno.
- *
- * Use of supply names configured via regulator_set_device_supply() is
- * strongly encouraged.  It is recommended that the supply name used
- * should match the name used for the supply and/or the relevant
- * device pins in the datasheet.
- */
-struct regulator *regulator_get(struct device *dev, const char *id)
+/* Internal regulator request function */
+static struct regulator *_regulator_get(struct device *dev, const char *id,
+					int exclusive)
 {
 	struct regulator_dev *rdev;
 	struct regulator_map *map;
 	struct regulator *regulator = ERR_PTR(-ENODEV);
 	const char *devname = NULL;
+	int ret;
 
 	if (id == NULL) {
 		printk(KERN_ERR "regulator: get() with no identifier\n");
@@ -1070,6 +1060,16 @@ struct regulator *regulator_get(struct device *dev, const char *id)
 	return regulator;
 
 found:
+	if (rdev->exclusive) {
+		regulator = ERR_PTR(-EPERM);
+		goto out;
+	}
+
+	if (exclusive && rdev->open_count) {
+		regulator = ERR_PTR(-EBUSY);
+		goto out;
+	}
+
 	if (!try_module_get(rdev->owner))
 		goto out;
 
@@ -1079,12 +1079,69 @@ found:
 		module_put(rdev->owner);
 	}
 
+	rdev->open_count++;
+	if (exclusive) {
+		rdev->exclusive = 1;
+
+		ret = _regulator_is_enabled(rdev);
+		if (ret > 0)
+			rdev->use_count = 1;
+		else
+			rdev->use_count = 0;
+	}
+
 out:
 	mutex_unlock(&regulator_list_mutex);
+
 	return regulator;
 }
+
+/**
+ * regulator_get - lookup and obtain a reference to a regulator.
+ * @dev: device for regulator "consumer"
+ * @id: Supply name or regulator ID.
+ *
+ * Returns a struct regulator corresponding to the regulator producer,
+ * or IS_ERR() condition containing errno.
+ *
+ * Use of supply names configured via regulator_set_device_supply() is
+ * strongly encouraged.  It is recommended that the supply name used
+ * should match the name used for the supply and/or the relevant
+ * device pins in the datasheet.
+ */
+struct regulator *regulator_get(struct device *dev, const char *id)
+{
+	return _regulator_get(dev, id, 0);
+}
 EXPORT_SYMBOL_GPL(regulator_get);
 
+/**
+ * regulator_get_exclusive - obtain exclusive access to a regulator.
+ * @dev: device for regulator "consumer"
+ * @id: Supply name or regulator ID.
+ *
+ * Returns a struct regulator corresponding to the regulator producer,
+ * or IS_ERR() condition containing errno.  Other consumers will be
+ * unable to obtain this reference is held and the use count for the
+ * regulator will be initialised to reflect the current state of the
+ * regulator.
+ *
+ * This is intended for use by consumers which cannot tolerate shared
+ * use of the regulator such as those which need to force the
+ * regulator off for correct operation of the hardware they are
+ * controlling.
+ *
+ * Use of supply names configured via regulator_set_device_supply() is
+ * strongly encouraged.  It is recommended that the supply name used
+ * should match the name used for the supply and/or the relevant
+ * device pins in the datasheet.
+ */
+struct regulator *regulator_get_exclusive(struct device *dev, const char *id)
+{
+	return _regulator_get(dev, id, 1);
+}
+EXPORT_SYMBOL_GPL(regulator_get_exclusive);
+
 /**
  * regulator_put - "free" the regulator source
  * @regulator: regulator source
@@ -1113,6 +1170,9 @@ void regulator_put(struct regulator *regulator)
 	list_del(&regulator->list);
 	kfree(regulator);
 
+	rdev->open_count--;
+	rdev->exclusive = 0;
+
 	module_put(rdev->owner);
 	mutex_unlock(&regulator_list_mutex);
 }
diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index 277f4b964df5..976b57b6912c 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -125,6 +125,8 @@ struct regulator_bulk_data {
 /* regulator get and put */
 struct regulator *__must_check regulator_get(struct device *dev,
 					     const char *id);
+struct regulator *__must_check regulator_get_exclusive(struct device *dev,
+						       const char *id);
 void regulator_put(struct regulator *regulator);
 
 /* regulator output control and status */
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index ce1be708ca16..73c9cd6cda7d 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -162,6 +162,8 @@ struct regulator_desc {
 struct regulator_dev {
 	struct regulator_desc *desc;
 	int use_count;
+	int open_count;
+	int exclusive;
 
 	/* lists we belong to */
 	struct list_head list; /* list of all regulators */
-- 
cgit v1.2.3-71-gd317


From a7a1ad9066e0266c8a4357ba3dbaeebfb80f531d Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Tue, 21 Jul 2009 16:00:24 +0100
Subject: regulator: Add regulator voltage range check API

Simplify checking of support for voltage ranges by providing an API which
wraps the existing count and list operations.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/core.c           | 29 +++++++++++++++++++++++++++++
 include/linux/regulator/consumer.h |  2 ++
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 68549008582c..e11c2222d9af 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -1441,6 +1441,35 @@ int regulator_list_voltage(struct regulator *regulator, unsigned selector)
 }
 EXPORT_SYMBOL_GPL(regulator_list_voltage);
 
+/**
+ * regulator_is_supported_voltage - check if a voltage range can be supported
+ *
+ * @regulator: Regulator to check.
+ * @min_uV: Minimum required voltage in uV.
+ * @max_uV: Maximum required voltage in uV.
+ *
+ * Returns a boolean or a negative error code.
+ */
+int regulator_is_supported_voltage(struct regulator *regulator,
+				   int min_uV, int max_uV)
+{
+	int i, voltages, ret;
+
+	ret = regulator_count_voltages(regulator);
+	if (ret < 0)
+		return ret;
+	voltages = ret;
+
+	for (i = 0; i < voltages; i++) {
+		ret = regulator_list_voltage(regulator, i);
+
+		if (ret >= min_uV && ret <= max_uV)
+			return 1;
+	}
+
+	return 0;
+}
+
 /**
  * regulator_set_voltage - set regulator output voltage
  * @regulator: regulator source
diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index 976b57b6912c..490c5b37b6d7 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -146,6 +146,8 @@ void regulator_bulk_free(int num_consumers,
 
 int regulator_count_voltages(struct regulator *regulator);
 int regulator_list_voltage(struct regulator *regulator, unsigned selector);
+int regulator_is_supported_voltage(struct regulator *regulator,
+				   int min_uV, int max_uV);
 int regulator_set_voltage(struct regulator *regulator, int min_uV, int max_uV);
 int regulator_get_voltage(struct regulator *regulator);
 int regulator_set_current_limit(struct regulator *regulator,
-- 
cgit v1.2.3-71-gd317


From 86d9884b6a3646bc24e57430f1f694c5171c1bf6 Mon Sep 17 00:00:00 2001
From: Roger Quadros <ext-roger.quadros@nokia.com>
Date: Thu, 6 Aug 2009 19:37:29 +0300
Subject: regulator: Add GPIO enable control to fixed voltage regulator driver

Now fixed regulators that have their enable pin connected to a GPIO line
can use the fixed regulator driver for regulator enable/disable control.
The GPIO number and polarity information is passed through platform data.
GPIO enable control is achieved using gpiolib.

Signed-off-by: Roger Quadros <ext-roger.quadros@nokia.com>
Reviewed-by: Philipp Zabel <philipp.zabel@gmail.com>
Reviewed-by: Felipe Balbi <felipe.balbi@nokia.com>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/fixed.c       | 88 ++++++++++++++++++++++++++++++++++++++++-
 include/linux/regulator/fixed.h | 24 +++++++++++
 2 files changed, 110 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/fixed.c b/drivers/regulator/fixed.c
index 9c7f956d57c4..f8b295700d7d 100644
--- a/drivers/regulator/fixed.c
+++ b/drivers/regulator/fixed.c
@@ -5,6 +5,9 @@
  *
  * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
  *
+ * Copyright (c) 2009 Nokia Corporation
+ * Roger Quadros <ext-roger.quadros@nokia.com>
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
  * published by the Free Software Foundation; either version 2 of the
@@ -20,20 +23,45 @@
 #include <linux/platform_device.h>
 #include <linux/regulator/driver.h>
 #include <linux/regulator/fixed.h>
+#include <linux/gpio.h>
 
 struct fixed_voltage_data {
 	struct regulator_desc desc;
 	struct regulator_dev *dev;
 	int microvolts;
+	int gpio;
+	unsigned enable_high:1;
+	unsigned is_enabled:1;
 };
 
 static int fixed_voltage_is_enabled(struct regulator_dev *dev)
 {
-	return 1;
+	struct fixed_voltage_data *data = rdev_get_drvdata(dev);
+
+	return data->is_enabled;
 }
 
 static int fixed_voltage_enable(struct regulator_dev *dev)
 {
+	struct fixed_voltage_data *data = rdev_get_drvdata(dev);
+
+	if (gpio_is_valid(data->gpio)) {
+		gpio_set_value_cansleep(data->gpio, data->enable_high);
+		data->is_enabled = 1;
+	}
+
+	return 0;
+}
+
+static int fixed_voltage_disable(struct regulator_dev *dev)
+{
+	struct fixed_voltage_data *data = rdev_get_drvdata(dev);
+
+	if (gpio_is_valid(data->gpio)) {
+		gpio_set_value_cansleep(data->gpio, !data->enable_high);
+		data->is_enabled = 0;
+	}
+
 	return 0;
 }
 
@@ -58,6 +86,7 @@ static int fixed_voltage_list_voltage(struct regulator_dev *dev,
 static struct regulator_ops fixed_voltage_ops = {
 	.is_enabled = fixed_voltage_is_enabled,
 	.enable = fixed_voltage_enable,
+	.disable = fixed_voltage_disable,
 	.get_voltage = fixed_voltage_get_voltage,
 	.list_voltage = fixed_voltage_list_voltage,
 };
@@ -87,13 +116,62 @@ static int regulator_fixed_voltage_probe(struct platform_device *pdev)
 	drvdata->desc.n_voltages = 1;
 
 	drvdata->microvolts = config->microvolts;
+	drvdata->gpio = config->gpio;
+
+	if (gpio_is_valid(config->gpio)) {
+		drvdata->enable_high = config->enable_high;
+
+		/* FIXME: Remove below print warning
+		 *
+		 * config->gpio must be set to -EINVAL by platform code if
+		 * GPIO control is not required. However, early adopters
+		 * not requiring GPIO control may forget to initialize
+		 * config->gpio to -EINVAL. This will cause GPIO 0 to be used
+		 * for GPIO control.
+		 *
+		 * This warning will be removed once there are a couple of users
+		 * for this driver.
+		 */
+		if (!config->gpio)
+			dev_warn(&pdev->dev,
+				"using GPIO 0 for regulator enable control\n");
+
+		ret = gpio_request(config->gpio, config->supply_name);
+		if (ret) {
+			dev_err(&pdev->dev,
+			   "Could not obtain regulator enable GPIO %d: %d\n",
+							config->gpio, ret);
+			goto err_name;
+		}
+
+		/* set output direction without changing state
+		 * to prevent glitch
+		 */
+		drvdata->is_enabled = config->enabled_at_boot;
+		ret = drvdata->is_enabled ?
+				config->enable_high : !config->enable_high;
+
+		ret = gpio_direction_output(config->gpio, ret);
+		if (ret) {
+			dev_err(&pdev->dev,
+			   "Could not configure regulator enable GPIO %d direction: %d\n",
+							config->gpio, ret);
+			goto err_gpio;
+		}
+
+	} else {
+		/* Regulator without GPIO control is considered
+		 * always enabled
+		 */
+		drvdata->is_enabled = 1;
+	}
 
 	drvdata->dev = regulator_register(&drvdata->desc, &pdev->dev,
 					  config->init_data, drvdata);
 	if (IS_ERR(drvdata->dev)) {
 		ret = PTR_ERR(drvdata->dev);
 		dev_err(&pdev->dev, "Failed to register regulator: %d\n", ret);
-		goto err_name;
+		goto err_gpio;
 	}
 
 	platform_set_drvdata(pdev, drvdata);
@@ -103,6 +181,9 @@ static int regulator_fixed_voltage_probe(struct platform_device *pdev)
 
 	return 0;
 
+err_gpio:
+	if (gpio_is_valid(config->gpio))
+		gpio_free(config->gpio);
 err_name:
 	kfree(drvdata->desc.name);
 err:
@@ -118,6 +199,9 @@ static int regulator_fixed_voltage_remove(struct platform_device *pdev)
 	kfree(drvdata->desc.name);
 	kfree(drvdata);
 
+	if (gpio_is_valid(drvdata->gpio))
+		gpio_free(drvdata->gpio);
+
 	return 0;
 }
 
diff --git a/include/linux/regulator/fixed.h b/include/linux/regulator/fixed.h
index 91b4da31f1b5..e94a4a1c7c8a 100644
--- a/include/linux/regulator/fixed.h
+++ b/include/linux/regulator/fixed.h
@@ -5,6 +5,9 @@
  *
  * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
  *
+ * Copyright (c) 2009 Nokia Corporation
+ * Roger Quadros <ext-roger.quadros@nokia.com>
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
  * published by the Free Software Foundation; either version 2 of the
@@ -16,9 +19,30 @@
 
 struct regulator_init_data;
 
+/**
+ * struct fixed_voltage_config - fixed_voltage_config structure
+ * @supply_name:	Name of the regulator supply
+ * @microvolts:		Output voltage of regulator
+ * @gpio:		GPIO to use for enable control
+ * 			set to -EINVAL if not used
+ * @enable_high:	Polarity of enable GPIO
+ *			1 = Active high, 0 = Active low
+ * @enabled_at_boot:	Whether regulator has been enabled at
+ * 			boot or not. 1 = Yes, 0 = No
+ * 			This is used to keep the regulator at
+ * 			the default state
+ * @init_data:		regulator_init_data
+ *
+ * This structure contains fixed voltage regulator configuration
+ * information that must be passed by platform code to the fixed
+ * voltage regulator driver.
+ */
 struct fixed_voltage_config {
 	const char *supply_name;
 	int microvolts;
+	int gpio;
+	unsigned enable_high:1;
+	unsigned enabled_at_boot:1;
 	struct regulator_init_data *init_data;
 };
 
-- 
cgit v1.2.3-71-gd317


From 2e7e65ce55566fc81036960b00e5e15f5d9578ea Mon Sep 17 00:00:00 2001
From: Wolfram Sang <w.sang@pengutronix.de>
Date: Fri, 18 Sep 2009 22:44:43 +0200
Subject: regulator: fix typos

Fix a couple of typos I found while working with this subsystem.

Signed-off-by: Wolfram Sang <w.sang@pengutronix.de>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 Documentation/power/regulator/machine.txt | 4 ++--
 include/linux/regulator/machine.h         | 6 +++---
 include/linux/regulator/max1586.h         | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/power/regulator/machine.txt b/Documentation/power/regulator/machine.txt
index ce3487d99abe..63728fed620b 100644
--- a/Documentation/power/regulator/machine.txt
+++ b/Documentation/power/regulator/machine.txt
@@ -87,7 +87,7 @@ static struct platform_device regulator_devices[] = {
 },
 };
 /* register regulator 1 device */
-platform_device_register(&wm8350_regulator_devices[0]);
+platform_device_register(&regulator_devices[0]);
 
 /* register regulator 2 device */
-platform_device_register(&wm8350_regulator_devices[1]);
+platform_device_register(&regulator_devices[1]);
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 99a4e2eb36aa..87f5f176d4ef 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -41,7 +41,7 @@ struct regulator;
 #define REGULATOR_CHANGE_DRMS		0x10
 
 /**
- * struct regulator_state - regulator state during low power syatem states
+ * struct regulator_state - regulator state during low power system states
  *
  * This describes a regulators state during a system wide low power state.
  *
@@ -117,10 +117,10 @@ struct regulation_constraints {
 	/* mode to set on startup */
 	unsigned int initial_mode;
 
-	/* constriant flags */
+	/* constraint flags */
 	unsigned always_on:1;	/* regulator never off when system is on */
 	unsigned boot_on:1;	/* bootloader/firmware enabled regulator */
-	unsigned apply_uV:1;	/* apply uV constraint iff min == max */
+	unsigned apply_uV:1;	/* apply uV constraint if min == max */
 };
 
 /**
diff --git a/include/linux/regulator/max1586.h b/include/linux/regulator/max1586.h
index 44563192bf16..de9a7fae20be 100644
--- a/include/linux/regulator/max1586.h
+++ b/include/linux/regulator/max1586.h
@@ -36,7 +36,7 @@
  * max1586_subdev_data - regulator data
  * @id: regulator Id (either MAX1586_V3 or MAX1586_V6)
  * @name: regulator cute name (example for V3: "vcc_core")
- * @platform_data: regulator init data (contraints, supplies, ...)
+ * @platform_data: regulator init data (constraints, supplies, ...)
  */
 struct max1586_subdev_data {
 	int				id;
@@ -46,7 +46,7 @@ struct max1586_subdev_data {
 
 /**
  * max1586_platform_data - platform data for max1586
- * @num_subdevs: number of regultors used (may be 1 or 2)
+ * @num_subdevs: number of regulators used (may be 1 or 2)
  * @subdevs: regulator used
  *           At most, there will be a regulator for V3 and one for V6 voltages.
  * @v3_gain: gain on the V3 voltage output multiplied by 1e6.
-- 
cgit v1.2.3-71-gd317


From d87b969d15a084503870da598c97278fb4877753 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <w.sang@pengutronix.de>
Date: Fri, 18 Sep 2009 22:44:46 +0200
Subject: regulator/driver: be more specific in nanodoc for is_enabled

Document the possibility that is_enabled may also return with negative
errorcodes.

Signed-off-by: Wolfram Sang <w.sang@pengutronix.de>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 include/linux/regulator/driver.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 73c9cd6cda7d..31f2055eae28 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -37,7 +37,8 @@ enum regulator_status {
  *
  * @enable: Configure the regulator as enabled.
  * @disable: Configure the regulator as disabled.
- * @is_enabled: Return 1 if the regulator is enabled, 0 otherwise.
+ * @is_enabled: Return 1 if the regulator is enabled, 0 if not.
+ *		May also return negative errno.
  *
  * @set_voltage: Set the voltage for the regulator within the range specified.
  *               The driver should select the voltage closest to min_uV.
-- 
cgit v1.2.3-71-gd317


From 61e225dc341107be304fd1088146c2a5e88ff9e0 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 21 Sep 2009 17:01:08 -0700
Subject: const: make struct super_block::dq_op const

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/super.c          | 2 +-
 fs/ext4/super.c          | 2 +-
 fs/ocfs2/quota.h         | 2 +-
 fs/ocfs2/quota_global.c  | 2 +-
 fs/quota/dquot.c         | 2 +-
 fs/reiserfs/super.c      | 2 +-
 include/linux/fs.h       | 2 +-
 include/linux/quotaops.h | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index a8d80a7f1105..e7a4e11352d2 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -720,7 +720,7 @@ static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
 static ssize_t ext3_quota_write(struct super_block *sb, int type,
 				const char *data, size_t len, loff_t off);
 
-static struct dquot_operations ext3_quota_operations = {
+static const struct dquot_operations ext3_quota_operations = {
 	.initialize	= dquot_initialize,
 	.drop		= dquot_drop,
 	.alloc_space	= dquot_alloc_space,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a6b1ab734728..7ffb62eca4b2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -964,7 +964,7 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
 static ssize_t ext4_quota_write(struct super_block *sb, int type,
 				const char *data, size_t len, loff_t off);
 
-static struct dquot_operations ext4_quota_operations = {
+static const struct dquot_operations ext4_quota_operations = {
 	.initialize	= dquot_initialize,
 	.drop		= dquot_drop,
 	.alloc_space	= dquot_alloc_space,
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 3fb96fcd4c81..e5df9d170b0c 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -109,7 +109,7 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
 int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
 			   struct buffer_head **bh);
 
-extern struct dquot_operations ocfs2_quota_operations;
+extern const struct dquot_operations ocfs2_quota_operations;
 extern struct quota_format_type ocfs2_quota_format;
 
 int ocfs2_quota_setup(void);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 44f2a5e1d042..3af4954d537b 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -849,7 +849,7 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
 	kmem_cache_free(ocfs2_dquot_cachep, dquot);
 }
 
-struct dquot_operations ocfs2_quota_operations = {
+const struct dquot_operations ocfs2_quota_operations = {
 	.initialize	= dquot_initialize,
 	.drop		= dquot_drop,
 	.alloc_space	= dquot_alloc_space,
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 38f7bd559f35..635ae2e535bf 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1839,7 +1839,7 @@ EXPORT_SYMBOL(dquot_commit_info);
 /*
  * Definitions of diskquota operations.
  */
-struct dquot_operations dquot_operations = {
+const struct dquot_operations dquot_operations = {
 	.initialize	= dquot_initialize,
 	.drop		= dquot_drop,
 	.alloc_space	= dquot_alloc_space,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 7adea74d6a8a..09c93c12874b 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -612,7 +612,7 @@ static int reiserfs_mark_dquot_dirty(struct dquot *);
 static int reiserfs_write_info(struct super_block *, int);
 static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
 
-static struct dquot_operations reiserfs_quota_operations = {
+static const struct dquot_operations reiserfs_quota_operations = {
 	.initialize = dquot_initialize,
 	.drop = dquot_drop,
 	.alloc_space = dquot_alloc_space,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 90162fb3bf04..83e1a0cea97a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1318,7 +1318,7 @@ struct super_block {
 	unsigned long long	s_maxbytes;	/* Max file size */
 	struct file_system_type	*s_type;
 	const struct super_operations	*s_op;
-	struct dquot_operations	*dq_op;
+	const struct dquot_operations	*dq_op;
  	struct quotactl_ops	*s_qcop;
 	const struct export_operations *s_export_op;
 	unsigned long		s_flags;
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 26361c4c037a..8dcbdb6e1019 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -135,7 +135,7 @@ static inline int sb_any_quota_active(struct super_block *sb)
 /*
  * Operations supported for diskquotas.
  */
-extern struct dquot_operations dquot_operations;
+extern const struct dquot_operations dquot_operations;
 extern struct quotactl_ops vfs_quotactl_ops;
 
 #define sb_dquot_ops (&dquot_operations)
-- 
cgit v1.2.3-71-gd317


From 0d54b217a247f39605361f867fefbb9e099a5432 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 21 Sep 2009 17:01:09 -0700
Subject: const: make struct super_block::s_qcop const

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/cifs/cifsfs.c                | 4 ++--
 fs/ext3/super.c                 | 2 +-
 fs/ext4/super.c                 | 2 +-
 fs/ocfs2/super.c                | 2 +-
 fs/quota/dquot.c                | 2 +-
 fs/reiserfs/super.c             | 2 +-
 fs/xfs/linux-2.6/xfs_quotaops.c | 2 +-
 fs/xfs/linux-2.6/xfs_super.h    | 2 +-
 include/linux/fs.h              | 2 +-
 include/linux/quotaops.h        | 2 +-
 10 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 3610e9958b4c..d79ce2e95c23 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -50,7 +50,7 @@
 #define CIFS_MAGIC_NUMBER 0xFF534D42	/* the first four bytes of SMB PDUs */
 
 #ifdef CONFIG_CIFS_QUOTA
-static struct quotactl_ops cifs_quotactl_ops;
+static const struct quotactl_ops cifs_quotactl_ops;
 #endif /* QUOTA */
 
 int cifsFYI = 0;
@@ -517,7 +517,7 @@ int cifs_xstate_get(struct super_block *sb, struct fs_quota_stat *qstats)
 	return rc;
 }
 
-static struct quotactl_ops cifs_quotactl_ops = {
+static const struct quotactl_ops cifs_quotactl_ops = {
 	.set_xquota	= cifs_xquota_set,
 	.get_xquota	= cifs_xquota_get,
 	.set_xstate	= cifs_xstate_set,
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index e7a4e11352d2..72743d360509 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -737,7 +737,7 @@ static const struct dquot_operations ext3_quota_operations = {
 	.destroy_dquot	= dquot_destroy,
 };
 
-static struct quotactl_ops ext3_qctl_operations = {
+static const struct quotactl_ops ext3_qctl_operations = {
 	.quota_on	= ext3_quota_on,
 	.quota_off	= vfs_quota_off,
 	.quota_sync	= vfs_quota_sync,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7ffb62eca4b2..df539ba27779 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -985,7 +985,7 @@ static const struct dquot_operations ext4_quota_operations = {
 	.destroy_dquot	= dquot_destroy,
 };
 
-static struct quotactl_ops ext4_qctl_operations = {
+static const struct quotactl_ops ext4_qctl_operations = {
 	.quota_on	= ext4_quota_on,
 	.quota_off	= vfs_quota_off,
 	.quota_sync	= vfs_quota_sync,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index a3f8871d21fd..faca4720aa47 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -965,7 +965,7 @@ static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
 	return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
 }
 
-static struct quotactl_ops ocfs2_quotactl_ops = {
+static const struct quotactl_ops ocfs2_quotactl_ops = {
 	.quota_on	= ocfs2_quota_on,
 	.quota_off	= ocfs2_quota_off,
 	.quota_sync	= vfs_quota_sync,
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 635ae2e535bf..39b49c42a7ed 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2461,7 +2461,7 @@ out:
 }
 EXPORT_SYMBOL(vfs_set_dqinfo);
 
-struct quotactl_ops vfs_quotactl_ops = {
+const struct quotactl_ops vfs_quotactl_ops = {
 	.quota_on	= vfs_quota_on,
 	.quota_off	= vfs_quota_off,
 	.quota_sync	= vfs_quota_sync,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 09c93c12874b..f0ad05f38022 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -629,7 +629,7 @@ static const struct dquot_operations reiserfs_quota_operations = {
 	.destroy_dquot	= dquot_destroy,
 };
 
-static struct quotactl_ops reiserfs_qctl_operations = {
+static const struct quotactl_ops reiserfs_qctl_operations = {
 	.quota_on = reiserfs_quota_on,
 	.quota_off = vfs_quota_off,
 	.quota_sync = vfs_quota_sync,
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index cb6e2cca214f..9e41f91aa269 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -150,7 +150,7 @@ xfs_fs_set_xquota(
 	return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
 }
 
-struct quotactl_ops xfs_quotactl_operations = {
+const struct quotactl_ops xfs_quotactl_operations = {
 	.quota_sync		= xfs_fs_quota_sync,
 	.get_xstate		= xfs_fs_get_xstate,
 	.set_xstate		= xfs_fs_set_xstate,
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 5a2ea3a21781..18175ebd58ed 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -93,7 +93,7 @@ extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
 
 extern const struct export_operations xfs_export_operations;
 extern struct xattr_handler *xfs_xattr_handlers[];
-extern struct quotactl_ops xfs_quotactl_operations;
+extern const struct quotactl_ops xfs_quotactl_operations;
 
 #define XFS_M(sb)		((struct xfs_mount *)((sb)->s_fs_info))
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 83e1a0cea97a..13c030ebfd28 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1319,7 +1319,7 @@ struct super_block {
 	struct file_system_type	*s_type;
 	const struct super_operations	*s_op;
 	const struct dquot_operations	*dq_op;
- 	struct quotactl_ops	*s_qcop;
+	const struct quotactl_ops	*s_qcop;
 	const struct export_operations *s_export_op;
 	unsigned long		s_flags;
 	unsigned long		s_magic;
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 8dcbdb6e1019..3ebb23153640 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -136,7 +136,7 @@ static inline int sb_any_quota_active(struct super_block *sb)
  * Operations supported for diskquotas.
  */
 extern const struct dquot_operations dquot_operations;
-extern struct quotactl_ops vfs_quotactl_ops;
+extern const struct quotactl_ops vfs_quotactl_ops;
 
 #define sb_dquot_ops (&dquot_operations)
 #define sb_quotactl_ops (&vfs_quotactl_ops)
-- 
cgit v1.2.3-71-gd317


From 6aed62853c72e29f2c97bbac7712cb398e8c9437 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 21 Sep 2009 17:01:11 -0700
Subject: const: make file_lock_operations const

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/afs/flock.c      | 2 +-
 fs/lockd/clntproc.c | 2 +-
 fs/nfs/nfs4state.c  | 2 +-
 include/linux/fs.h  | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 3ff8bdd18fb3..0931bc1325eb 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -21,7 +21,7 @@ static void afs_fl_release_private(struct file_lock *fl);
 static struct workqueue_struct *afs_lock_manager;
 static DEFINE_MUTEX(afs_lock_manager_mutex);
 
-static struct file_lock_operations afs_lock_ops = {
+static const struct file_lock_operations afs_lock_ops = {
 	.fl_copy_lock		= afs_fl_copy_lock,
 	.fl_release_private	= afs_fl_release_private,
 };
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 4336adba952a..c81249fef11f 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -458,7 +458,7 @@ static void nlmclnt_locks_release_private(struct file_lock *fl)
 	nlm_put_lockowner(fl->fl_u.nfs_fl.owner);
 }
 
-static struct file_lock_operations nlmclnt_lock_ops = {
+static const struct file_lock_operations nlmclnt_lock_ops = {
 	.fl_copy_lock = nlmclnt_locks_copy_lock,
 	.fl_release_private = nlmclnt_locks_release_private,
 };
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 1434080aefeb..2ef4fecf3984 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -638,7 +638,7 @@ static void nfs4_fl_release_lock(struct file_lock *fl)
 	nfs4_put_lock_state(fl->fl_u.nfs4_fl.owner);
 }
 
-static struct file_lock_operations nfs4_fl_lock_ops = {
+static const struct file_lock_operations nfs4_fl_lock_ops = {
 	.fl_copy_lock = nfs4_fl_copy_lock,
 	.fl_release_private = nfs4_fl_release_lock,
 };
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 13c030ebfd28..6146dec21c35 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1066,7 +1066,7 @@ struct file_lock {
 	struct fasync_struct *	fl_fasync; /* for lease break notifications */
 	unsigned long fl_break_time;	/* for nonblocking lease breaks */
 
-	struct file_lock_operations *fl_ops;	/* Callbacks for filesystems */
+	const struct file_lock_operations *fl_ops;	/* Callbacks for filesystems */
 	struct lock_manager_operations *fl_lmops;	/* Callbacks for lockmanagers */
 	union {
 		struct nfs_lock_info	nfs_fl;
-- 
cgit v1.2.3-71-gd317


From 7b021967c5e1463936042c8da72b550d3cabe9ac Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 21 Sep 2009 17:01:12 -0700
Subject: const: make lock_manager_operations const

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/lockd/svclock.c          | 2 +-
 fs/locks.c                  | 2 +-
 fs/nfsd/nfs4state.c         | 4 ++--
 include/linux/fs.h          | 2 +-
 include/linux/lockd/lockd.h | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index e577a78d7bac..d1001790fa9a 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -705,7 +705,7 @@ static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2)
 	return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid;
 }
 
-struct lock_manager_operations nlmsvc_lock_operations = {
+const struct lock_manager_operations nlmsvc_lock_operations = {
 	.fl_compare_owner = nlmsvc_same_owner,
 	.fl_notify = nlmsvc_notify_blocked,
 	.fl_grant = nlmsvc_grant_deferred,
diff --git a/fs/locks.c b/fs/locks.c
index 19ee18a6829b..a8794f233bc9 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -434,7 +434,7 @@ static int lease_mylease_callback(struct file_lock *fl, struct file_lock *try)
 	return fl->fl_file == try->fl_file;
 }
 
-static struct lock_manager_operations lease_manager_ops = {
+static const struct lock_manager_operations lease_manager_ops = {
 	.fl_break = lease_break_callback,
 	.fl_release_private = lease_release_private_callback,
 	.fl_mylease = lease_mylease_callback,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 980a216a48c8..766d3d544544 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2163,7 +2163,7 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
 		return -EAGAIN;
 }
 
-static struct lock_manager_operations nfsd_lease_mng_ops = {
+static const struct lock_manager_operations nfsd_lease_mng_ops = {
 	.fl_break = nfsd_break_deleg_cb,
 	.fl_release_private = nfsd_release_deleg_cb,
 	.fl_copy_lock = nfsd_copy_lock_deleg_cb,
@@ -3368,7 +3368,7 @@ nfs4_transform_lock_offset(struct file_lock *lock)
 
 /* Hack!: For now, we're defining this just so we can use a pointer to it
  * as a unique cookie to identify our (NFSv4's) posix locks. */
-static struct lock_manager_operations nfsd_posix_mng_ops  = {
+static const struct lock_manager_operations nfsd_posix_mng_ops  = {
 };
 
 static inline void
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6146dec21c35..51803528b095 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1067,7 +1067,7 @@ struct file_lock {
 	unsigned long fl_break_time;	/* for nonblocking lease breaks */
 
 	const struct file_lock_operations *fl_ops;	/* Callbacks for filesystems */
-	struct lock_manager_operations *fl_lmops;	/* Callbacks for lockmanagers */
+	const struct lock_manager_operations *fl_lmops;	/* Callbacks for lockmanagers */
 	union {
 		struct nfs_lock_info	nfs_fl;
 		struct nfs4_lock_info	nfs4_fl;
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index c325b187966b..ccf2e0dc077a 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -395,7 +395,7 @@ static inline int nlm_compare_locks(const struct file_lock *fl1,
 	     &&(fl1->fl_type  == fl2->fl_type || fl2->fl_type == F_UNLCK);
 }
 
-extern struct lock_manager_operations nlmsvc_lock_operations;
+extern const struct lock_manager_operations nlmsvc_lock_operations;
 
 #endif /* __KERNEL__ */
 
-- 
cgit v1.2.3-71-gd317


From 83d5cde47dedf01b6a4a4331882cbc0a7eea3c2e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 21 Sep 2009 17:01:13 -0700
Subject: const: make block_device_operations const

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/sysdev/axonram.c       | 2 +-
 arch/um/drivers/ubd_kern.c          | 2 +-
 drivers/block/DAC960.c              | 2 +-
 drivers/block/amiflop.c             | 2 +-
 drivers/block/aoe/aoeblk.c          | 2 +-
 drivers/block/ataflop.c             | 2 +-
 drivers/block/brd.c                 | 2 +-
 drivers/block/cciss.c               | 2 +-
 drivers/block/cpqarray.c            | 2 +-
 drivers/block/floppy.c              | 2 +-
 drivers/block/hd.c                  | 2 +-
 drivers/block/loop.c                | 2 +-
 drivers/block/mg_disk.c             | 2 +-
 drivers/block/nbd.c                 | 2 +-
 drivers/block/osdblk.c              | 2 +-
 drivers/block/paride/pcd.c          | 2 +-
 drivers/block/paride/pd.c           | 2 +-
 drivers/block/paride/pf.c           | 2 +-
 drivers/block/pktcdvd.c             | 2 +-
 drivers/block/ps3disk.c             | 2 +-
 drivers/block/ps3vram.c             | 2 +-
 drivers/block/sunvdc.c              | 2 +-
 drivers/block/swim.c                | 2 +-
 drivers/block/swim3.c               | 2 +-
 drivers/block/sx8.c                 | 2 +-
 drivers/block/ub.c                  | 2 +-
 drivers/block/umem.c                | 3 +--
 drivers/block/viodasd.c             | 2 +-
 drivers/block/virtio_blk.c          | 2 +-
 drivers/block/xd.c                  | 2 +-
 drivers/block/xen-blkfront.c        | 4 ++--
 drivers/block/xsysace.c             | 2 +-
 drivers/block/z2ram.c               | 3 +--
 drivers/cdrom/gdrom.c               | 2 +-
 drivers/cdrom/viocd.c               | 2 +-
 drivers/ide/ide-cd.c                | 2 +-
 drivers/ide/ide-gd.c                | 2 +-
 drivers/ide/ide-tape.c              | 2 +-
 drivers/md/dm.c                     | 4 ++--
 drivers/md/md.c                     | 4 ++--
 drivers/memstick/core/mspro_block.c | 2 +-
 drivers/message/i2o/i2o_block.c     | 2 +-
 drivers/mmc/card/block.c            | 2 +-
 drivers/mtd/mtd_blkdevs.c           | 2 +-
 drivers/s390/block/dasd.c           | 2 +-
 drivers/s390/block/dasd_int.h       | 2 +-
 drivers/s390/block/dcssblk.c        | 2 +-
 drivers/s390/block/xpram.c          | 2 +-
 drivers/s390/char/tape_block.c      | 2 +-
 drivers/sbus/char/jsflash.c         | 2 +-
 drivers/scsi/sd.c                   | 2 +-
 drivers/scsi/sr.c                   | 2 +-
 fs/block_dev.c                      | 2 +-
 fs/ext2/xip.c                       | 2 +-
 fs/partitions/check.c               | 2 +-
 include/linux/genhd.h               | 2 +-
 56 files changed, 59 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index a4779912a5ca..88f4ae787832 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -165,7 +165,7 @@ axon_ram_direct_access(struct block_device *device, sector_t sector,
 	return 0;
 }
 
-static struct block_device_operations axon_ram_devops = {
+static const struct block_device_operations axon_ram_devops = {
 	.owner		= THIS_MODULE,
 	.direct_access	= axon_ram_direct_access
 };
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 8f05d4d9da12..635d16d90a80 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -106,7 +106,7 @@ static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
 #define MAX_DEV (16)
 
-static struct block_device_operations ubd_blops = {
+static const struct block_device_operations ubd_blops = {
         .owner		= THIS_MODULE,
         .open		= ubd_open,
         .release	= ubd_release,
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 1e6b7c14f697..8b50af381a8e 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -152,7 +152,7 @@ static int DAC960_revalidate_disk(struct gendisk *disk)
 	return 0;
 }
 
-static struct block_device_operations DAC960_BlockDeviceOperations = {
+static const struct block_device_operations DAC960_BlockDeviceOperations = {
 	.owner			= THIS_MODULE,
 	.open			= DAC960_open,
 	.getgeo			= DAC960_getgeo,
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 2f07b7c99a95..055225839024 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1632,7 +1632,7 @@ static int amiga_floppy_change(struct gendisk *disk)
 	return 0;
 }
 
-static struct block_device_operations floppy_fops = {
+static const struct block_device_operations floppy_fops = {
 	.owner		= THIS_MODULE,
 	.open		= floppy_open,
 	.release	= floppy_release,
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index b6cd571adbf2..3af97d4da2db 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -237,7 +237,7 @@ aoeblk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
-static struct block_device_operations aoe_bdops = {
+static const struct block_device_operations aoe_bdops = {
 	.open = aoeblk_open,
 	.release = aoeblk_release,
 	.getgeo = aoeblk_getgeo,
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 3ff02941b3dd..847a9e57570a 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1856,7 +1856,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
 	return 0;
 }
 
-static struct block_device_operations floppy_fops = {
+static const struct block_device_operations floppy_fops = {
 	.owner		= THIS_MODULE,
 	.open		= floppy_open,
 	.release	= floppy_release,
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 4bf8705b3ace..4f688434daf1 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -375,7 +375,7 @@ static int brd_ioctl(struct block_device *bdev, fmode_t mode,
 	return error;
 }
 
-static struct block_device_operations brd_fops = {
+static const struct block_device_operations brd_fops = {
 	.owner =		THIS_MODULE,
 	.locked_ioctl =		brd_ioctl,
 #ifdef CONFIG_BLK_DEV_XIP
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index d8372b432826..4f19105f755c 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -205,7 +205,7 @@ static int cciss_compat_ioctl(struct block_device *, fmode_t,
 			      unsigned, unsigned long);
 #endif
 
-static struct block_device_operations cciss_fops = {
+static const struct block_device_operations cciss_fops = {
 	.owner = THIS_MODULE,
 	.open = cciss_open,
 	.release = cciss_release,
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index 44fa2018f6b0..b82d438e2607 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -193,7 +193,7 @@ static inline ctlr_info_t *get_host(struct gendisk *disk)
 }
 
 
-static struct block_device_operations ida_fops  = {
+static const struct block_device_operations ida_fops  = {
 	.owner		= THIS_MODULE,
 	.open		= ida_open,
 	.release	= ida_release,
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 2b387c2260d8..5c01f747571b 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3907,7 +3907,7 @@ static int floppy_revalidate(struct gendisk *disk)
 	return res;
 }
 
-static struct block_device_operations floppy_fops = {
+static const struct block_device_operations floppy_fops = {
 	.owner			= THIS_MODULE,
 	.open			= floppy_open,
 	.release		= floppy_release,
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index f9d01608cbe2..d5cdce08ffd2 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -692,7 +692,7 @@ static irqreturn_t hd_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static struct block_device_operations hd_fops = {
+static const struct block_device_operations hd_fops = {
 	.getgeo =	hd_getgeo,
 };
 
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index bbb79441d895..edda9ea7c626 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1438,7 +1438,7 @@ out_unlocked:
 	return 0;
 }
 
-static struct block_device_operations lo_fops = {
+static const struct block_device_operations lo_fops = {
 	.owner =	THIS_MODULE,
 	.open =		lo_open,
 	.release =	lo_release,
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 6d7fbaa92248..e0339aaa1815 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -775,7 +775,7 @@ static int mg_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
-static struct block_device_operations mg_disk_ops = {
+static const struct block_device_operations mg_disk_ops = {
 	.getgeo = mg_getgeo
 };
 
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 5d23ffad7c77..cc923a5b430c 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -722,7 +722,7 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
 	return error;
 }
 
-static struct block_device_operations nbd_fops =
+static const struct block_device_operations nbd_fops =
 {
 	.owner =	THIS_MODULE,
 	.locked_ioctl =	nbd_ioctl,
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index 13c1aee6aa3f..a808b1530b3b 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -125,7 +125,7 @@ static struct class *class_osdblk;		/* /sys/class/osdblk */
 static DEFINE_MUTEX(ctl_mutex);	/* Serialize open/close/setup/teardown */
 static LIST_HEAD(osdblkdev_list);
 
-static struct block_device_operations osdblk_bd_ops = {
+static const struct block_device_operations osdblk_bd_ops = {
 	.owner		= THIS_MODULE,
 };
 
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 9f3518c515a1..8866ca369d5e 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -247,7 +247,7 @@ static int pcd_block_media_changed(struct gendisk *disk)
 	return cdrom_media_changed(&cd->info);
 }
 
-static struct block_device_operations pcd_bdops = {
+static const struct block_device_operations pcd_bdops = {
 	.owner		= THIS_MODULE,
 	.open		= pcd_block_open,
 	.release	= pcd_block_release,
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index bf5955b3d873..569e39e8f114 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -807,7 +807,7 @@ static int pd_revalidate(struct gendisk *p)
 	return 0;
 }
 
-static struct block_device_operations pd_fops = {
+static const struct block_device_operations pd_fops = {
 	.owner		= THIS_MODULE,
 	.open		= pd_open,
 	.release	= pd_release,
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index 68a90834e993..ea54ea393553 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -262,7 +262,7 @@ static char *pf_buf;		/* buffer for request in progress */
 
 /* kernel glue structures */
 
-static struct block_device_operations pf_fops = {
+static const struct block_device_operations pf_fops = {
 	.owner		= THIS_MODULE,
 	.open		= pf_open,
 	.release	= pf_release,
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index fd5bb8ad59a9..2ddf03ae034e 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2849,7 +2849,7 @@ static int pkt_media_changed(struct gendisk *disk)
 	return attached_disk->fops->media_changed(attached_disk);
 }
 
-static struct block_device_operations pktcdvd_ops = {
+static const struct block_device_operations pktcdvd_ops = {
 	.owner =		THIS_MODULE,
 	.open =			pkt_open,
 	.release =		pkt_close,
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 34cbb7f3efa8..03a130dca8ab 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -82,7 +82,7 @@ enum lv1_ata_in_out {
 static int ps3disk_major;
 
 
-static struct block_device_operations ps3disk_fops = {
+static const struct block_device_operations ps3disk_fops = {
 	.owner		= THIS_MODULE,
 };
 
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index c8753a9ed290..3bb7c47c869f 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -88,7 +88,7 @@ struct ps3vram_priv {
 static int ps3vram_major;
 
 
-static struct block_device_operations ps3vram_fops = {
+static const struct block_device_operations ps3vram_fops = {
 	.owner		= THIS_MODULE,
 };
 
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index cbfd9c0aef03..411f064760b4 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -103,7 +103,7 @@ static int vdc_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
-static struct block_device_operations vdc_fops = {
+static const struct block_device_operations vdc_fops = {
 	.owner		= THIS_MODULE,
 	.getgeo		= vdc_getgeo,
 };
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index cf7877fb8a7d..8f569e3df890 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -748,7 +748,7 @@ static int floppy_revalidate(struct gendisk *disk)
 	return !fs->disk_in;
 }
 
-static struct block_device_operations floppy_fops = {
+static const struct block_device_operations floppy_fops = {
 	.owner		 = THIS_MODULE,
 	.open		 = floppy_open,
 	.release	 = floppy_release,
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index 80df93e3cdd0..e39e3820fef9 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -998,7 +998,7 @@ static int floppy_revalidate(struct gendisk *disk)
 	return ret;
 }
 
-static struct block_device_operations floppy_fops = {
+static const struct block_device_operations floppy_fops = {
 	.open		= floppy_open,
 	.release	= floppy_release,
 	.locked_ioctl	= floppy_ioctl,
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index f5cd2e83ebcc..a7c4184f4a63 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -423,7 +423,7 @@ static struct pci_driver carm_driver = {
 	.remove		= carm_remove_one,
 };
 
-static struct block_device_operations carm_bd_ops = {
+static const struct block_device_operations carm_bd_ops = {
 	.owner		= THIS_MODULE,
 	.getgeo		= carm_bdev_getgeo,
 };
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index cc54473b8e77..c739b203fe91 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -1789,7 +1789,7 @@ static int ub_bd_media_changed(struct gendisk *disk)
 	return lun->changed;
 }
 
-static struct block_device_operations ub_bd_fops = {
+static const struct block_device_operations ub_bd_fops = {
 	.owner		= THIS_MODULE,
 	.open		= ub_bd_open,
 	.release	= ub_bd_release,
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 858c34dd032d..ad1ba393801a 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -140,7 +140,6 @@ struct cardinfo {
 };
 
 static struct cardinfo cards[MM_MAXCARDS];
-static struct block_device_operations mm_fops;
 static struct timer_list battery_timer;
 
 static int num_cards;
@@ -789,7 +788,7 @@ static int mm_check_change(struct gendisk *disk)
 	return 0;
 }
 
-static struct block_device_operations mm_fops = {
+static const struct block_device_operations mm_fops = {
 	.owner		= THIS_MODULE,
 	.getgeo		= mm_getgeo,
 	.revalidate_disk = mm_revalidate,
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index b441ce3832e9..a8c8b56b275e 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -219,7 +219,7 @@ static int viodasd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 /*
  * Our file operations table
  */
-static struct block_device_operations viodasd_fops = {
+static const struct block_device_operations viodasd_fops = {
 	.owner = THIS_MODULE,
 	.open = viodasd_open,
 	.release = viodasd_release,
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index aa1a3d5a3e2b..aa89fe45237d 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -243,7 +243,7 @@ static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
 	return 0;
 }
 
-static struct block_device_operations virtblk_fops = {
+static const struct block_device_operations virtblk_fops = {
 	.locked_ioctl = virtblk_ioctl,
 	.owner  = THIS_MODULE,
 	.getgeo = virtblk_getgeo,
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index ce2429219925..0877d3628fda 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -130,7 +130,7 @@ static struct gendisk *xd_gendisk[2];
 
 static int xd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
-static struct block_device_operations xd_fops = {
+static const struct block_device_operations xd_fops = {
 	.owner	= THIS_MODULE,
 	.locked_ioctl	= xd_ioctl,
 	.getgeo = xd_getgeo,
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index e53284767f7c..b8578bb3f4c9 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -65,7 +65,7 @@ struct blk_shadow {
 	unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 };
 
-static struct block_device_operations xlvbd_block_fops;
+static const struct block_device_operations xlvbd_block_fops;
 
 #define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
 
@@ -1039,7 +1039,7 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
 	return 0;
 }
 
-static struct block_device_operations xlvbd_block_fops =
+static const struct block_device_operations xlvbd_block_fops =
 {
 	.owner = THIS_MODULE,
 	.open = blkif_open,
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index b20abe102a2b..e5c5415eb45e 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -941,7 +941,7 @@ static int ace_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
-static struct block_device_operations ace_fops = {
+static const struct block_device_operations ace_fops = {
 	.owner = THIS_MODULE,
 	.open = ace_open,
 	.release = ace_release,
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index b2590409f25e..64f941e0f14b 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -64,7 +64,6 @@ static int current_device   = -1;
 
 static DEFINE_SPINLOCK(z2ram_lock);
 
-static struct block_device_operations z2_fops;
 static struct gendisk *z2ram_gendisk;
 
 static void do_z2_request(struct request_queue *q)
@@ -315,7 +314,7 @@ z2_release(struct gendisk *disk, fmode_t mode)
     return 0;
 }
 
-static struct block_device_operations z2_fops =
+static const struct block_device_operations z2_fops =
 {
 	.owner		= THIS_MODULE,
 	.open		= z2_open,
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index b5621f27c4be..a762283d2a21 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -512,7 +512,7 @@ static int gdrom_bdops_ioctl(struct block_device *bdev, fmode_t mode,
 	return cdrom_ioctl(gd.cd_info, bdev, mode, cmd, arg);
 }
 
-static struct block_device_operations gdrom_bdops = {
+static const struct block_device_operations gdrom_bdops = {
 	.owner			= THIS_MODULE,
 	.open			= gdrom_bdops_open,
 	.release		= gdrom_bdops_release,
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index 0fff646cc2f0..57ca69e0ac55 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -177,7 +177,7 @@ static int viocd_blk_media_changed(struct gendisk *disk)
 	return cdrom_media_changed(&di->viocd_info);
 }
 
-struct block_device_operations viocd_fops = {
+static const struct block_device_operations viocd_fops = {
 	.owner =		THIS_MODULE,
 	.open =			viocd_blk_open,
 	.release =		viocd_blk_release,
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index b79ca419d8d9..64207df8da82 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1686,7 +1686,7 @@ static int idecd_revalidate_disk(struct gendisk *disk)
 	return  0;
 }
 
-static struct block_device_operations idecd_ops = {
+static const struct block_device_operations idecd_ops = {
 	.owner			= THIS_MODULE,
 	.open			= idecd_open,
 	.release		= idecd_release,
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
index 214119026b3f..753241429c26 100644
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -321,7 +321,7 @@ static int ide_gd_ioctl(struct block_device *bdev, fmode_t mode,
 	return drive->disk_ops->ioctl(drive, bdev, mode, cmd, arg);
 }
 
-static struct block_device_operations ide_gd_ops = {
+static const struct block_device_operations ide_gd_ops = {
 	.owner			= THIS_MODULE,
 	.open			= ide_gd_open,
 	.release		= ide_gd_release,
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 9d6f62baac27..58fc920d5c32 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -1913,7 +1913,7 @@ static int idetape_ioctl(struct block_device *bdev, fmode_t mode,
 	return err;
 }
 
-static struct block_device_operations idetape_block_ops = {
+static const struct block_device_operations idetape_block_ops = {
 	.owner		= THIS_MODULE,
 	.open		= idetape_open,
 	.release	= idetape_release,
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index eee28fac210c..376f1ab48a24 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1716,7 +1716,7 @@ out:
 	return r;
 }
 
-static struct block_device_operations dm_blk_dops;
+static const struct block_device_operations dm_blk_dops;
 
 static void dm_wq_work(struct work_struct *work);
 
@@ -2663,7 +2663,7 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
 	kfree(pools);
 }
 
-static struct block_device_operations dm_blk_dops = {
+static const struct block_device_operations dm_blk_dops = {
 	.open = dm_blk_open,
 	.release = dm_blk_close,
 	.ioctl = dm_blk_ioctl,
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9dd872000cec..6aa497e4baf8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -138,7 +138,7 @@ static ctl_table raid_root_table[] = {
 	{ .ctl_name = 0 }
 };
 
-static struct block_device_operations md_fops;
+static const struct block_device_operations md_fops;
 
 static int start_readonly;
 
@@ -5556,7 +5556,7 @@ static int md_revalidate(struct gendisk *disk)
 	mddev->changed = 0;
 	return 0;
 }
-static struct block_device_operations md_fops =
+static const struct block_device_operations md_fops =
 {
 	.owner		= THIS_MODULE,
 	.open		= md_open,
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 7847bbc1440d..bd83fa0a4970 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -235,7 +235,7 @@ static int mspro_block_bd_getgeo(struct block_device *bdev,
 	return 0;
 }
 
-static struct block_device_operations ms_block_bdops = {
+static const struct block_device_operations ms_block_bdops = {
 	.open    = mspro_block_bd_open,
 	.release = mspro_block_bd_release,
 	.getgeo  = mspro_block_bd_getgeo,
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 335d4c78a775..d505b68cd372 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -925,7 +925,7 @@ static void i2o_block_request_fn(struct request_queue *q)
 };
 
 /* I2O Block device operations definition */
-static struct block_device_operations i2o_block_fops = {
+static const struct block_device_operations i2o_block_fops = {
 	.owner = THIS_MODULE,
 	.open = i2o_block_open,
 	.release = i2o_block_release,
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index adc205c49fbf..85f0e8cd875b 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -130,7 +130,7 @@ mmc_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
-static struct block_device_operations mmc_bdops = {
+static const struct block_device_operations mmc_bdops = {
 	.open			= mmc_blk_open,
 	.release		= mmc_blk_release,
 	.getgeo			= mmc_blk_getgeo,
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 7baba40c1ed2..0acbf4f5be50 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -210,7 +210,7 @@ static int blktrans_ioctl(struct block_device *bdev, fmode_t mode,
 	}
 }
 
-static struct block_device_operations mtd_blktrans_ops = {
+static const struct block_device_operations mtd_blktrans_ops = {
 	.owner		= THIS_MODULE,
 	.open		= blktrans_open,
 	.release	= blktrans_release,
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index e109da4583a8..dad0449475b6 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -2146,7 +2146,7 @@ static int dasd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
-struct block_device_operations
+const struct block_device_operations
 dasd_device_operations = {
 	.owner		= THIS_MODULE,
 	.open		= dasd_open,
diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h
index 5e47a1ee52b9..8afd9fa00875 100644
--- a/drivers/s390/block/dasd_int.h
+++ b/drivers/s390/block/dasd_int.h
@@ -540,7 +540,7 @@ dasd_check_blocksize(int bsize)
 extern debug_info_t *dasd_debug_area;
 extern struct dasd_profile_info_t dasd_global_profile;
 extern unsigned int dasd_profile_level;
-extern struct block_device_operations dasd_device_operations;
+extern const struct block_device_operations dasd_device_operations;
 
 extern struct kmem_cache *dasd_page_cache;
 
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index d34617682a62..f76f4bd82b9f 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -34,7 +34,7 @@ static int dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
 static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
 
 static int dcssblk_major;
-static struct block_device_operations dcssblk_devops = {
+static const struct block_device_operations dcssblk_devops = {
 	.owner   	= THIS_MODULE,
 	.open    	= dcssblk_open,
 	.release 	= dcssblk_release,
diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
index ee604e92a5fa..116d1b3eeb15 100644
--- a/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@ -244,7 +244,7 @@ static int xpram_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
-static struct block_device_operations xpram_devops =
+static const struct block_device_operations xpram_devops =
 {
 	.owner	= THIS_MODULE,
 	.getgeo	= xpram_getgeo,
diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c
index 4cb9e70507ab..64f57ef2763c 100644
--- a/drivers/s390/char/tape_block.c
+++ b/drivers/s390/char/tape_block.c
@@ -50,7 +50,7 @@ static int tapeblock_ioctl(struct block_device *, fmode_t, unsigned int,
 static int tapeblock_medium_changed(struct gendisk *);
 static int tapeblock_revalidate_disk(struct gendisk *);
 
-static struct block_device_operations tapeblock_fops = {
+static const struct block_device_operations tapeblock_fops = {
 	.owner		 = THIS_MODULE,
 	.open		 = tapeblock_open,
 	.release	 = tapeblock_release,
diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c
index 6d4651684688..869a30b49edc 100644
--- a/drivers/sbus/char/jsflash.c
+++ b/drivers/sbus/char/jsflash.c
@@ -452,7 +452,7 @@ static const struct file_operations jsf_fops = {
 
 static struct miscdevice jsf_dev = { JSF_MINOR, "jsflash", &jsf_fops };
 
-static struct block_device_operations jsfd_fops = {
+static const struct block_device_operations jsfd_fops = {
 	.owner =	THIS_MODULE,
 };
 
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index a89c421dab51..8dd96dcd716c 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -956,7 +956,7 @@ static int sd_compat_ioctl(struct block_device *bdev, fmode_t mode,
 }
 #endif
 
-static struct block_device_operations sd_fops = {
+static const struct block_device_operations sd_fops = {
 	.owner			= THIS_MODULE,
 	.open			= sd_open,
 	.release		= sd_release,
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index cce0fe4c8a3b..eb61f7a70e1d 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -525,7 +525,7 @@ static int sr_block_media_changed(struct gendisk *disk)
 	return cdrom_media_changed(&cd->cdi);
 }
 
-static struct block_device_operations sr_bdops =
+static const struct block_device_operations sr_bdops =
 {
 	.owner		= THIS_MODULE,
 	.open		= sr_block_open,
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 71e7e03ac343..5d1ed50bd46c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1114,7 +1114,7 @@ EXPORT_SYMBOL(revalidate_disk);
 int check_disk_change(struct block_device *bdev)
 {
 	struct gendisk *disk = bdev->bd_disk;
-	struct block_device_operations * bdops = disk->fops;
+	const struct block_device_operations *bdops = disk->fops;
 
 	if (!bdops->media_changed)
 		return 0;
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
index b72b85884223..c18fbf3e4068 100644
--- a/fs/ext2/xip.c
+++ b/fs/ext2/xip.c
@@ -20,7 +20,7 @@ __inode_direct_access(struct inode *inode, sector_t block,
 		      void **kaddr, unsigned long *pfn)
 {
 	struct block_device *bdev = inode->i_sb->s_bdev;
-	struct block_device_operations *ops = bdev->bd_disk->fops;
+	const struct block_device_operations *ops = bdev->bd_disk->fops;
 	sector_t sector;
 
 	sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index fbeaddf595d3..7b685e10cbad 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -581,7 +581,7 @@ try_scan:
 		}
 
 		if (from + size > get_capacity(disk)) {
-			struct block_device_operations *bdops = disk->fops;
+			const struct block_device_operations *bdops = disk->fops;
 			unsigned long long capacity;
 
 			printk(KERN_WARNING
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 109d179adb93..297df45ffd0a 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -151,7 +151,7 @@ struct gendisk {
 	struct disk_part_tbl *part_tbl;
 	struct hd_struct part0;
 
-	struct block_device_operations *fops;
+	const struct block_device_operations *fops;
 	struct request_queue *queue;
 	void *private_data;
 
-- 
cgit v1.2.3-71-gd317


From a5abeeacc44bbef2935a7a8e939264c28962def2 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 21 Sep 2009 17:01:13 -0700
Subject: mm: make swap token dummies static inlines

Make use of the compiler's typechecking on !CONFIG_SWAP as well.

[akpm@linux-foundation.org: build fix]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7c15334f3ff2..6c990e658f4e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -419,10 +419,22 @@ static inline swp_entry_t get_swap_page(void)
 }
 
 /* linux/mm/thrash.c */
-#define put_swap_token(mm)	do { } while (0)
-#define grab_swap_token(mm)	do { } while (0)
-#define has_swap_token(mm)	0
-#define disable_swap_token()	do { } while (0)
+static inline void put_swap_token(struct mm_struct *mm)
+{
+}
+
+static inline void grab_swap_token(struct mm_struct *mm)
+{
+}
+
+static inline int has_swap_token(struct mm_struct *mm)
+{
+	return 0;
+}
+
+static inline void disable_swap_token(void)
+{
+}
 
 static inline void
 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
-- 
cgit v1.2.3-71-gd317


From 112067f0905b2de862c607ee62411cf47d2fe5c4 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 21 Sep 2009 17:01:16 -0700
Subject: memory hotplug: update zone pcp at memory online

In my test, 128M memory is hot added, but zone's pcp batch is 0, which is
an obvious error.  When pages are onlined, zone pcp should be updated
accordingly.

[akpm@linux-foundation.org: fix warnings]
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Yakui Zhao <yakui.zhao@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h  |  2 ++
 mm/memory_hotplug.c |  1 +
 mm/page_alloc.c     | 26 ++++++++++++++++++++++++++
 3 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9a72cc78e6b8..d3c8ae7c8015 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1058,6 +1058,8 @@ extern void setup_per_cpu_pageset(void);
 static inline void setup_per_cpu_pageset(void) {}
 #endif
 
+extern void zone_pcp_update(struct zone *zone);
+
 /* nommu.c */
 extern atomic_long_t mmap_pages_allocated;
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e4412a676c88..616236e6343f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -422,6 +422,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	zone->present_pages += onlined_pages;
 	zone->zone_pgdat->node_present_pages += onlined_pages;
 
+	zone_pcp_update(zone);
 	setup_per_zone_wmarks();
 	calculate_zone_inactive_ratio(zone);
 	if (onlined_pages) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f4e929e356db..1a3a893ef50e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3142,6 +3142,32 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 	return 0;
 }
 
+static int __zone_pcp_update(void *data)
+{
+	struct zone *zone = data;
+	int cpu;
+	unsigned long batch = zone_batchsize(zone), flags;
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		struct per_cpu_pageset *pset;
+		struct per_cpu_pages *pcp;
+
+		pset = zone_pcp(zone, cpu);
+		pcp = &pset->pcp;
+
+		local_irq_save(flags);
+		free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+		setup_pageset(pset, batch);
+		local_irq_restore(flags);
+	}
+	return 0;
+}
+
+void zone_pcp_update(struct zone *zone)
+{
+	stop_machine(__zone_pcp_update, zone, NULL);
+}
+
 static __meminit void zone_pcp_init(struct zone *zone)
 {
 	int cpu;
-- 
cgit v1.2.3-71-gd317


From e8c5c8249878fb6564125680a1d15e06adbd5639 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Mon, 21 Sep 2009 17:01:22 -0700
Subject: hugetlb: balance freeing of huge pages across nodes

Free huges pages from nodes in round robin fashion in an attempt to keep
[persistent a.k.a static] hugepages balanced across nodes

New function free_pool_huge_page() is modeled on and performs roughly the
inverse of alloc_fresh_huge_page().  Replaces dequeue_huge_page() which
now has no callers, so this patch removes it.

Helper function hstate_next_node_to_free() uses new hstate member
next_to_free_nid to distribute "frees" across all nodes with huge pages.

Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: Adam Litke <agl@us.ibm.com>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: Eric Whitney <eric.whitney@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |   3 +-
 mm/hugetlb.c            | 132 +++++++++++++++++++++++++++++++-----------------
 2 files changed, 88 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5cbc620bdfe0..16cdb75a543a 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -185,7 +185,8 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 #define HSTATE_NAME_LEN 32
 /* Defines one hugetlb page size */
 struct hstate {
-	int hugetlb_next_nid;
+	int next_nid_to_alloc;
+	int next_nid_to_free;
 	unsigned int order;
 	unsigned long mask;
 	unsigned long max_huge_pages;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b16d63634777..38dab5586827 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -456,24 +456,6 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
 	h->free_huge_pages_node[nid]++;
 }
 
-static struct page *dequeue_huge_page(struct hstate *h)
-{
-	int nid;
-	struct page *page = NULL;
-
-	for (nid = 0; nid < MAX_NUMNODES; ++nid) {
-		if (!list_empty(&h->hugepage_freelists[nid])) {
-			page = list_entry(h->hugepage_freelists[nid].next,
-					  struct page, lru);
-			list_del(&page->lru);
-			h->free_huge_pages--;
-			h->free_huge_pages_node[nid]--;
-			break;
-		}
-	}
-	return page;
-}
-
 static struct page *dequeue_huge_page_vma(struct hstate *h,
 				struct vm_area_struct *vma,
 				unsigned long address, int avoid_reserve)
@@ -641,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 
 /*
  * Use a helper variable to find the next node and then
- * copy it back to hugetlb_next_nid afterwards:
+ * copy it back to next_nid_to_alloc afterwards:
  * otherwise there's a window in which a racer might
  * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
  * But we don't need to use a spin_lock here: it really
@@ -650,13 +632,13 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
  * if we just successfully allocated a hugepage so that
  * the next caller gets hugepages on the next node.
  */
-static int hstate_next_node(struct hstate *h)
+static int hstate_next_node_to_alloc(struct hstate *h)
 {
 	int next_nid;
-	next_nid = next_node(h->hugetlb_next_nid, node_online_map);
+	next_nid = next_node(h->next_nid_to_alloc, node_online_map);
 	if (next_nid == MAX_NUMNODES)
 		next_nid = first_node(node_online_map);
-	h->hugetlb_next_nid = next_nid;
+	h->next_nid_to_alloc = next_nid;
 	return next_nid;
 }
 
@@ -667,14 +649,15 @@ static int alloc_fresh_huge_page(struct hstate *h)
 	int next_nid;
 	int ret = 0;
 
-	start_nid = h->hugetlb_next_nid;
+	start_nid = h->next_nid_to_alloc;
+	next_nid = start_nid;
 
 	do {
-		page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);
+		page = alloc_fresh_huge_page_node(h, next_nid);
 		if (page)
 			ret = 1;
-		next_nid = hstate_next_node(h);
-	} while (!page && h->hugetlb_next_nid != start_nid);
+		next_nid = hstate_next_node_to_alloc(h);
+	} while (!page && next_nid != start_nid);
 
 	if (ret)
 		count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -684,6 +667,52 @@ static int alloc_fresh_huge_page(struct hstate *h)
 	return ret;
 }
 
+/*
+ * helper for free_pool_huge_page() - find next node
+ * from which to free a huge page
+ */
+static int hstate_next_node_to_free(struct hstate *h)
+{
+	int next_nid;
+	next_nid = next_node(h->next_nid_to_free, node_online_map);
+	if (next_nid == MAX_NUMNODES)
+		next_nid = first_node(node_online_map);
+	h->next_nid_to_free = next_nid;
+	return next_nid;
+}
+
+/*
+ * Free huge page from pool from next node to free.
+ * Attempt to keep persistent huge pages more or less
+ * balanced over allowed nodes.
+ * Called with hugetlb_lock locked.
+ */
+static int free_pool_huge_page(struct hstate *h)
+{
+	int start_nid;
+	int next_nid;
+	int ret = 0;
+
+	start_nid = h->next_nid_to_free;
+	next_nid = start_nid;
+
+	do {
+		if (!list_empty(&h->hugepage_freelists[next_nid])) {
+			struct page *page =
+				list_entry(h->hugepage_freelists[next_nid].next,
+					  struct page, lru);
+			list_del(&page->lru);
+			h->free_huge_pages--;
+			h->free_huge_pages_node[next_nid]--;
+			update_and_free_page(h, page);
+			ret = 1;
+		}
+		next_nid = hstate_next_node_to_free(h);
+	} while (!ret && next_nid != start_nid);
+
+	return ret;
+}
+
 static struct page *alloc_buddy_huge_page(struct hstate *h,
 			struct vm_area_struct *vma, unsigned long address)
 {
@@ -1008,7 +1037,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
 		void *addr;
 
 		addr = __alloc_bootmem_node_nopanic(
-				NODE_DATA(h->hugetlb_next_nid),
+				NODE_DATA(h->next_nid_to_alloc),
 				huge_page_size(h), huge_page_size(h), 0);
 
 		if (addr) {
@@ -1020,7 +1049,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
 			m = addr;
 			goto found;
 		}
-		hstate_next_node(h);
+		hstate_next_node_to_alloc(h);
 		nr_nodes--;
 	}
 	return 0;
@@ -1141,31 +1170,43 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
  */
 static int adjust_pool_surplus(struct hstate *h, int delta)
 {
-	static int prev_nid;
-	int nid = prev_nid;
+	int start_nid, next_nid;
 	int ret = 0;
 
 	VM_BUG_ON(delta != -1 && delta != 1);
-	do {
-		nid = next_node(nid, node_online_map);
-		if (nid == MAX_NUMNODES)
-			nid = first_node(node_online_map);
 
-		/* To shrink on this node, there must be a surplus page */
-		if (delta < 0 && !h->surplus_huge_pages_node[nid])
-			continue;
-		/* Surplus cannot exceed the total number of pages */
-		if (delta > 0 && h->surplus_huge_pages_node[nid] >=
+	if (delta < 0)
+		start_nid = h->next_nid_to_alloc;
+	else
+		start_nid = h->next_nid_to_free;
+	next_nid = start_nid;
+
+	do {
+		int nid = next_nid;
+		if (delta < 0)  {
+			next_nid = hstate_next_node_to_alloc(h);
+			/*
+			 * To shrink on this node, there must be a surplus page
+			 */
+			if (!h->surplus_huge_pages_node[nid])
+				continue;
+		}
+		if (delta > 0) {
+			next_nid = hstate_next_node_to_free(h);
+			/*
+			 * Surplus cannot exceed the total number of pages
+			 */
+			if (h->surplus_huge_pages_node[nid] >=
 						h->nr_huge_pages_node[nid])
-			continue;
+				continue;
+		}
 
 		h->surplus_huge_pages += delta;
 		h->surplus_huge_pages_node[nid] += delta;
 		ret = 1;
 		break;
-	} while (nid != prev_nid);
+	} while (next_nid != start_nid);
 
-	prev_nid = nid;
 	return ret;
 }
 
@@ -1227,10 +1268,8 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
 	min_count = max(count, min_count);
 	try_to_free_low(h, min_count);
 	while (min_count < persistent_huge_pages(h)) {
-		struct page *page = dequeue_huge_page(h);
-		if (!page)
+		if (!free_pool_huge_page(h))
 			break;
-		update_and_free_page(h, page);
 	}
 	while (count < persistent_huge_pages(h)) {
 		if (!adjust_pool_surplus(h, 1))
@@ -1442,7 +1481,8 @@ void __init hugetlb_add_hstate(unsigned order)
 	h->free_huge_pages = 0;
 	for (i = 0; i < MAX_NUMNODES; ++i)
 		INIT_LIST_HEAD(&h->hugepage_freelists[i]);
-	h->hugetlb_next_nid = first_node(node_online_map);
+	h->next_nid_to_alloc = first_node(node_online_map);
+	h->next_nid_to_free = first_node(node_online_map);
 	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
 					huge_page_size(h)/1024);
 
-- 
cgit v1.2.3-71-gd317


From c6a7f5728a1db45d30df55a01adc130b4ab0327c Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Mon, 21 Sep 2009 17:01:32 -0700
Subject: mm: oom analysis: Show kernel stack usage in /proc/meminfo and OOM
 log output

The amount of memory allocated to kernel stacks can become significant and
cause OOM conditions.  However, we do not display the amount of memory
consumed by stacks.

Add code to display the amount of memory used for stacks in /proc/meminfo.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c    |  3 +++
 fs/proc/meminfo.c      |  2 ++
 include/linux/mmzone.h |  3 ++-
 kernel/fork.c          | 11 +++++++++++
 mm/page_alloc.c        |  3 +++
 mm/vmstat.c            |  1 +
 6 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 91d4087b4039..b560c17f6d4e 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -85,6 +85,7 @@ static ssize_t node_read_meminfo(struct sys_device * dev,
 		       "Node %d FilePages:      %8lu kB\n"
 		       "Node %d Mapped:         %8lu kB\n"
 		       "Node %d AnonPages:      %8lu kB\n"
+		       "Node %d KernelStack:    %8lu kB\n"
 		       "Node %d PageTables:     %8lu kB\n"
 		       "Node %d NFS_Unstable:   %8lu kB\n"
 		       "Node %d Bounce:         %8lu kB\n"
@@ -116,6 +117,8 @@ static ssize_t node_read_meminfo(struct sys_device * dev,
 		       nid, K(node_page_state(nid, NR_FILE_PAGES)),
 		       nid, K(node_page_state(nid, NR_FILE_MAPPED)),
 		       nid, K(node_page_state(nid, NR_ANON_PAGES)),
+		       nid, node_page_state(nid, NR_KERNEL_STACK) *
+				THREAD_SIZE / 1024,
 		       nid, K(node_page_state(nid, NR_PAGETABLE)),
 		       nid, K(node_page_state(nid, NR_UNSTABLE_NFS)),
 		       nid, K(node_page_state(nid, NR_BOUNCE)),
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index d5c410d47fae..1fc588f430e4 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -84,6 +84,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		"Slab:           %8lu kB\n"
 		"SReclaimable:   %8lu kB\n"
 		"SUnreclaim:     %8lu kB\n"
+		"KernelStack:    %8lu kB\n"
 		"PageTables:     %8lu kB\n"
 #ifdef CONFIG_QUICKLIST
 		"Quicklists:     %8lu kB\n"
@@ -128,6 +129,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 				global_page_state(NR_SLAB_UNRECLAIMABLE)),
 		K(global_page_state(NR_SLAB_RECLAIMABLE)),
 		K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
+		global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
 		K(global_page_state(NR_PAGETABLE)),
 #ifdef CONFIG_QUICKLIST
 		K(quicklist_total_size()),
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 889598537370..d9335b8de84a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -94,10 +94,11 @@ enum zone_stat_item {
 	NR_SLAB_RECLAIMABLE,
 	NR_SLAB_UNRECLAIMABLE,
 	NR_PAGETABLE,		/* used for pagetables */
+	NR_KERNEL_STACK,
+	/* Second 128 byte cacheline */
 	NR_UNSTABLE_NFS,	/* NFS unstable pages */
 	NR_BOUNCE,
 	NR_VMSCAN_WRITE,
-	/* Second 128 byte cacheline */
 	NR_WRITEBACK_TEMP,	/* Writeback using temporary buffers */
 #ifdef CONFIG_NUMA
 	NUMA_HIT,		/* allocated in intended node */
diff --git a/kernel/fork.c b/kernel/fork.c
index 2cebfb23b0b8..d4638c8cc19e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -136,9 +136,17 @@ struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
+static void account_kernel_stack(struct thread_info *ti, int account)
+{
+	struct zone *zone = page_zone(virt_to_page(ti));
+
+	mod_zone_page_state(zone, NR_KERNEL_STACK, account);
+}
+
 void free_task(struct task_struct *tsk)
 {
 	prop_local_destroy_single(&tsk->dirties);
+	account_kernel_stack(tsk->stack, -1);
 	free_thread_info(tsk->stack);
 	rt_mutex_debug_task_free(tsk);
 	ftrace_graph_exit_task(tsk);
@@ -253,6 +261,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	tsk->btrace_seq = 0;
 #endif
 	tsk->splice_pipe = NULL;
+
+	account_kernel_stack(ti, 1);
+
 	return tsk;
 
 out:
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 494c09196c30..4e050f325ebd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2177,6 +2177,7 @@ void show_free_areas(void)
 			" mapped:%lukB"
 			" slab_reclaimable:%lukB"
 			" slab_unreclaimable:%lukB"
+			" kernel_stack:%lukB"
 			" pagetables:%lukB"
 			" unstable:%lukB"
 			" bounce:%lukB"
@@ -2201,6 +2202,8 @@ void show_free_areas(void)
 			K(zone_page_state(zone, NR_FILE_MAPPED)),
 			K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
 			K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
+			zone_page_state(zone, NR_KERNEL_STACK) *
+				THREAD_SIZE / 1024,
 			K(zone_page_state(zone, NR_PAGETABLE)),
 			K(zone_page_state(zone, NR_UNSTABLE_NFS)),
 			K(zone_page_state(zone, NR_BOUNCE)),
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 138bed53706e..ceda39b63d7e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -639,6 +639,7 @@ static const char * const vmstat_text[] = {
 	"nr_slab_reclaimable",
 	"nr_slab_unreclaimable",
 	"nr_page_table_pages",
+	"nr_kernel_stack",
 	"nr_unstable",
 	"nr_bounce",
 	"nr_vmscan_write",
-- 
cgit v1.2.3-71-gd317


From 4b02108ac1b3354a22b0d83c684797692efdc395 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Mon, 21 Sep 2009 17:01:33 -0700
Subject: mm: oom analysis: add shmem vmstat

Recently we encountered OOM problems due to memory use of the GEM cache.
Generally a large amuont of Shmem/Tmpfs pages tend to create a memory
shortage problem.

We often use the following calculation to determine the amount of shmem
pages:

shmem = NR_ACTIVE_ANON + NR_INACTIVE_ANON - NR_ANON_PAGES

however the expression does not consider isolated and mlocked pages.

This patch adds explicit accounting for pages used by shmem and tmpfs.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Acked-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c    | 2 ++
 fs/proc/meminfo.c      | 2 ++
 include/linux/mmzone.h | 1 +
 mm/filemap.c           | 4 ++++
 mm/migrate.c           | 5 ++++-
 mm/page_alloc.c        | 5 ++++-
 mm/vmstat.c            | 2 +-
 7 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index b560c17f6d4e..1fe5536d404f 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -85,6 +85,7 @@ static ssize_t node_read_meminfo(struct sys_device * dev,
 		       "Node %d FilePages:      %8lu kB\n"
 		       "Node %d Mapped:         %8lu kB\n"
 		       "Node %d AnonPages:      %8lu kB\n"
+		       "Node %d Shmem:          %8lu kB\n"
 		       "Node %d KernelStack:    %8lu kB\n"
 		       "Node %d PageTables:     %8lu kB\n"
 		       "Node %d NFS_Unstable:   %8lu kB\n"
@@ -117,6 +118,7 @@ static ssize_t node_read_meminfo(struct sys_device * dev,
 		       nid, K(node_page_state(nid, NR_FILE_PAGES)),
 		       nid, K(node_page_state(nid, NR_FILE_MAPPED)),
 		       nid, K(node_page_state(nid, NR_ANON_PAGES)),
+		       nid, K(node_page_state(nid, NR_SHMEM)),
 		       nid, node_page_state(nid, NR_KERNEL_STACK) *
 				THREAD_SIZE / 1024,
 		       nid, K(node_page_state(nid, NR_PAGETABLE)),
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 1fc588f430e4..171e052c07b3 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -81,6 +81,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		"Writeback:      %8lu kB\n"
 		"AnonPages:      %8lu kB\n"
 		"Mapped:         %8lu kB\n"
+		"Shmem:          %8lu kB\n"
 		"Slab:           %8lu kB\n"
 		"SReclaimable:   %8lu kB\n"
 		"SUnreclaim:     %8lu kB\n"
@@ -125,6 +126,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		K(global_page_state(NR_WRITEBACK)),
 		K(global_page_state(NR_ANON_PAGES)),
 		K(global_page_state(NR_FILE_MAPPED)),
+		K(global_page_state(NR_SHMEM)),
 		K(global_page_state(NR_SLAB_RECLAIMABLE) +
 				global_page_state(NR_SLAB_UNRECLAIMABLE)),
 		K(global_page_state(NR_SLAB_RECLAIMABLE)),
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d9335b8de84a..b3583b93b77e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -100,6 +100,7 @@ enum zone_stat_item {
 	NR_BOUNCE,
 	NR_VMSCAN_WRITE,
 	NR_WRITEBACK_TEMP,	/* Writeback using temporary buffers */
+	NR_SHMEM,		/* shmem pages (included tmpfs/GEM pages) */
 #ifdef CONFIG_NUMA
 	NUMA_HIT,		/* allocated in intended node */
 	NUMA_MISS,		/* allocated in non intended node */
diff --git a/mm/filemap.c b/mm/filemap.c
index dd51c68e2b86..bcc7372aebbc 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -119,6 +119,8 @@ void __remove_from_page_cache(struct page *page)
 	page->mapping = NULL;
 	mapping->nrpages--;
 	__dec_zone_page_state(page, NR_FILE_PAGES);
+	if (PageSwapBacked(page))
+		__dec_zone_page_state(page, NR_SHMEM);
 	BUG_ON(page_mapped(page));
 
 	/*
@@ -431,6 +433,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 		if (likely(!error)) {
 			mapping->nrpages++;
 			__inc_zone_page_state(page, NR_FILE_PAGES);
+			if (PageSwapBacked(page))
+				__inc_zone_page_state(page, NR_SHMEM);
 			spin_unlock_irq(&mapping->tree_lock);
 		} else {
 			page->mapping = NULL;
diff --git a/mm/migrate.c b/mm/migrate.c
index 0edeac91348d..37143b924484 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -312,7 +312,10 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 	 */
 	__dec_zone_page_state(page, NR_FILE_PAGES);
 	__inc_zone_page_state(newpage, NR_FILE_PAGES);
-
+	if (PageSwapBacked(page)) {
+		__dec_zone_page_state(page, NR_SHMEM);
+		__inc_zone_page_state(newpage, NR_SHMEM);
+	}
 	spin_unlock_irq(&mapping->tree_lock);
 
 	return 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4e050f325ebd..e50c22545b8f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2139,7 +2139,7 @@ void show_free_areas(void)
 		" unevictable:%lu"
 		" dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n"
 		" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
-		" mapped:%lu pagetables:%lu bounce:%lu\n",
+		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
 		global_page_state(NR_ACTIVE_ANON),
 		global_page_state(NR_ACTIVE_FILE),
 		global_page_state(NR_INACTIVE_ANON),
@@ -2153,6 +2153,7 @@ void show_free_areas(void)
 		global_page_state(NR_SLAB_RECLAIMABLE),
 		global_page_state(NR_SLAB_UNRECLAIMABLE),
 		global_page_state(NR_FILE_MAPPED),
+		global_page_state(NR_SHMEM),
 		global_page_state(NR_PAGETABLE),
 		global_page_state(NR_BOUNCE));
 
@@ -2175,6 +2176,7 @@ void show_free_areas(void)
 			" dirty:%lukB"
 			" writeback:%lukB"
 			" mapped:%lukB"
+			" shmem:%lukB"
 			" slab_reclaimable:%lukB"
 			" slab_unreclaimable:%lukB"
 			" kernel_stack:%lukB"
@@ -2200,6 +2202,7 @@ void show_free_areas(void)
 			K(zone_page_state(zone, NR_FILE_DIRTY)),
 			K(zone_page_state(zone, NR_WRITEBACK)),
 			K(zone_page_state(zone, NR_FILE_MAPPED)),
+			K(zone_page_state(zone, NR_SHMEM)),
 			K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
 			K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
 			zone_page_state(zone, NR_KERNEL_STACK) *
diff --git a/mm/vmstat.c b/mm/vmstat.c
index ceda39b63d7e..7214a4511257 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -644,7 +644,7 @@ static const char * const vmstat_text[] = {
 	"nr_bounce",
 	"nr_vmscan_write",
 	"nr_writeback_temp",
-
+	"nr_shmem",
 #ifdef CONFIG_NUMA
 	"numa_hit",
 	"numa_miss",
-- 
cgit v1.2.3-71-gd317


From a731286de62294b63d8ceb3c5914ac52cc17e690 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Mon, 21 Sep 2009 17:01:37 -0700
Subject: mm: vmstat: add isolate pages

If the system is running a heavy load of processes then concurrent reclaim
can isolate a large number of pages from the LRU. /proc/vmstat and the
output generated for an OOM do not show how many pages were isolated.

This has been observed during process fork bomb testing (mstctl11 in LTP).

This patch shows the information about isolated pages.

Reproduced via:

-----------------------
% ./hackbench 140 process 1000
   => OOM occur

active_anon:146 inactive_anon:0 isolated_anon:49245
 active_file:79 inactive_file:18 isolated_file:113
 unevictable:0 dirty:0 writeback:0 unstable:0 buffer:39
 free:370 slab_reclaimable:309 slab_unreclaimable:5492
 mapped:53 shmem:15 pagetables:28140 bounce:0

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Wu Fengguang <fengguang.wu@intel.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  2 ++
 mm/migrate.c           | 11 +++++++++++
 mm/page_alloc.c        | 12 +++++++++---
 mm/vmscan.c            | 12 +++++++++++-
 mm/vmstat.c            |  2 ++
 5 files changed, 35 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b3583b93b77e..9c50309b30a1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -100,6 +100,8 @@ enum zone_stat_item {
 	NR_BOUNCE,
 	NR_VMSCAN_WRITE,
 	NR_WRITEBACK_TEMP,	/* Writeback using temporary buffers */
+	NR_ISOLATED_ANON,	/* Temporary isolated pages from anon lru */
+	NR_ISOLATED_FILE,	/* Temporary isolated pages from file lru */
 	NR_SHMEM,		/* shmem pages (included tmpfs/GEM pages) */
 #ifdef CONFIG_NUMA
 	NUMA_HIT,		/* allocated in intended node */
diff --git a/mm/migrate.c b/mm/migrate.c
index 37143b924484..b535a2c1656c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -67,6 +67,8 @@ int putback_lru_pages(struct list_head *l)
 
 	list_for_each_entry_safe(page, page2, l, lru) {
 		list_del(&page->lru);
+		dec_zone_page_state(page, NR_ISOLATED_ANON +
+				    !!page_is_file_cache(page));
 		putback_lru_page(page);
 		count++;
 	}
@@ -698,6 +700,8 @@ unlock:
  		 * restored.
  		 */
  		list_del(&page->lru);
+		dec_zone_page_state(page, NR_ISOLATED_ANON +
+				    !!page_is_file_cache(page));
 		putback_lru_page(page);
 	}
 
@@ -742,6 +746,13 @@ int migrate_pages(struct list_head *from,
 	struct page *page2;
 	int swapwrite = current->flags & PF_SWAPWRITE;
 	int rc;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	list_for_each_entry(page, from, lru)
+		__inc_zone_page_state(page, NR_ISOLATED_ANON +
+				      !!page_is_file_cache(page));
+	local_irq_restore(flags);
 
 	if (!swapwrite)
 		current->flags |= PF_SWAPWRITE;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b6d0d09557ef..afda8fd16484 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2134,16 +2134,18 @@ void show_free_areas(void)
 		}
 	}
 
-	printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n"
-		" inactive_file:%lu"
+	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
+		" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
 		" unevictable:%lu"
 		" dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n"
 		" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
 		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
 		global_page_state(NR_ACTIVE_ANON),
-		global_page_state(NR_ACTIVE_FILE),
 		global_page_state(NR_INACTIVE_ANON),
+		global_page_state(NR_ISOLATED_ANON),
+		global_page_state(NR_ACTIVE_FILE),
 		global_page_state(NR_INACTIVE_FILE),
+		global_page_state(NR_ISOLATED_FILE),
 		global_page_state(NR_UNEVICTABLE),
 		global_page_state(NR_FILE_DIRTY),
 		global_page_state(NR_WRITEBACK),
@@ -2171,6 +2173,8 @@ void show_free_areas(void)
 			" active_file:%lukB"
 			" inactive_file:%lukB"
 			" unevictable:%lukB"
+			" isolated(anon):%lukB"
+			" isolated(file):%lukB"
 			" present:%lukB"
 			" mlocked:%lukB"
 			" dirty:%lukB"
@@ -2197,6 +2201,8 @@ void show_free_areas(void)
 			K(zone_page_state(zone, NR_ACTIVE_FILE)),
 			K(zone_page_state(zone, NR_INACTIVE_FILE)),
 			K(zone_page_state(zone, NR_UNEVICTABLE)),
+			K(zone_page_state(zone, NR_ISOLATED_ANON)),
+			K(zone_page_state(zone, NR_ISOLATED_FILE)),
 			K(zone->present_pages),
 			K(zone_page_state(zone, NR_MLOCK)),
 			K(zone_page_state(zone, NR_FILE_DIRTY)),
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d86a91f8c16b..75c29974e878 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1072,6 +1072,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 		unsigned long nr_active;
 		unsigned int count[NR_LRU_LISTS] = { 0, };
 		int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE;
+		unsigned long nr_anon;
+		unsigned long nr_file;
 
 		nr_taken = sc->isolate_pages(sc->swap_cluster_max,
 			     &page_list, &nr_scan, sc->order, mode,
@@ -1102,6 +1104,10 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 		__mod_zone_page_state(zone, NR_INACTIVE_ANON,
 						-count[LRU_INACTIVE_ANON]);
 
+		nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
+		nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
+		__mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
+		__mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
 
 		reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON];
 		reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON];
@@ -1169,6 +1175,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 				spin_lock_irq(&zone->lru_lock);
 			}
 		}
+		__mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
+		__mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
+
   	} while (nr_scanned < max_scan);
 
 done:
@@ -1279,6 +1288,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		__mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
 	else
 		__mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
+	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
 	spin_unlock_irq(&zone->lru_lock);
 
 	while (!list_empty(&l_hold)) {
@@ -1329,7 +1339,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 						LRU_ACTIVE + file * LRU_FILE);
 	move_active_pages_to_lru(zone, &l_inactive,
 						LRU_BASE   + file * LRU_FILE);
-
+	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
 	spin_unlock_irq(&zone->lru_lock);
 }
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7214a4511257..c81321f9feec 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -644,6 +644,8 @@ static const char * const vmstat_text[] = {
 	"nr_bounce",
 	"nr_vmscan_write",
 	"nr_writeback_temp",
+	"nr_isolated_anon",
+	"nr_isolated_file",
 	"nr_shmem",
 #ifdef CONFIG_NUMA
 	"numa_hit",
-- 
cgit v1.2.3-71-gd317


From 5a2ae913f5229d6e1d4a666f0477350789d5128e Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Mon, 21 Sep 2009 17:01:39 -0700
Subject: mm: remove __{add,sub}_zone_page_state()

__add_zone_page_state() and __sub_zone_page_state() are unused.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmstat.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 81a97cf8f0a0..d7f577f49d16 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -210,11 +210,6 @@ extern void zone_statistics(struct zone *, struct zone *);
 
 #endif /* CONFIG_NUMA */
 
-#define __add_zone_page_state(__z, __i, __d)	\
-		__mod_zone_page_state(__z, __i, __d)
-#define __sub_zone_page_state(__z, __i, __d)	\
-		__mod_zone_page_state(__z, __i,-(__d))
-
 #define add_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, __d)
 #define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, -(__d))
 
-- 
cgit v1.2.3-71-gd317


From adea02a1bea71a508da32c04d715485a1fe62029 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Mon, 21 Sep 2009 17:01:42 -0700
Subject: mm: count only reclaimable lru pages

global_lru_pages() / zone_lru_pages() can be used in two ways:
- to estimate max reclaimable pages in determine_dirtyable_memory()
- to calculate the slab scan ratio

When swap is full or not present, the anon lru lists are not reclaimable
and also won't be scanned.  So the anon pages shall not be counted in both
usage scenarios.  Also rename to _reclaimable_pages: now they are counting
the possibly reclaimable lru pages.

It can greatly (and correctly) increase the slab scan rate under high
memory pressure (when most file pages have been reclaimed and swap is
full/absent), thus reduce false OOM kills.

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: David Howells <dhowells@redhat.com>
Cc: "Li, Ming Chun" <macli@brc.ubc.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmstat.h | 11 ++---------
 mm/page-writeback.c    |  5 +++--
 mm/vmscan.c            | 50 +++++++++++++++++++++++++++++++++++++++-----------
 3 files changed, 44 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index d7f577f49d16..2d0f222388a8 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -166,15 +166,8 @@ static inline unsigned long zone_page_state(struct zone *zone,
 	return x;
 }
 
-extern unsigned long global_lru_pages(void);
-
-static inline unsigned long zone_lru_pages(struct zone *zone)
-{
-	return (zone_page_state(zone, NR_ACTIVE_ANON)
-		+ zone_page_state(zone, NR_ACTIVE_FILE)
-		+ zone_page_state(zone, NR_INACTIVE_ANON)
-		+ zone_page_state(zone, NR_INACTIVE_FILE));
-}
+extern unsigned long global_reclaimable_pages(void);
+extern unsigned long zone_reclaimable_pages(struct zone *zone);
 
 #ifdef CONFIG_NUMA
 /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d1ba46441053..5f378dd58802 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -380,7 +380,8 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
 		struct zone *z =
 			&NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
 
-		x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z);
+		x += zone_page_state(z, NR_FREE_PAGES) +
+		     zone_reclaimable_pages(z);
 	}
 	/*
 	 * Make sure that the number of highmem pages is never larger
@@ -404,7 +405,7 @@ unsigned long determine_dirtyable_memory(void)
 {
 	unsigned long x;
 
-	x = global_page_state(NR_FREE_PAGES) + global_lru_pages();
+	x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
 
 	if (!vm_highmem_is_dirtyable)
 		x -= highmem_dirtyable_memory(x);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f90b76086ffa..208071c48bf2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1734,7 +1734,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 				continue;
 
-			lru_pages += zone_lru_pages(zone);
+			lru_pages += zone_reclaimable_pages(zone);
 		}
 	}
 
@@ -1951,7 +1951,7 @@ loop_again:
 		for (i = 0; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 
-			lru_pages += zone_lru_pages(zone);
+			lru_pages += zone_reclaimable_pages(zone);
 		}
 
 		/*
@@ -1995,7 +1995,7 @@ loop_again:
 			if (zone_is_all_unreclaimable(zone))
 				continue;
 			if (nr_slab == 0 && zone->pages_scanned >=
-						(zone_lru_pages(zone) * 6))
+					(zone_reclaimable_pages(zone) * 6))
 					zone_set_flag(zone,
 						      ZONE_ALL_UNRECLAIMABLE);
 			/*
@@ -2162,12 +2162,39 @@ void wakeup_kswapd(struct zone *zone, int order)
 	wake_up_interruptible(&pgdat->kswapd_wait);
 }
 
-unsigned long global_lru_pages(void)
+/*
+ * The reclaimable count would be mostly accurate.
+ * The less reclaimable pages may be
+ * - mlocked pages, which will be moved to unevictable list when encountered
+ * - mapped pages, which may require several travels to be reclaimed
+ * - dirty pages, which is not "instantly" reclaimable
+ */
+unsigned long global_reclaimable_pages(void)
 {
-	return global_page_state(NR_ACTIVE_ANON)
-		+ global_page_state(NR_ACTIVE_FILE)
-		+ global_page_state(NR_INACTIVE_ANON)
-		+ global_page_state(NR_INACTIVE_FILE);
+	int nr;
+
+	nr = global_page_state(NR_ACTIVE_FILE) +
+	     global_page_state(NR_INACTIVE_FILE);
+
+	if (nr_swap_pages > 0)
+		nr += global_page_state(NR_ACTIVE_ANON) +
+		      global_page_state(NR_INACTIVE_ANON);
+
+	return nr;
+}
+
+unsigned long zone_reclaimable_pages(struct zone *zone)
+{
+	int nr;
+
+	nr = zone_page_state(zone, NR_ACTIVE_FILE) +
+	     zone_page_state(zone, NR_INACTIVE_FILE);
+
+	if (nr_swap_pages > 0)
+		nr += zone_page_state(zone, NR_ACTIVE_ANON) +
+		      zone_page_state(zone, NR_INACTIVE_ANON);
+
+	return nr;
 }
 
 #ifdef CONFIG_HIBERNATION
@@ -2239,7 +2266,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 
 	current->reclaim_state = &reclaim_state;
 
-	lru_pages = global_lru_pages();
+	lru_pages = global_reclaimable_pages();
 	nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
 	/* If slab caches are huge, it's better to hit them first */
 	while (nr_slab >= lru_pages) {
@@ -2281,7 +2308,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 
 			reclaim_state.reclaimed_slab = 0;
 			shrink_slab(sc.nr_scanned, sc.gfp_mask,
-					global_lru_pages());
+				    global_reclaimable_pages());
 			sc.nr_reclaimed += reclaim_state.reclaimed_slab;
 			if (sc.nr_reclaimed >= nr_pages)
 				goto out;
@@ -2298,7 +2325,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 	if (!sc.nr_reclaimed) {
 		do {
 			reclaim_state.reclaimed_slab = 0;
-			shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages());
+			shrink_slab(nr_pages, sc.gfp_mask,
+				    global_reclaimable_pages());
 			sc.nr_reclaimed += reclaim_state.reclaimed_slab;
 		} while (sc.nr_reclaimed < nr_pages &&
 				reclaim_state.reclaimed_slab > 0);
-- 
cgit v1.2.3-71-gd317


From 451ea25da71590361c71bf3044c55b870a887d53 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 21 Sep 2009 17:01:48 -0700
Subject: mm: perform non-atomic test-clear of PG_mlocked on free

By the time PG_mlocked is cleared in the page freeing path, nobody else is
looking at our page->flags anymore.

It is thus safe to make the test-and-clear non-atomic and thereby removing
an unnecessary and expensive operation from a hotpath.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 12 +++++++++---
 mm/page_alloc.c            |  4 ++--
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 2b87acfc5f87..d07c0bb2203a 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -158,6 +158,9 @@ static inline int TestSetPage##uname(struct page *page)			\
 static inline int TestClearPage##uname(struct page *page)		\
 		{ return test_and_clear_bit(PG_##lname, &page->flags); }
 
+#define __TESTCLEARFLAG(uname, lname)					\
+static inline int __TestClearPage##uname(struct page *page)		\
+		{ return __test_and_clear_bit(PG_##lname, &page->flags); }
 
 #define PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname)		\
 	SETPAGEFLAG(uname, lname) CLEARPAGEFLAG(uname, lname)
@@ -184,6 +187,9 @@ static inline void __ClearPage##uname(struct page *page) {  }
 #define TESTCLEARFLAG_FALSE(uname)					\
 static inline int TestClearPage##uname(struct page *page) { return 0; }
 
+#define __TESTCLEARFLAG_FALSE(uname)					\
+static inline int __TestClearPage##uname(struct page *page) { return 0; }
+
 struct page;	/* forward declaration */
 
 TESTPAGEFLAG(Locked, locked) TESTSETFLAG(Locked, locked)
@@ -250,11 +256,11 @@ PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable)
 #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
 #define MLOCK_PAGES 1
 PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked)
-	TESTSCFLAG(Mlocked, mlocked)
+	TESTSCFLAG(Mlocked, mlocked) __TESTCLEARFLAG(Mlocked, mlocked)
 #else
 #define MLOCK_PAGES 0
-PAGEFLAG_FALSE(Mlocked)
-	SETPAGEFLAG_NOOP(Mlocked) TESTCLEARFLAG_FALSE(Mlocked)
+PAGEFLAG_FALSE(Mlocked) SETPAGEFLAG_NOOP(Mlocked)
+	TESTCLEARFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked)
 #endif
 
 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 81926c7ef6f0..9242d13f4ff3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -557,7 +557,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 	unsigned long flags;
 	int i;
 	int bad = 0;
-	int wasMlocked = TestClearPageMlocked(page);
+	int wasMlocked = __TestClearPageMlocked(page);
 
 	kmemcheck_free_shadow(page, order);
 
@@ -1026,7 +1026,7 @@ static void free_hot_cold_page(struct page *page, int cold)
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
-	int wasMlocked = TestClearPageMlocked(page);
+	int wasMlocked = __TestClearPageMlocked(page);
 
 	kmemcheck_free_shadow(page, 0);
 
-- 
cgit v1.2.3-71-gd317


From 828502d30073036a486d96b1fe051e0f08b6df83 Mon Sep 17 00:00:00 2001
From: Izik Eidus <ieidus@redhat.com>
Date: Mon, 21 Sep 2009 17:01:51 -0700
Subject: ksm: add mmu_notifier set_pte_at_notify()

KSM is a linux driver that allows dynamicly sharing identical memory pages
between one or more processes.

Unlike tradtional page sharing that is made at the allocation of the
memory, ksm do it dynamicly after the memory was created.  Memory is
periodically scanned; identical pages are identified and merged.

The sharing is made in a transparent way to the processes that use it.

Ksm is highly important for hypervisors (kvm), where in production
enviorments there might be many copys of the same data data among the host
memory.  This kind of data can be: similar kernels, librarys, cache, and
so on.

Even that ksm was wrote for kvm, any userspace application that want to
use it to share its data can try it.

Ksm may be useful for any application that might have similar (page
aligment) data strctures among the memory, ksm will find this data merge
it to one copy, and even if it will be changed and thereforew copy on
writed, ksm will merge it again as soon as it will be identical again.

Another reason to consider using ksm is the fact that it might simplify
alot the userspace code of application that want to use shared private
data, instead that the application will mange shared area, ksm will do
this for the application, and even write to this data will be allowed
without any synchinization acts from the application.

Ksm was designed to be a loadable module that doesn't change the VM code
of linux.

This patch:

The set_pte_at_notify() macro allows setting a pte in the shadow page
table directly, instead of flushing the shadow page table entry and then
getting vmexit to set it.  It uses a new change_pte() callback to do so.

set_pte_at_notify() is an optimization for kvm, and other users of
mmu_notifiers, for COW pages.  It is useful for kvm when ksm is used,
because it allows kvm not to have to receive vmexit and only then map the
ksm page into the shadow page table, but instead map it directly at the
same time as Linux maps the page into the host page table.

Users of mmu_notifiers who don't implement new mmu_notifier_change_pte()
callback will just receive the mmu_notifier_invalidate_page() callback.

Signed-off-by: Izik Eidus <ieidus@redhat.com>
Signed-off-by: Chris Wright <chrisw@redhat.com>
Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmu_notifier.h | 34 ++++++++++++++++++++++++++++++++++
 mm/memory.c                  |  9 +++++++--
 mm/mmu_notifier.c            | 20 ++++++++++++++++++++
 3 files changed, 61 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index b77486d152cd..4e02ee2b071e 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -61,6 +61,15 @@ struct mmu_notifier_ops {
 				 struct mm_struct *mm,
 				 unsigned long address);
 
+	/*
+	 * change_pte is called in cases that pte mapping to page is changed:
+	 * for example, when ksm remaps pte to point to a new shared page.
+	 */
+	void (*change_pte)(struct mmu_notifier *mn,
+			   struct mm_struct *mm,
+			   unsigned long address,
+			   pte_t pte);
+
 	/*
 	 * Before this is invoked any secondary MMU is still ok to
 	 * read/write to the page previously pointed to by the Linux
@@ -154,6 +163,8 @@ extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
 extern void __mmu_notifier_release(struct mm_struct *mm);
 extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 					  unsigned long address);
+extern void __mmu_notifier_change_pte(struct mm_struct *mm,
+				      unsigned long address, pte_t pte);
 extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
 					  unsigned long address);
 extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
@@ -175,6 +186,13 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
 	return 0;
 }
 
+static inline void mmu_notifier_change_pte(struct mm_struct *mm,
+					   unsigned long address, pte_t pte)
+{
+	if (mm_has_notifiers(mm))
+		__mmu_notifier_change_pte(mm, address, pte);
+}
+
 static inline void mmu_notifier_invalidate_page(struct mm_struct *mm,
 					  unsigned long address)
 {
@@ -236,6 +254,16 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 	__young;							\
 })
 
+#define set_pte_at_notify(__mm, __address, __ptep, __pte)		\
+({									\
+	struct mm_struct *___mm = __mm;					\
+	unsigned long ___address = __address;				\
+	pte_t ___pte = __pte;						\
+									\
+	set_pte_at(___mm, ___address, __ptep, ___pte);			\
+	mmu_notifier_change_pte(___mm, ___address, ___pte);		\
+})
+
 #else /* CONFIG_MMU_NOTIFIER */
 
 static inline void mmu_notifier_release(struct mm_struct *mm)
@@ -248,6 +276,11 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
 	return 0;
 }
 
+static inline void mmu_notifier_change_pte(struct mm_struct *mm,
+					   unsigned long address, pte_t pte)
+{
+}
+
 static inline void mmu_notifier_invalidate_page(struct mm_struct *mm,
 					  unsigned long address)
 {
@@ -273,6 +306,7 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 
 #define ptep_clear_flush_young_notify ptep_clear_flush_young
 #define ptep_clear_flush_notify ptep_clear_flush
+#define set_pte_at_notify set_pte_at
 
 #endif /* CONFIG_MMU_NOTIFIER */
 
diff --git a/mm/memory.c b/mm/memory.c
index e8f63d9961ea..368561f32009 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2115,9 +2115,14 @@ gotten:
 		 * seen in the presence of one thread doing SMC and another
 		 * thread doing COW.
 		 */
-		ptep_clear_flush_notify(vma, address, page_table);
+		ptep_clear_flush(vma, address, page_table);
 		page_add_new_anon_rmap(new_page, vma, address);
-		set_pte_at(mm, address, page_table, entry);
+		/*
+		 * We call the notify macro here because, when using secondary
+		 * mmu page tables (such as kvm shadow page tables), we want the
+		 * new page to be mapped directly into the secondary page table.
+		 */
+		set_pte_at_notify(mm, address, page_table, entry);
 		update_mmu_cache(vma, address, entry);
 		if (old_page) {
 			/*
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 5f4ef0250bee..7e33f2cb3c77 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -99,6 +99,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 	return young;
 }
 
+void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
+			       pte_t pte)
+{
+	struct mmu_notifier *mn;
+	struct hlist_node *n;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+		if (mn->ops->change_pte)
+			mn->ops->change_pte(mn, mm, address, pte);
+		/*
+		 * Some drivers don't have change_pte,
+		 * so we must call invalidate_page in that case.
+		 */
+		else if (mn->ops->invalidate_page)
+			mn->ops->invalidate_page(mn, mm, address);
+	}
+	rcu_read_unlock();
+}
+
 void __mmu_notifier_invalidate_page(struct mm_struct *mm,
 					  unsigned long address)
 {
-- 
cgit v1.2.3-71-gd317


From f8af4da3b4c14e7267c4ffb952079af3912c51c5 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Date: Mon, 21 Sep 2009 17:01:57 -0700
Subject: ksm: the mm interface to ksm

This patch presents the mm interface to a dummy version of ksm.c, for
better scrutiny of that interface: the real ksm.c follows later.

When CONFIG_KSM is not set, madvise(2) reject MADV_MERGEABLE and
MADV_UNMERGEABLE with EINVAL, since that seems more helpful than
pretending that they can be serviced.  But when CONFIG_KSM=y, accept them
even if KSM is not currently running, and even on areas which KSM will not
touch (e.g.  hugetlb or shared file or special driver mappings).

Like other madvices, report ENOMEM despite success if any area in the
range is unmapped, and use EAGAIN to report out of memory.

Define vma flag VM_MERGEABLE to identify an area on which KSM may try
merging pages: leave it to ksm_madvise() to decide whether to set it.
Define mm flag MMF_VM_MERGEABLE to identify an mm which might contain
VM_MERGEABLE areas, to minimize callouts when forking or exiting.

Based upon earlier patches by Chris Wright and Izik Eidus.

Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Chris Wright <chrisw@redhat.com>
Signed-off-by: Izik Eidus <ieidus@redhat.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ksm.h   | 50 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mm.h    |  1 +
 include/linux/sched.h |  7 +++++++
 kernel/fork.c         |  8 +++++++-
 mm/Kconfig            | 11 ++++++++++
 mm/Makefile           |  1 +
 mm/ksm.c              | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/madvise.c          | 14 +++++++++++++
 8 files changed, 147 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/ksm.h
 create mode 100644 mm/ksm.c

(limited to 'include/linux')

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
new file mode 100644
index 000000000000..eb2a448981ee
--- /dev/null
+++ b/include/linux/ksm.h
@@ -0,0 +1,50 @@
+#ifndef __LINUX_KSM_H
+#define __LINUX_KSM_H
+/*
+ * Memory merging support.
+ *
+ * This code enables dynamic sharing of identical pages found in different
+ * memory areas, even if they are not shared by fork().
+ */
+
+#include <linux/bitops.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+
+#ifdef CONFIG_KSM
+int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
+		unsigned long end, int advice, unsigned long *vm_flags);
+int __ksm_enter(struct mm_struct *mm);
+void __ksm_exit(struct mm_struct *mm);
+
+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
+		return __ksm_enter(mm);
+	return 0;
+}
+
+static inline void ksm_exit(struct mm_struct *mm)
+{
+	if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
+		__ksm_exit(mm);
+}
+#else  /* !CONFIG_KSM */
+
+static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
+		unsigned long end, int advice, unsigned long *vm_flags)
+{
+	return 0;
+}
+
+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+	return 0;
+}
+
+static inline void ksm_exit(struct mm_struct *mm)
+{
+}
+#endif /* !CONFIG_KSM */
+
+#endif
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d3c8ae7c8015..d808cf832c4d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -103,6 +103,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_MIXEDMAP	0x10000000	/* Can contain "struct page" and pure PFN pages */
 #define VM_SAO		0x20000000	/* Strong Access Ordering (powerpc) */
 #define VM_PFN_AT_MMAP	0x40000000	/* PFNMAP vma that is fully mapped at mmap time */
+#define VM_MERGEABLE	0x80000000	/* KSM may merge identical pages */
 
 #ifndef VM_STACK_DEFAULT_FLAGS		/* arch can override this */
 #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8fe351c3914a..8f3e63cb33a6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -434,7 +434,9 @@ extern int get_dumpable(struct mm_struct *mm);
 /* dumpable bits */
 #define MMF_DUMPABLE      0  /* core dump is permitted */
 #define MMF_DUMP_SECURELY 1  /* core file is readable only by root */
+
 #define MMF_DUMPABLE_BITS 2
+#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
 
 /* coredump filter bits */
 #define MMF_DUMP_ANON_PRIVATE	2
@@ -444,6 +446,7 @@ extern int get_dumpable(struct mm_struct *mm);
 #define MMF_DUMP_ELF_HEADERS	6
 #define MMF_DUMP_HUGETLB_PRIVATE 7
 #define MMF_DUMP_HUGETLB_SHARED  8
+
 #define MMF_DUMP_FILTER_SHIFT	MMF_DUMPABLE_BITS
 #define MMF_DUMP_FILTER_BITS	7
 #define MMF_DUMP_FILTER_MASK \
@@ -457,6 +460,10 @@ extern int get_dumpable(struct mm_struct *mm);
 #else
 # define MMF_DUMP_MASK_DEFAULT_ELF	0
 #endif
+					/* leave room for more dump flags */
+#define MMF_VM_MERGEABLE	16	/* KSM may merge identical pages */
+
+#define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
 
 struct sighand_struct {
 	atomic_t		count;
diff --git a/kernel/fork.c b/kernel/fork.c
index d4638c8cc19e..73a442b7be6d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -49,6 +49,7 @@
 #include <linux/ftrace.h>
 #include <linux/profile.h>
 #include <linux/rmap.h>
+#include <linux/ksm.h>
 #include <linux/acct.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
@@ -299,6 +300,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 	rb_link = &mm->mm_rb.rb_node;
 	rb_parent = NULL;
 	pprev = &mm->mmap;
+	retval = ksm_fork(mm, oldmm);
+	if (retval)
+		goto out;
 
 	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
 		struct file *file;
@@ -435,7 +439,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 	atomic_set(&mm->mm_count, 1);
 	init_rwsem(&mm->mmap_sem);
 	INIT_LIST_HEAD(&mm->mmlist);
-	mm->flags = (current->mm) ? current->mm->flags : default_dump_filter;
+	mm->flags = (current->mm) ?
+		(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
 	mm->core_state = NULL;
 	mm->nr_ptes = 0;
 	set_mm_counter(mm, file_rss, 0);
@@ -496,6 +501,7 @@ void mmput(struct mm_struct *mm)
 
 	if (atomic_dec_and_test(&mm->mm_users)) {
 		exit_aio(mm);
+		ksm_exit(mm);
 		exit_mmap(mm);
 		set_mm_exe_file(mm, NULL);
 		if (!list_empty(&mm->mmlist)) {
diff --git a/mm/Kconfig b/mm/Kconfig
index 3aa519f52e18..c0b6afa178a1 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -214,6 +214,17 @@ config HAVE_MLOCKED_PAGE_BIT
 config MMU_NOTIFIER
 	bool
 
+config KSM
+	bool "Enable KSM for page merging"
+	depends on MMU
+	help
+	  Enable Kernel Samepage Merging: KSM periodically scans those areas
+	  of an application's address space that an app has advised may be
+	  mergeable.  When it finds pages of identical content, it replaces
+	  the many instances by a single resident page with that content, so
+	  saving memory until one or another app needs to modify the content.
+	  Recommended for use with KVM, or with other duplicative applications.
+
 config DEFAULT_MMAP_MIN_ADDR
         int "Low address space to protect from user allocation"
         default 4096
diff --git a/mm/Makefile b/mm/Makefile
index ea4b18bd3960..a63bf59a0c77 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
 obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
 obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
+obj-$(CONFIG_KSM) += ksm.o
 obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
diff --git a/mm/ksm.c b/mm/ksm.c
new file mode 100644
index 000000000000..8b76008fcd32
--- /dev/null
+++ b/mm/ksm.c
@@ -0,0 +1,56 @@
+/*
+ * Initial dummy version just to illustrate KSM's interface to other files.
+ */
+
+#include <linux/errno.h>
+#include <linux/mman.h>
+#include <linux/ksm.h>
+
+int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
+		unsigned long end, int advice, unsigned long *vm_flags)
+{
+	struct mm_struct *mm = vma->vm_mm;
+
+	switch (advice) {
+	case MADV_MERGEABLE:
+		/*
+		 * Be somewhat over-protective for now!
+		 */
+		if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
+				 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
+				 VM_RESERVED  | VM_HUGETLB | VM_INSERTPAGE |
+				 VM_MIXEDMAP  | VM_SAO))
+			return 0;		/* just ignore the advice */
+
+		if (!test_bit(MMF_VM_MERGEABLE, &mm->flags))
+			if (__ksm_enter(mm) < 0)
+				return -EAGAIN;
+
+		*vm_flags |= VM_MERGEABLE;
+		break;
+
+	case MADV_UNMERGEABLE:
+		if (!(*vm_flags & VM_MERGEABLE))
+			return 0;		/* just ignore the advice */
+
+		/* Unmerge any merged pages here */
+
+		*vm_flags &= ~VM_MERGEABLE;
+		break;
+	}
+
+	return 0;
+}
+
+int __ksm_enter(struct mm_struct *mm)
+{
+	/* Allocate a structure to track mm and link it into KSM's list */
+	set_bit(MMF_VM_MERGEABLE, &mm->flags);
+	return 0;
+}
+
+void __ksm_exit(struct mm_struct *mm)
+{
+	/* Unlink and free all KSM's structures which track this mm */
+	clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+}
diff --git a/mm/madvise.c b/mm/madvise.c
index 66c31264f062..d9ae2067952e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -11,6 +11,7 @@
 #include <linux/mempolicy.h>
 #include <linux/hugetlb.h>
 #include <linux/sched.h>
+#include <linux/ksm.h>
 
 /*
  * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -63,6 +64,12 @@ static long madvise_behavior(struct vm_area_struct * vma,
 		}
 		new_flags &= ~VM_DONTCOPY;
 		break;
+	case MADV_MERGEABLE:
+	case MADV_UNMERGEABLE:
+		error = ksm_madvise(vma, start, end, behavior, &new_flags);
+		if (error)
+			goto out;
+		break;
 	}
 
 	if (new_flags == vma->vm_flags) {
@@ -239,6 +246,10 @@ madvise_behavior_valid(int behavior)
 	case MADV_REMOVE:
 	case MADV_WILLNEED:
 	case MADV_DONTNEED:
+#ifdef CONFIG_KSM
+	case MADV_MERGEABLE:
+	case MADV_UNMERGEABLE:
+#endif
 		return 1;
 
 	default:
@@ -273,6 +284,9 @@ madvise_behavior_valid(int behavior)
  *  MADV_DONTFORK - omit this area from child's address space when forking:
  *		typically, to avoid COWing pages pinned by get_user_pages().
  *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
+ *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
+ *		this area with pages of identical content from other such areas.
+ *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
  *
  * return values:
  *  zero    - success
-- 
cgit v1.2.3-71-gd317


From 21333b2b66b805a360641568588e5a0bb06d9d1f Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Date: Mon, 21 Sep 2009 17:01:59 -0700
Subject: ksm: no debug in page_dup_rmap()

page_dup_rmap(), used on each mapped page when forking, was originally
just an inline atomic_inc of mapcount.  2.6.22 added CONFIG_DEBUG_VM
out-of-line checks to it, which would need to be ever-so-slightly
complicated to allow for the PageKsm() we're about to define.

But I think these checks never caught anything.  And if it's coding errors
we're worried about, such checks should be in page_remove_rmap() too, not
just when forking; whereas if it's pagetable corruption we're worried
about, then they shouldn't be limited to CONFIG_DEBUG_VM.

Oh, just revert page_dup_rmap() to an inline atomic_inc of mapcount.

Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Chris Wright <chrisw@redhat.com>
Signed-off-by: Izik Eidus <ieidus@redhat.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Avi Kivity <avi@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h |  6 +-----
 mm/memory.c          |  2 +-
 mm/rmap.c            | 21 ---------------------
 3 files changed, 2 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bf116d0dbf23..477841d29fce 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -71,14 +71,10 @@ void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned lon
 void page_add_file_rmap(struct page *);
 void page_remove_rmap(struct page *);
 
-#ifdef CONFIG_DEBUG_VM
-void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address);
-#else
-static inline void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
+static inline void page_dup_rmap(struct page *page)
 {
 	atomic_inc(&page->_mapcount);
 }
-#endif
 
 /*
  * Called from mm/vmscan.c to handle paging out
diff --git a/mm/memory.c b/mm/memory.c
index 368561f32009..7a61a11f1867 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -597,7 +597,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	page = vm_normal_page(vma, addr, pte);
 	if (page) {
 		get_page(page);
-		page_dup_rmap(page, vma, addr);
+		page_dup_rmap(page);
 		rss[!!PageAnon(page)]++;
 	}
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 1406e67f9613..720fc03a7bc4 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -710,27 +710,6 @@ void page_add_file_rmap(struct page *page)
 	}
 }
 
-#ifdef CONFIG_DEBUG_VM
-/**
- * page_dup_rmap - duplicate pte mapping to a page
- * @page:	the page to add the mapping to
- * @vma:	the vm area being duplicated
- * @address:	the user virtual address mapped
- *
- * For copy_page_range only: minimal extract from page_add_file_rmap /
- * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's
- * quicker.
- *
- * The caller needs to hold the pte lock.
- */
-void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
-{
-	if (PageAnon(page))
-		__page_check_anon_rmap(page, vma, address);
-	atomic_inc(&page->_mapcount);
-}
-#endif
-
 /**
  * page_remove_rmap - take down pte mapping from a page
  * @page: page to remove mapping from
-- 
cgit v1.2.3-71-gd317


From 9a840895147b12de5cdd633c600b38686840ee53 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Date: Mon, 21 Sep 2009 17:02:01 -0700
Subject: ksm: identify PageKsm pages

KSM will need to identify its kernel merged pages unambiguously, and
/proc/kpageflags will probably like to do so too.

Since KSM will only be substituting anonymous pages, statistics are best
preserved by making a PageKsm page a special PageAnon page: one with no
anon_vma.

But KSM then needs its own page_add_ksm_rmap() - keep it in ksm.h near
PageKsm; and do_wp_page() must COW them, unlike singly mapped PageAnons.

Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Chris Wright <chrisw@redhat.com>
Signed-off-by: Izik Eidus <ieidus@redhat.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/page.c      |  5 +++++
 include/linux/ksm.h | 29 +++++++++++++++++++++++++++++
 mm/memory.c         |  3 ++-
 3 files changed, 36 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/proc/page.c b/fs/proc/page.c
index 2707c6c7a20f..2281c2cbfe2b 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -2,6 +2,7 @@
 #include <linux/compiler.h>
 #include <linux/fs.h>
 #include <linux/init.h>
+#include <linux/ksm.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <linux/proc_fs.h>
@@ -95,6 +96,8 @@ static const struct file_operations proc_kpagecount_operations = {
 #define KPF_UNEVICTABLE		18
 #define KPF_NOPAGE		20
 
+#define KPF_KSM			21
+
 /* kernel hacking assistances
  * WARNING: subject to change, never rely on them!
  */
@@ -137,6 +140,8 @@ static u64 get_uflags(struct page *page)
 		u |= 1 << KPF_MMAP;
 	if (PageAnon(page))
 		u |= 1 << KPF_ANON;
+	if (PageKsm(page))
+		u |= 1 << KPF_KSM;
 
 	/*
 	 * compound pages: export both head/tail info
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index eb2a448981ee..a485c14ecd5d 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -10,6 +10,7 @@
 #include <linux/bitops.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/vmstat.h>
 
 #ifdef CONFIG_KSM
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
@@ -29,6 +30,27 @@ static inline void ksm_exit(struct mm_struct *mm)
 	if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
 		__ksm_exit(mm);
 }
+
+/*
+ * A KSM page is one of those write-protected "shared pages" or "merged pages"
+ * which KSM maps into multiple mms, wherever identical anonymous page content
+ * is found in VM_MERGEABLE vmas.  It's a PageAnon page, with NULL anon_vma.
+ */
+static inline int PageKsm(struct page *page)
+{
+	return ((unsigned long)page->mapping == PAGE_MAPPING_ANON);
+}
+
+/*
+ * But we have to avoid the checking which page_add_anon_rmap() performs.
+ */
+static inline void page_add_ksm_rmap(struct page *page)
+{
+	if (atomic_inc_and_test(&page->_mapcount)) {
+		page->mapping = (void *) PAGE_MAPPING_ANON;
+		__inc_zone_page_state(page, NR_ANON_PAGES);
+	}
+}
 #else  /* !CONFIG_KSM */
 
 static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
@@ -45,6 +67,13 @@ static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 static inline void ksm_exit(struct mm_struct *mm)
 {
 }
+
+static inline int PageKsm(struct page *page)
+{
+	return 0;
+}
+
+/* No stub required for page_add_ksm_rmap(page) */
 #endif /* !CONFIG_KSM */
 
 #endif
diff --git a/mm/memory.c b/mm/memory.c
index 7a61a11f1867..1a435b81876c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -45,6 +45,7 @@
 #include <linux/swap.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/delayacct.h>
@@ -1974,7 +1975,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * Take out anonymous pages first, anonymous shared vmas are
 	 * not dirty accountable.
 	 */
-	if (PageAnon(old_page)) {
+	if (PageAnon(old_page) && !PageKsm(old_page)) {
 		if (!trylock_page(old_page)) {
 			page_cache_get(old_page);
 			pte_unmap_unlock(page_table, ptl);
-- 
cgit v1.2.3-71-gd317


From 9ba6929480088a85c1ff60a4b1f1c9fc80dbd2b7 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Date: Mon, 21 Sep 2009 17:02:20 -0700
Subject: ksm: fix oom deadlock

There's a now-obvious deadlock in KSM's out-of-memory handling:
imagine ksmd or KSM_RUN_UNMERGE handling, holding ksm_thread_mutex,
trying to allocate a page to break KSM in an mm which becomes the
OOM victim (quite likely in the unmerge case): it's killed and goes
to exit, and hangs there waiting to acquire ksm_thread_mutex.

Clearly we must not require ksm_thread_mutex in __ksm_exit, simple
though that made everything else: perhaps use mmap_sem somehow?
And part of the answer lies in the comments on unmerge_ksm_pages:
__ksm_exit should also leave all the rmap_item removal to ksmd.

But there's a fundamental problem, that KSM relies upon mmap_sem to
guarantee the consistency of the mm it's dealing with, yet exit_mmap
tears down an mm without taking mmap_sem.  And bumping mm_users won't
help at all, that just ensures that the pages the OOM killer assumes
are on their way to being freed will not be freed.

The best answer seems to be, to move the ksm_exit callout from just
before exit_mmap, to the middle of exit_mmap: after the mm's pages
have been freed (if the mmu_gather is flushed), but before its page
tables and vma structures have been freed; and down_write,up_write
mmap_sem there to serialize with KSM's own reliance on mmap_sem.

But KSM then needs to be careful, whenever it downs mmap_sem, to
check that the mm is not already exiting: there's a danger of using
find_vma on a layout that's being torn apart, or writing into page
tables which have been freed for reuse; and even do_anonymous_page
and __do_fault need to check they're not being called by break_ksm
to reinstate a pte after zap_pte_range has zapped that page table.

Though it might be clearer to add an exiting flag, set while holding
mmap_sem in __ksm_exit, that wouldn't cover the issue of reinstating
a zapped pte.  All we need is to check whether mm_users is 0 - but
must remember that ksmd may detect that before __ksm_exit is reached.
So, ksm_test_exit(mm) added to comment such checks on mm->mm_users.

__ksm_exit now has to leave clearing up the rmap_items to ksmd,
that needs ksm_thread_mutex; but shift the exiting mm just after the
ksm_scan cursor so that it will soon be dealt with.  __ksm_enter raise
mm_count to hold the mm_struct, ksmd's exit processing (exactly like
its processing when it finds all VM_MERGEABLEs unmapped) mmdrop it,
similar procedure for KSM_RUN_UNMERGE (which has stopped ksmd).

But also give __ksm_exit a fast path: when there's no complication
(no rmap_items attached to mm and it's not at the ksm_scan cursor),
it can safely do all the exiting work itself.  This is not just an
optimization: when ksmd is not running, the raised mm_count would
otherwise leak mm_structs.

Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Acked-by: Izik Eidus <ieidus@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ksm.h |  31 +++++++++--
 kernel/fork.c       |   1 -
 mm/ksm.c            | 144 +++++++++++++++++++++++++++++++++++-----------------
 mm/memory.c         |   5 +-
 mm/mmap.c           |   9 ++++
 5 files changed, 137 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index a485c14ecd5d..2d64ff30c0de 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -12,11 +12,14 @@
 #include <linux/sched.h>
 #include <linux/vmstat.h>
 
+struct mmu_gather;
+
 #ifdef CONFIG_KSM
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		unsigned long end, int advice, unsigned long *vm_flags);
 int __ksm_enter(struct mm_struct *mm);
-void __ksm_exit(struct mm_struct *mm);
+void __ksm_exit(struct mm_struct *mm,
+		struct mmu_gather **tlbp, unsigned long end);
 
 static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 {
@@ -25,10 +28,24 @@ static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 	return 0;
 }
 
-static inline void ksm_exit(struct mm_struct *mm)
+/*
+ * For KSM to handle OOM without deadlock when it's breaking COW in a
+ * likely victim of the OOM killer, exit_mmap() has to serialize with
+ * ksm_exit() after freeing mm's pages but before freeing its page tables.
+ * That leaves a window in which KSM might refault pages which have just
+ * been finally unmapped: guard against that with ksm_test_exit(), and
+ * use it after getting mmap_sem in ksm.c, to check if mm is exiting.
+ */
+static inline bool ksm_test_exit(struct mm_struct *mm)
+{
+	return atomic_read(&mm->mm_users) == 0;
+}
+
+static inline void ksm_exit(struct mm_struct *mm,
+			    struct mmu_gather **tlbp, unsigned long end)
 {
 	if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
-		__ksm_exit(mm);
+		__ksm_exit(mm, tlbp, end);
 }
 
 /*
@@ -64,7 +81,13 @@ static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 	return 0;
 }
 
-static inline void ksm_exit(struct mm_struct *mm)
+static inline bool ksm_test_exit(struct mm_struct *mm)
+{
+	return 0;
+}
+
+static inline void ksm_exit(struct mm_struct *mm,
+			    struct mmu_gather **tlbp, unsigned long end)
 {
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 73a442b7be6d..42f20f565b16 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -501,7 +501,6 @@ void mmput(struct mm_struct *mm)
 
 	if (atomic_dec_and_test(&mm->mm_users)) {
 		exit_aio(mm);
-		ksm_exit(mm);
 		exit_mmap(mm);
 		set_mm_exe_file(mm, NULL);
 		if (!list_empty(&mm->mmlist)) {
diff --git a/mm/ksm.c b/mm/ksm.c
index 7e4d255dadc0..722e3f2a8dc5 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -32,6 +32,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/ksm.h>
 
+#include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
 /*
@@ -347,6 +348,8 @@ static void break_cow(struct mm_struct *mm, unsigned long addr)
 	struct vm_area_struct *vma;
 
 	down_read(&mm->mmap_sem);
+	if (ksm_test_exit(mm))
+		goto out;
 	vma = find_vma(mm, addr);
 	if (!vma || vma->vm_start > addr)
 		goto out;
@@ -365,6 +368,8 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
 	struct page *page;
 
 	down_read(&mm->mmap_sem);
+	if (ksm_test_exit(mm))
+		goto out;
 	vma = find_vma(mm, addr);
 	if (!vma || vma->vm_start > addr)
 		goto out;
@@ -439,11 +444,11 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
 	} else if (rmap_item->address & NODE_FLAG) {
 		unsigned char age;
 		/*
-		 * ksm_thread can and must skip the rb_erase, because
+		 * Usually ksmd can and must skip the rb_erase, because
 		 * root_unstable_tree was already reset to RB_ROOT.
-		 * But __ksm_exit has to be careful: do the rb_erase
-		 * if it's interrupting a scan, and this rmap_item was
-		 * inserted by this scan rather than left from before.
+		 * But be careful when an mm is exiting: do the rb_erase
+		 * if this rmap_item was inserted by this scan, rather
+		 * than left over from before.
 		 */
 		age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
 		BUG_ON(age > 1);
@@ -491,6 +496,8 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
 	int err = 0;
 
 	for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
+		if (ksm_test_exit(vma->vm_mm))
+			break;
 		if (signal_pending(current))
 			err = -ERESTARTSYS;
 		else
@@ -507,34 +514,50 @@ static int unmerge_and_remove_all_rmap_items(void)
 	int err = 0;
 
 	spin_lock(&ksm_mmlist_lock);
-	mm_slot = list_entry(ksm_mm_head.mm_list.next,
+	ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
 						struct mm_slot, mm_list);
 	spin_unlock(&ksm_mmlist_lock);
 
-	while (mm_slot != &ksm_mm_head) {
+	for (mm_slot = ksm_scan.mm_slot;
+			mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
 		mm = mm_slot->mm;
 		down_read(&mm->mmap_sem);
 		for (vma = mm->mmap; vma; vma = vma->vm_next) {
+			if (ksm_test_exit(mm))
+				break;
 			if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
 				continue;
 			err = unmerge_ksm_pages(vma,
 						vma->vm_start, vma->vm_end);
-			if (err) {
-				up_read(&mm->mmap_sem);
-				goto out;
-			}
+			if (err)
+				goto error;
 		}
+
 		remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next);
-		up_read(&mm->mmap_sem);
 
 		spin_lock(&ksm_mmlist_lock);
-		mm_slot = list_entry(mm_slot->mm_list.next,
+		ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
 						struct mm_slot, mm_list);
-		spin_unlock(&ksm_mmlist_lock);
+		if (ksm_test_exit(mm)) {
+			hlist_del(&mm_slot->link);
+			list_del(&mm_slot->mm_list);
+			spin_unlock(&ksm_mmlist_lock);
+
+			free_mm_slot(mm_slot);
+			clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+			up_read(&mm->mmap_sem);
+			mmdrop(mm);
+		} else {
+			spin_unlock(&ksm_mmlist_lock);
+			up_read(&mm->mmap_sem);
+		}
 	}
 
 	ksm_scan.seqnr = 0;
-out:
+	return 0;
+
+error:
+	up_read(&mm->mmap_sem);
 	spin_lock(&ksm_mmlist_lock);
 	ksm_scan.mm_slot = &ksm_mm_head;
 	spin_unlock(&ksm_mmlist_lock);
@@ -755,6 +778,9 @@ static int try_to_merge_with_ksm_page(struct mm_struct *mm1,
 	int err = -EFAULT;
 
 	down_read(&mm1->mmap_sem);
+	if (ksm_test_exit(mm1))
+		goto out;
+
 	vma = find_vma(mm1, addr1);
 	if (!vma || vma->vm_start > addr1)
 		goto out;
@@ -796,6 +822,10 @@ static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1,
 		return err;
 
 	down_read(&mm1->mmap_sem);
+	if (ksm_test_exit(mm1)) {
+		up_read(&mm1->mmap_sem);
+		goto out;
+	}
 	vma = find_vma(mm1, addr1);
 	if (!vma || vma->vm_start > addr1) {
 		up_read(&mm1->mmap_sem);
@@ -1174,7 +1204,12 @@ next_mm:
 
 	mm = slot->mm;
 	down_read(&mm->mmap_sem);
-	for (vma = find_vma(mm, ksm_scan.address); vma; vma = vma->vm_next) {
+	if (ksm_test_exit(mm))
+		vma = NULL;
+	else
+		vma = find_vma(mm, ksm_scan.address);
+
+	for (; vma; vma = vma->vm_next) {
 		if (!(vma->vm_flags & VM_MERGEABLE))
 			continue;
 		if (ksm_scan.address < vma->vm_start)
@@ -1183,6 +1218,8 @@ next_mm:
 			ksm_scan.address = vma->vm_end;
 
 		while (ksm_scan.address < vma->vm_end) {
+			if (ksm_test_exit(mm))
+				break;
 			*page = follow_page(vma, ksm_scan.address, FOLL_GET);
 			if (*page && PageAnon(*page)) {
 				flush_anon_page(vma, *page, ksm_scan.address);
@@ -1205,6 +1242,11 @@ next_mm:
 		}
 	}
 
+	if (ksm_test_exit(mm)) {
+		ksm_scan.address = 0;
+		ksm_scan.rmap_item = list_entry(&slot->rmap_list,
+						struct rmap_item, link);
+	}
 	/*
 	 * Nuke all the rmap_items that are above this current rmap:
 	 * because there were no VM_MERGEABLE vmas with such addresses.
@@ -1219,24 +1261,29 @@ next_mm:
 		 * We've completed a full scan of all vmas, holding mmap_sem
 		 * throughout, and found no VM_MERGEABLE: so do the same as
 		 * __ksm_exit does to remove this mm from all our lists now.
+		 * This applies either when cleaning up after __ksm_exit
+		 * (but beware: we can reach here even before __ksm_exit),
+		 * or when all VM_MERGEABLE areas have been unmapped (and
+		 * mmap_sem then protects against race with MADV_MERGEABLE).
 		 */
 		hlist_del(&slot->link);
 		list_del(&slot->mm_list);
+		spin_unlock(&ksm_mmlist_lock);
+
 		free_mm_slot(slot);
 		clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+		up_read(&mm->mmap_sem);
+		mmdrop(mm);
+	} else {
+		spin_unlock(&ksm_mmlist_lock);
+		up_read(&mm->mmap_sem);
 	}
-	spin_unlock(&ksm_mmlist_lock);
-	up_read(&mm->mmap_sem);
 
 	/* Repeat until we've completed scanning the whole list */
 	slot = ksm_scan.mm_slot;
 	if (slot != &ksm_mm_head)
 		goto next_mm;
 
-	/*
-	 * Bump seqnr here rather than at top, so that __ksm_exit
-	 * can skip rb_erase on unstable tree until we run again.
-	 */
 	ksm_scan.seqnr++;
 	return NULL;
 }
@@ -1361,6 +1408,7 @@ int __ksm_enter(struct mm_struct *mm)
 	spin_unlock(&ksm_mmlist_lock);
 
 	set_bit(MMF_VM_MERGEABLE, &mm->flags);
+	atomic_inc(&mm->mm_count);
 
 	if (needs_wakeup)
 		wake_up_interruptible(&ksm_thread_wait);
@@ -1368,41 +1416,45 @@ int __ksm_enter(struct mm_struct *mm)
 	return 0;
 }
 
-void __ksm_exit(struct mm_struct *mm)
+void __ksm_exit(struct mm_struct *mm,
+		struct mmu_gather **tlbp, unsigned long end)
 {
 	struct mm_slot *mm_slot;
+	int easy_to_free = 0;
 
 	/*
-	 * This process is exiting: doesn't hold and doesn't need mmap_sem;
-	 * but we do need to exclude ksmd and other exiters while we modify
-	 * the various lists and trees.
+	 * This process is exiting: if it's straightforward (as is the
+	 * case when ksmd was never running), free mm_slot immediately.
+	 * But if it's at the cursor or has rmap_items linked to it, use
+	 * mmap_sem to synchronize with any break_cows before pagetables
+	 * are freed, and leave the mm_slot on the list for ksmd to free.
+	 * Beware: ksm may already have noticed it exiting and freed the slot.
 	 */
-	mutex_lock(&ksm_thread_mutex);
+
 	spin_lock(&ksm_mmlist_lock);
 	mm_slot = get_mm_slot(mm);
-	if (!list_empty(&mm_slot->rmap_list)) {
-		spin_unlock(&ksm_mmlist_lock);
-		remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next);
-		spin_lock(&ksm_mmlist_lock);
-	}
-
-	if (ksm_scan.mm_slot == mm_slot) {
-		ksm_scan.mm_slot = list_entry(
-			mm_slot->mm_list.next, struct mm_slot, mm_list);
-		ksm_scan.address = 0;
-		ksm_scan.rmap_item = list_entry(
-			&ksm_scan.mm_slot->rmap_list, struct rmap_item, link);
-		if (ksm_scan.mm_slot == &ksm_mm_head)
-			ksm_scan.seqnr++;
+	if (mm_slot && ksm_scan.mm_slot != mm_slot) {
+		if (list_empty(&mm_slot->rmap_list)) {
+			hlist_del(&mm_slot->link);
+			list_del(&mm_slot->mm_list);
+			easy_to_free = 1;
+		} else {
+			list_move(&mm_slot->mm_list,
+				  &ksm_scan.mm_slot->mm_list);
+		}
 	}
-
-	hlist_del(&mm_slot->link);
-	list_del(&mm_slot->mm_list);
 	spin_unlock(&ksm_mmlist_lock);
 
-	free_mm_slot(mm_slot);
-	clear_bit(MMF_VM_MERGEABLE, &mm->flags);
-	mutex_unlock(&ksm_thread_mutex);
+	if (easy_to_free) {
+		free_mm_slot(mm_slot);
+		clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+		mmdrop(mm);
+	} else if (mm_slot) {
+		tlb_finish_mmu(*tlbp, 0, end);
+		down_write(&mm->mmap_sem);
+		up_write(&mm->mmap_sem);
+		*tlbp = tlb_gather_mmu(mm, 1);
+	}
 }
 
 #define KSM_ATTR_RO(_name) \
diff --git a/mm/memory.c b/mm/memory.c
index 1a435b81876c..f47ffe971012 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2648,8 +2648,9 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
-	if (!pte_none(*page_table))
+	if (!pte_none(*page_table) || ksm_test_exit(mm))
 		goto release;
+
 	inc_mm_counter(mm, anon_rss);
 	page_add_new_anon_rmap(page, vma, address);
 	set_pte_at(mm, address, page_table, entry);
@@ -2791,7 +2792,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * handle that later.
 	 */
 	/* Only go through if we didn't race with anybody else... */
-	if (likely(pte_same(*page_table, orig_pte))) {
+	if (likely(pte_same(*page_table, orig_pte) && !ksm_test_exit(mm))) {
 		flush_icache_page(vma, page);
 		entry = mk_pte(page, vma->vm_page_prot);
 		if (flags & FAULT_FLAG_WRITE)
diff --git a/mm/mmap.c b/mm/mmap.c
index 376492ed08f4..e02f1aa66a1a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -27,6 +27,7 @@
 #include <linux/mount.h>
 #include <linux/mempolicy.h>
 #include <linux/rmap.h>
+#include <linux/ksm.h>
 #include <linux/mmu_notifier.h>
 #include <linux/perf_event.h>
 
@@ -2111,6 +2112,14 @@ void exit_mmap(struct mm_struct *mm)
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
 	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
+
+	/*
+	 * For KSM to handle OOM without deadlock when it's breaking COW in a
+	 * likely victim of the OOM killer, we must serialize with ksm_exit()
+	 * after freeing mm's pages but before freeing its page tables.
+	 */
+	ksm_exit(mm, &tlb, end);
+
 	free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
 	tlb_finish_mmu(tlb, 0, end);
 
-- 
cgit v1.2.3-71-gd317


From 1c2fb7a4c2ca7a958b02bc1e615d0254990bba8d Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Mon, 21 Sep 2009 17:02:22 -0700
Subject: ksm: fix deadlock with munlock in exit_mmap

Rawhide users have reported hang at startup when cryptsetup is run: the
same problem can be simply reproduced by running a program int main() {
mlockall(MCL_CURRENT | MCL_FUTURE); return 0; }

The problem is that exit_mmap() applies munlock_vma_pages_all() to
clean up VM_LOCKED areas, and its current implementation (stupidly)
tries to fault in absent pages, for example where PROT_NONE prevented
them being faulted in when mlocking.  Whereas the "ksm: fix oom
deadlock" patch, knowing there's a race by which KSM might try to fault
in pages after exit_mmap() had finally zapped the range, backs out of
such faults doing nothing when its ksm_test_exit() notices mm_users 0.

So revert that part of "ksm: fix oom deadlock" which moved the
ksm_exit() call from before exit_mmap() to the middle of exit_mmap();
and remove those ksm_test_exit() checks from the page fault paths, so
allowing the munlocking to proceed without interference.

ksm_exit, if there are rmap_items still chained on this mm slot, takes
mmap_sem write side: so preventing KSM from working on an mm while
exit_mmap runs.  And KSM will bail out as soon as it notices that
mm_users is already zero, thanks to its internal ksm_test_exit checks.
So that when a task is killed by OOM killer or the user, KSM will not
indefinitely prevent it from running exit_mmap to release its memory.

This does break a part of what "ksm: fix oom deadlock" was trying to
achieve.  When unmerging KSM (echo 2 >/sys/kernel/mm/ksm), and even
when ksmd itself has to cancel a KSM page, it is possible that the
first OOM-kill victim would be the KSM process being faulted: then its
memory won't be freed until a second victim has been selected (freeing
memory for the unmerging fault to complete).

But the OOM killer is already liable to kill a second victim once the
intended victim's p->mm goes to NULL: so there's not much point in
rejecting this KSM patch before fixing that OOM behaviour.  It is very
much more important to allow KSM users to boot up, than to haggle over
an unlikely and poorly supported OOM case.

We also intend to fix munlocking to not fault pages: at which point
this patch _could_ be reverted; though that would be controversial, so
we hope to find a better solution.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Justin M. Forbes <jforbes@redhat.com>
Acked-for-now-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Izik Eidus <ieidus@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ksm.h | 11 ++++-------
 kernel/fork.c       |  1 +
 mm/ksm.c            |  5 +----
 mm/memory.c         |  4 ++--
 mm/mmap.c           |  7 -------
 5 files changed, 8 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 2d64ff30c0de..0e26de6adb51 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -18,8 +18,7 @@ struct mmu_gather;
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		unsigned long end, int advice, unsigned long *vm_flags);
 int __ksm_enter(struct mm_struct *mm);
-void __ksm_exit(struct mm_struct *mm,
-		struct mmu_gather **tlbp, unsigned long end);
+void __ksm_exit(struct mm_struct *mm);
 
 static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 {
@@ -41,11 +40,10 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
 	return atomic_read(&mm->mm_users) == 0;
 }
 
-static inline void ksm_exit(struct mm_struct *mm,
-			    struct mmu_gather **tlbp, unsigned long end)
+static inline void ksm_exit(struct mm_struct *mm)
 {
 	if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
-		__ksm_exit(mm, tlbp, end);
+		__ksm_exit(mm);
 }
 
 /*
@@ -86,8 +84,7 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
 	return 0;
 }
 
-static inline void ksm_exit(struct mm_struct *mm,
-			    struct mmu_gather **tlbp, unsigned long end)
+static inline void ksm_exit(struct mm_struct *mm)
 {
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 42f20f565b16..73a442b7be6d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -501,6 +501,7 @@ void mmput(struct mm_struct *mm)
 
 	if (atomic_dec_and_test(&mm->mm_users)) {
 		exit_aio(mm);
+		ksm_exit(mm);
 		exit_mmap(mm);
 		set_mm_exe_file(mm, NULL);
 		if (!list_empty(&mm->mmlist)) {
diff --git a/mm/ksm.c b/mm/ksm.c
index 722e3f2a8dc5..92034eb47eba 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1416,8 +1416,7 @@ int __ksm_enter(struct mm_struct *mm)
 	return 0;
 }
 
-void __ksm_exit(struct mm_struct *mm,
-		struct mmu_gather **tlbp, unsigned long end)
+void __ksm_exit(struct mm_struct *mm)
 {
 	struct mm_slot *mm_slot;
 	int easy_to_free = 0;
@@ -1450,10 +1449,8 @@ void __ksm_exit(struct mm_struct *mm,
 		clear_bit(MMF_VM_MERGEABLE, &mm->flags);
 		mmdrop(mm);
 	} else if (mm_slot) {
-		tlb_finish_mmu(*tlbp, 0, end);
 		down_write(&mm->mmap_sem);
 		up_write(&mm->mmap_sem);
-		*tlbp = tlb_gather_mmu(mm, 1);
 	}
 }
 
diff --git a/mm/memory.c b/mm/memory.c
index f47ffe971012..05feaa11d87c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2648,7 +2648,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
-	if (!pte_none(*page_table) || ksm_test_exit(mm))
+	if (!pte_none(*page_table))
 		goto release;
 
 	inc_mm_counter(mm, anon_rss);
@@ -2792,7 +2792,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * handle that later.
 	 */
 	/* Only go through if we didn't race with anybody else... */
-	if (likely(pte_same(*page_table, orig_pte) && !ksm_test_exit(mm))) {
+	if (likely(pte_same(*page_table, orig_pte))) {
 		flush_icache_page(vma, page);
 		entry = mk_pte(page, vma->vm_page_prot);
 		if (flags & FAULT_FLAG_WRITE)
diff --git a/mm/mmap.c b/mm/mmap.c
index e02f1aa66a1a..ffd6c6c9bcf4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2113,13 +2113,6 @@ void exit_mmap(struct mm_struct *mm)
 	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
 
-	/*
-	 * For KSM to handle OOM without deadlock when it's breaking COW in a
-	 * likely victim of the OOM killer, we must serialize with ksm_exit()
-	 * after freeing mm's pages but before freeing its page tables.
-	 */
-	ksm_exit(mm, &tlb, end);
-
 	free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
 	tlb_finish_mmu(tlb, 0, end);
 
-- 
cgit v1.2.3-71-gd317


From a913e182ab9484308e870af37a14d372742d53b0 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Date: Mon, 21 Sep 2009 17:02:26 -0700
Subject: ksm: clean up obsolete references

A few cleanups, given the munlock fix: the comment on ksm_test_exit() no
longer applies, and it can be made private to ksm.c; there's no more
reference to mmu_gather or tlb.h, and mmap.c doesn't need ksm.h.

Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Acked-by: Izik Eidus <ieidus@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ksm.h | 20 --------------------
 mm/ksm.c            | 14 +++++++++++++-
 mm/mmap.c           |  1 -
 3 files changed, 13 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 0e26de6adb51..a485c14ecd5d 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -12,8 +12,6 @@
 #include <linux/sched.h>
 #include <linux/vmstat.h>
 
-struct mmu_gather;
-
 #ifdef CONFIG_KSM
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		unsigned long end, int advice, unsigned long *vm_flags);
@@ -27,19 +25,6 @@ static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 	return 0;
 }
 
-/*
- * For KSM to handle OOM without deadlock when it's breaking COW in a
- * likely victim of the OOM killer, exit_mmap() has to serialize with
- * ksm_exit() after freeing mm's pages but before freeing its page tables.
- * That leaves a window in which KSM might refault pages which have just
- * been finally unmapped: guard against that with ksm_test_exit(), and
- * use it after getting mmap_sem in ksm.c, to check if mm is exiting.
- */
-static inline bool ksm_test_exit(struct mm_struct *mm)
-{
-	return atomic_read(&mm->mm_users) == 0;
-}
-
 static inline void ksm_exit(struct mm_struct *mm)
 {
 	if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
@@ -79,11 +64,6 @@ static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 	return 0;
 }
 
-static inline bool ksm_test_exit(struct mm_struct *mm)
-{
-	return 0;
-}
-
 static inline void ksm_exit(struct mm_struct *mm)
 {
 }
diff --git a/mm/ksm.c b/mm/ksm.c
index 3bd54ce9eb38..e11e7a5ac84f 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -32,7 +32,6 @@
 #include <linux/mmu_notifier.h>
 #include <linux/ksm.h>
 
-#include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
 /*
@@ -284,6 +283,19 @@ static inline int in_stable_tree(struct rmap_item *rmap_item)
 	return rmap_item->address & STABLE_FLAG;
 }
 
+/*
+ * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
+ * page tables after it has passed through ksm_exit() - which, if necessary,
+ * takes mmap_sem briefly to serialize against them.  ksm_exit() does not set
+ * a special flag: they can just back out as soon as mm_users goes to zero.
+ * ksm_test_exit() is used throughout to make this test for exit: in some
+ * places for correctness, in some places just to avoid unnecessary work.
+ */
+static inline bool ksm_test_exit(struct mm_struct *mm)
+{
+	return atomic_read(&mm->mm_users) == 0;
+}
+
 /*
  * We use break_ksm to break COW on a ksm page: it's a stripped down
  *
diff --git a/mm/mmap.c b/mm/mmap.c
index 22dff49d579e..6eed98c00543 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -27,7 +27,6 @@
 #include <linux/mount.h>
 #include <linux/mempolicy.h>
 #include <linux/rmap.h>
-#include <linux/ksm.h>
 #include <linux/mmu_notifier.h>
 #include <linux/perf_event.h>
 
-- 
cgit v1.2.3-71-gd317


From 35451beecbd7c86ce3249d543594517a5fe9a0cd Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Date: Mon, 21 Sep 2009 17:02:27 -0700
Subject: ksm: unmerge is an origin of OOMs

Just as the swapoff system call allocates many pages of RAM to various
processes, perhaps triggering OOM, so "echo 2 >/sys/kernel/mm/ksm/run"
(unmerge) is liable to allocate many pages of RAM to various processes,
perhaps triggering OOM; and each is normally run from a modest admin
process (swapoff or shell), easily repeated until it succeeds.

So treat unmerge_and_remove_all_rmap_items() in the same way that we treat
try_to_unuse(): generalize PF_SWAPOFF to PF_OOM_ORIGIN, and bracket both
with that, to ask the OOM killer to kill them first, to prevent them from
spawning more and more OOM kills.

Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Acked-by: Izik Eidus <ieidus@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 2 +-
 mm/ksm.c              | 2 ++
 mm/oom_kill.c         | 2 +-
 mm/swapfile.c         | 4 ++--
 4 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8f3e63cb33a6..899d7304d594 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1720,7 +1720,7 @@ extern cputime_t task_gtime(struct task_struct *p);
 #define PF_FROZEN	0x00010000	/* frozen for system suspend */
 #define PF_FSTRANS	0x00020000	/* inside a filesystem transaction */
 #define PF_KSWAPD	0x00040000	/* I am kswapd */
-#define PF_SWAPOFF	0x00080000	/* I am in swapoff */
+#define PF_OOM_ORIGIN	0x00080000	/* Allocating much memory to others */
 #define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
 #define PF_KTHREAD	0x00200000	/* I am a kernel thread */
 #define PF_RANDOMIZE	0x00400000	/* randomize virtual address space */
diff --git a/mm/ksm.c b/mm/ksm.c
index e11e7a5ac84f..37cc37325094 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1557,7 +1557,9 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
 	if (ksm_run != flags) {
 		ksm_run = flags;
 		if (flags & KSM_RUN_UNMERGE) {
+			current->flags |= PF_OOM_ORIGIN;
 			err = unmerge_and_remove_all_rmap_items();
+			current->flags &= ~PF_OOM_ORIGIN;
 			if (err) {
 				ksm_run = KSM_RUN_STOP;
 				count = err;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index a7b2460e922b..da4c342f2641 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -79,7 +79,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 	/*
 	 * swapoff can easily use up all memory, so kill those first.
 	 */
-	if (p->flags & PF_SWAPOFF)
+	if (p->flags & PF_OOM_ORIGIN)
 		return ULONG_MAX;
 
 	/*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 74f1102e8749..f1bf19daadc6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1575,9 +1575,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	p->flags &= ~SWP_WRITEOK;
 	spin_unlock(&swap_lock);
 
-	current->flags |= PF_SWAPOFF;
+	current->flags |= PF_OOM_ORIGIN;
 	err = try_to_unuse(type);
-	current->flags &= ~PF_SWAPOFF;
+	current->flags &= ~PF_OOM_ORIGIN;
 
 	if (err) {
 		/* re-insert swap space back into swap_list */
-- 
cgit v1.2.3-71-gd317


From 38a398572fa2d8124f7479e40db581b5b72719c9 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Mon, 21 Sep 2009 17:02:39 -0700
Subject: page-allocator: remove dead function free_cold_page()

The function free_cold_page() has no callers so delete it.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 1 -
 mm/page_alloc.c     | 5 -----
 2 files changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 7c777a0da17a..c32bfa8e7f1e 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -326,7 +326,6 @@ void free_pages_exact(void *virt, size_t size);
 extern void __free_pages(struct page *page, unsigned int order);
 extern void free_pages(unsigned long addr, unsigned int order);
 extern void free_hot_page(struct page *page);
-extern void free_cold_page(struct page *page);
 
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr),0)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 20759803a64a..913a8ebd3a8e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1079,11 +1079,6 @@ void free_hot_page(struct page *page)
 	free_hot_cold_page(page, 0);
 }
 	
-void free_cold_page(struct page *page)
-{
-	free_hot_cold_page(page, 1);
-}
-
 /*
  * split_page takes a non-compound higher-order page, and splits it into
  * n (1<<order) sub-pages: page[0..n]
-- 
cgit v1.2.3-71-gd317


From bba78819548a59a52e60f0b259997bbd011164ae Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 21 Sep 2009 17:02:56 -0700
Subject: mm: remove broken 'kzalloc' mempool

The kzalloc mempool zeros items when they are initially allocated, but
does not rezero used items that are returned to the pool.  Consequently
mempool_alloc()s may return non-zeroed memory.

Since there are/were only two in-tree users for
mempool_create_kzalloc_pool(), and 'fixing' this in a way that will
re-zero used (but not new) items before first use is non-trivial, just
remove it.

Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempool.h | 10 ++--------
 mm/mempool.c            |  7 -------
 2 files changed, 2 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 9be484d11283..7c08052e3321 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -47,22 +47,16 @@ mempool_create_slab_pool(int min_nr, struct kmem_cache *kc)
 }
 
 /*
- * 2 mempool_alloc_t's and a mempool_free_t to kmalloc/kzalloc and kfree
- * the amount of memory specified by pool_data
+ * a mempool_alloc_t and a mempool_free_t to kmalloc and kfree the
+ * amount of memory specified by pool_data
  */
 void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data);
-void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data);
 void mempool_kfree(void *element, void *pool_data);
 static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
 {
 	return mempool_create(min_nr, mempool_kmalloc, mempool_kfree,
 			      (void *) size);
 }
-static inline mempool_t *mempool_create_kzalloc_pool(int min_nr, size_t size)
-{
-	return mempool_create(min_nr, mempool_kzalloc, mempool_kfree,
-			      (void *) size);
-}
 
 /*
  * A mempool_alloc_t and mempool_free_t for a simple page allocator that
diff --git a/mm/mempool.c b/mm/mempool.c
index 32e75d400503..1a3bc3d4d554 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -308,13 +308,6 @@ void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
 }
 EXPORT_SYMBOL(mempool_kmalloc);
 
-void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data)
-{
-	size_t size = (size_t)pool_data;
-	return kzalloc(size, gfp_mask);
-}
-EXPORT_SYMBOL(mempool_kzalloc);
-
 void mempool_kfree(void *element, void *pool_data)
 {
 	kfree(element);
-- 
cgit v1.2.3-71-gd317


From 401a8e1c1670085b8177330ca47d4f7c4ac88761 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 21 Sep 2009 17:02:58 -0700
Subject: mm: introduce page_lru_base_type()

Instead of abusing page_is_file_cache() for LRU list index arithmetic, add
another helper with a more appropriate name and convert the non-boolean
users of page_is_file_cache() accordingly.

This new helper gives the LRU base type a page is supposed to live on,
inactive anon or inactive file.

[hugh.dickins@tiscali.co.uk: convert del_page_from_lru() also]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h | 23 +++++++++++++++++++----
 mm/swap.c                 |  4 ++--
 mm/vmscan.c               |  6 +++---
 3 files changed, 24 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 7fbb97267556..99977ff45b83 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -39,21 +39,36 @@ del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l)
 	mem_cgroup_del_lru_list(page, l);
 }
 
+/**
+ * page_lru_base_type - which LRU list type should a page be on?
+ * @page: the page to test
+ *
+ * Used for LRU list index arithmetic.
+ *
+ * Returns the base LRU type - file or anon - @page should be on.
+ */
+static inline enum lru_list page_lru_base_type(struct page *page)
+{
+	if (page_is_file_cache(page))
+		return LRU_INACTIVE_FILE;
+	return LRU_INACTIVE_ANON;
+}
+
 static inline void
 del_page_from_lru(struct zone *zone, struct page *page)
 {
-	enum lru_list l = LRU_BASE;
+	enum lru_list l;
 
 	list_del(&page->lru);
 	if (PageUnevictable(page)) {
 		__ClearPageUnevictable(page);
 		l = LRU_UNEVICTABLE;
 	} else {
+		l = page_lru_base_type(page);
 		if (PageActive(page)) {
 			__ClearPageActive(page);
 			l += LRU_ACTIVE;
 		}
-		l += page_is_file_cache(page);
 	}
 	__dec_zone_state(zone, NR_LRU_BASE + l);
 	mem_cgroup_del_lru_list(page, l);
@@ -68,14 +83,14 @@ del_page_from_lru(struct zone *zone, struct page *page)
  */
 static inline enum lru_list page_lru(struct page *page)
 {
-	enum lru_list lru = LRU_BASE;
+	enum lru_list lru;
 
 	if (PageUnevictable(page))
 		lru = LRU_UNEVICTABLE;
 	else {
+		lru = page_lru_base_type(page);
 		if (PageActive(page))
 			lru += LRU_ACTIVE;
-		lru += page_is_file_cache(page);
 	}
 
 	return lru;
diff --git a/mm/swap.c b/mm/swap.c
index cb29ae5d33ab..168d53e6e58e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -118,7 +118,7 @@ static void pagevec_move_tail(struct pagevec *pvec)
 			spin_lock(&zone->lru_lock);
 		}
 		if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
-			int lru = page_is_file_cache(page);
+			int lru = page_lru_base_type(page);
 			list_move_tail(&page->lru, &zone->lru[lru].list);
 			pgmoved++;
 		}
@@ -181,7 +181,7 @@ void activate_page(struct page *page)
 	spin_lock_irq(&zone->lru_lock);
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
 		int file = page_is_file_cache(page);
-		int lru = LRU_BASE + file;
+		int lru = page_lru_base_type(page);
 		del_page_from_lru_list(zone, page, lru);
 
 		SetPageActive(page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index cad5d528a6f0..30e56ee833f8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -531,7 +531,7 @@ redo:
 		 * unevictable page on [in]active list.
 		 * We know how to handle that.
 		 */
-		lru = active + page_is_file_cache(page);
+		lru = active + page_lru_base_type(page);
 		lru_cache_add_lru(page, lru);
 	} else {
 		/*
@@ -986,7 +986,7 @@ static unsigned long clear_active_flags(struct list_head *page_list,
 	struct page *page;
 
 	list_for_each_entry(page, page_list, lru) {
-		lru = page_is_file_cache(page);
+		lru = page_lru_base_type(page);
 		if (PageActive(page)) {
 			lru += LRU_ACTIVE;
 			ClearPageActive(page);
@@ -2652,7 +2652,7 @@ static void check_move_unevictable_page(struct page *page, struct zone *zone)
 retry:
 	ClearPageUnevictable(page);
 	if (page_evictable(page, NULL)) {
-		enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page);
+		enum lru_list l = page_lru_base_type(page);
 
 		__dec_zone_state(zone, NR_UNEVICTABLE);
 		list_move(&page->lru, &zone->lru[l].list);
-- 
cgit v1.2.3-71-gd317


From 6c0b13519d1c755d874e82c8fb8a6dcef0ee402c Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 21 Sep 2009 17:02:59 -0700
Subject: mm: return boolean from page_is_file_cache()

page_is_file_cache() has been used for both boolean checks and LRU
arithmetic, which was always a bit weird.

Now that page_lru_base_type() exists for LRU arithmetic, make
page_is_file_cache() a real predicate function and adjust the
boolean-using callsites to drop those pesky double negations.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h | 8 ++------
 mm/migrate.c              | 6 +++---
 mm/swap.c                 | 2 +-
 mm/vmscan.c               | 2 +-
 4 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 99977ff45b83..8835b877b8db 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -5,7 +5,7 @@
  * page_is_file_cache - should the page be on a file LRU or anon LRU?
  * @page: the page to test
  *
- * Returns LRU_FILE if @page is page cache page backed by a regular filesystem,
+ * Returns 1 if @page is page cache page backed by a regular filesystem,
  * or 0 if @page is anonymous, tmpfs or otherwise ram or swap backed.
  * Used by functions that manipulate the LRU lists, to sort a page
  * onto the right LRU list.
@@ -16,11 +16,7 @@
  */
 static inline int page_is_file_cache(struct page *page)
 {
-	if (PageSwapBacked(page))
-		return 0;
-
-	/* The page is page cache backed by a normal filesystem. */
-	return LRU_FILE;
+	return !PageSwapBacked(page);
 }
 
 static inline void
diff --git a/mm/migrate.c b/mm/migrate.c
index b535a2c1656c..e97e513fe898 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -68,7 +68,7 @@ int putback_lru_pages(struct list_head *l)
 	list_for_each_entry_safe(page, page2, l, lru) {
 		list_del(&page->lru);
 		dec_zone_page_state(page, NR_ISOLATED_ANON +
-				    !!page_is_file_cache(page));
+				page_is_file_cache(page));
 		putback_lru_page(page);
 		count++;
 	}
@@ -701,7 +701,7 @@ unlock:
  		 */
  		list_del(&page->lru);
 		dec_zone_page_state(page, NR_ISOLATED_ANON +
-				    !!page_is_file_cache(page));
+				page_is_file_cache(page));
 		putback_lru_page(page);
 	}
 
@@ -751,7 +751,7 @@ int migrate_pages(struct list_head *from,
 	local_irq_save(flags);
 	list_for_each_entry(page, from, lru)
 		__inc_zone_page_state(page, NR_ISOLATED_ANON +
-				      !!page_is_file_cache(page));
+				page_is_file_cache(page));
 	local_irq_restore(flags);
 
 	if (!swapwrite)
diff --git a/mm/swap.c b/mm/swap.c
index 168d53e6e58e..4a8a59e671f7 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -189,7 +189,7 @@ void activate_page(struct page *page)
 		add_page_to_lru_list(zone, page, lru);
 		__count_vm_event(PGACTIVATE);
 
-		update_page_reclaim_stat(zone, page, !!file, 1);
+		update_page_reclaim_stat(zone, page, file, 1);
 	}
 	spin_unlock_irq(&zone->lru_lock);
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 30e56ee833f8..172119caebcc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -821,7 +821,7 @@ int __isolate_lru_page(struct page *page, int mode, int file)
 	if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
 		return ret;
 
-	if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file))
+	if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
 		return ret;
 
 	/*
-- 
cgit v1.2.3-71-gd317


From edcf4748cd56adcdf0856cc99ef108a4ea3ac7fe Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 21 Sep 2009 17:02:59 -0700
Subject: mm: return boolean from page_has_private()

Make page_has_private() return a true boolean value and remove the double
negations from the two callsites using it for arithmetic.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 13 ++++++++-----
 mm/migrate.c               |  2 +-
 mm/vmscan.c                |  2 +-
 3 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index d07c0bb2203a..13de789f0a5c 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -402,8 +402,8 @@ static inline void __ClearPageTail(struct page *page)
  */
 #define PAGE_FLAGS_CHECK_AT_PREP	((1 << NR_PAGEFLAGS) - 1)
 
-#endif /* !__GENERATING_BOUNDS_H */
-
+#define PAGE_FLAGS_PRIVATE				\
+	(1 << PG_private | 1 << PG_private_2)
 /**
  * page_has_private - Determine if page has private stuff
  * @page: The page to be checked
@@ -411,8 +411,11 @@ static inline void __ClearPageTail(struct page *page)
  * Determine if a page has private stuff, indicating that release routines
  * should be invoked upon it.
  */
-#define page_has_private(page)			\
-	((page)->flags & ((1 << PG_private) |	\
-			  (1 << PG_private_2)))
+static inline int page_has_private(struct page *page)
+{
+	return !!(page->flags & PAGE_FLAGS_PRIVATE);
+}
+
+#endif /* !__GENERATING_BOUNDS_H */
 
 #endif	/* PAGE_FLAGS_H */
diff --git a/mm/migrate.c b/mm/migrate.c
index e97e513fe898..16052e80aaac 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -272,7 +272,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 	pslot = radix_tree_lookup_slot(&mapping->page_tree,
  					page_index(page));
 
-	expected_count = 2 + !!page_has_private(page);
+	expected_count = 2 + page_has_private(page);
 	if (page_count(page) != expected_count ||
 			(struct page *)radix_tree_deref_slot(pslot) != page) {
 		spin_unlock_irq(&mapping->tree_lock);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 172119caebcc..f5b5f029288c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -286,7 +286,7 @@ static inline int page_mapping_inuse(struct page *page)
 
 static inline int is_page_cache_freeable(struct page *page)
 {
-	return page_count(page) - !!page_has_private(page) == 2;
+	return page_count(page) - page_has_private(page) == 2;
 }
 
 static int may_write_to_queue(struct backing_dev_info *bdi)
-- 
cgit v1.2.3-71-gd317


From 4481374ce88ba8f460c8b89f2572027bd27057d0 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Mon, 21 Sep 2009 17:03:05 -0700
Subject: mm: replace various uses of num_physpages by totalram_pages

Sizing of memory allocations shouldn't depend on the number of physical
pages found in a system, as that generally includes (perhaps a huge amount
of) non-RAM pages.  The amount of what actually is usable as storage
should instead be used as a basis here.

Some of the calculations (i.e.  those not intending to use high memory)
should likely even use (totalram_pages - totalhigh_pages).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Dave Airlie <airlied@linux.ie>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Patrick McHardy <kaber@trash.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/microcode_core.c  | 4 ++--
 drivers/char/agp/backend.c        | 4 ++--
 drivers/parisc/ccio-dma.c         | 4 ++--
 drivers/parisc/sba_iommu.c        | 4 ++--
 drivers/xen/balloon.c             | 4 ----
 fs/ntfs/malloc.h                  | 2 +-
 include/linux/mm.h                | 1 +
 init/main.c                       | 4 ++--
 mm/slab.c                         | 2 +-
 mm/swap.c                         | 2 +-
 mm/vmalloc.c                      | 4 ++--
 net/core/sock.c                   | 4 ++--
 net/dccp/proto.c                  | 6 +++---
 net/decnet/dn_route.c             | 2 +-
 net/ipv4/route.c                  | 2 +-
 net/ipv4/tcp.c                    | 4 ++--
 net/netfilter/nf_conntrack_core.c | 4 ++--
 net/netfilter/x_tables.c          | 2 +-
 net/netfilter/xt_hashlimit.c      | 8 ++++----
 net/netlink/af_netlink.c          | 6 +++---
 net/sctp/protocol.c               | 6 +++---
 21 files changed, 38 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 0db7969b0dde..378e9a8f1bf8 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -210,8 +210,8 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
 {
 	ssize_t ret = -EINVAL;
 
-	if ((len >> PAGE_SHIFT) > num_physpages) {
-		pr_err("microcode: too much data (max %ld pages)\n", num_physpages);
+	if ((len >> PAGE_SHIFT) > totalram_pages) {
+		pr_err("microcode: too much data (max %ld pages)\n", totalram_pages);
 		return ret;
 	}
 
diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c
index ad87753f6de4..a56ca080e108 100644
--- a/drivers/char/agp/backend.c
+++ b/drivers/char/agp/backend.c
@@ -114,9 +114,9 @@ static int agp_find_max(void)
 	long memory, index, result;
 
 #if PAGE_SHIFT < 20
-	memory = num_physpages >> (20 - PAGE_SHIFT);
+	memory = totalram_pages >> (20 - PAGE_SHIFT);
 #else
-	memory = num_physpages << (PAGE_SHIFT - 20);
+	memory = totalram_pages << (PAGE_SHIFT - 20);
 #endif
 	index = 1;
 
diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index a45b0c0d574e..a6b4a5a53d40 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -1266,7 +1266,7 @@ ccio_ioc_init(struct ioc *ioc)
 	** Hot-Plug/Removal of PCI cards. (aka PCI OLARD).
 	*/
 
-	iova_space_size = (u32) (num_physpages / count_parisc_driver(&ccio_driver));
+	iova_space_size = (u32) (totalram_pages / count_parisc_driver(&ccio_driver));
 
 	/* limit IOVA space size to 1MB-1GB */
 
@@ -1305,7 +1305,7 @@ ccio_ioc_init(struct ioc *ioc)
 
 	DBG_INIT("%s() hpa 0x%p mem %luMB IOV %dMB (%d bits)\n",
 			__func__, ioc->ioc_regs,
-			(unsigned long) num_physpages >> (20 - PAGE_SHIFT),
+			(unsigned long) totalram_pages >> (20 - PAGE_SHIFT),
 			iova_space_size>>20,
 			iov_order + PAGE_SHIFT);
 
diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c
index 123d8fe3427d..57a6d19eba4c 100644
--- a/drivers/parisc/sba_iommu.c
+++ b/drivers/parisc/sba_iommu.c
@@ -1390,7 +1390,7 @@ sba_ioc_init(struct parisc_device *sba, struct ioc *ioc, int ioc_num)
 	** for DMA hints - ergo only 30 bits max.
 	*/
 
-	iova_space_size = (u32) (num_physpages/global_ioc_cnt);
+	iova_space_size = (u32) (totalram_pages/global_ioc_cnt);
 
 	/* limit IOVA space size to 1MB-1GB */
 	if (iova_space_size < (1 << (20 - PAGE_SHIFT))) {
@@ -1415,7 +1415,7 @@ sba_ioc_init(struct parisc_device *sba, struct ioc *ioc, int ioc_num)
 	DBG_INIT("%s() hpa 0x%lx mem %ldMB IOV %dMB (%d bits)\n",
 			__func__,
 			ioc->ioc_hpa,
-			(unsigned long) num_physpages >> (20 - PAGE_SHIFT),
+			(unsigned long) totalram_pages >> (20 - PAGE_SHIFT),
 			iova_space_size>>20,
 			iov_order + PAGE_SHIFT);
 
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index f5bbd9e83416..1b7123eb5d7b 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -96,11 +96,7 @@ static struct balloon_stats balloon_stats;
 /* We increase/decrease in batches which fit in a page */
 static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
 
-/* VM /proc information for memory */
-extern unsigned long totalram_pages;
-
 #ifdef CONFIG_HIGHMEM
-extern unsigned long totalhigh_pages;
 #define inc_totalhigh_pages() (totalhigh_pages++)
 #define dec_totalhigh_pages() (totalhigh_pages--)
 #else
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index cd0be3f5c3cd..a44b14cbceeb 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -47,7 +47,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask)
 		return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM);
 		/* return (void *)__get_free_page(gfp_mask); */
 	}
-	if (likely(size >> PAGE_SHIFT < num_physpages))
+	if (likely((size >> PAGE_SHIFT) < totalram_pages))
 		return __vmalloc(size, gfp_mask, PAGE_KERNEL);
 	return NULL;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d808cf832c4d..19ff81c49ba6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -25,6 +25,7 @@ extern unsigned long max_mapnr;
 #endif
 
 extern unsigned long num_physpages;
+extern unsigned long totalram_pages;
 extern void * high_memory;
 extern int page_cluster;
 
diff --git a/init/main.c b/init/main.c
index 34971becbd3c..2c48c3153163 100644
--- a/init/main.c
+++ b/init/main.c
@@ -668,12 +668,12 @@ asmlinkage void __init start_kernel(void)
 #endif
 	thread_info_cache_init();
 	cred_init();
-	fork_init(num_physpages);
+	fork_init(totalram_pages);
 	proc_caches_init();
 	buffer_init();
 	key_init();
 	security_init();
-	vfs_caches_init(num_physpages);
+	vfs_caches_init(totalram_pages);
 	radix_tree_init();
 	signals_init();
 	/* rootfs populating might need page-writeback */
diff --git a/mm/slab.c b/mm/slab.c
index 7b5d4deacfcd..7dfa481c96ba 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1384,7 +1384,7 @@ void __init kmem_cache_init(void)
 	 * Fragmentation resistance on low memory - only use bigger
 	 * page orders on machines with more than 32MB of memory.
 	 */
-	if (num_physpages > (32 << 20) >> PAGE_SHIFT)
+	if (totalram_pages > (32 << 20) >> PAGE_SHIFT)
 		slab_break_gfp_order = BREAK_GFP_ORDER_HI;
 
 	/* Bootstrap is tricky, because several objects are allocated
diff --git a/mm/swap.c b/mm/swap.c
index 4a8a59e671f7..308e57d8d7ed 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -496,7 +496,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
  */
 void __init swap_setup(void)
 {
-	unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
+	unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
 
 #ifdef CONFIG_SWAP
 	bdi_init(swapper_space.backing_dev_info);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 9216b2555d07..5535da1d6961 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1386,7 +1386,7 @@ void *vmap(struct page **pages, unsigned int count,
 
 	might_sleep();
 
-	if (count > num_physpages)
+	if (count > totalram_pages)
 		return NULL;
 
 	area = get_vm_area_caller((count << PAGE_SHIFT), flags,
@@ -1493,7 +1493,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
 	unsigned long real_size = size;
 
 	size = PAGE_ALIGN(size);
-	if (!size || (size >> PAGE_SHIFT) > num_physpages)
+	if (!size || (size >> PAGE_SHIFT) > totalram_pages)
 		return NULL;
 
 	area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END,
diff --git a/net/core/sock.c b/net/core/sock.c
index 30d5446512f9..524712a7b154 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1206,12 +1206,12 @@ EXPORT_SYMBOL_GPL(sk_setup_caps);
 
 void __init sk_init(void)
 {
-	if (num_physpages <= 4096) {
+	if (totalram_pages <= 4096) {
 		sysctl_wmem_max = 32767;
 		sysctl_rmem_max = 32767;
 		sysctl_wmem_default = 32767;
 		sysctl_rmem_default = 32767;
-	} else if (num_physpages >= 131072) {
+	} else if (totalram_pages >= 131072) {
 		sysctl_wmem_max = 131071;
 		sysctl_rmem_max = 131071;
 	}
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 923db06c7e55..bc4467082a00 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1049,10 +1049,10 @@ static int __init dccp_init(void)
 	 *
 	 * The methodology is similar to that of the buffer cache.
 	 */
-	if (num_physpages >= (128 * 1024))
-		goal = num_physpages >> (21 - PAGE_SHIFT);
+	if (totalram_pages >= (128 * 1024))
+		goal = totalram_pages >> (21 - PAGE_SHIFT);
 	else
-		goal = num_physpages >> (23 - PAGE_SHIFT);
+		goal = totalram_pages >> (23 - PAGE_SHIFT);
 
 	if (thash_entries)
 		goal = (thash_entries *
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 9383d3e5a1ab..57662cabaf9b 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1750,7 +1750,7 @@ void __init dn_route_init(void)
 	dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ;
 	add_timer(&dn_route_timer);
 
-	goal = num_physpages >> (26 - PAGE_SHIFT);
+	goal = totalram_pages >> (26 - PAGE_SHIFT);
 
 	for(order = 0; (1UL << order) < goal; order++)
 		/* NOTHING */;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 91867d3e6328..df9347314538 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3414,7 +3414,7 @@ int __init ip_rt_init(void)
 		alloc_large_system_hash("IP route cache",
 					sizeof(struct rt_hash_bucket),
 					rhash_entries,
-					(num_physpages >= 128 * 1024) ?
+					(totalram_pages >= 128 * 1024) ?
 					15 : 17,
 					0,
 					&rt_hash_log,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 19a0612b8a20..21387ebabf00 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2862,7 +2862,7 @@ void __init tcp_init(void)
 		alloc_large_system_hash("TCP established",
 					sizeof(struct inet_ehash_bucket),
 					thash_entries,
-					(num_physpages >= 128 * 1024) ?
+					(totalram_pages >= 128 * 1024) ?
 					13 : 15,
 					0,
 					&tcp_hashinfo.ehash_size,
@@ -2879,7 +2879,7 @@ void __init tcp_init(void)
 		alloc_large_system_hash("TCP bind",
 					sizeof(struct inet_bind_hashbucket),
 					tcp_hashinfo.ehash_size,
-					(num_physpages >= 128 * 1024) ?
+					(totalram_pages >= 128 * 1024) ?
 					13 : 15,
 					0,
 					&tcp_hashinfo.bhash_size,
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index b37109817a98..7c9ec3dee96e 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1245,9 +1245,9 @@ static int nf_conntrack_init_init_net(void)
 	 * machine has 512 buckets. >= 1GB machines have 16384 buckets. */
 	if (!nf_conntrack_htable_size) {
 		nf_conntrack_htable_size
-			= (((num_physpages << PAGE_SHIFT) / 16384)
+			= (((totalram_pages << PAGE_SHIFT) / 16384)
 			   / sizeof(struct hlist_head));
-		if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
+		if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
 			nf_conntrack_htable_size = 16384;
 		if (nf_conntrack_htable_size < 32)
 			nf_conntrack_htable_size = 32;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index a6ac83a93348..f01955cce314 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -617,7 +617,7 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size)
 	int cpu;
 
 	/* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
-	if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > num_physpages)
+	if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages)
 		return NULL;
 
 	newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL);
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 219dcdbe388c..dd16e404424f 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -194,9 +194,9 @@ static int htable_create_v0(struct xt_hashlimit_info *minfo, u_int8_t family)
 	if (minfo->cfg.size)
 		size = minfo->cfg.size;
 	else {
-		size = ((num_physpages << PAGE_SHIFT) / 16384) /
+		size = ((totalram_pages << PAGE_SHIFT) / 16384) /
 		       sizeof(struct list_head);
-		if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
+		if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
 			size = 8192;
 		if (size < 16)
 			size = 16;
@@ -266,9 +266,9 @@ static int htable_create(struct xt_hashlimit_mtinfo1 *minfo, u_int8_t family)
 	if (minfo->cfg.size) {
 		size = minfo->cfg.size;
 	} else {
-		size = (num_physpages << PAGE_SHIFT) / 16384 /
+		size = (totalram_pages << PAGE_SHIFT) / 16384 /
 		       sizeof(struct list_head);
-		if (num_physpages > 1024 * 1024 * 1024 / PAGE_SIZE)
+		if (totalram_pages > 1024 * 1024 * 1024 / PAGE_SIZE)
 			size = 8192;
 		if (size < 16)
 			size = 16;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index c5aab6a368ce..55180b99562a 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2091,10 +2091,10 @@ static int __init netlink_proto_init(void)
 	if (!nl_table)
 		goto panic;
 
-	if (num_physpages >= (128 * 1024))
-		limit = num_physpages >> (21 - PAGE_SHIFT);
+	if (totalram_pages >= (128 * 1024))
+		limit = totalram_pages >> (21 - PAGE_SHIFT);
 	else
-		limit = num_physpages >> (23 - PAGE_SHIFT);
+		limit = totalram_pages >> (23 - PAGE_SHIFT);
 
 	order = get_bitmask_order(limit) - 1 + PAGE_SHIFT;
 	limit = (1UL << order) / sizeof(struct hlist_head);
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index c557f1fb1c66..612dc878e05c 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1184,10 +1184,10 @@ SCTP_STATIC __init int sctp_init(void)
 	/* Size and allocate the association hash table.
 	 * The methodology is similar to that of the tcp hash tables.
 	 */
-	if (num_physpages >= (128 * 1024))
-		goal = num_physpages >> (22 - PAGE_SHIFT);
+	if (totalram_pages >= (128 * 1024))
+		goal = totalram_pages >> (22 - PAGE_SHIFT);
 	else
-		goal = num_physpages >> (24 - PAGE_SHIFT);
+		goal = totalram_pages >> (24 - PAGE_SHIFT);
 
 	for (order = 0; (1UL << order) < goal; order++)
 		;
-- 
cgit v1.2.3-71-gd317


From 2c85f51d222ccdd8c401d77a36b723a89156810d Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Mon, 21 Sep 2009 17:03:07 -0700
Subject: mm: also use alloc_large_system_hash() for the PID hash table

This is being done by allowing boot time allocations to specify that they
may want a sub-page sized amount of memory.

Overall this seems more consistent with the other hash table allocations,
and allows making two supposedly mm-only variables really mm-only
(nr_{kernel,all}_pages).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bootmem.h |  5 ++---
 kernel/pid.c            | 15 ++++-----------
 mm/page_alloc.c         | 13 ++++++++++---
 3 files changed, 16 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index bc3ab7073695..dd97fb8408a8 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -132,9 +132,6 @@ static inline void *alloc_remap(int nid, unsigned long size)
 }
 #endif /* CONFIG_HAVE_ARCH_ALLOC_REMAP */
 
-extern unsigned long __meminitdata nr_kernel_pages;
-extern unsigned long __meminitdata nr_all_pages;
-
 extern void *alloc_large_system_hash(const char *tablename,
 				     unsigned long bucketsize,
 				     unsigned long numentries,
@@ -145,6 +142,8 @@ extern void *alloc_large_system_hash(const char *tablename,
 				     unsigned long limit);
 
 #define HASH_EARLY	0x00000001	/* Allocating during early boot? */
+#define HASH_SMALL	0x00000002	/* sub-page allocation allowed, min
+					 * shift passed via *_hash_shift */
 
 /* Only NUMA needs hash distribution. 64bit NUMA architectures have
  * sufficient vmalloc space.
diff --git a/kernel/pid.c b/kernel/pid.c
index 31310b5d3f50..d3f722d20f9c 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -40,7 +40,7 @@
 #define pid_hashfn(nr, ns)	\
 	hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
 static struct hlist_head *pid_hash;
-static int pidhash_shift;
+static unsigned int pidhash_shift = 4;
 struct pid init_struct_pid = INIT_STRUCT_PID;
 
 int pid_max = PID_MAX_DEFAULT;
@@ -499,19 +499,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 void __init pidhash_init(void)
 {
 	int i, pidhash_size;
-	unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
 
-	pidhash_shift = max(4, fls(megabytes * 4));
-	pidhash_shift = min(12, pidhash_shift);
+	pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
+					   HASH_EARLY | HASH_SMALL,
+					   &pidhash_shift, NULL, 4096);
 	pidhash_size = 1 << pidhash_shift;
 
-	printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
-		pidhash_size, pidhash_shift,
-		pidhash_size * sizeof(struct hlist_head));
-
-	pid_hash = alloc_bootmem(pidhash_size *	sizeof(*(pid_hash)));
-	if (!pid_hash)
-		panic("Could not alloc pidhash!\n");
 	for (i = 0; i < pidhash_size; i++)
 		INIT_HLIST_HEAD(&pid_hash[i]);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 33b1a4762a7b..770f011e1c12 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -124,8 +124,8 @@ static char * const zone_names[MAX_NR_ZONES] = {
 
 int min_free_kbytes = 1024;
 
-unsigned long __meminitdata nr_kernel_pages;
-unsigned long __meminitdata nr_all_pages;
+static unsigned long __meminitdata nr_kernel_pages;
+static unsigned long __meminitdata nr_all_pages;
 static unsigned long __meminitdata dma_reserve;
 
 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
@@ -4821,7 +4821,14 @@ void *__init alloc_large_system_hash(const char *tablename,
 			numentries <<= (PAGE_SHIFT - scale);
 
 		/* Make sure we've got at least a 0-order allocation.. */
-		if (unlikely((numentries * bucketsize) < PAGE_SIZE))
+		if (unlikely(flags & HASH_SMALL)) {
+			/* Makes no sense without HASH_EARLY */
+			WARN_ON(!(flags & HASH_EARLY));
+			if (!(numentries >> *_hash_shift)) {
+				numentries = 1UL << *_hash_shift;
+				BUG_ON(!numentries);
+			}
+		} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
 			numentries = PAGE_SIZE / bucketsize;
 	}
 	numentries = roundup_pow_of_two(numentries);
-- 
cgit v1.2.3-71-gd317


From 1a8670a29b5277cbe601f74ab63d2c5211fb3005 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 21 Sep 2009 17:03:09 -0700
Subject: oom: move oom_killer_enable()/oom_killer_disable to where they belong

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h    | 12 ------------
 include/linux/oom.h    | 11 +++++++++++
 kernel/power/process.c |  1 +
 3 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index c32bfa8e7f1e..f53e9b868c26 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -335,18 +335,6 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
 void drain_all_pages(void);
 void drain_local_pages(void *dummy);
 
-extern bool oom_killer_disabled;
-
-static inline void oom_killer_disable(void)
-{
-	oom_killer_disabled = true;
-}
-
-static inline void oom_killer_enable(void)
-{
-	oom_killer_disabled = false;
-}
-
 extern gfp_t gfp_allowed_mask;
 
 static inline void set_gfp_allowed_mask(gfp_t mask)
diff --git a/include/linux/oom.h b/include/linux/oom.h
index a7979baf1e39..6aac5fe4f6f1 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -30,5 +30,16 @@ extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order);
 extern int register_oom_notifier(struct notifier_block *nb);
 extern int unregister_oom_notifier(struct notifier_block *nb);
 
+extern bool oom_killer_disabled;
+
+static inline void oom_killer_disable(void)
+{
+	oom_killer_disabled = true;
+}
+
+static inline void oom_killer_enable(void)
+{
+	oom_killer_disabled = false;
+}
 #endif /* __KERNEL__*/
 #endif /* _INCLUDE_LINUX_OOM_H */
diff --git a/kernel/power/process.c b/kernel/power/process.c
index da2072d73811..cc2e55373b68 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -9,6 +9,7 @@
 #undef DEBUG
 
 #include <linux/interrupt.h>
+#include <linux/oom.h>
 #include <linux/suspend.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
-- 
cgit v1.2.3-71-gd317


From f86296317434b21585e229f6c49a33cb9ebab4d3 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Mon, 21 Sep 2009 17:03:11 -0700
Subject: mm: do batched scans for mem_cgroup

For mem_cgroup, shrink_zone() may call shrink_list() with nr_to_scan=1, in
which case shrink_list() _still_ calls isolate_pages() with the much
larger SWAP_CLUSTER_MAX.  It effectively scales up the inactive list scan
rate by up to 32 times.

For example, with 16k inactive pages and DEF_PRIORITY=12, (16k >> 12)=4.
So when shrink_zone() expects to scan 4 pages in the active/inactive list,
the active list will be scanned 4 pages, while the inactive list will be
(over) scanned SWAP_CLUSTER_MAX=32 pages in effect.  And that could break
the balance between the two lists.

It can further impact the scan of anon active list, due to the anon
active/inactive ratio rebalance logic in balance_pgdat()/shrink_zone():

inactive anon list over scanned => inactive_anon_is_low() == TRUE
                                => shrink_active_list()
                                => active anon list over scanned

So the end result may be

- anon inactive  => over scanned
- anon active    => over scanned (maybe not as much)
- file inactive  => over scanned
- file active    => under scanned (relatively)

The accesses to nr_saved_scan are not lock protected and so not 100%
accurate, however we can tolerate small errors and the resulted small
imbalanced scan rates between zones.

Cc: Rik van Riel <riel@redhat.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  6 +++++-
 mm/page_alloc.c        |  2 +-
 mm/vmscan.c            | 20 +++++++++++---------
 3 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9c50309b30a1..c188ea624c74 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -273,6 +273,11 @@ struct zone_reclaim_stat {
 	 */
 	unsigned long		recent_rotated[2];
 	unsigned long		recent_scanned[2];
+
+	/*
+	 * accumulated for batching
+	 */
+	unsigned long		nr_saved_scan[NR_LRU_LISTS];
 };
 
 struct zone {
@@ -327,7 +332,6 @@ struct zone {
 	spinlock_t		lru_lock;	
 	struct zone_lru {
 		struct list_head list;
-		unsigned long nr_saved_scan;	/* accumulated for batching */
 	} lru[NR_LRU_LISTS];
 
 	struct zone_reclaim_stat reclaim_stat;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 770f011e1c12..84d9da1e8f4c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3809,7 +3809,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		zone_pcp_init(zone);
 		for_each_lru(l) {
 			INIT_LIST_HEAD(&zone->lru[l].list);
-			zone->lru[l].nr_saved_scan = 0;
+			zone->reclaim_stat.nr_saved_scan[l] = 0;
 		}
 		zone->reclaim_stat.recent_rotated[0] = 0;
 		zone->reclaim_stat.recent_rotated[1] = 0;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5432c230c4cb..0e7f5e4a22d7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1586,6 +1586,7 @@ static void shrink_zone(int priority, struct zone *zone,
 	enum lru_list l;
 	unsigned long nr_reclaimed = sc->nr_reclaimed;
 	unsigned long swap_cluster_max = sc->swap_cluster_max;
+	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
 	int noswap = 0;
 
 	/* If we have no swap space, do not bother scanning anon pages. */
@@ -1605,12 +1606,9 @@ static void shrink_zone(int priority, struct zone *zone,
 			scan >>= priority;
 			scan = (scan * percent[file]) / 100;
 		}
-		if (scanning_global_lru(sc))
-			nr[l] = nr_scan_try_batch(scan,
-						  &zone->lru[l].nr_saved_scan,
-						  swap_cluster_max);
-		else
-			nr[l] = scan;
+		nr[l] = nr_scan_try_batch(scan,
+					  &reclaim_stat->nr_saved_scan[l],
+					  swap_cluster_max);
 	}
 
 	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -2220,6 +2218,7 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
 {
 	struct zone *zone;
 	unsigned long nr_reclaimed = 0;
+	struct zone_reclaim_stat *reclaim_stat;
 
 	for_each_populated_zone(zone) {
 		enum lru_list l;
@@ -2236,11 +2235,14 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
 						l == LRU_ACTIVE_FILE))
 				continue;
 
-			zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1;
-			if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) {
+			reclaim_stat = get_reclaim_stat(zone, sc);
+			reclaim_stat->nr_saved_scan[l] +=
+						(lru_pages >> prio) + 1;
+			if (reclaim_stat->nr_saved_scan[l]
+						>= nr_pages || pass > 3) {
 				unsigned long nr_to_scan;
 
-				zone->lru[l].nr_saved_scan = 0;
+				reclaim_stat->nr_saved_scan[l] = 0;
 				nr_to_scan = min(nr_pages, lru_pages);
 				nr_reclaimed += shrink_list(l, nr_to_scan, zone,
 								sc, prio);
-- 
cgit v1.2.3-71-gd317


From 28b83c5193e7ab951e402252278f2cc79dc4d298 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Mon, 21 Sep 2009 17:03:13 -0700
Subject: oom: move oom_adj value from task_struct to signal_struct

Currently, OOM logic callflow is here.

    __out_of_memory()
        select_bad_process()            for each task
            badness()                   calculate badness of one task
                oom_kill_process()      search child
                    oom_kill_task()     kill target task and mm shared tasks with it

example, process-A have two thread, thread-A and thread-B and it have very
fat memory and each thread have following oom_adj and oom_score.

     thread-A: oom_adj = OOM_DISABLE, oom_score = 0
     thread-B: oom_adj = 0,           oom_score = very-high

Then, select_bad_process() select thread-B, but oom_kill_task() refuse
kill the task because thread-A have OOM_DISABLE.  Thus __out_of_memory()
call select_bad_process() again.  but select_bad_process() select the same
task.  It mean kernel fall in livelock.

The fact is, select_bad_process() must select killable task.  otherwise
OOM logic go into livelock.

And root cause is, oom_adj shouldn't be per-thread value.  it should be
per-process value because OOM-killer kill a process, not thread.  Thus
This patch moves oomkilladj (now more appropriately named oom_adj) from
struct task_struct to struct signal_struct.  it naturally prevent
select_bad_process() choose wrong task.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c        | 24 ++++++++++++++++++++----
 include/linux/sched.h |  3 ++-
 kernel/fork.c         |  2 ++
 mm/oom_kill.c         | 34 +++++++++++++++-------------------
 4 files changed, 39 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6f742f6658a9..81cfff82875b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -999,11 +999,17 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
 	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
 	char buffer[PROC_NUMBUF];
 	size_t len;
-	int oom_adjust;
+	int oom_adjust = OOM_DISABLE;
+	unsigned long flags;
 
 	if (!task)
 		return -ESRCH;
-	oom_adjust = task->oomkilladj;
+
+	if (lock_task_sighand(task, &flags)) {
+		oom_adjust = task->signal->oom_adj;
+		unlock_task_sighand(task, &flags);
+	}
+
 	put_task_struct(task);
 
 	len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
@@ -1017,6 +1023,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	struct task_struct *task;
 	char buffer[PROC_NUMBUF], *end;
 	int oom_adjust;
+	unsigned long flags;
 
 	memset(buffer, 0, sizeof(buffer));
 	if (count > sizeof(buffer) - 1)
@@ -1032,11 +1039,20 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	task = get_proc_task(file->f_path.dentry->d_inode);
 	if (!task)
 		return -ESRCH;
-	if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) {
+	if (!lock_task_sighand(task, &flags)) {
+		put_task_struct(task);
+		return -ESRCH;
+	}
+
+	if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
+		unlock_task_sighand(task, &flags);
 		put_task_struct(task);
 		return -EACCES;
 	}
-	task->oomkilladj = oom_adjust;
+
+	task->signal->oom_adj = oom_adjust;
+
+	unlock_task_sighand(task, &flags);
 	put_task_struct(task);
 	if (end - buffer == 0)
 		return -EIO;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 899d7304d594..17e9a8e9a51d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -639,6 +639,8 @@ struct signal_struct {
 	unsigned audit_tty;
 	struct tty_audit_buf *tty_audit_buf;
 #endif
+
+	int oom_adj;	/* OOM kill score adjustment (bit shift) */
 };
 
 /* Context switch must be unlocked if interrupts are to be enabled */
@@ -1221,7 +1223,6 @@ struct task_struct {
 	 * a short time
 	 */
 	unsigned char fpu_counter;
-	s8 oomkilladj; /* OOM kill score adjustment (bit shift). */
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	unsigned int btrace_seq;
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 73a442b7be6d..1020977b57ca 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -880,6 +880,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 
 	tty_audit_fork(sig);
 
+	sig->oom_adj = current->signal->oom_adj;
+
 	return 0;
 }
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index da4c342f2641..630b77fe862f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,6 +58,10 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 	unsigned long points, cpu_time, run_time;
 	struct mm_struct *mm;
 	struct task_struct *child;
+	int oom_adj = p->signal->oom_adj;
+
+	if (oom_adj == OOM_DISABLE)
+		return 0;
 
 	task_lock(p);
 	mm = p->mm;
@@ -148,15 +152,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 		points /= 8;
 
 	/*
-	 * Adjust the score by oomkilladj.
+	 * Adjust the score by oom_adj.
 	 */
-	if (p->oomkilladj) {
-		if (p->oomkilladj > 0) {
+	if (oom_adj) {
+		if (oom_adj > 0) {
 			if (!points)
 				points = 1;
-			points <<= p->oomkilladj;
+			points <<= oom_adj;
 		} else
-			points >>= -(p->oomkilladj);
+			points >>= -(oom_adj);
 	}
 
 #ifdef DEBUG
@@ -251,7 +255,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
 			*ppoints = ULONG_MAX;
 		}
 
-		if (p->oomkilladj == OOM_DISABLE)
+		if (p->signal->oom_adj == OOM_DISABLE)
 			continue;
 
 		points = badness(p, uptime.tv_sec);
@@ -304,7 +308,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
 		}
 		printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d     %3d %s\n",
 		       p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
-		       get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj,
+		       get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj,
 		       p->comm);
 		task_unlock(p);
 	} while_each_thread(g, p);
@@ -359,18 +363,9 @@ static int oom_kill_task(struct task_struct *p)
 	 * change to NULL at any time since we do not hold task_lock(p).
 	 * However, this is of no concern to us.
 	 */
-
-	if (mm == NULL)
+	if (!mm || p->signal->oom_adj == OOM_DISABLE)
 		return 1;
 
-	/*
-	 * Don't kill the process if any threads are set to OOM_DISABLE
-	 */
-	do_each_thread(g, q) {
-		if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
-			return 1;
-	} while_each_thread(g, q);
-
 	__oom_kill_task(p, 1);
 
 	/*
@@ -394,8 +389,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 
 	if (printk_ratelimit()) {
 		printk(KERN_WARNING "%s invoked oom-killer: "
-			"gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
-			current->comm, gfp_mask, order, current->oomkilladj);
+			"gfp_mask=0x%x, order=%d, oom_adj=%d\n",
+			current->comm, gfp_mask, order,
+			current->signal->oom_adj);
 		task_lock(current);
 		cpuset_print_task_mems_allowed(current);
 		task_unlock(current);
-- 
cgit v1.2.3-71-gd317


From 5f8dcc21211a3d4e3a7a5ca366b469fb88117f61 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Mon, 21 Sep 2009 17:03:19 -0700
Subject: page-allocator: split per-cpu list into one-list-per-migrate-type

The following two patches remove searching in the page allocator fast-path
by maintaining multiple free-lists in the per-cpu structure.  At the time
the search was introduced, increasing the per-cpu structures would waste a
lot of memory as per-cpu structures were statically allocated at
compile-time.  This is no longer the case.

The patches are as follows. They are based on mmotm-2009-08-27.

Patch 1 adds multiple lists to struct per_cpu_pages, one per
	migratetype that can be stored on the PCP lists.

Patch 2 notes that the pcpu drain path check empty lists multiple times. The
	patch reduces the number of checks by maintaining a count of free
	lists encountered. Lists containing pages will then free multiple
	pages in batch

The patches were tested with kernbench, netperf udp/tcp, hackbench and
sysbench.  The netperf tests were not bound to any CPU in particular and
were run such that the results should be 99% confidence that the reported
results are within 1% of the estimated mean.  sysbench was run with a
postgres background and read-only tests.  Similar to netperf, it was run
multiple times so that it's 99% confidence results are within 1%.  The
patches were tested on x86, x86-64 and ppc64 as

x86:	Intel Pentium D 3GHz with 8G RAM (no-brand machine)
	kernbench	- No significant difference, variance well within noise
	netperf-udp	- 1.34% to 2.28% gain
	netperf-tcp	- 0.45% to 1.22% gain
	hackbench	- Small variances, very close to noise
	sysbench	- Very small gains

x86-64:	AMD Phenom 9950 1.3GHz with 8G RAM (no-brand machine)
	kernbench	- No significant difference, variance well within noise
	netperf-udp	- 1.83% to 10.42% gains
	netperf-tcp	- No conclusive until buffer >= PAGE_SIZE
				4096	+15.83%
				8192	+ 0.34% (not significant)
				16384	+ 1%
	hackbench	- Small gains, very close to noise
	sysbench	- 0.79% to 1.6% gain

ppc64:	PPC970MP 2.5GHz with 10GB RAM (it's a terrasoft powerstation)
	kernbench	- No significant difference, variance well within noise
	netperf-udp	- 2-3% gain for almost all buffer sizes tested
	netperf-tcp	- losses on small buffers, gains on larger buffers
			  possibly indicates some bad caching effect.
	hackbench	- No significant difference
	sysbench	- 2-4% gain

This patch:

Currently the per-cpu page allocator searches the PCP list for pages of
the correct migrate-type to reduce the possibility of pages being
inappropriate placed from a fragmentation perspective.  This search is
potentially expensive in a fast-path and undesirable.  Splitting the
per-cpu list into multiple lists increases the size of a per-cpu structure
and this was potentially a major problem at the time the search was
introduced.  These problem has been mitigated as now only the necessary
number of structures is allocated for the running system.

This patch replaces a list search in the per-cpu allocator with one list
per migrate type.  The potential snag with this approach is when bulk
freeing pages.  We round-robin free pages based on migrate type which has
little bearing on the cache hotness of the page and potentially checks
empty lists repeatedly in the event the majority of PCP pages are of one
type.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Nick Piggin <npiggin@suse.de>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |   5 ++-
 mm/page_alloc.c        | 106 +++++++++++++++++++++++++++----------------------
 2 files changed, 63 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c188ea624c74..652ef01be582 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -38,6 +38,7 @@
 #define MIGRATE_UNMOVABLE     0
 #define MIGRATE_RECLAIMABLE   1
 #define MIGRATE_MOVABLE       2
+#define MIGRATE_PCPTYPES      3 /* the number of types on the pcp lists */
 #define MIGRATE_RESERVE       3
 #define MIGRATE_ISOLATE       4 /* can't allocate from here */
 #define MIGRATE_TYPES         5
@@ -169,7 +170,9 @@ struct per_cpu_pages {
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
 	int batch;		/* chunk size for buddy add/remove */
-	struct list_head list;	/* the list of pages */
+
+	/* Lists of pages, one per migrate type stored on the pcp-lists */
+	struct list_head lists[MIGRATE_PCPTYPES];
 };
 
 struct per_cpu_pageset {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 84d9da1e8f4c..1b1c39e6a9b8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -511,7 +511,7 @@ static inline int free_pages_check(struct page *page)
 }
 
 /*
- * Frees a list of pages. 
+ * Frees a number of pages from the PCP lists
  * Assumes all pages on list are in same zone, and of same order.
  * count is the number of pages to free.
  *
@@ -521,23 +521,36 @@ static inline int free_pages_check(struct page *page)
  * And clear the zone's pages_scanned counter, to hold off the "all pages are
  * pinned" detection logic.
  */
-static void free_pages_bulk(struct zone *zone, int count,
-					struct list_head *list, int order)
+static void free_pcppages_bulk(struct zone *zone, int count,
+					struct per_cpu_pages *pcp)
 {
+	int migratetype = 0;
+
 	spin_lock(&zone->lock);
 	zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
 	zone->pages_scanned = 0;
 
-	__mod_zone_page_state(zone, NR_FREE_PAGES, count << order);
+	__mod_zone_page_state(zone, NR_FREE_PAGES, count);
 	while (count--) {
 		struct page *page;
+		struct list_head *list;
+
+		/*
+		 * Remove pages from lists in a round-robin fashion. This spinning
+		 * around potentially empty lists is bloody awful, alternatives that
+		 * don't suck are welcome
+		 */
+		do {
+			if (++migratetype == MIGRATE_PCPTYPES)
+				migratetype = 0;
+			list = &pcp->lists[migratetype];
+		} while (list_empty(list));
 
-		VM_BUG_ON(list_empty(list));
 		page = list_entry(list->prev, struct page, lru);
 		/* have to delete it as __free_one_page list manipulates */
 		list_del(&page->lru);
-		trace_mm_page_pcpu_drain(page, order, page_private(page));
-		__free_one_page(page, zone, order, page_private(page));
+		trace_mm_page_pcpu_drain(page, 0, migratetype);
+		__free_one_page(page, zone, 0, migratetype);
 	}
 	spin_unlock(&zone->lock);
 }
@@ -953,7 +966,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 		to_drain = pcp->batch;
 	else
 		to_drain = pcp->count;
-	free_pages_bulk(zone, to_drain, &pcp->list, 0);
+	free_pcppages_bulk(zone, to_drain, pcp);
 	pcp->count -= to_drain;
 	local_irq_restore(flags);
 }
@@ -979,7 +992,7 @@ static void drain_pages(unsigned int cpu)
 
 		pcp = &pset->pcp;
 		local_irq_save(flags);
-		free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+		free_pcppages_bulk(zone, pcp->count, pcp);
 		pcp->count = 0;
 		local_irq_restore(flags);
 	}
@@ -1045,6 +1058,7 @@ static void free_hot_cold_page(struct page *page, int cold)
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
+	int migratetype;
 	int wasMlocked = __TestClearPageMlocked(page);
 
 	kmemcheck_free_shadow(page, 0);
@@ -1062,21 +1076,39 @@ static void free_hot_cold_page(struct page *page, int cold)
 	kernel_map_pages(page, 1, 0);
 
 	pcp = &zone_pcp(zone, get_cpu())->pcp;
-	set_page_private(page, get_pageblock_migratetype(page));
+	migratetype = get_pageblock_migratetype(page);
+	set_page_private(page, migratetype);
 	local_irq_save(flags);
 	if (unlikely(wasMlocked))
 		free_page_mlock(page);
 	__count_vm_event(PGFREE);
 
+	/*
+	 * We only track unmovable, reclaimable and movable on pcp lists.
+	 * Free ISOLATE pages back to the allocator because they are being
+	 * offlined but treat RESERVE as movable pages so we can get those
+	 * areas back if necessary. Otherwise, we may have to free
+	 * excessively into the page allocator
+	 */
+	if (migratetype >= MIGRATE_PCPTYPES) {
+		if (unlikely(migratetype == MIGRATE_ISOLATE)) {
+			free_one_page(zone, page, 0, migratetype);
+			goto out;
+		}
+		migratetype = MIGRATE_MOVABLE;
+	}
+
 	if (cold)
-		list_add_tail(&page->lru, &pcp->list);
+		list_add_tail(&page->lru, &pcp->lists[migratetype]);
 	else
-		list_add(&page->lru, &pcp->list);
+		list_add(&page->lru, &pcp->lists[migratetype]);
 	pcp->count++;
 	if (pcp->count >= pcp->high) {
-		free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+		free_pcppages_bulk(zone, pcp->batch, pcp);
 		pcp->count -= pcp->batch;
 	}
+
+out:
 	local_irq_restore(flags);
 	put_cpu();
 }
@@ -1134,46 +1166,24 @@ again:
 	cpu  = get_cpu();
 	if (likely(order == 0)) {
 		struct per_cpu_pages *pcp;
+		struct list_head *list;
 
 		pcp = &zone_pcp(zone, cpu)->pcp;
+		list = &pcp->lists[migratetype];
 		local_irq_save(flags);
-		if (!pcp->count) {
-			pcp->count = rmqueue_bulk(zone, 0,
-					pcp->batch, &pcp->list,
-					migratetype, cold);
-			if (unlikely(!pcp->count))
-				goto failed;
-		}
-
-		/* Find a page of the appropriate migrate type */
-		if (cold) {
-			list_for_each_entry_reverse(page, &pcp->list, lru)
-				if (page_private(page) == migratetype)
-					break;
-		} else {
-			list_for_each_entry(page, &pcp->list, lru)
-				if (page_private(page) == migratetype)
-					break;
-		}
-
-		/* Allocate more to the pcp list if necessary */
-		if (unlikely(&page->lru == &pcp->list)) {
-			int get_one_page = 0;
-
+		if (list_empty(list)) {
 			pcp->count += rmqueue_bulk(zone, 0,
-					pcp->batch, &pcp->list,
+					pcp->batch, list,
 					migratetype, cold);
-			list_for_each_entry(page, &pcp->list, lru) {
-				if (get_pageblock_migratetype(page) !=
-					    MIGRATE_ISOLATE) {
-					get_one_page = 1;
-					break;
-				}
-			}
-			if (!get_one_page)
+			if (unlikely(list_empty(list)))
 				goto failed;
 		}
 
+		if (cold)
+			page = list_entry(list->prev, struct page, lru);
+		else
+			page = list_entry(list->next, struct page, lru);
+
 		list_del(&page->lru);
 		pcp->count--;
 	} else {
@@ -3024,6 +3034,7 @@ static int zone_batchsize(struct zone *zone)
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 {
 	struct per_cpu_pages *pcp;
+	int migratetype;
 
 	memset(p, 0, sizeof(*p));
 
@@ -3031,7 +3042,8 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 	pcp->count = 0;
 	pcp->high = 6 * batch;
 	pcp->batch = max(1UL, 1 * batch);
-	INIT_LIST_HEAD(&pcp->list);
+	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
+		INIT_LIST_HEAD(&pcp->lists[migratetype]);
 }
 
 /*
@@ -3223,7 +3235,7 @@ static int __zone_pcp_update(void *data)
 		pcp = &pset->pcp;
 
 		local_irq_save(flags);
-		free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+		free_pcppages_bulk(zone, pcp->count, pcp);
 		setup_pageset(pset, batch);
 		local_irq_restore(flags);
 	}
-- 
cgit v1.2.3-71-gd317


From f3e8fccd06d27773186a0094371daf2d84c79469 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Date: Mon, 21 Sep 2009 17:03:25 -0700
Subject: mm: add get_dump_page

In preparation for the next patch, add a simple get_dump_page(addr)
interface for the CONFIG_ELF_CORE dumpers to use, instead of calling
get_user_pages() directly.  They're not interested in errors: they
just want to use holes as much as possible, to save space and make
sure that the data is aligned where the headers said it would be.

Oh, and don't use that horrid DUMP_SEEK(off) macro!

Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c       | 44 +++++++++++++---------------------------
 fs/binfmt_elf_fdpic.c | 56 +++++++++++++++++----------------------------------
 include/linux/mm.h    |  1 +
 mm/memory.c           | 33 +++++++++++++++++++++++++++++-
 4 files changed, 66 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7c1e65d54872..442d94fe255c 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1280,9 +1280,6 @@ static int writenote(struct memelfnote *men, struct file *file,
 #define DUMP_WRITE(addr, nr)	\
 	if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \
 		goto end_coredump;
-#define DUMP_SEEK(off)	\
-	if (!dump_seek(file, (off))) \
-		goto end_coredump;
 
 static void fill_elf_header(struct elfhdr *elf, int segs,
 			    u16 machine, u32 flags, u8 osabi)
@@ -2016,7 +2013,8 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
 		goto end_coredump;
 
 	/* Align to page */
-	DUMP_SEEK(dataoff - foffset);
+	if (!dump_seek(file, dataoff - foffset))
+		goto end_coredump;
 
 	for (vma = first_vma(current, gate_vma); vma != NULL;
 			vma = next_vma(vma, gate_vma)) {
@@ -2027,33 +2025,19 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
 
 		for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) {
 			struct page *page;
-			struct vm_area_struct *tmp_vma;
-
-			if (get_user_pages(current, current->mm, addr, 1, 0, 1,
-						&page, &tmp_vma) <= 0) {
-				DUMP_SEEK(PAGE_SIZE);
-			} else {
-				if (page == ZERO_PAGE(0)) {
-					if (!dump_seek(file, PAGE_SIZE)) {
-						page_cache_release(page);
-						goto end_coredump;
-					}
-				} else {
-					void *kaddr;
-					flush_cache_page(tmp_vma, addr,
-							 page_to_pfn(page));
-					kaddr = kmap(page);
-					if ((size += PAGE_SIZE) > limit ||
-					    !dump_write(file, kaddr,
-					    PAGE_SIZE)) {
-						kunmap(page);
-						page_cache_release(page);
-						goto end_coredump;
-					}
-					kunmap(page);
-				}
+			int stop;
+
+			page = get_dump_page(addr);
+			if (page) {
+				void *kaddr = kmap(page);
+				stop = ((size += PAGE_SIZE) > limit) ||
+					!dump_write(file, kaddr, PAGE_SIZE);
+				kunmap(page);
 				page_cache_release(page);
-			}
+			} else
+				stop = !dump_seek(file, PAGE_SIZE);
+			if (stop)
+				goto end_coredump;
 		}
 	}
 
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 20fbeced472b..76285471073e 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1325,9 +1325,6 @@ static int writenote(struct memelfnote *men, struct file *file)
 #define DUMP_WRITE(addr, nr)	\
 	if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \
 		goto end_coredump;
-#define DUMP_SEEK(off)	\
-	if (!dump_seek(file, (off))) \
-		goto end_coredump;
 
 static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs)
 {
@@ -1518,6 +1515,7 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
 			   unsigned long *limit, unsigned long mm_flags)
 {
 	struct vm_area_struct *vma;
+	int err = 0;
 
 	for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
 		unsigned long addr;
@@ -1525,43 +1523,26 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
 		if (!maydump(vma, mm_flags))
 			continue;
 
-		for (addr = vma->vm_start;
-		     addr < vma->vm_end;
-		     addr += PAGE_SIZE
-		     ) {
-			struct vm_area_struct *vma;
-			struct page *page;
-
-			if (get_user_pages(current, current->mm, addr, 1, 0, 1,
-					   &page, &vma) <= 0) {
-				DUMP_SEEK(file->f_pos + PAGE_SIZE);
-			}
-			else if (page == ZERO_PAGE(0)) {
-				page_cache_release(page);
-				DUMP_SEEK(file->f_pos + PAGE_SIZE);
-			}
-			else {
-				void *kaddr;
-
-				flush_cache_page(vma, addr, page_to_pfn(page));
-				kaddr = kmap(page);
-				if ((*size += PAGE_SIZE) > *limit ||
-				    !dump_write(file, kaddr, PAGE_SIZE)
-				    ) {
-					kunmap(page);
-					page_cache_release(page);
-					return -EIO;
-				}
+		for (addr = vma->vm_start; addr < vma->vm_end;
+							addr += PAGE_SIZE) {
+			struct page *page = get_dump_page(addr);
+			if (page) {
+				void *kaddr = kmap(page);
+				*size += PAGE_SIZE;
+				if (*size > *limit)
+					err = -EFBIG;
+				else if (!dump_write(file, kaddr, PAGE_SIZE))
+					err = -EIO;
 				kunmap(page);
 				page_cache_release(page);
-			}
+			} else if (!dump_seek(file, file->f_pos + PAGE_SIZE))
+				err = -EFBIG;
+			if (err)
+				goto out;
 		}
 	}
-
-	return 0;
-
-end_coredump:
-	return -EFBIG;
+out:
+	return err;
 }
 #endif
 
@@ -1802,7 +1783,8 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
 				goto end_coredump;
 	}
 
-	DUMP_SEEK(dataoff);
+	if (!dump_seek(file, dataoff))
+		goto end_coredump;
 
 	if (elf_fdpic_dump_segments(file, &size, &limit, mm_flags) < 0)
 		goto end_coredump;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 19ff81c49ba6..e41795bba95d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -817,6 +817,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			struct page **pages, struct vm_area_struct **vmas);
 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 			struct page **pages);
+struct page *get_dump_page(unsigned long addr);
 
 extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
 extern void do_invalidatepage(struct page *page, unsigned long offset);
diff --git a/mm/memory.c b/mm/memory.c
index 4b5200f5f35a..a8430ff13837 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1423,9 +1423,40 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 
 	return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
 }
-
 EXPORT_SYMBOL(get_user_pages);
 
+/**
+ * get_dump_page() - pin user page in memory while writing it to core dump
+ * @addr: user address
+ *
+ * Returns struct page pointer of user page pinned for dump,
+ * to be freed afterwards by page_cache_release() or put_page().
+ *
+ * Returns NULL on any kind of failure - a hole must then be inserted into
+ * the corefile, to preserve alignment with its headers; and also returns
+ * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
+ * allowing a hole to be left in the corefile to save diskspace.
+ *
+ * Called without mmap_sem, but after all other threads have been killed.
+ */
+#ifdef CONFIG_ELF_CORE
+struct page *get_dump_page(unsigned long addr)
+{
+	struct vm_area_struct *vma;
+	struct page *page;
+
+	if (__get_user_pages(current, current->mm, addr, 1,
+				GUP_FLAGS_FORCE, &page, &vma) < 1)
+		return NULL;
+	if (page == ZERO_PAGE(0)) {
+		page_cache_release(page);
+		return NULL;
+	}
+	flush_cache_page(vma, addr, page_to_pfn(page));
+	return page;
+}
+#endif /* CONFIG_ELF_CORE */
+
 pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
 			spinlock_t **ptl)
 {
-- 
cgit v1.2.3-71-gd317


From 8e4b9a60718970bbc02dfd3abd0b956ab65af231 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Date: Mon, 21 Sep 2009 17:03:26 -0700
Subject: mm: FOLL_DUMP replace FOLL_ANON

The "FOLL_ANON optimization" and its use_zero_page() test have caused
confusion and bugs: why does it test VM_SHARED? for the very good but
unsatisfying reason that VMware crashed without.  As we look to maybe
reinstating anonymous use of the ZERO_PAGE, we need to sort this out.

Easily done: it's silly for __get_user_pages() and follow_page() to
be guessing whether it's safe to assume that they're being used for
a coredump (which can take a shortcut snapshot where other uses must
handle a fault) - just tell them with GUP_FLAGS_DUMP and FOLL_DUMP.

get_dump_page() doesn't even want a ZERO_PAGE: an error suits fine.

Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  2 +-
 mm/internal.h      |  1 +
 mm/memory.c        | 43 ++++++++++++-------------------------------
 3 files changed, 14 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e41795bba95d..45ee5b5a343d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1231,7 +1231,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
 #define FOLL_WRITE	0x01	/* check pte is writable */
 #define FOLL_TOUCH	0x02	/* mark page accessed */
 #define FOLL_GET	0x04	/* do get_page on page */
-#define FOLL_ANON	0x08	/* give ZERO_PAGE if no pgtable */
+#define FOLL_DUMP	0x08	/* give error on hole if it would be zero */
 
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 			void *data);
diff --git a/mm/internal.h b/mm/internal.h
index 166765cd58d6..d41475078b20 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -252,6 +252,7 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 
 #define GUP_FLAGS_WRITE		0x01
 #define GUP_FLAGS_FORCE		0x02
+#define GUP_FLAGS_DUMP		0x04
 
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		     unsigned long start, int len, int flags,
diff --git a/mm/memory.c b/mm/memory.c
index a8430ff13837..532a55bce6a4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1174,41 +1174,22 @@ no_page:
 	pte_unmap_unlock(ptep, ptl);
 	if (!pte_none(pte))
 		return page;
-	/* Fall through to ZERO_PAGE handling */
+
 no_page_table:
 	/*
 	 * When core dumping an enormous anonymous area that nobody
-	 * has touched so far, we don't want to allocate page tables.
+	 * has touched so far, we don't want to allocate unnecessary pages or
+	 * page tables.  Return error instead of NULL to skip handle_mm_fault,
+	 * then get_dump_page() will return NULL to leave a hole in the dump.
+	 * But we can only make this optimization where a hole would surely
+	 * be zero-filled if handle_mm_fault() actually did handle it.
 	 */
-	if (flags & FOLL_ANON) {
-		page = ZERO_PAGE(0);
-		if (flags & FOLL_GET)
-			get_page(page);
-		BUG_ON(flags & FOLL_WRITE);
-	}
+	if ((flags & FOLL_DUMP) &&
+	    (!vma->vm_ops || !vma->vm_ops->fault))
+		return ERR_PTR(-EFAULT);
 	return page;
 }
 
-/* Can we do the FOLL_ANON optimization? */
-static inline int use_zero_page(struct vm_area_struct *vma)
-{
-	/*
-	 * We don't want to optimize FOLL_ANON for make_pages_present()
-	 * when it tries to page in a VM_LOCKED region. As to VM_SHARED,
-	 * we want to get the page from the page tables to make sure
-	 * that we serialize and update with any other user of that
-	 * mapping.
-	 */
-	if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
-		return 0;
-	/*
-	 * And if we have a fault routine, it's not an anonymous region.
-	 */
-	return !vma->vm_ops || !vma->vm_ops->fault;
-}
-
-
-
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		     unsigned long start, int nr_pages, int flags,
 		     struct page **pages, struct vm_area_struct **vmas)
@@ -1288,8 +1269,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		foll_flags = FOLL_TOUCH;
 		if (pages)
 			foll_flags |= FOLL_GET;
-		if (!write && use_zero_page(vma))
-			foll_flags |= FOLL_ANON;
+		if (flags & GUP_FLAGS_DUMP)
+			foll_flags |= FOLL_DUMP;
 
 		do {
 			struct page *page;
@@ -1446,7 +1427,7 @@ struct page *get_dump_page(unsigned long addr)
 	struct page *page;
 
 	if (__get_user_pages(current, current->mm, addr, 1,
-				GUP_FLAGS_FORCE, &page, &vma) < 1)
+			GUP_FLAGS_FORCE | GUP_FLAGS_DUMP, &page, &vma) < 1)
 		return NULL;
 	if (page == ZERO_PAGE(0)) {
 		page_cache_release(page);
-- 
cgit v1.2.3-71-gd317


From 2a15efc953b26ad57d7d38b9e6782d57e53b4ab2 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Date: Mon, 21 Sep 2009 17:03:27 -0700
Subject: mm: follow_hugetlb_page flags

follow_hugetlb_page() shouldn't be guessing about the coredump case
either: pass the foll_flags down to it, instead of just the write bit.

Remove that obscure huge_zeropage_ok() test.  The decision is easy,
though unlike the non-huge case - here vm_ops->fault is always set.
But we know that a fault would serve up zeroes, unless there's
already a hugetlbfs pagecache page to back the range.

(Alternatively, since hugetlb pages aren't swapped out under pressure,
you could save more dump space by arguing that a page not yet faulted
into this process cannot be relevant to the dump; but that would be
more surprising.)

Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |  4 +++-
 mm/hugetlb.c            | 62 +++++++++++++++++++++++++++++--------------------
 mm/memory.c             | 14 ++++++-----
 3 files changed, 48 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 16cdb75a543a..e7f0fabfa1c2 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -24,7 +24,9 @@ int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *
 int hugetlb_overcommit_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
 int hugetlb_treat_movable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
-int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int, int);
+int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
+			struct page **, struct vm_area_struct **,
+			unsigned long *, int *, int, unsigned int flags);
 void unmap_hugepage_range(struct vm_area_struct *,
 			unsigned long, unsigned long, struct page *);
 void __unmap_hugepage_range(struct vm_area_struct *,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c001f846f17d..6b41f70bbc7f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2016,6 +2016,23 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h,
 	return find_lock_page(mapping, idx);
 }
 
+/* Return whether there is a pagecache page to back given address within VMA */
+static bool hugetlbfs_backed(struct hstate *h,
+			struct vm_area_struct *vma, unsigned long address)
+{
+	struct address_space *mapping;
+	pgoff_t idx;
+	struct page *page;
+
+	mapping = vma->vm_file->f_mapping;
+	idx = vma_hugecache_offset(h, vma, address);
+
+	page = find_get_page(mapping, idx);
+	if (page)
+		put_page(page);
+	return page != NULL;
+}
+
 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, pte_t *ptep, unsigned int flags)
 {
@@ -2211,54 +2228,52 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
 	return NULL;
 }
 
-static int huge_zeropage_ok(pte_t *ptep, int write, int shared)
-{
-	if (!ptep || write || shared)
-		return 0;
-	else
-		return huge_pte_none(huge_ptep_get(ptep));
-}
-
 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			struct page **pages, struct vm_area_struct **vmas,
 			unsigned long *position, int *length, int i,
-			int write)
+			unsigned int flags)
 {
 	unsigned long pfn_offset;
 	unsigned long vaddr = *position;
 	int remainder = *length;
 	struct hstate *h = hstate_vma(vma);
-	int zeropage_ok = 0;
-	int shared = vma->vm_flags & VM_SHARED;
 
 	spin_lock(&mm->page_table_lock);
 	while (vaddr < vma->vm_end && remainder) {
 		pte_t *pte;
+		int absent;
 		struct page *page;
 
 		/*
 		 * Some archs (sparc64, sh*) have multiple pte_ts to
-		 * each hugepage.  We have to make * sure we get the
+		 * each hugepage.  We have to make sure we get the
 		 * first, for the page indexing below to work.
 		 */
 		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
-		if (huge_zeropage_ok(pte, write, shared))
-			zeropage_ok = 1;
+		absent = !pte || huge_pte_none(huge_ptep_get(pte));
+
+		/*
+		 * When coredumping, it suits get_dump_page if we just return
+		 * an error if there's a hole and no huge pagecache to back it.
+		 */
+		if (absent &&
+		    ((flags & FOLL_DUMP) && !hugetlbfs_backed(h, vma, vaddr))) {
+			remainder = 0;
+			break;
+		}
 
-		if (!pte ||
-		    (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) ||
-		    (write && !pte_write(huge_ptep_get(pte)))) {
+		if (absent ||
+		    ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
 			int ret;
 
 			spin_unlock(&mm->page_table_lock);
-			ret = hugetlb_fault(mm, vma, vaddr, write);
+			ret = hugetlb_fault(mm, vma, vaddr,
+				(flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
 			spin_lock(&mm->page_table_lock);
 			if (!(ret & VM_FAULT_ERROR))
 				continue;
 
 			remainder = 0;
-			if (!i)
-				i = -EFAULT;
 			break;
 		}
 
@@ -2266,10 +2281,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		page = pte_page(huge_ptep_get(pte));
 same_page:
 		if (pages) {
-			if (zeropage_ok)
-				pages[i] = ZERO_PAGE(0);
-			else
-				pages[i] = mem_map_offset(page, pfn_offset);
+			pages[i] = mem_map_offset(page, pfn_offset);
 			get_page(pages[i]);
 		}
 
@@ -2293,7 +2305,7 @@ same_page:
 	*length = remainder;
 	*position = vaddr;
 
-	return i;
+	return i ? i : -EFAULT;
 }
 
 void hugetlb_change_protection(struct vm_area_struct *vma,
diff --git a/mm/memory.c b/mm/memory.c
index 532a55bce6a4..6359a4f80c4a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1260,17 +1260,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		    !(vm_flags & vma->vm_flags))
 			return i ? : -EFAULT;
 
-		if (is_vm_hugetlb_page(vma)) {
-			i = follow_hugetlb_page(mm, vma, pages, vmas,
-						&start, &nr_pages, i, write);
-			continue;
-		}
-
 		foll_flags = FOLL_TOUCH;
 		if (pages)
 			foll_flags |= FOLL_GET;
 		if (flags & GUP_FLAGS_DUMP)
 			foll_flags |= FOLL_DUMP;
+		if (write)
+			foll_flags |= FOLL_WRITE;
+
+		if (is_vm_hugetlb_page(vma)) {
+			i = follow_hugetlb_page(mm, vma, pages, vmas,
+					&start, &nr_pages, i, foll_flags);
+			continue;
+		}
 
 		do {
 			struct page *page;
-- 
cgit v1.2.3-71-gd317


From 58fa879e1e640a1856f736b418984ebeccee1c95 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Date: Mon, 21 Sep 2009 17:03:31 -0700
Subject: mm: FOLL flags for GUP flags

__get_user_pages() has been taking its own GUP flags, then processing
them into FOLL flags for follow_page().  Though oddly named, the FOLL
flags are more widely used, so pass them to __get_user_pages() now.
Sorry, VM flags, VM_FAULT flags and FAULT_FLAGs are still distinct.

(The patch to __get_user_pages() looks peculiar, with both gup_flags
and foll_flags: the gup_flags remain constant; but as before there's
an exceptional case, out of scope of the patch, in which foll_flags
per page have FOLL_WRITE masked off.)

Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Rik van Riel <riel@redhat.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  1 +
 mm/internal.h      |  6 +-----
 mm/memory.c        | 44 +++++++++++++++++++-------------------------
 mm/mlock.c         |  4 ++--
 mm/nommu.c         | 16 ++++++++--------
 5 files changed, 31 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 45ee5b5a343d..5409eced7aae 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1232,6 +1232,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
 #define FOLL_TOUCH	0x02	/* mark page accessed */
 #define FOLL_GET	0x04	/* do get_page on page */
 #define FOLL_DUMP	0x08	/* give error on hole if it would be zero */
+#define FOLL_FORCE	0x10	/* get_user_pages read/write w/o permission */
 
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 			void *data);
diff --git a/mm/internal.h b/mm/internal.h
index d41475078b20..75596574911e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -250,12 +250,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 }
 #endif /* CONFIG_SPARSEMEM */
 
-#define GUP_FLAGS_WRITE		0x01
-#define GUP_FLAGS_FORCE		0x02
-#define GUP_FLAGS_DUMP		0x04
-
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-		     unsigned long start, int len, int flags,
+		     unsigned long start, int len, unsigned int foll_flags,
 		     struct page **pages, struct vm_area_struct **vmas);
 
 #define ZONE_RECLAIM_NOSCAN	-2
diff --git a/mm/memory.c b/mm/memory.c
index c8b5b9435a92..5c694f2b9c12 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1209,27 +1209,29 @@ no_page_table:
 }
 
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-		     unsigned long start, int nr_pages, int flags,
+		     unsigned long start, int nr_pages, unsigned int gup_flags,
 		     struct page **pages, struct vm_area_struct **vmas)
 {
 	int i;
-	unsigned int vm_flags = 0;
-	int write = !!(flags & GUP_FLAGS_WRITE);
-	int force = !!(flags & GUP_FLAGS_FORCE);
+	unsigned long vm_flags;
 
 	if (nr_pages <= 0)
 		return 0;
+
+	VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
+
 	/* 
 	 * Require read or write permissions.
-	 * If 'force' is set, we only require the "MAY" flags.
+	 * If FOLL_FORCE is set, we only require the "MAY" flags.
 	 */
-	vm_flags  = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
-	vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+	vm_flags  = (gup_flags & FOLL_WRITE) ?
+			(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+	vm_flags &= (gup_flags & FOLL_FORCE) ?
+			(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
 	i = 0;
 
 	do {
 		struct vm_area_struct *vma;
-		unsigned int foll_flags;
 
 		vma = find_extend_vma(mm, start);
 		if (!vma && in_gate_area(tsk, start)) {
@@ -1241,7 +1243,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			pte_t *pte;
 
 			/* user gate pages are read-only */
-			if (write)
+			if (gup_flags & FOLL_WRITE)
 				return i ? : -EFAULT;
 			if (pg > TASK_SIZE)
 				pgd = pgd_offset_k(pg);
@@ -1278,22 +1280,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		    !(vm_flags & vma->vm_flags))
 			return i ? : -EFAULT;
 
-		foll_flags = FOLL_TOUCH;
-		if (pages)
-			foll_flags |= FOLL_GET;
-		if (flags & GUP_FLAGS_DUMP)
-			foll_flags |= FOLL_DUMP;
-		if (write)
-			foll_flags |= FOLL_WRITE;
-
 		if (is_vm_hugetlb_page(vma)) {
 			i = follow_hugetlb_page(mm, vma, pages, vmas,
-					&start, &nr_pages, i, foll_flags);
+					&start, &nr_pages, i, gup_flags);
 			continue;
 		}
 
 		do {
 			struct page *page;
+			unsigned int foll_flags = gup_flags;
 
 			/*
 			 * If we have a pending SIGKILL, don't keep faulting
@@ -1302,9 +1297,6 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			if (unlikely(fatal_signal_pending(current)))
 				return i ? i : -ERESTARTSYS;
 
-			if (write)
-				foll_flags |= FOLL_WRITE;
-
 			cond_resched();
 			while (!(page = follow_page(vma, start, foll_flags))) {
 				int ret;
@@ -1415,12 +1407,14 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long start, int nr_pages, int write, int force,
 		struct page **pages, struct vm_area_struct **vmas)
 {
-	int flags = 0;
+	int flags = FOLL_TOUCH;
 
+	if (pages)
+		flags |= FOLL_GET;
 	if (write)
-		flags |= GUP_FLAGS_WRITE;
+		flags |= FOLL_WRITE;
 	if (force)
-		flags |= GUP_FLAGS_FORCE;
+		flags |= FOLL_FORCE;
 
 	return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
 }
@@ -1447,7 +1441,7 @@ struct page *get_dump_page(unsigned long addr)
 	struct page *page;
 
 	if (__get_user_pages(current, current->mm, addr, 1,
-			GUP_FLAGS_FORCE | GUP_FLAGS_DUMP, &page, &vma) < 1)
+			FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1)
 		return NULL;
 	if (page == ZERO_PAGE(0)) {
 		page_cache_release(page);
diff --git a/mm/mlock.c b/mm/mlock.c
index e13918d4fc4f..22041aa9f5c1 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -166,9 +166,9 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
 	VM_BUG_ON(end   > vma->vm_end);
 	VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
 
-	gup_flags = 0;
+	gup_flags = FOLL_TOUCH | FOLL_GET;
 	if (vma->vm_flags & VM_WRITE)
-		gup_flags = GUP_FLAGS_WRITE;
+		gup_flags |= FOLL_WRITE;
 
 	while (nr_pages > 0) {
 		int i;
diff --git a/mm/nommu.c b/mm/nommu.c
index 386443e9d2c6..2d02ca17ce18 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -168,20 +168,20 @@ unsigned int kobjsize(const void *objp)
 }
 
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-		     unsigned long start, int nr_pages, int flags,
+		     unsigned long start, int nr_pages, int foll_flags,
 		     struct page **pages, struct vm_area_struct **vmas)
 {
 	struct vm_area_struct *vma;
 	unsigned long vm_flags;
 	int i;
-	int write = !!(flags & GUP_FLAGS_WRITE);
-	int force = !!(flags & GUP_FLAGS_FORCE);
 
 	/* calculate required read or write permissions.
-	 * - if 'force' is set, we only require the "MAY" flags.
+	 * If FOLL_FORCE is set, we only require the "MAY" flags.
 	 */
-	vm_flags  = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
-	vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+	vm_flags  = (foll_flags & FOLL_WRITE) ?
+			(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+	vm_flags &= (foll_flags & FOLL_FORCE) ?
+			(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
 
 	for (i = 0; i < nr_pages; i++) {
 		vma = find_vma(mm, start);
@@ -223,9 +223,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 	int flags = 0;
 
 	if (write)
-		flags |= GUP_FLAGS_WRITE;
+		flags |= FOLL_WRITE;
 	if (force)
-		flags |= GUP_FLAGS_FORCE;
+		flags |= FOLL_FORCE;
 
 	return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
 }
-- 
cgit v1.2.3-71-gd317


From 3f96b79ad96263cc0ece7bb340cddf9b2ddfb1b3 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Date: Mon, 21 Sep 2009 17:03:37 -0700
Subject: tmpfs: depend on shmem

CONFIG_SHMEM off gives you (ramfs masquerading as) tmpfs, even when
CONFIG_TMPFS is off: that's a little anomalous, and I'd intended to make
more sense of it by removing CONFIG_TMPFS altogether, always enabling its
code when CONFIG_SHMEM; but so many defconfigs have CONFIG_SHMEM on
CONFIG_TMPFS off that we'd better leave that as is.

But there is no point in asking for CONFIG_TMPFS if CONFIG_SHMEM is off:
make TMPFS depend on SHMEM, which also prevents TMPFS_POSIX_ACL
shmem_acl.o being pointlessly built into the kernel when SHMEM is off.

And a selfish change, to prevent the world from being rebuilt when I
switch between CONFIG_SHMEM on and off: the only CONFIG_SHMEM in the
header files is mm.h shmem_lock() - give that a shmem.c stub instead.

Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Acked-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig         |  1 +
 include/linux/mm.h | 11 +----------
 mm/shmem.c         |  5 +++++
 3 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/Kconfig b/fs/Kconfig
index 455aa207e67e..d4bf8caad8d0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -109,6 +109,7 @@ source "fs/sysfs/Kconfig"
 
 config TMPFS
 	bool "Virtual memory file system support (former shm fs)"
+	depends on SHMEM
 	help
 	  Tmpfs is a file system which keeps all files in virtual memory.
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5409eced7aae..5946e2ff9fe8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -702,17 +702,8 @@ extern void pagefault_out_of_memory(void);
 
 extern void show_free_areas(void);
 
-#ifdef CONFIG_SHMEM
-extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
-#else
-static inline int shmem_lock(struct file *file, int lock,
-			    struct user_struct *user)
-{
-	return 0;
-}
-#endif
+int shmem_lock(struct file *file, int lock, struct user_struct *user);
 struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags);
-
 int shmem_zero_setup(struct vm_area_struct *);
 
 #ifndef CONFIG_MMU
diff --git a/mm/shmem.c b/mm/shmem.c
index 25ba75d02580..b4b56fd1e772 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2593,6 +2593,11 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
 	return 0;
 }
 
+int shmem_lock(struct file *file, int lock, struct user_struct *user)
+{
+	return 0;
+}
+
 #define shmem_vm_ops				generic_file_vm_ops
 #define shmem_file_operations			ramfs_file_operations
 #define shmem_get_inode(sb, mode, dev, flags)	ramfs_get_inode(sb, mode, dev)
-- 
cgit v1.2.3-71-gd317


From 6bfde05bf5c9682e255c6a2c669dc80f91af6296 Mon Sep 17 00:00:00 2001
From: Eric B Munson <ebmunson@us.ibm.com>
Date: Mon, 21 Sep 2009 17:03:43 -0700
Subject: hugetlbfs: allow the creation of files suitable for MAP_PRIVATE on
 the vfs internal mount

This patchset adds a flag to mmap that allows the user to request that an
anonymous mapping be backed with huge pages.  This mapping will borrow
functionality from the huge page shm code to create a file on the kernel
internal mount and use it to approximate an anonymous mapping.  The
MAP_HUGETLB flag is a modifier to MAP_ANONYMOUS and will not work without
both flags being preset.

A new flag is necessary because there is no other way to hook into huge
pages without creating a file on a hugetlbfs mount which wouldn't be
MAP_ANONYMOUS.

To userspace, this mapping will behave just like an anonymous mapping
because the file is not accessible outside of the kernel.

This patchset is meant to simplify the programming model.  Presently there
is a large chunk of boiler platecode, contained in libhugetlbfs, required
to create private, hugepage backed mappings.  This patch set would allow
use of hugepages without linking to libhugetlbfs or having hugetblfs
mounted.

Unification of the VM code would provide these same benefits, but it has
been resisted each time that it has been suggested for several reasons: it
would break PAGE_SIZE assumptions across the kernel, it makes page-table
abstractions really expensive, and it does not provide any benefit on
architectures that do not support huge pages, incurring fast path
penalties without providing any benefit on these architectures.

This patch:

There are two means of creating mappings backed by huge pages:

        1. mmap() a file created on hugetlbfs
        2. Use shm which creates a file on an internal mount which essentially
           maps it MAP_SHARED

The internal mount is only used for shared mappings but there is very
little that stops it being used for private mappings. This patch extends
hugetlbfs_file_setup() to deal with the creation of files that will be
mapped MAP_PRIVATE on the internal hugetlbfs mount. This extended API is
used in a subsequent patch to implement the MAP_HUGETLB mmap() flag.

Signed-off-by: Eric Munson <ebmunson@us.ibm.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Adam Litke <agl@us.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c    | 21 +++++++++++++++++----
 include/linux/hugetlb.h | 12 ++++++++++--
 ipc/shm.c               |  2 +-
 3 files changed, 28 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a93b885311d8..06b7c2623f99 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -507,6 +507,13 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		INIT_LIST_HEAD(&inode->i_mapping->private_list);
 		info = HUGETLBFS_I(inode);
+		/*
+		 * The policy is initialized here even if we are creating a
+		 * private inode because initialization simply creates an
+		 * an empty rb tree and calls spin_lock_init(), later when we
+		 * call mpol_free_shared_policy() it will just return because
+		 * the rb tree will still be empty.
+		 */
 		mpol_shared_policy_init(&info->policy, NULL);
 		switch (mode & S_IFMT) {
 		default:
@@ -931,13 +938,19 @@ static struct file_system_type hugetlbfs_fs_type = {
 
 static struct vfsmount *hugetlbfs_vfsmount;
 
-static int can_do_hugetlb_shm(void)
+static int can_do_hugetlb_shm(int creat_flags)
 {
-	return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
+	if (creat_flags != HUGETLB_SHMFS_INODE)
+		return 0;
+	if (capable(CAP_IPC_LOCK))
+		return 1;
+	if (in_group_p(sysctl_hugetlb_shm_group))
+		return 1;
+	return 0;
 }
 
 struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
-						struct user_struct **user)
+				struct user_struct **user, int creat_flags)
 {
 	int error = -ENOMEM;
 	struct file *file;
@@ -949,7 +962,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
 	if (!hugetlbfs_vfsmount)
 		return ERR_PTR(-ENOENT);
 
-	if (!can_do_hugetlb_shm()) {
+	if (!can_do_hugetlb_shm(creat_flags)) {
 		*user = current_user();
 		if (user_shm_lock(size, *user)) {
 			WARN_ONCE(1,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e7f0fabfa1c2..f6505ad86657 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -112,6 +112,14 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
 
 #endif /* !CONFIG_HUGETLB_PAGE */
 
+enum {
+	/*
+	 * The file will be used as an shm file so shmfs accounting rules
+	 * apply
+	 */
+	HUGETLB_SHMFS_INODE     = 1,
+};
+
 #ifdef CONFIG_HUGETLBFS
 struct hugetlbfs_config {
 	uid_t   uid;
@@ -150,7 +158,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 extern const struct file_operations hugetlbfs_file_operations;
 extern struct vm_operations_struct hugetlb_vm_ops;
 struct file *hugetlb_file_setup(const char *name, size_t size, int acct,
-						struct user_struct **user);
+				struct user_struct **user, int creat_flags);
 int hugetlb_get_quota(struct address_space *mapping, long delta);
 void hugetlb_put_quota(struct address_space *mapping, long delta);
 
@@ -172,7 +180,7 @@ static inline void set_file_hugepages(struct file *file)
 
 #define is_file_hugepages(file)			0
 #define set_file_hugepages(file)		BUG()
-#define hugetlb_file_setup(name,size,acct,user)	ERR_PTR(-ENOSYS)
+#define hugetlb_file_setup(name,size,acct,user,creat)	ERR_PTR(-ENOSYS)
 
 #endif /* !CONFIG_HUGETLBFS */
 
diff --git a/ipc/shm.c b/ipc/shm.c
index 30162a59621a..9eb1488b543b 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -370,7 +370,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 		if (shmflg & SHM_NORESERVE)
 			acctflag = VM_NORESERVE;
 		file = hugetlb_file_setup(name, size, acctflag,
-							&shp->mlock_user);
+					&shp->mlock_user, HUGETLB_SHMFS_INODE);
 	} else {
 		/*
 		 * Do not allow no accounting for OVERCOMMIT_NEVER, even
-- 
cgit v1.2.3-71-gd317


From 4e52780d41a741fb4861ae1df2413dd816ec11b1 Mon Sep 17 00:00:00 2001
From: Eric B Munson <ebmunson@us.ibm.com>
Date: Mon, 21 Sep 2009 17:03:47 -0700
Subject: hugetlb: add MAP_HUGETLB for mmaping pseudo-anonymous huge page
 regions

Add a flag for mmap that will be used to request a huge page region that
will look like anonymous memory to userspace.  This is accomplished by
using a file on the internal vfsmount.  MAP_HUGETLB is a modifier of
MAP_ANONYMOUS and so must be specified with it.  The region will behave
the same as a MAP_ANONYMOUS region using small pages.

[akpm@linux-foundation.org: fix arch definitions of MAP_HUGETLB]
Signed-off-by: Eric B Munson <ebmunson@us.ibm.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Adam Litke <agl@us.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |  7 +++++++
 mm/mmap.c               | 18 ++++++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index f6505ad86657..176e7ee73eff 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -112,12 +112,19 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
 
 #endif /* !CONFIG_HUGETLB_PAGE */
 
+#define HUGETLB_ANON_FILE "anon_hugepage"
+
 enum {
 	/*
 	 * The file will be used as an shm file so shmfs accounting rules
 	 * apply
 	 */
 	HUGETLB_SHMFS_INODE     = 1,
+	/*
+	 * The file is being created on the internal vfs mount and shmfs
+	 * accounting rules do not apply
+	 */
+	HUGETLB_ANONHUGE_INODE  = 2,
 };
 
 #ifdef CONFIG_HUGETLBFS
diff --git a/mm/mmap.c b/mm/mmap.c
index 1aeef6625e6a..21d4029a07b3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -949,6 +949,24 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	if (mm->map_count > sysctl_max_map_count)
 		return -ENOMEM;
 
+	if (flags & MAP_HUGETLB) {
+		struct user_struct *user = NULL;
+		if (file)
+			return -EINVAL;
+
+		/*
+		 * VM_NORESERVE is used because the reservations will be
+		 * taken when vm_ops->mmap() is called
+		 * A dummy user value is used because we are not locking
+		 * memory so no accounting is necessary
+		 */
+		len = ALIGN(len, huge_page_size(&default_hstate));
+		file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
+						&user, HUGETLB_ANONHUGE_INODE);
+		if (IS_ERR(file))
+			return PTR_ERR(file);
+	}
+
 	/* Obtain the address to map to. we verify (or select) it and ensure
 	 * that it represents a valid section of the address space.
 	 */
-- 
cgit v1.2.3-71-gd317


From 3d2d827f5ca5e32816194119d5c980c7e04474a6 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 21 Sep 2009 17:03:51 -0700
Subject: mm: move use_mm/unuse_mm from aio.c to mm/

Anyone who wants to do copy to/from user from a kernel thread, needs
use_mm (like what fs/aio has).  Move that into mm/, to make reusing and
exporting easier down the line, and make aio use it.  Next intended user,
besides aio, will be vhost-net.

Acked-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c                    | 47 +-------------------------------------
 include/linux/mmu_context.h |  9 ++++++++
 mm/Makefile                 |  2 +-
 mm/mmu_context.c            | 55 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 66 insertions(+), 47 deletions(-)
 create mode 100644 include/linux/mmu_context.h
 create mode 100644 mm/mmu_context.c

(limited to 'include/linux')

diff --git a/fs/aio.c b/fs/aio.c
index d065b2c3273e..fc21c23b2387 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -24,6 +24,7 @@
 #include <linux/file.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
+#include <linux/mmu_context.h>
 #include <linux/slab.h>
 #include <linux/timer.h>
 #include <linux/aio.h>
@@ -34,7 +35,6 @@
 
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
-#include <asm/mmu_context.h>
 
 #if DEBUG > 1
 #define dprintk		printk
@@ -594,51 +594,6 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 	return ret;
 }
 
-/*
- * use_mm
- *	Makes the calling kernel thread take on the specified
- *	mm context.
- *	Called by the retry thread execute retries within the
- *	iocb issuer's mm context, so that copy_from/to_user
- *	operations work seamlessly for aio.
- *	(Note: this routine is intended to be called only
- *	from a kernel thread context)
- */
-static void use_mm(struct mm_struct *mm)
-{
-	struct mm_struct *active_mm;
-	struct task_struct *tsk = current;
-
-	task_lock(tsk);
-	active_mm = tsk->active_mm;
-	atomic_inc(&mm->mm_count);
-	tsk->mm = mm;
-	tsk->active_mm = mm;
-	switch_mm(active_mm, mm, tsk);
-	task_unlock(tsk);
-
-	mmdrop(active_mm);
-}
-
-/*
- * unuse_mm
- *	Reverses the effect of use_mm, i.e. releases the
- *	specified mm context which was earlier taken on
- *	by the calling kernel thread
- *	(Note: this routine is intended to be called only
- *	from a kernel thread context)
- */
-static void unuse_mm(struct mm_struct *mm)
-{
-	struct task_struct *tsk = current;
-
-	task_lock(tsk);
-	tsk->mm = NULL;
-	/* active_mm is still 'mm' */
-	enter_lazy_tlb(mm, tsk);
-	task_unlock(tsk);
-}
-
 /*
  * Queue up a kiocb to be retried. Assumes that the kiocb
  * has already been marked as kicked, and places it on
diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h
new file mode 100644
index 000000000000..70fffeba7495
--- /dev/null
+++ b/include/linux/mmu_context.h
@@ -0,0 +1,9 @@
+#ifndef _LINUX_MMU_CONTEXT_H
+#define _LINUX_MMU_CONTEXT_H
+
+struct mm_struct;
+
+void use_mm(struct mm_struct *mm);
+void unuse_mm(struct mm_struct *mm);
+
+#endif
diff --git a/mm/Makefile b/mm/Makefile
index a63bf59a0c77..728a9fde49d1 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -11,7 +11,7 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   maccess.o page_alloc.o page-writeback.o \
 			   readahead.o swap.o truncate.o vmscan.o shmem.o \
 			   prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
-			   page_isolation.o mm_init.o $(mmu-y)
+			   page_isolation.o mm_init.o mmu_context.o $(mmu-y)
 obj-y += init-mm.o
 
 obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
new file mode 100644
index 000000000000..fd473b51c903
--- /dev/null
+++ b/mm/mmu_context.c
@@ -0,0 +1,55 @@
+/* Copyright (C) 2009 Red Hat, Inc.
+ *
+ * See ../COPYING for licensing terms.
+ */
+
+#include <linux/mm.h>
+#include <linux/mmu_context.h>
+#include <linux/sched.h>
+
+#include <asm/mmu_context.h>
+
+/*
+ * use_mm
+ *	Makes the calling kernel thread take on the specified
+ *	mm context.
+ *	Called by the retry thread execute retries within the
+ *	iocb issuer's mm context, so that copy_from/to_user
+ *	operations work seamlessly for aio.
+ *	(Note: this routine is intended to be called only
+ *	from a kernel thread context)
+ */
+void use_mm(struct mm_struct *mm)
+{
+	struct mm_struct *active_mm;
+	struct task_struct *tsk = current;
+
+	task_lock(tsk);
+	active_mm = tsk->active_mm;
+	atomic_inc(&mm->mm_count);
+	tsk->mm = mm;
+	tsk->active_mm = mm;
+	switch_mm(active_mm, mm, tsk);
+	task_unlock(tsk);
+
+	mmdrop(active_mm);
+}
+
+/*
+ * unuse_mm
+ *	Reverses the effect of use_mm, i.e. releases the
+ *	specified mm context which was earlier taken on
+ *	by the calling kernel thread
+ *	(Note: this routine is intended to be called only
+ *	from a kernel thread context)
+ */
+void unuse_mm(struct mm_struct *mm)
+{
+	struct task_struct *tsk = current;
+
+	task_lock(tsk);
+	tsk->mm = NULL;
+	/* active_mm is still 'mm' */
+	enter_lazy_tlb(mm, tsk);
+	task_unlock(tsk);
+}
-- 
cgit v1.2.3-71-gd317


From 69d25870f20c4b2563304f2b79c5300dd60a067e Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Mon, 21 Sep 2009 17:04:08 -0700
Subject: cpuidle: fix the menu governor to boost IO performance

Fix the menu idle governor which balances power savings, energy efficiency
and performance impact.

The reason for a reworked governor is that there have been serious
performance issues reported with the existing code on Nehalem server
systems.

To show this I'm sure Andrew wants to see benchmark results:
(benchmark is "fio", "no cstates" is using "idle=poll")

		no cstates	current linux	new algorithm
1 disk		107 Mb/s	85 Mb/s		105 Mb/s
2 disks		215 Mb/s	123 Mb/s	209 Mb/s
12 disks	590 Mb/s	320 Mb/s	585 Mb/s

In various power benchmark measurements, no degredation was found by our
measurement&diagnostics team.  Obviously a small percentage more power was
used in the "fio" benchmark, due to the much higher performance.

While it would be a novel idea to describe the new algorithm in this
commit message, I cheaped out and described it in comments in the code
instead.

[changes since first post: spelling fixes from akpm, review feedback,
folded menu-tng into menu.c]

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Yanmin Zhang <yanmin_zhang@linux.intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/cpuidle/governors/menu.c | 251 +++++++++++++++++++++++++++++++++------
 include/linux/sched.h            |   4 +
 kernel/sched.c                   |  13 ++
 3 files changed, 229 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index f1df59f59a37..9f3d77532ab9 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -2,8 +2,12 @@
  * menu.c - the menu idle governor
  *
  * Copyright (C) 2006-2007 Adam Belay <abelay@novell.com>
+ * Copyright (C) 2009 Intel Corporation
+ * Author:
+ *        Arjan van de Ven <arjan@linux.intel.com>
  *
- * This code is licenced under the GPL.
+ * This code is licenced under the GPL version 2 as described
+ * in the COPYING file that acompanies the Linux Kernel.
  */
 
 #include <linux/kernel.h>
@@ -13,20 +17,153 @@
 #include <linux/ktime.h>
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
+#include <linux/sched.h>
 
-#define BREAK_FUZZ	4	/* 4 us */
-#define PRED_HISTORY_PCT	50
+#define BUCKETS 12
+#define RESOLUTION 1024
+#define DECAY 4
+#define MAX_INTERESTING 50000
+
+/*
+ * Concepts and ideas behind the menu governor
+ *
+ * For the menu governor, there are 3 decision factors for picking a C
+ * state:
+ * 1) Energy break even point
+ * 2) Performance impact
+ * 3) Latency tolerance (from pmqos infrastructure)
+ * These these three factors are treated independently.
+ *
+ * Energy break even point
+ * -----------------------
+ * C state entry and exit have an energy cost, and a certain amount of time in
+ * the  C state is required to actually break even on this cost. CPUIDLE
+ * provides us this duration in the "target_residency" field. So all that we
+ * need is a good prediction of how long we'll be idle. Like the traditional
+ * menu governor, we start with the actual known "next timer event" time.
+ *
+ * Since there are other source of wakeups (interrupts for example) than
+ * the next timer event, this estimation is rather optimistic. To get a
+ * more realistic estimate, a correction factor is applied to the estimate,
+ * that is based on historic behavior. For example, if in the past the actual
+ * duration always was 50% of the next timer tick, the correction factor will
+ * be 0.5.
+ *
+ * menu uses a running average for this correction factor, however it uses a
+ * set of factors, not just a single factor. This stems from the realization
+ * that the ratio is dependent on the order of magnitude of the expected
+ * duration; if we expect 500 milliseconds of idle time the likelihood of
+ * getting an interrupt very early is much higher than if we expect 50 micro
+ * seconds of idle time. A second independent factor that has big impact on
+ * the actual factor is if there is (disk) IO outstanding or not.
+ * (as a special twist, we consider every sleep longer than 50 milliseconds
+ * as perfect; there are no power gains for sleeping longer than this)
+ *
+ * For these two reasons we keep an array of 12 independent factors, that gets
+ * indexed based on the magnitude of the expected duration as well as the
+ * "is IO outstanding" property.
+ *
+ * Limiting Performance Impact
+ * ---------------------------
+ * C states, especially those with large exit latencies, can have a real
+ * noticable impact on workloads, which is not acceptable for most sysadmins,
+ * and in addition, less performance has a power price of its own.
+ *
+ * As a general rule of thumb, menu assumes that the following heuristic
+ * holds:
+ *     The busier the system, the less impact of C states is acceptable
+ *
+ * This rule-of-thumb is implemented using a performance-multiplier:
+ * If the exit latency times the performance multiplier is longer than
+ * the predicted duration, the C state is not considered a candidate
+ * for selection due to a too high performance impact. So the higher
+ * this multiplier is, the longer we need to be idle to pick a deep C
+ * state, and thus the less likely a busy CPU will hit such a deep
+ * C state.
+ *
+ * Two factors are used in determing this multiplier:
+ * a value of 10 is added for each point of "per cpu load average" we have.
+ * a value of 5 points is added for each process that is waiting for
+ * IO on this CPU.
+ * (these values are experimentally determined)
+ *
+ * The load average factor gives a longer term (few seconds) input to the
+ * decision, while the iowait value gives a cpu local instantanious input.
+ * The iowait factor may look low, but realize that this is also already
+ * represented in the system load average.
+ *
+ */
 
 struct menu_device {
 	int		last_state_idx;
 
 	unsigned int	expected_us;
-	unsigned int	predicted_us;
-	unsigned int    current_predicted_us;
-	unsigned int	last_measured_us;
-	unsigned int	elapsed_us;
+	u64		predicted_us;
+	unsigned int	measured_us;
+	unsigned int	exit_us;
+	unsigned int	bucket;
+	u64		correction_factor[BUCKETS];
 };
 
+
+#define LOAD_INT(x) ((x) >> FSHIFT)
+#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+
+static int get_loadavg(void)
+{
+	unsigned long this = this_cpu_load();
+
+
+	return LOAD_INT(this) * 10 + LOAD_FRAC(this) / 10;
+}
+
+static inline int which_bucket(unsigned int duration)
+{
+	int bucket = 0;
+
+	/*
+	 * We keep two groups of stats; one with no
+	 * IO pending, one without.
+	 * This allows us to calculate
+	 * E(duration)|iowait
+	 */
+	if (nr_iowait_cpu())
+		bucket = BUCKETS/2;
+
+	if (duration < 10)
+		return bucket;
+	if (duration < 100)
+		return bucket + 1;
+	if (duration < 1000)
+		return bucket + 2;
+	if (duration < 10000)
+		return bucket + 3;
+	if (duration < 100000)
+		return bucket + 4;
+	return bucket + 5;
+}
+
+/*
+ * Return a multiplier for the exit latency that is intended
+ * to take performance requirements into account.
+ * The more performance critical we estimate the system
+ * to be, the higher this multiplier, and thus the higher
+ * the barrier to go to an expensive C state.
+ */
+static inline int performance_multiplier(void)
+{
+	int mult = 1;
+
+	/* for higher loadavg, we are more reluctant */
+
+	mult += 2 * get_loadavg();
+
+	/* for IO wait tasks (per cpu!) we add 5x each */
+	mult += 10 * nr_iowait_cpu();
+
+	return mult;
+}
+
 static DEFINE_PER_CPU(struct menu_device, menu_devices);
 
 /**
@@ -38,37 +175,59 @@ static int menu_select(struct cpuidle_device *dev)
 	struct menu_device *data = &__get_cpu_var(menu_devices);
 	int latency_req = pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY);
 	int i;
+	int multiplier;
+
+	data->last_state_idx = 0;
+	data->exit_us = 0;
 
 	/* Special case when user has set very strict latency requirement */
-	if (unlikely(latency_req == 0)) {
-		data->last_state_idx = 0;
+	if (unlikely(latency_req == 0))
 		return 0;
-	}
 
-	/* determine the expected residency time */
+	/* determine the expected residency time, round up */
 	data->expected_us =
-		(u32) ktime_to_ns(tick_nohz_get_sleep_length()) / 1000;
+	    DIV_ROUND_UP((u32)ktime_to_ns(tick_nohz_get_sleep_length()), 1000);
+
+
+	data->bucket = which_bucket(data->expected_us);
+
+	multiplier = performance_multiplier();
+
+	/*
+	 * if the correction factor is 0 (eg first time init or cpu hotplug
+	 * etc), we actually want to start out with a unity factor.
+	 */
+	if (data->correction_factor[data->bucket] == 0)
+		data->correction_factor[data->bucket] = RESOLUTION * DECAY;
+
+	/* Make sure to round up for half microseconds */
+	data->predicted_us = DIV_ROUND_CLOSEST(
+		data->expected_us * data->correction_factor[data->bucket],
+		RESOLUTION * DECAY);
+
+	/*
+	 * We want to default to C1 (hlt), not to busy polling
+	 * unless the timer is happening really really soon.
+	 */
+	if (data->expected_us > 5)
+		data->last_state_idx = CPUIDLE_DRIVER_STATE_START;
 
-	/* Recalculate predicted_us based on prediction_history_pct */
-	data->predicted_us *= PRED_HISTORY_PCT;
-	data->predicted_us += (100 - PRED_HISTORY_PCT) *
-				data->current_predicted_us;
-	data->predicted_us /= 100;
 
 	/* find the deepest idle state that satisfies our constraints */
-	for (i = CPUIDLE_DRIVER_STATE_START + 1; i < dev->state_count; i++) {
+	for (i = CPUIDLE_DRIVER_STATE_START; i < dev->state_count; i++) {
 		struct cpuidle_state *s = &dev->states[i];
 
-		if (s->target_residency > data->expected_us)
-			break;
 		if (s->target_residency > data->predicted_us)
 			break;
 		if (s->exit_latency > latency_req)
 			break;
+		if (s->exit_latency * multiplier > data->predicted_us)
+			break;
+		data->exit_us = s->exit_latency;
+		data->last_state_idx = i;
 	}
 
-	data->last_state_idx = i - 1;
-	return i - 1;
+	return data->last_state_idx;
 }
 
 /**
@@ -85,35 +244,49 @@ static void menu_reflect(struct cpuidle_device *dev)
 	unsigned int last_idle_us = cpuidle_get_last_residency(dev);
 	struct cpuidle_state *target = &dev->states[last_idx];
 	unsigned int measured_us;
+	u64 new_factor;
 
 	/*
 	 * Ugh, this idle state doesn't support residency measurements, so we
 	 * are basically lost in the dark.  As a compromise, assume we slept
-	 * for one full standard timer tick.  However, be aware that this
-	 * could potentially result in a suboptimal state transition.
+	 * for the whole expected time.
 	 */
 	if (unlikely(!(target->flags & CPUIDLE_FLAG_TIME_VALID)))
-		last_idle_us = USEC_PER_SEC / HZ;
+		last_idle_us = data->expected_us;
+
+
+	measured_us = last_idle_us;
 
 	/*
-	 * measured_us and elapsed_us are the cumulative idle time, since the
-	 * last time we were woken out of idle by an interrupt.
+	 * We correct for the exit latency; we are assuming here that the
+	 * exit latency happens after the event that we're interested in.
 	 */
-	if (data->elapsed_us <= data->elapsed_us + last_idle_us)
-		measured_us = data->elapsed_us + last_idle_us;
+	if (measured_us > data->exit_us)
+		measured_us -= data->exit_us;
+
+
+	/* update our correction ratio */
+
+	new_factor = data->correction_factor[data->bucket]
+			* (DECAY - 1) / DECAY;
+
+	if (data->expected_us > 0 && data->measured_us < MAX_INTERESTING)
+		new_factor += RESOLUTION * measured_us / data->expected_us;
 	else
-		measured_us = -1;
+		/*
+		 * we were idle so long that we count it as a perfect
+		 * prediction
+		 */
+		new_factor += RESOLUTION;
 
-	/* Predict time until next break event */
-	data->current_predicted_us = max(measured_us, data->last_measured_us);
+	/*
+	 * We don't want 0 as factor; we always want at least
+	 * a tiny bit of estimated time.
+	 */
+	if (new_factor == 0)
+		new_factor = 1;
 
-	if (last_idle_us + BREAK_FUZZ <
-	    data->expected_us - target->exit_latency) {
-		data->last_measured_us = measured_us;
-		data->elapsed_us = 0;
-	} else {
-		data->elapsed_us = measured_us;
-	}
+	data->correction_factor[data->bucket] = new_factor;
 }
 
 /**
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 17e9a8e9a51d..97b10da0a3ea 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -140,6 +140,10 @@ extern int nr_processes(void);
 extern unsigned long nr_running(void);
 extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_iowait(void);
+extern unsigned long nr_iowait_cpu(void);
+extern unsigned long this_cpu_load(void);
+
+
 extern void calc_global_load(void);
 extern u64 cpu_nr_migrations(int cpu);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 91843ba7f237..0ac9053c21d6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2904,6 +2904,19 @@ unsigned long nr_iowait(void)
 	return sum;
 }
 
+unsigned long nr_iowait_cpu(void)
+{
+	struct rq *this = this_rq();
+	return atomic_read(&this->nr_iowait);
+}
+
+unsigned long this_cpu_load(void)
+{
+	struct rq *this = this_rq();
+	return this->cpu_load[0];
+}
+
+
 /* Variables and functions for calc_load */
 static atomic_long_t calc_load_tasks;
 static unsigned long calc_load_update;
-- 
cgit v1.2.3-71-gd317


From e6de3988aa52debb25a427d085061f3bf1181d54 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 21 Sep 2009 17:04:30 -0700
Subject: flex_array: add flex_array_clear function

Add a new function to the flex_array API:

	int flex_array_clear(struct flex_array *fa,
				unsigned int element_nr)

This function will zero the element at element_nr in the flex_array.

Although this is equivalent to using flex_array_put() and passing a
pointer to zero'd memory, flex_array_clear() does not require such a
pointer to memory that would most likely need to be allocated on the
caller's stack which could be significantly large depending on
element_size.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/flex_array.h |  1 +
 lib/flex_array.c           | 26 ++++++++++++++++++++++++++
 2 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/flex_array.h b/include/linux/flex_array.h
index 45ff18491514..3887b21f883f 100644
--- a/include/linux/flex_array.h
+++ b/include/linux/flex_array.h
@@ -44,6 +44,7 @@ void flex_array_free(struct flex_array *fa);
 void flex_array_free_parts(struct flex_array *fa);
 int flex_array_put(struct flex_array *fa, unsigned int element_nr, void *src,
 		gfp_t flags);
+int flex_array_clear(struct flex_array *fa, unsigned int element_nr);
 void *flex_array_get(struct flex_array *fa, unsigned int element_nr);
 
 #endif /* _FLEX_ARRAY_H */
diff --git a/lib/flex_array.c b/lib/flex_array.c
index 7baed2fc3bc8..b68f99be4080 100644
--- a/lib/flex_array.c
+++ b/lib/flex_array.c
@@ -206,6 +206,32 @@ int flex_array_put(struct flex_array *fa, unsigned int element_nr, void *src,
 	return 0;
 }
 
+/**
+ * flex_array_clear - clear element in array at @element_nr
+ * @element_nr:	index of the position to clear.
+ *
+ * Locking must be provided by the caller.
+ */
+int flex_array_clear(struct flex_array *fa, unsigned int element_nr)
+{
+	int part_nr = fa_element_to_part_nr(fa, element_nr);
+	struct flex_array_part *part;
+	void *dst;
+
+	if (element_nr >= fa->total_nr_elements)
+		return -ENOSPC;
+	if (elements_fit_in_base(fa))
+		part = (struct flex_array_part *)&fa->parts[0];
+	else {
+		part = fa->parts[part_nr];
+		if (!part)
+			return -EINVAL;
+	}
+	dst = &part->elements[index_inside_part(fa, element_nr)];
+	memset(dst, 0, fa->element_size);
+	return 0;
+}
+
 /**
  * flex_array_prealloc - guarantee that array space exists
  * @start:	index of first array element for which space is allocated
-- 
cgit v1.2.3-71-gd317


From 19da3dd157f8db6fe727ff268dab4791d55a6371 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 21 Sep 2009 17:04:31 -0700
Subject: flex_array: poison free elements

Newly initialized flex_array's and/or flex_array_part's are now poisoned
with a new poison value, FLEX_ARRAY_FREE.  It's value is similar to
POISON_FREE used in the various slab allocators, but is different to
distinguish between flex array's poisoned kmem and slab allocator poisoned
kmem.

This will allow us to identify flex_array_part's that only contain free
elements (and free them with an addition to the flex_array API).  This
could also be extended in the future to identify `get' uses on elements
that have not been `put'.

If __GFP_ZERO is passed for a part's gfp mask, the poisoning is avoided.
These elements are considered to be in-use since they have been
initialized.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/poison.h |  3 +++
 lib/flex_array.c       | 15 +++++++--------
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/poison.h b/include/linux/poison.h
index 6729f7dcd60e..7fc194aef8c2 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -65,6 +65,9 @@
 #define MUTEX_DEBUG_INIT	0x11
 #define MUTEX_DEBUG_FREE	0x22
 
+/********** lib/flex_array.c **********/
+#define FLEX_ARRAY_FREE	0x6c	/* for use-after-free poisoning */
+
 /********** security/ **********/
 #define KEY_DESTROY		0xbd
 
diff --git a/lib/flex_array.c b/lib/flex_array.c
index b68f99be4080..e22d0e9776aa 100644
--- a/lib/flex_array.c
+++ b/lib/flex_array.c
@@ -113,6 +113,8 @@ struct flex_array *flex_array_alloc(int element_size, unsigned int total,
 		return NULL;
 	ret->element_size = element_size;
 	ret->total_nr_elements = total;
+	if (elements_fit_in_base(ret) && !(flags & __GFP_ZERO))
+		memset(ret->parts[0], FLEX_ARRAY_FREE, bytes_left_in_base());
 	return ret;
 }
 
@@ -159,15 +161,12 @@ __fa_get_part(struct flex_array *fa, int part_nr, gfp_t flags)
 {
 	struct flex_array_part *part = fa->parts[part_nr];
 	if (!part) {
-		/*
-		 * This leaves the part pages uninitialized
-		 * and with potentially random data, just
-		 * as if the user had kmalloc()'d the whole.
-		 * __GFP_ZERO can be used to zero it.
-		 */
-		part = kmalloc(FLEX_ARRAY_PART_SIZE, flags);
+		part = kmalloc(sizeof(struct flex_array_part), flags);
 		if (!part)
 			return NULL;
+		if (!(flags & __GFP_ZERO))
+			memset(part, FLEX_ARRAY_FREE,
+				sizeof(struct flex_array_part));
 		fa->parts[part_nr] = part;
 	}
 	return part;
@@ -228,7 +227,7 @@ int flex_array_clear(struct flex_array *fa, unsigned int element_nr)
 			return -EINVAL;
 	}
 	dst = &part->elements[index_inside_part(fa, element_nr)];
-	memset(dst, 0, fa->element_size);
+	memset(dst, FLEX_ARRAY_FREE, fa->element_size);
 	return 0;
 }
 
-- 
cgit v1.2.3-71-gd317


From 4af5a2f770cc8575840ccb1514ec76ecb592985c Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 21 Sep 2009 17:04:31 -0700
Subject: flex_array: add flex_array_shrink function

Add a new function to the flex_array API:

	int flex_array_shrink(struct flex_array *fa)

This function will free all unused second-level pages.  Since elements are
now poisoned if they are not allocated with __GFP_ZERO, it's possible to
identify parts that consist solely of unused elements.

flex_array_shrink() returns the number of pages freed.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/flex_array.h |  1 +
 lib/flex_array.c           | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/flex_array.h b/include/linux/flex_array.h
index 3887b21f883f..f12401e485fe 100644
--- a/include/linux/flex_array.h
+++ b/include/linux/flex_array.h
@@ -46,5 +46,6 @@ int flex_array_put(struct flex_array *fa, unsigned int element_nr, void *src,
 		gfp_t flags);
 int flex_array_clear(struct flex_array *fa, unsigned int element_nr);
 void *flex_array_get(struct flex_array *fa, unsigned int element_nr);
+int flex_array_shrink(struct flex_array *fa);
 
 #endif /* _FLEX_ARRAY_H */
diff --git a/lib/flex_array.c b/lib/flex_array.c
index e22d0e9776aa..1b03bb553410 100644
--- a/lib/flex_array.c
+++ b/lib/flex_array.c
@@ -291,3 +291,43 @@ void *flex_array_get(struct flex_array *fa, unsigned int element_nr)
 	}
 	return &part->elements[index_inside_part(fa, element_nr)];
 }
+
+static int part_is_free(struct flex_array_part *part)
+{
+	int i;
+
+	for (i = 0; i < sizeof(struct flex_array_part); i++)
+		if (part->elements[i] != FLEX_ARRAY_FREE)
+			return 0;
+	return 1;
+}
+
+/**
+ * flex_array_shrink - free unused second-level pages
+ *
+ * Frees all second-level pages that consist solely of unused
+ * elements.  Returns the number of pages freed.
+ *
+ * Locking must be provided by the caller.
+ */
+int flex_array_shrink(struct flex_array *fa)
+{
+	struct flex_array_part *part;
+	int max_part = nr_base_part_ptrs();
+	int part_nr;
+	int ret = 0;
+
+	if (elements_fit_in_base(fa))
+		return ret;
+	for (part_nr = 0; part_nr < max_part; part_nr++) {
+		part = fa->parts[part_nr];
+		if (!part)
+			continue;
+		if (part_is_free(part)) {
+			fa->parts[part_nr] = NULL;
+			kfree(part);
+			ret++;
+		}
+	}
+	return ret;
+}
-- 
cgit v1.2.3-71-gd317


From 45b588d6e5cc172704bac0c998ce54873b149b22 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 21 Sep 2009 17:04:33 -0700
Subject: flex_array: introduce DEFINE_FLEX_ARRAY

FLEX_ARRAY_INIT(element_size, total_nr_elements) cannot determine if
either parameter is valid, so flex arrays which are statically allocated
with this interface can easily become corrupted or reference beyond its
allocated memory.

This removes FLEX_ARRAY_INIT() as a struct flex_array initializer since no
initializer may perform the required checking.  Instead, the array is now
defined with a new interface:

	DEFINE_FLEX_ARRAY(name, element_size, total_nr_elements)

This may be prefixed with `static' for file scope.

This interface includes compile-time checking of the parameters to ensure
they are valid.  Since the validity of both element_size and
total_nr_elements depend on FLEX_ARRAY_BASE_SIZE and FLEX_ARRAY_PART_SIZE,
the kernel build will fail if either of these predefined values changes
such that the array parameters are no longer valid.

Since BUILD_BUG_ON() requires compile time constants, several of the
static inline functions that were once local to lib/flex_array.c had to be
moved to include/linux/flex_array.h.

Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/flex_array.h | 30 ++++++++++++++++++++++++++----
 lib/flex_array.c           | 36 ++++++++++--------------------------
 2 files changed, 36 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/flex_array.h b/include/linux/flex_array.h
index f12401e485fe..1d747f72298b 100644
--- a/include/linux/flex_array.h
+++ b/include/linux/flex_array.h
@@ -31,10 +31,32 @@ struct flex_array {
 	};
 };
 
-#define FLEX_ARRAY_INIT(size, total) { { {\
-	.element_size = (size),		\
-	.total_nr_elements = (total),	\
-} } }
+/* Number of bytes left in base struct flex_array, excluding metadata */
+#define FLEX_ARRAY_BASE_BYTES_LEFT					\
+	(FLEX_ARRAY_BASE_SIZE - offsetof(struct flex_array, parts))
+
+/* Number of pointers in base to struct flex_array_part pages */
+#define FLEX_ARRAY_NR_BASE_PTRS						\
+	(FLEX_ARRAY_BASE_BYTES_LEFT / sizeof(struct flex_array_part *))
+
+/* Number of elements of size that fit in struct flex_array_part */
+#define FLEX_ARRAY_ELEMENTS_PER_PART(size)				\
+	(FLEX_ARRAY_PART_SIZE / size)
+
+/*
+ * Defines a statically allocated flex array and ensures its parameters are
+ * valid.
+ */
+#define DEFINE_FLEX_ARRAY(__arrayname, __element_size, __total)		\
+	struct flex_array __arrayname = { { {				\
+		.element_size = (__element_size),			\
+		.total_nr_elements = (__total),				\
+	} } };								\
+	static inline void __arrayname##_invalid_parameter(void)	\
+	{								\
+		BUILD_BUG_ON((__total) > FLEX_ARRAY_NR_BASE_PTRS *	\
+			FLEX_ARRAY_ELEMENTS_PER_PART(__element_size));	\
+	}
 
 struct flex_array *flex_array_alloc(int element_size, unsigned int total,
 		gfp_t flags);
diff --git a/lib/flex_array.c b/lib/flex_array.c
index 1b03bb553410..b62ce6cffd0a 100644
--- a/lib/flex_array.c
+++ b/lib/flex_array.c
@@ -28,23 +28,6 @@ struct flex_array_part {
 	char elements[FLEX_ARRAY_PART_SIZE];
 };
 
-static inline int __elements_per_part(int element_size)
-{
-	return FLEX_ARRAY_PART_SIZE / element_size;
-}
-
-static inline int bytes_left_in_base(void)
-{
-	int element_offset = offsetof(struct flex_array, parts);
-	int bytes_left = FLEX_ARRAY_BASE_SIZE - element_offset;
-	return bytes_left;
-}
-
-static inline int nr_base_part_ptrs(void)
-{
-	return bytes_left_in_base() / sizeof(struct flex_array_part *);
-}
-
 /*
  * If a user requests an allocation which is small
  * enough, we may simply use the space in the
@@ -54,7 +37,7 @@ static inline int nr_base_part_ptrs(void)
 static inline int elements_fit_in_base(struct flex_array *fa)
 {
 	int data_size = fa->element_size * fa->total_nr_elements;
-	if (data_size <= bytes_left_in_base())
+	if (data_size <= FLEX_ARRAY_BASE_BYTES_LEFT)
 		return 1;
 	return 0;
 }
@@ -103,7 +86,8 @@ struct flex_array *flex_array_alloc(int element_size, unsigned int total,
 					gfp_t flags)
 {
 	struct flex_array *ret;
-	int max_size = nr_base_part_ptrs() * __elements_per_part(element_size);
+	int max_size = FLEX_ARRAY_NR_BASE_PTRS *
+				FLEX_ARRAY_ELEMENTS_PER_PART(element_size);
 
 	/* max_size will end up 0 if element_size > PAGE_SIZE */
 	if (total > max_size)
@@ -114,14 +98,15 @@ struct flex_array *flex_array_alloc(int element_size, unsigned int total,
 	ret->element_size = element_size;
 	ret->total_nr_elements = total;
 	if (elements_fit_in_base(ret) && !(flags & __GFP_ZERO))
-		memset(ret->parts[0], FLEX_ARRAY_FREE, bytes_left_in_base());
+		memset(ret->parts[0], FLEX_ARRAY_FREE,
+						FLEX_ARRAY_BASE_BYTES_LEFT);
 	return ret;
 }
 
 static int fa_element_to_part_nr(struct flex_array *fa,
 					unsigned int element_nr)
 {
-	return element_nr / __elements_per_part(fa->element_size);
+	return element_nr / FLEX_ARRAY_ELEMENTS_PER_PART(fa->element_size);
 }
 
 /**
@@ -133,11 +118,10 @@ static int fa_element_to_part_nr(struct flex_array *fa,
 void flex_array_free_parts(struct flex_array *fa)
 {
 	int part_nr;
-	int max_part = nr_base_part_ptrs();
 
 	if (elements_fit_in_base(fa))
 		return;
-	for (part_nr = 0; part_nr < max_part; part_nr++)
+	for (part_nr = 0; part_nr < FLEX_ARRAY_NR_BASE_PTRS; part_nr++)
 		kfree(fa->parts[part_nr]);
 }
 
@@ -152,7 +136,8 @@ static unsigned int index_inside_part(struct flex_array *fa,
 {
 	unsigned int part_offset;
 
-	part_offset = element_nr % __elements_per_part(fa->element_size);
+	part_offset = element_nr %
+				FLEX_ARRAY_ELEMENTS_PER_PART(fa->element_size);
 	return part_offset * fa->element_size;
 }
 
@@ -313,13 +298,12 @@ static int part_is_free(struct flex_array_part *part)
 int flex_array_shrink(struct flex_array *fa)
 {
 	struct flex_array_part *part;
-	int max_part = nr_base_part_ptrs();
 	int part_nr;
 	int ret = 0;
 
 	if (elements_fit_in_base(fa))
 		return ret;
-	for (part_nr = 0; part_nr < max_part; part_nr++) {
+	for (part_nr = 0; part_nr < FLEX_ARRAY_NR_BASE_PTRS; part_nr++) {
 		part = fa->parts[part_nr];
 		if (!part)
 			continue;
-- 
cgit v1.2.3-71-gd317


From 0ec48915e8bbb37dea3df85c41e4c3498b95664b Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@caiaq.de>
Date: Mon, 21 Sep 2009 17:04:42 -0700
Subject: lis3: fix typo

Bit 0x80 in CTRL_REG3 is an ACTIVE_LOW rather than an ACTIVE_HIGH
function, I got that wrong during my last change.

Signed-off-by: Daniel Mack <daniel@caiaq.de>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: Eric Piel <eric.piel@tremplin-utc.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/lis3lv02d.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/lis3lv02d.h b/include/linux/lis3lv02d.h
index ad651f4e45ac..113778b5df36 100644
--- a/include/linux/lis3lv02d.h
+++ b/include/linux/lis3lv02d.h
@@ -32,7 +32,7 @@ struct lis3lv02d_platform_data {
 #define LIS3_IRQ2_DATA_READY	(4 << 3)
 #define LIS3_IRQ2_CLICK		(7 << 3)
 #define LIS3_IRQ_OPEN_DRAIN	(1 << 6)
-#define LIS3_IRQ_ACTIVE_HIGH	(1 << 7)
+#define LIS3_IRQ_ACTIVE_LOW	(1 << 7)
 	unsigned char irq_cfg;
 };
 
-- 
cgit v1.2.3-71-gd317


From 8873c33483e62988ed886230aab71ef4c678f710 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@caiaq.de>
Date: Mon, 21 Sep 2009 17:04:43 -0700
Subject: lis3: add free-fall/wakeup function via platform_data

This offers a way for platforms to define flags and thresholds for the
free-fall/wakeup functions of the lis302d chips.

More registers needed to be seperated as they are specific to the

Signed-off-by: Daniel Mack <daniel@caiaq.de>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: Eric Piel <eric.piel@tremplin-utc.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/hwmon/lis3lv02d.c |  9 +++++++++
 drivers/hwmon/lis3lv02d.h | 24 ++++++++++++++++++------
 include/linux/lis3lv02d.h |  9 +++++++++
 3 files changed, 36 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hwmon/lis3lv02d.c b/drivers/hwmon/lis3lv02d.c
index 271338bdb6be..cf5afb9a10ab 100644
--- a/drivers/hwmon/lis3lv02d.c
+++ b/drivers/hwmon/lis3lv02d.c
@@ -454,6 +454,15 @@ int lis3lv02d_init_device(struct lis3lv02d *dev)
 					(p->click_thresh_y << 4));
 		}
 
+		if (p->wakeup_flags && (dev->whoami == LIS_SINGLE_ID)) {
+			dev->write(dev, FF_WU_CFG_1, p->wakeup_flags);
+			dev->write(dev, FF_WU_THS_1, p->wakeup_thresh & 0x7f);
+			/* default to 2.5ms for now */
+			dev->write(dev, FF_WU_DURATION_1, 1);
+			/* enable high pass filter for both free-fall units */
+			dev->write(dev, CTRL_REG2, HP_FF_WU1 | HP_FF_WU2);
+		}
+
 		if (p->irq_cfg)
 			dev->write(dev, CTRL_REG3, p->irq_cfg);
 	}
diff --git a/drivers/hwmon/lis3lv02d.h b/drivers/hwmon/lis3lv02d.h
index e320e2f511f1..3e1ff46f72d3 100644
--- a/drivers/hwmon/lis3lv02d.h
+++ b/drivers/hwmon/lis3lv02d.h
@@ -58,15 +58,17 @@ enum lis3_reg {
 	OUTZ_L		= 0x2C,
 	OUTZ_H		= 0x2D,
 	OUTZ		= 0x2D,
-	FF_WU_CFG	= 0x30,
-	FF_WU_SRC	= 0x31,
-	FF_WU_ACK	= 0x32,
-	FF_WU_THS_L	= 0x34,
-	FF_WU_THS_H	= 0x35,
-	FF_WU_DURATION	= 0x36,
 };
 
 enum lis302d_reg {
+	FF_WU_CFG_1	= 0x30,
+	FF_WU_SRC_1	= 0x31,
+	FF_WU_THS_1	= 0x32,
+	FF_WU_DURATION_1 = 0x33,
+	FF_WU_CFG_2	= 0x34,
+	FF_WU_SRC_2	= 0x35,
+	FF_WU_THS_2	= 0x36,
+	FF_WU_DURATION_2 = 0x37,
 	CLICK_CFG	= 0x38,
 	CLICK_SRC	= 0x39,
 	CLICK_THSY_X	= 0x3B,
@@ -77,6 +79,12 @@ enum lis302d_reg {
 };
 
 enum lis3lv02d_reg {
+	FF_WU_CFG	= 0x30,
+	FF_WU_SRC	= 0x31,
+	FF_WU_ACK	= 0x32,
+	FF_WU_THS_L	= 0x34,
+	FF_WU_THS_H	= 0x35,
+	FF_WU_DURATION	= 0x36,
 	DD_CFG		= 0x38,
 	DD_SRC		= 0x39,
 	DD_ACK		= 0x3A,
@@ -107,6 +115,10 @@ enum lis3lv02d_ctrl2 {
 	CTRL2_FS	= 0x80, /* Full Scale selection */
 };
 
+enum lis302d_ctrl2 {
+	HP_FF_WU2	= 0x08,
+	HP_FF_WU1	= 0x04,
+};
 
 enum lis3lv02d_ctrl3 {
 	CTRL3_CFS0	= 0x01,
diff --git a/include/linux/lis3lv02d.h b/include/linux/lis3lv02d.h
index 113778b5df36..3cc2f2c53e4c 100644
--- a/include/linux/lis3lv02d.h
+++ b/include/linux/lis3lv02d.h
@@ -34,6 +34,15 @@ struct lis3lv02d_platform_data {
 #define LIS3_IRQ_OPEN_DRAIN	(1 << 6)
 #define LIS3_IRQ_ACTIVE_LOW	(1 << 7)
 	unsigned char irq_cfg;
+
+#define LIS3_WAKEUP_X_LO	(1 << 0)
+#define LIS3_WAKEUP_X_HI	(1 << 1)
+#define LIS3_WAKEUP_Y_LO	(1 << 2)
+#define LIS3_WAKEUP_Y_HI	(1 << 3)
+#define LIS3_WAKEUP_Z_LO	(1 << 4)
+#define LIS3_WAKEUP_Z_HI	(1 << 5)
+	unsigned char wakeup_flags;
+	unsigned char wakeup_thresh;
 };
 
 #endif /* __LIS3LV02D_H_ */
-- 
cgit v1.2.3-71-gd317


From abd6633c67925f90775bb74755f9c547e30f1f20 Mon Sep 17 00:00:00 2001
From: David Härdeman <david@hardeman.nu>
Date: Mon, 21 Sep 2009 17:04:52 -0700
Subject: pnp: add a shutdown method to pnp drivers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The shutdown method is used by the winbond cir driver to setup the
hardware for wake-from-S5.

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: David Härdeman <david@hardeman.nu>
Cc: Dmitry Torokhov <dtor@mail.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/pnp/driver.c | 10 ++++++++++
 include/linux/pnp.h  |  1 +
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pnp/driver.c b/drivers/pnp/driver.c
index 527ee764c93f..cd11b113494f 100644
--- a/drivers/pnp/driver.c
+++ b/drivers/pnp/driver.c
@@ -135,6 +135,15 @@ static int pnp_device_remove(struct device *dev)
 	return 0;
 }
 
+static void pnp_device_shutdown(struct device *dev)
+{
+	struct pnp_dev *pnp_dev = to_pnp_dev(dev);
+	struct pnp_driver *drv = pnp_dev->driver;
+
+	if (drv && drv->shutdown)
+		drv->shutdown(pnp_dev);
+}
+
 static int pnp_bus_match(struct device *dev, struct device_driver *drv)
 {
 	struct pnp_dev *pnp_dev = to_pnp_dev(dev);
@@ -203,6 +212,7 @@ struct bus_type pnp_bus_type = {
 	.match   = pnp_bus_match,
 	.probe   = pnp_device_probe,
 	.remove  = pnp_device_remove,
+	.shutdown = pnp_device_shutdown,
 	.suspend = pnp_bus_suspend,
 	.resume  = pnp_bus_resume,
 	.dev_attrs = pnp_interface_attrs,
diff --git a/include/linux/pnp.h b/include/linux/pnp.h
index b063c7328ba5..fddfafaed024 100644
--- a/include/linux/pnp.h
+++ b/include/linux/pnp.h
@@ -360,6 +360,7 @@ struct pnp_driver {
 	unsigned int flags;
 	int (*probe) (struct pnp_dev *dev, const struct pnp_device_id *dev_id);
 	void (*remove) (struct pnp_dev *dev);
+	void (*shutdown) (struct pnp_dev *dev);
 	int (*suspend) (struct pnp_dev *dev, pm_message_t state);
 	int (*resume) (struct pnp_dev *dev);
 	struct device_driver driver;
-- 
cgit v1.2.3-71-gd317


From 7c329288d72e025db4feac65f0fed95fb3e3ef1c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 23 Sep 2009 09:52:18 +1000
Subject: vgaarb: make client interface config invariant.

Fixes build when VGA_ARB is off.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 include/linux/vgaarb.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/vgaarb.h b/include/linux/vgaarb.h
index e81c64af80c1..923f9040ea20 100644
--- a/include/linux/vgaarb.h
+++ b/include/linux/vgaarb.h
@@ -41,7 +41,7 @@
  *     interrupts at any time.
  */
 extern void vga_set_legacy_decoding(struct pci_dev *pdev,
-									unsigned int decodes);
+				    unsigned int decodes);
 
 /**
  *     vga_get         - acquire & locks VGA resources
@@ -193,8 +193,17 @@ static inline int vga_conflicts(struct pci_dev *p1, struct pci_dev *p2)
  * They driver will get a callback when VGA arbitration is first used
  * by userspace since we some older X servers have issues.
  */
+#if defined(CONFIG_VGA_ARB)
 int vga_client_register(struct pci_dev *pdev, void *cookie,
 			void (*irq_set_state)(void *cookie, bool state),
 			unsigned int (*set_vga_decode)(void *cookie, bool state));
+#else
+static inline int vga_client_register(struct pci_dev *pdev, void *cookie,
+				      void (*irq_set_state)(void *cookie, bool state),
+				      unsigned int (*set_vga_decode)(void *cookie, bool state))
+{
+	return 0;
+}
+#endif
 
 #endif /* LINUX_VGA_H */
-- 
cgit v1.2.3-71-gd317


From 3c1b27d5043086a485f8526353ae9fe37bfa1065 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 23 Sep 2009 22:26:31 -0600
Subject: virtio: make add_buf return capacity remaining

This API change means that virtio_net can tell how much capacity
remains for buffers.  It's necessarily fuzzy, since
VIRTIO_RING_F_INDIRECT_DESC means we can fit any number of descriptors
in one, *if* we can kmalloc.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Dinesh Subhraveti <dineshs@us.ibm.com>
---
 drivers/block/virtio_blk.c          |  2 +-
 drivers/char/hw_random/virtio-rng.c |  2 +-
 drivers/char/virtio_console.c       |  4 ++--
 drivers/net/virtio_net.c            | 14 +++++++-------
 drivers/virtio/virtio_balloon.c     |  2 +-
 drivers/virtio/virtio_ring.c        |  6 +++++-
 include/linux/virtio.h              |  2 +-
 net/9p/trans_virtio.c               |  2 +-
 8 files changed, 19 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index aa1a3d5a3e2b..d739ee4201f9 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -139,7 +139,7 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 		}
 	}
 
-	if (vblk->vq->vq_ops->add_buf(vblk->vq, vblk->sg, out, in, vbr)) {
+	if (vblk->vq->vq_ops->add_buf(vblk->vq, vblk->sg, out, in, vbr) < 0) {
 		mempool_free(vbr, vblk->pool);
 		return false;
 	}
diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
index 32216b623248..b6c24dcc987d 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -51,7 +51,7 @@ static void register_buffer(void)
 
 	sg_init_one(&sg, random_data+data_left, RANDOM_DATA_SIZE-data_left);
 	/* There should always be room for one buffer. */
-	if (vq->vq_ops->add_buf(vq, &sg, 0, 1, random_data) != 0)
+	if (vq->vq_ops->add_buf(vq, &sg, 0, 1, random_data) < 0)
 		BUG();
 	vq->vq_ops->kick(vq);
 }
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index c74dacfa6795..a035ae39a359 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -65,7 +65,7 @@ static int put_chars(u32 vtermno, const char *buf, int count)
 
 	/* add_buf wants a token to identify this buffer: we hand it any
 	 * non-NULL pointer, since there's only ever one buffer. */
-	if (out_vq->vq_ops->add_buf(out_vq, sg, 1, 0, (void *)1) == 0) {
+	if (out_vq->vq_ops->add_buf(out_vq, sg, 1, 0, (void *)1) >= 0) {
 		/* Tell Host to go! */
 		out_vq->vq_ops->kick(out_vq);
 		/* Chill out until it's done with the buffer. */
@@ -85,7 +85,7 @@ static void add_inbuf(void)
 	sg_init_one(sg, inbuf, PAGE_SIZE);
 
 	/* We should always be able to add one buffer to an empty queue. */
-	if (in_vq->vq_ops->add_buf(in_vq, sg, 0, 1, inbuf) != 0)
+	if (in_vq->vq_ops->add_buf(in_vq, sg, 0, 1, inbuf) < 0)
 		BUG();
 	in_vq->vq_ops->kick(in_vq);
 }
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 32266fb89c20..fbf04a553f5a 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -320,7 +320,7 @@ static bool try_fill_recv_maxbufs(struct virtnet_info *vi, gfp_t gfp)
 		skb_queue_head(&vi->recv, skb);
 
 		err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, num, skb);
-		if (err) {
+		if (err < 0) {
 			skb_unlink(skb, &vi->recv);
 			trim_pages(vi, skb);
 			kfree_skb(skb);
@@ -373,7 +373,7 @@ static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
 		skb_queue_head(&vi->recv, skb);
 
 		err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, 1, skb);
-		if (err) {
+		if (err < 0) {
 			skb_unlink(skb, &vi->recv);
 			kfree_skb(skb);
 			break;
@@ -527,7 +527,7 @@ static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
 	num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1;
 
 	err = vi->svq->vq_ops->add_buf(vi->svq, sg, num, 0, skb);
-	if (!err && !vi->free_in_tasklet)
+	if (err >= 0 && !vi->free_in_tasklet)
 		mod_timer(&vi->xmit_free_timer, jiffies + (HZ/10));
 
 	return err;
@@ -538,7 +538,7 @@ static void xmit_tasklet(unsigned long data)
 	struct virtnet_info *vi = (void *)data;
 
 	netif_tx_lock_bh(vi->dev);
-	if (vi->last_xmit_skb && xmit_skb(vi, vi->last_xmit_skb) == 0) {
+	if (vi->last_xmit_skb && xmit_skb(vi, vi->last_xmit_skb) >= 0) {
 		vi->svq->vq_ops->kick(vi->svq);
 		vi->last_xmit_skb = NULL;
 	}
@@ -557,7 +557,7 @@ again:
 
 	/* If we has a buffer left over from last time, send it now. */
 	if (unlikely(vi->last_xmit_skb) &&
-	    xmit_skb(vi, vi->last_xmit_skb) != 0)
+	    xmit_skb(vi, vi->last_xmit_skb) < 0)
 		goto stop_queue;
 
 	vi->last_xmit_skb = NULL;
@@ -565,7 +565,7 @@ again:
 	/* Put new one in send queue and do transmit */
 	if (likely(skb)) {
 		__skb_queue_head(&vi->send, skb);
-		if (xmit_skb(vi, skb) != 0) {
+		if (xmit_skb(vi, skb) < 0) {
 			vi->last_xmit_skb = skb;
 			skb = NULL;
 			goto stop_queue;
@@ -668,7 +668,7 @@ static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
 		sg_set_buf(&sg[i + 1], sg_virt(s), s->length);
 	sg_set_buf(&sg[out + in - 1], &status, sizeof(status));
 
-	BUG_ON(vi->cvq->vq_ops->add_buf(vi->cvq, sg, out, in, vi));
+	BUG_ON(vi->cvq->vq_ops->add_buf(vi->cvq, sg, out, in, vi) < 0);
 
 	vi->cvq->vq_ops->kick(vi->cvq);
 
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 26b278264796..39789232646d 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -84,7 +84,7 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
 	init_completion(&vb->acked);
 
 	/* We should always be able to add one buffer to an empty queue. */
-	if (vq->vq_ops->add_buf(vq, &sg, 1, 0, vb) != 0)
+	if (vq->vq_ops->add_buf(vq, &sg, 1, 0, vb) < 0)
 		BUG();
 	vq->vq_ops->kick(vq);
 
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index a882f2606515..f53600580726 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -208,7 +208,11 @@ add_head:
 
 	pr_debug("Added buffer head %i to %p\n", head, vq);
 	END_USE(vq);
-	return 0;
+
+	/* If we're indirect, we can fit many (assuming not OOM). */
+	if (vq->indirect)
+		return vq->num_free ? vq->vring.num : 0;
+	return vq->num_free;
 }
 
 static void vring_kick(struct virtqueue *_vq)
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 4fca4f5440ba..057a2e010758 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -34,7 +34,7 @@ struct virtqueue {
  *	out_num: the number of sg readable by other side
  *	in_num: the number of sg which are writable (after readable ones)
  *	data: the token identifying the buffer.
- *      Returns 0 or an error.
+ *      Returns remaining capacity of queue (sg segments) or a negative error.
  * @kick: update after add_buf
  *	vq: the struct virtqueue
  *	After one or more add_buf calls, invoke this to kick the other side.
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index 9bf0b737aa51..53c139d31a21 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -200,7 +200,7 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req)
 
 	req->status = REQ_STATUS_SENT;
 
-	if (chan->vq->vq_ops->add_buf(chan->vq, chan->sg, out, in, req->tc)) {
+	if (chan->vq->vq_ops->add_buf(chan->vq, chan->sg, out, in, req->tc) < 0) {
 		P9_DPRINTK(P9_DEBUG_TRANS,
 			"9p debug: virtio rpc add_buf returned failure");
 		return -EIO;
-- 
cgit v1.2.3-71-gd317


From 3ca4f5ca73057a617f9444a91022d7127041970a Mon Sep 17 00:00:00 2001
From: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp>
Date: Fri, 31 Jul 2009 15:25:56 +0900
Subject: virtio: add virtio IDs file

Virtio IDs are spread all over the tree which makes assigning new IDs
bothersome. Putting them together should make the process less error-prone.

Signed-off-by: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/lguest/lguest.c       |  1 +
 drivers/block/virtio_blk.c          |  1 +
 drivers/char/hw_random/virtio-rng.c |  1 +
 drivers/char/virtio_console.c       |  1 +
 drivers/net/virtio_net.c            |  1 +
 drivers/virtio/virtio_balloon.c     |  1 +
 include/linux/virtio_9p.h           |  2 --
 include/linux/virtio_balloon.h      |  3 ---
 include/linux/virtio_blk.h          |  3 ---
 include/linux/virtio_console.h      |  3 ---
 include/linux/virtio_ids.h          | 17 +++++++++++++++++
 include/linux/virtio_net.h          |  3 ---
 include/linux/virtio_rng.h          |  3 ---
 net/9p/trans_virtio.c               |  1 +
 14 files changed, 24 insertions(+), 17 deletions(-)
 create mode 100644 include/linux/virtio_ids.h

(limited to 'include/linux')

diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 950cde6d6e58..84bbb190bf7b 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -42,6 +42,7 @@
 #include <signal.h>
 #include "linux/lguest_launcher.h"
 #include "linux/virtio_config.h"
+#include <linux/virtio_ids.h>
 #include "linux/virtio_net.h"
 #include "linux/virtio_blk.h"
 #include "linux/virtio_console.h"
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index d739ee4201f9..73de7532fcf5 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -3,6 +3,7 @@
 #include <linux/blkdev.h>
 #include <linux/hdreg.h>
 #include <linux/virtio.h>
+#include <linux/virtio_ids.h>
 #include <linux/virtio_blk.h>
 #include <linux/scatterlist.h>
 
diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
index b6c24dcc987d..962968f05b94 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -21,6 +21,7 @@
 #include <linux/scatterlist.h>
 #include <linux/spinlock.h>
 #include <linux/virtio.h>
+#include <linux/virtio_ids.h>
 #include <linux/virtio_rng.h>
 
 /* The host will fill any buffer we give it with sweet, sweet randomness.  We
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index a035ae39a359..0d328b59568d 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -31,6 +31,7 @@
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/virtio.h>
+#include <linux/virtio_ids.h>
 #include <linux/virtio_console.h>
 #include "hvc_console.h"
 
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index fbf04a553f5a..5c498d2b043f 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -22,6 +22,7 @@
 #include <linux/ethtool.h>
 #include <linux/module.h>
 #include <linux/virtio.h>
+#include <linux/virtio_ids.h>
 #include <linux/virtio_net.h>
 #include <linux/scatterlist.h>
 #include <linux/if_vlan.h>
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 39789232646d..200c22f55130 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -19,6 +19,7 @@
  */
 //#define DEBUG
 #include <linux/virtio.h>
+#include <linux/virtio_ids.h>
 #include <linux/virtio_balloon.h>
 #include <linux/swap.h>
 #include <linux/kthread.h>
diff --git a/include/linux/virtio_9p.h b/include/linux/virtio_9p.h
index b3c4a60ceeb3..ea7226a45acb 100644
--- a/include/linux/virtio_9p.h
+++ b/include/linux/virtio_9p.h
@@ -4,8 +4,6 @@
  * compatible drivers/servers. */
 #include <linux/virtio_config.h>
 
-/* The ID for virtio console */
-#define VIRTIO_ID_9P	9
 /* Maximum number of virtio channels per partition (1 for now) */
 #define MAX_9P_CHAN	1
 
diff --git a/include/linux/virtio_balloon.h b/include/linux/virtio_balloon.h
index 8726ff77763e..09d730085060 100644
--- a/include/linux/virtio_balloon.h
+++ b/include/linux/virtio_balloon.h
@@ -4,9 +4,6 @@
  * compatible drivers/servers. */
 #include <linux/virtio_config.h>
 
-/* The ID for virtio_balloon */
-#define VIRTIO_ID_BALLOON	5
-
 /* The feature bitmap for virtio balloon */
 #define VIRTIO_BALLOON_F_MUST_TELL_HOST	0 /* Tell before reclaiming pages */
 
diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h
index 8dab9f2b8832..25fbabfa90d4 100644
--- a/include/linux/virtio_blk.h
+++ b/include/linux/virtio_blk.h
@@ -5,9 +5,6 @@
 #include <linux/types.h>
 #include <linux/virtio_config.h>
 
-/* The ID for virtio_block */
-#define VIRTIO_ID_BLOCK	2
-
 /* Feature bits */
 #define VIRTIO_BLK_F_BARRIER	0	/* Does host support barriers? */
 #define VIRTIO_BLK_F_SIZE_MAX	1	/* Indicates maximum segment size */
diff --git a/include/linux/virtio_console.h b/include/linux/virtio_console.h
index dc161115ae35..b5f519806014 100644
--- a/include/linux/virtio_console.h
+++ b/include/linux/virtio_console.h
@@ -5,9 +5,6 @@
 /* This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so
  * anyone can use the definitions to implement compatible drivers/servers. */
 
-/* The ID for virtio console */
-#define VIRTIO_ID_CONSOLE	3
-
 /* Feature bits */
 #define VIRTIO_CONSOLE_F_SIZE	0	/* Does host provide console size? */
 
diff --git a/include/linux/virtio_ids.h b/include/linux/virtio_ids.h
new file mode 100644
index 000000000000..06660c0a78d7
--- /dev/null
+++ b/include/linux/virtio_ids.h
@@ -0,0 +1,17 @@
+#ifndef _LINUX_VIRTIO_IDS_H
+#define _LINUX_VIRTIO_IDS_H
+/*
+ * Virtio IDs
+ *
+ * This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers.
+ */
+
+#define VIRTIO_ID_NET		1 /* virtio net */
+#define VIRTIO_ID_BLOCK		2 /* virtio block */
+#define VIRTIO_ID_CONSOLE	3 /* virtio console */
+#define VIRTIO_ID_RNG		4 /* virtio ring */
+#define VIRTIO_ID_BALLOON	5 /* virtio balloon */
+#define VIRTIO_ID_9P		9 /* 9p virtio console */
+
+#endif /* _LINUX_VIRTIO_IDS_H */
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index d8dd539c9f48..1f41734bbb77 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -6,9 +6,6 @@
 #include <linux/virtio_config.h>
 #include <linux/if_ether.h>
 
-/* The ID for virtio_net */
-#define VIRTIO_ID_NET	1
-
 /* The feature bitmap for virtio net */
 #define VIRTIO_NET_F_CSUM	0	/* Host handles pkts w/ partial csum */
 #define VIRTIO_NET_F_GUEST_CSUM	1	/* Guest handles pkts w/ partial csum */
diff --git a/include/linux/virtio_rng.h b/include/linux/virtio_rng.h
index 1a85dab8a940..48121c3c434b 100644
--- a/include/linux/virtio_rng.h
+++ b/include/linux/virtio_rng.h
@@ -4,7 +4,4 @@
  * compatible drivers/servers. */
 #include <linux/virtio_config.h>
 
-/* The ID for virtio_rng */
-#define VIRTIO_ID_RNG	4
-
 #endif /* _LINUX_VIRTIO_RNG_H */
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index ea1e3daabefe..b2e07f0dd298 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -43,6 +43,7 @@
 #include <net/9p/transport.h>
 #include <linux/scatterlist.h>
 #include <linux/virtio.h>
+#include <linux/virtio_ids.h>
 #include <linux/virtio_9p.h>
 
 #define VIRTQUEUE_NUM	128
-- 
cgit v1.2.3-71-gd317


From f1b0ef062602713c2c7cfa12362d5d90ed01c5f6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 17 Sep 2009 19:57:42 +0200
Subject: virtio_blk: add support for cache flush

Recent qemu has added a VIRTIO_BLK_F_FLUSH flag to advertise that the
virtual disk has a volatile write cache that needs to be flushed.  In case
we see this feature implement tell the Linux block layer about the fact
and use the new VIRTIO_BLK_T_FLUSH to flush the cache when required.  This
allows for an correct and simple implementation of write barriers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/block/virtio_blk.c | 30 +++++++++++++++++++++++++-----
 include/linux/virtio_blk.h | 15 +++++++++++++++
 2 files changed, 40 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 73de7532fcf5..3d5fe97569c7 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -92,15 +92,26 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 		return false;
 
 	vbr->req = req;
-	if (blk_fs_request(vbr->req)) {
+	switch (req->cmd_type) {
+	case REQ_TYPE_FS:
 		vbr->out_hdr.type = 0;
 		vbr->out_hdr.sector = blk_rq_pos(vbr->req);
 		vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
-	} else if (blk_pc_request(vbr->req)) {
+		break;
+	case REQ_TYPE_BLOCK_PC:
 		vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD;
 		vbr->out_hdr.sector = 0;
 		vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
-	} else {
+		break;
+	case REQ_TYPE_LINUX_BLOCK:
+		if (req->cmd[0] == REQ_LB_OP_FLUSH) {
+			vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
+			vbr->out_hdr.sector = 0;
+			vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
+			break;
+		}
+		/*FALLTHRU*/
+	default:
 		/* We don't put anything else in the queue. */
 		BUG();
 	}
@@ -200,6 +211,12 @@ out:
 	return err;
 }
 
+static void virtblk_prepare_flush(struct request_queue *q, struct request *req)
+{
+	req->cmd_type = REQ_TYPE_LINUX_BLOCK;
+	req->cmd[0] = REQ_LB_OP_FLUSH;
+}
+
 static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
 			 unsigned cmd, unsigned long data)
 {
@@ -338,7 +355,10 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 	index++;
 
 	/* If barriers are supported, tell block layer that queue is ordered */
-	if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER))
+	if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH))
+		blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_DRAIN_FLUSH,
+				  virtblk_prepare_flush);
+	else if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER))
 		blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_TAG, NULL);
 
 	/* If disk is read-only in the host, the guest should obey */
@@ -425,7 +445,7 @@ static struct virtio_device_id id_table[] = {
 static unsigned int features[] = {
 	VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX,
 	VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
-	VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_IDENTIFY
+	VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_IDENTIFY, VIRTIO_BLK_F_FLUSH
 };
 
 /*
diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h
index 25fbabfa90d4..15cb666581d7 100644
--- a/include/linux/virtio_blk.h
+++ b/include/linux/virtio_blk.h
@@ -14,6 +14,7 @@
 #define VIRTIO_BLK_F_BLK_SIZE	6	/* Block size of disk is available*/
 #define VIRTIO_BLK_F_SCSI	7	/* Supports scsi command passthru */
 #define VIRTIO_BLK_F_IDENTIFY	8	/* ATA IDENTIFY supported */
+#define VIRTIO_BLK_F_FLUSH	9	/* Cache flush command support */
 
 #define VIRTIO_BLK_ID_BYTES	(sizeof(__u16[256]))	/* IDENTIFY DATA */
 
@@ -35,6 +36,17 @@ struct virtio_blk_config {
 	__u8 identify[VIRTIO_BLK_ID_BYTES];
 } __attribute__((packed));
 
+/*
+ * Command types
+ *
+ * Usage is a bit tricky as some bits are used as flags and some are not.
+ *
+ * Rules:
+ *   VIRTIO_BLK_T_OUT may be combined with VIRTIO_BLK_T_SCSI_CMD or
+ *   VIRTIO_BLK_T_BARRIER.  VIRTIO_BLK_T_FLUSH is a command of its own
+ *   and may not be combined with any of the other flags.
+ */
+
 /* These two define direction. */
 #define VIRTIO_BLK_T_IN		0
 #define VIRTIO_BLK_T_OUT	1
@@ -42,6 +54,9 @@ struct virtio_blk_config {
 /* This bit says it's a scsi command, not an actual read or write. */
 #define VIRTIO_BLK_T_SCSI_CMD	2
 
+/* Cache flush command */
+#define VIRTIO_BLK_T_FLUSH	4
+
 /* Barrier before this op. */
 #define VIRTIO_BLK_T_BARRIER	0x80000000
 
-- 
cgit v1.2.3-71-gd317


From 4e9e92003529e5c7bb11281f7c2c9b3fe8858403 Mon Sep 17 00:00:00 2001
From: Pete Zaitcev <zaitcev@redhat.com>
Date: Thu, 11 Jun 2009 08:53:20 -0600
Subject: USB: usbmon: end ugly tricks with DMA peeking

This patch fixes crashes when usbmon attempts to access GART aperture.
The old code attempted to take a bus address and convert it into a
virtual address, which clearly was impossible on systems with actual
IOMMUs. Let us not persist in this foolishness, and use transfer_buffer
in all cases instead.

I think downsides are negligible. The ones I see are:
 - A driver may pass an address of one buffer down as transfer_buffer,
   and entirely different entity mapped for DMA, resulting in misleading
   output of usbmon. Note, however, that PIO based controllers would
   do transfer the same data that usbmon sees here.
 - Out of tree drivers may crash usbmon if they store garbage in
   transfer_buffer. I inspected the in-tree drivers, and clarified
   the documentation in comments.
 - Drivers that use get_user_pages will not be possible to monitor.
   I only found one driver with this problem (drivers/staging/rspiusb).
 - Same happens with with usb_storage transferring from highmem, but
   it works fine on 64-bit systems, so I think it's not a concern.
   At least we don't crash anymore.

Why didn't we do this in 2.6.10? That's because back in those days
it was popular not to fill in transfer_buffer, so almost all
traffic would be invisible (e.g. all of HID was like that).
But now, the tree is almost 100% PIO friendly, so we can do the
right thing at last.

Signed-off-by: Pete Zaitcev <zaitcev@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/mon/Makefile   |  2 +-
 drivers/usb/mon/mon_bin.c  | 12 +-----
 drivers/usb/mon/mon_dma.c  | 95 ----------------------------------------------
 drivers/usb/mon/mon_main.c |  1 -
 drivers/usb/mon/mon_text.c | 14 -------
 drivers/usb/mon/usb_mon.h  | 14 -------
 include/linux/usb.h        | 19 +++++++---
 7 files changed, 15 insertions(+), 142 deletions(-)
 delete mode 100644 drivers/usb/mon/mon_dma.c

(limited to 'include/linux')

diff --git a/drivers/usb/mon/Makefile b/drivers/usb/mon/Makefile
index c6516b566731..384b198faa7c 100644
--- a/drivers/usb/mon/Makefile
+++ b/drivers/usb/mon/Makefile
@@ -2,6 +2,6 @@
 # Makefile for USB monitor
 #
 
-usbmon-objs	:= mon_main.o mon_stat.o mon_text.o mon_bin.o mon_dma.o
+usbmon-objs	:= mon_main.o mon_stat.o mon_text.o mon_bin.o
 
 obj-$(CONFIG_USB_MON)	+= usbmon.o
diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c
index 0f7a30b7d2d1..dfdc43e2e00d 100644
--- a/drivers/usb/mon/mon_bin.c
+++ b/drivers/usb/mon/mon_bin.c
@@ -220,9 +220,8 @@ static void mon_free_buff(struct mon_pgmap *map, int npages);
 
 /*
  * This is a "chunked memcpy". It does not manipulate any counters.
- * But it returns the new offset for repeated application.
  */
-unsigned int mon_copy_to_buff(const struct mon_reader_bin *this,
+static void mon_copy_to_buff(const struct mon_reader_bin *this,
     unsigned int off, const unsigned char *from, unsigned int length)
 {
 	unsigned int step_len;
@@ -247,7 +246,6 @@ unsigned int mon_copy_to_buff(const struct mon_reader_bin *this,
 		from += step_len;
 		length -= step_len;
 	}
-	return off;
 }
 
 /*
@@ -400,15 +398,8 @@ static char mon_bin_get_data(const struct mon_reader_bin *rp,
     unsigned int offset, struct urb *urb, unsigned int length)
 {
 
-	if (urb->dev->bus->uses_dma &&
-	    (urb->transfer_flags & URB_NO_TRANSFER_DMA_MAP)) {
-		mon_dmapeek_vec(rp, offset, urb->transfer_dma, length);
-		return 0;
-	}
-
 	if (urb->transfer_buffer == NULL)
 		return 'Z';
-
 	mon_copy_to_buff(rp, offset, urb->transfer_buffer, length);
 	return 0;
 }
@@ -635,7 +626,6 @@ static int mon_bin_open(struct inode *inode, struct file *file)
 	spin_lock_init(&rp->b_lock);
 	init_waitqueue_head(&rp->b_wait);
 	mutex_init(&rp->fetch_lock);
-
 	rp->b_size = BUFF_DFL;
 
 	size = sizeof(struct mon_pgmap) * (rp->b_size/CHUNK_SIZE);
diff --git a/drivers/usb/mon/mon_dma.c b/drivers/usb/mon/mon_dma.c
deleted file mode 100644
index 140cc80bd2b1..000000000000
--- a/drivers/usb/mon/mon_dma.c
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * The USB Monitor, inspired by Dave Harding's USBMon.
- *
- * mon_dma.c: Library which snoops on DMA areas.
- *
- * Copyright (C) 2005 Pete Zaitcev (zaitcev@redhat.com)
- */
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/highmem.h>
-#include <asm/page.h>
-
-#include <linux/usb.h>	/* Only needed for declarations in usb_mon.h */
-#include "usb_mon.h"
-
-/*
- * PC-compatibles, are, fortunately, sufficiently cache-coherent for this.
- */
-#if defined(__i386__) || defined(__x86_64__) /* CONFIG_ARCH_I386 doesn't exit */
-#define MON_HAS_UNMAP 1
-
-#define phys_to_page(phys)	pfn_to_page((phys) >> PAGE_SHIFT)
-
-char mon_dmapeek(unsigned char *dst, dma_addr_t dma_addr, int len)
-{
-	struct page *pg;
-	unsigned long flags;
-	unsigned char *map;
-	unsigned char *ptr;
-
-	/*
-	 * On i386, a DMA handle is the "physical" address of a page.
-	 * In other words, the bus address is equal to physical address.
-	 * There is no IOMMU.
-	 */
-	pg = phys_to_page(dma_addr);
-
-	/*
-	 * We are called from hardware IRQs in case of callbacks.
-	 * But we can be called from softirq or process context in case
-	 * of submissions. In such case, we need to protect KM_IRQ0.
-	 */
-	local_irq_save(flags);
-	map = kmap_atomic(pg, KM_IRQ0);
-	ptr = map + (dma_addr & (PAGE_SIZE-1));
-	memcpy(dst, ptr, len);
-	kunmap_atomic(map, KM_IRQ0);
-	local_irq_restore(flags);
-	return 0;
-}
-
-void mon_dmapeek_vec(const struct mon_reader_bin *rp,
-    unsigned int offset, dma_addr_t dma_addr, unsigned int length)
-{
-	unsigned long flags;
-	unsigned int step_len;
-	struct page *pg;
-	unsigned char *map;
-	unsigned long page_off, page_len;
-
-	local_irq_save(flags);
-	while (length) {
-		/* compute number of bytes we are going to copy in this page */
-		step_len = length;
-		page_off = dma_addr & (PAGE_SIZE-1);
-		page_len = PAGE_SIZE - page_off;
-		if (page_len < step_len)
-			step_len = page_len;
-
-		/* copy data and advance pointers */
-		pg = phys_to_page(dma_addr);
-		map = kmap_atomic(pg, KM_IRQ0);
-		offset = mon_copy_to_buff(rp, offset, map + page_off, step_len);
-		kunmap_atomic(map, KM_IRQ0);
-		dma_addr += step_len;
-		length -= step_len;
-	}
-	local_irq_restore(flags);
-}
-
-#endif /* __i386__ */
-
-#ifndef MON_HAS_UNMAP
-char mon_dmapeek(unsigned char *dst, dma_addr_t dma_addr, int len)
-{
-	return 'D';
-}
-
-void mon_dmapeek_vec(const struct mon_reader_bin *rp,
-    unsigned int offset, dma_addr_t dma_addr, unsigned int length)
-{
-	;
-}
-
-#endif /* MON_HAS_UNMAP */
diff --git a/drivers/usb/mon/mon_main.c b/drivers/usb/mon/mon_main.c
index 5e0ab4201c00..e0c2db3b767b 100644
--- a/drivers/usb/mon/mon_main.c
+++ b/drivers/usb/mon/mon_main.c
@@ -361,7 +361,6 @@ static int __init mon_init(void)
 	}
 	// MOD_INC_USE_COUNT(which_module?);
 
-
 	mutex_lock(&usb_bus_list_lock);
 	list_for_each_entry (ubus, &usb_bus_list, bus_list) {
 		mon_bus_init(ubus);
diff --git a/drivers/usb/mon/mon_text.c b/drivers/usb/mon/mon_text.c
index a7eb4c99342c..9f1a9227ebe6 100644
--- a/drivers/usb/mon/mon_text.c
+++ b/drivers/usb/mon/mon_text.c
@@ -150,20 +150,6 @@ static inline char mon_text_get_data(struct mon_event_text *ep, struct urb *urb,
 			return '>';
 	}
 
-	/*
-	 * The check to see if it's safe to poke at data has an enormous
-	 * number of corner cases, but it seems that the following is
-	 * more or less safe.
-	 *
-	 * We do not even try to look at transfer_buffer, because it can
-	 * contain non-NULL garbage in case the upper level promised to
-	 * set DMA for the HCD.
-	 */
-	if (urb->dev->bus->uses_dma &&
-	    (urb->transfer_flags & URB_NO_TRANSFER_DMA_MAP)) {
-		return mon_dmapeek(ep->data, urb->transfer_dma, len);
-	}
-
 	if (urb->transfer_buffer == NULL)
 		return 'Z';	/* '0' would be not as pretty. */
 
diff --git a/drivers/usb/mon/usb_mon.h b/drivers/usb/mon/usb_mon.h
index f5d84ff8c101..df9a4df342c7 100644
--- a/drivers/usb/mon/usb_mon.h
+++ b/drivers/usb/mon/usb_mon.h
@@ -64,20 +64,6 @@ void mon_text_exit(void);
 int __init mon_bin_init(void);
 void mon_bin_exit(void);
 
-/*
- * DMA interface.
- *
- * XXX The vectored side needs a serious re-thinking. Abstracting vectors,
- * like in Paolo's original patch, produces a double pkmap. We need an idea.
-*/
-extern char mon_dmapeek(unsigned char *dst, dma_addr_t dma_addr, int len);
-
-struct mon_reader_bin;
-extern void mon_dmapeek_vec(const struct mon_reader_bin *rp,
-    unsigned int offset, dma_addr_t dma_addr, unsigned int len);
-extern unsigned int mon_copy_to_buff(const struct mon_reader_bin *rp,
-    unsigned int offset, const unsigned char *from, unsigned int len);
-
 /*
  */
 extern struct mutex mon_lock;
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 19fabc487beb..3b45a0d27b80 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1036,9 +1036,10 @@ typedef void (*usb_complete_t)(struct urb *);
  * @transfer_flags: A variety of flags may be used to affect how URB
  *	submission, unlinking, or operation are handled.  Different
  *	kinds of URB can use different flags.
- * @transfer_buffer:  This identifies the buffer to (or from) which
- * 	the I/O request will be performed (unless URB_NO_TRANSFER_DMA_MAP
- *	is set).  This buffer must be suitable for DMA; allocate it with
+ * @transfer_buffer:  This identifies the buffer to (or from) which the I/O
+ *	request will be performed unless URB_NO_TRANSFER_DMA_MAP is set
+ *	(however, do not leave garbage in transfer_buffer even then).
+ *	This buffer must be suitable for DMA; allocate it with
  *	kmalloc() or equivalent.  For transfers to "in" endpoints, contents
  *	of this buffer will be modified.  This buffer is used for the data
  *	stage of control transfers.
@@ -1104,9 +1105,15 @@ typedef void (*usb_complete_t)(struct urb *);
  * allocate a DMA buffer with usb_buffer_alloc() or call usb_buffer_map().
  * When these transfer flags are provided, host controller drivers will
  * attempt to use the dma addresses found in the transfer_dma and/or
- * setup_dma fields rather than determining a dma address themselves.  (Note
- * that transfer_buffer and setup_packet must still be set because not all
- * host controllers use DMA, nor do virtual root hubs).
+ * setup_dma fields rather than determining a dma address themselves.
+ *
+ * Note that transfer_buffer must still be set if the controller
+ * does not support DMA (as indicated by bus.uses_dma) and when talking
+ * to root hub. If you have to trasfer between highmem zone and the device
+ * on such controller, create a bounce buffer or bail out with an error.
+ * If transfer_buffer cannot be set (is in highmem) and the controller is DMA
+ * capable, assign NULL to it, so that usbmon knows not to use the value.
+ * The setup_packet must always be set, so it cannot be located in highmem.
  *
  * Initialization:
  *
-- 
cgit v1.2.3-71-gd317


From 85e08ca54c5c203cd2638f0fc8fa899a539f6254 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@skynet.be>
Date: Sun, 21 Jun 2009 23:19:23 +0200
Subject: USB: Move endpoint sync type definitions from usb/audio.h to
 usb/ch9.h

And use the new definitions in the USB Audio Class gadget driver.

Signed-off-by: Laurent Pinchart <laurent.pinchart@skynet.be>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/gadget/f_audio.c |  2 +-
 include/linux/usb/audio.h    | 10 ----------
 include/linux/usb/ch9.h      |  6 ++++++
 3 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/f_audio.c b/drivers/usb/gadget/f_audio.c
index 66527ba2d2ea..76afbd1b515f 100644
--- a/drivers/usb/gadget/f_audio.c
+++ b/drivers/usb/gadget/f_audio.c
@@ -174,7 +174,7 @@ static struct usb_endpoint_descriptor as_out_ep_desc __initdata = {
 	.bLength =		USB_DT_ENDPOINT_AUDIO_SIZE,
 	.bDescriptorType =	USB_DT_ENDPOINT,
 	.bEndpointAddress =	USB_DIR_OUT,
-	.bmAttributes =		USB_AS_ENDPOINT_ADAPTIVE
+	.bmAttributes =		USB_ENDPOINT_SYNC_ADAPTIVE
 				| USB_ENDPOINT_XFER_ISOC,
 	.wMaxPacketSize =	__constant_cpu_to_le16(OUT_EP_MAX_PACKET_SIZE),
 	.bInterval =		4,
diff --git a/include/linux/usb/audio.h b/include/linux/usb/audio.h
index b5744bc218ab..bcf4fb4f5e82 100644
--- a/include/linux/usb/audio.h
+++ b/include/linux/usb/audio.h
@@ -46,12 +46,6 @@
 #define MIDI_IN_JACK			0x02
 #define MIDI_OUT_JACK			0x03
 
-/* endpoint attributes */
-#define EP_ATTR_MASK			0x0c
-#define EP_ATTR_ASYNC			0x04
-#define EP_ATTR_ADAPTIVE		0x08
-#define EP_ATTR_SYNC			0x0c
-
 /* cs endpoint attributes */
 #define EP_CS_ATTR_SAMPLE_RATE		0x01
 #define EP_CS_ATTR_PITCH_CONTROL	0x02
@@ -244,10 +238,6 @@ struct usb_as_formate_type_i_discrete_descriptor_##n {		\
 #define USB_AS_FORMAT_TYPE_II		0x2
 #define USB_AS_FORMAT_TYPE_III		0x3
 
-#define USB_AS_ENDPOINT_ASYNC		(1 << 2)
-#define USB_AS_ENDPOINT_ADAPTIVE	(2 << 2)
-#define USB_AS_ENDPOINT_SYNC		(3 << 2)
-
 struct usb_as_iso_endpoint_descriptor {
 	__u8  bLength;			/* in bytes: 7 */
 	__u8  bDescriptorType;		/* USB_DT_CS_ENDPOINT */
diff --git a/include/linux/usb/ch9.h b/include/linux/usb/ch9.h
index 93223638f702..8f8b7411b87b 100644
--- a/include/linux/usb/ch9.h
+++ b/include/linux/usb/ch9.h
@@ -348,6 +348,12 @@ struct usb_endpoint_descriptor {
 #define USB_ENDPOINT_NUMBER_MASK	0x0f	/* in bEndpointAddress */
 #define USB_ENDPOINT_DIR_MASK		0x80
 
+#define USB_ENDPOINT_SYNCTYPE		0x0c
+#define USB_ENDPOINT_SYNC_NONE		(0 << 2)
+#define USB_ENDPOINT_SYNC_ASYNC		(1 << 2)
+#define USB_ENDPOINT_SYNC_ADAPTIVE	(2 << 2)
+#define USB_ENDPOINT_SYNC_SYNC		(3 << 2)
+
 #define USB_ENDPOINT_XFERTYPE_MASK	0x03	/* in bmAttributes */
 #define USB_ENDPOINT_XFER_CONTROL	0
 #define USB_ENDPOINT_XFER_ISOC		1
-- 
cgit v1.2.3-71-gd317


From 315ad3028c8aae14891797040f855fc3291a076b Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@skynet.be>
Date: Sun, 21 Jun 2009 23:20:39 +0200
Subject: USB: Move vendor subclass definition from usb/audio.h to usb/ch9.h

USB_SUBCLASS_VENDOR_SPEC is common to several USB classes and as such belongs
to usb/ch9.h.

Signed-off-by: Laurent Pinchart <laurent.pinchart@skynet.be>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/usb/audio.h | 1 -
 include/linux/usb/ch9.h   | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/audio.h b/include/linux/usb/audio.h
index bcf4fb4f5e82..f75092aba02f 100644
--- a/include/linux/usb/audio.h
+++ b/include/linux/usb/audio.h
@@ -24,7 +24,6 @@
 #define USB_SUBCLASS_AUDIOCONTROL	0x01
 #define USB_SUBCLASS_AUDIOSTREAMING	0x02
 #define USB_SUBCLASS_MIDISTREAMING	0x03
-#define USB_SUBCLASS_VENDOR_SPEC	0xff
 
 /* A.5 Audio Class-Specific AC interface Descriptor Subtypes*/
 #define HEADER				0x01
diff --git a/include/linux/usb/ch9.h b/include/linux/usb/ch9.h
index 8f8b7411b87b..94012e649d86 100644
--- a/include/linux/usb/ch9.h
+++ b/include/linux/usb/ch9.h
@@ -258,6 +258,8 @@ struct usb_device_descriptor {
 #define USB_CLASS_APP_SPEC		0xfe
 #define USB_CLASS_VENDOR_SPEC		0xff
 
+#define USB_SUBCLASS_VENDOR_SPEC	0xff
+
 /*-------------------------------------------------------------------------*/
 
 /* USB_DT_CONFIG: Configuration descriptor information.
-- 
cgit v1.2.3-71-gd317


From 512ad27d8667158747de2e8da8a23e8f50e91856 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@skynet.be>
Date: Sun, 21 Jun 2009 23:23:05 +0200
Subject: USB audio gadget: Prefix all macro definitions with UAC_ in
 linux/usb/audio.h

linux/usb/audio.h is a public header file that includes definitions
exported to userspace. To avoid namespace clashes, prefix all macro
definitions with UAC_. Existing macros and structures prefixed with
USB_AC_ and USB_AS_ are renamed for consistency.

Signed-off-by: Laurent Pinchart <laurent.pinchart@skynet.be>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/gadget/audio.c   |  24 ++--
 drivers/usb/gadget/f_audio.c |  80 +++++++-------
 drivers/usb/gadget/gmidi.c   |   8 +-
 include/linux/usb/audio.h    | 258 ++++++++++++++++++++++---------------------
 4 files changed, 188 insertions(+), 182 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/audio.c b/drivers/usb/gadget/audio.c
index 9f80f4e970bd..a3a0f4a27ef0 100644
--- a/drivers/usb/gadget/audio.c
+++ b/drivers/usb/gadget/audio.c
@@ -106,20 +106,20 @@ static int audio_set_endpoint_req(struct usb_configuration *c,
 			ctrl->bRequest, w_value, len, ep);
 
 	switch (ctrl->bRequest) {
-	case SET_CUR:
+	case UAC_SET_CUR:
 		value = 0;
 		break;
 
-	case SET_MIN:
+	case UAC_SET_MIN:
 		break;
 
-	case SET_MAX:
+	case UAC_SET_MAX:
 		break;
 
-	case SET_RES:
+	case UAC_SET_RES:
 		break;
 
-	case SET_MEM:
+	case UAC_SET_MEM:
 		break;
 
 	default:
@@ -142,13 +142,13 @@ static int audio_get_endpoint_req(struct usb_configuration *c,
 			ctrl->bRequest, w_value, len, ep);
 
 	switch (ctrl->bRequest) {
-	case GET_CUR:
-	case GET_MIN:
-	case GET_MAX:
-	case GET_RES:
+	case UAC_GET_CUR:
+	case UAC_GET_MIN:
+	case UAC_GET_MAX:
+	case UAC_GET_RES:
 		value = 3;
 		break;
-	case GET_MEM:
+	case UAC_GET_MEM:
 		break;
 	default:
 		break;
@@ -171,11 +171,11 @@ audio_setup(struct usb_configuration *c, const struct usb_ctrlrequest *ctrl)
 	 * Audio class messages; interface activation uses set_alt().
 	 */
 	switch (ctrl->bRequestType) {
-	case USB_AUDIO_SET_ENDPOINT:
+	case USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_ENDPOINT:
 		value = audio_set_endpoint_req(c, ctrl);
 		break;
 
-	case USB_AUDIO_GET_ENDPOINT:
+	case USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_ENDPOINT:
 		value = audio_get_endpoint_req(c, ctrl);
 		break;
 
diff --git a/drivers/usb/gadget/f_audio.c b/drivers/usb/gadget/f_audio.c
index 76afbd1b515f..7b05b3c8c0b1 100644
--- a/drivers/usb/gadget/f_audio.c
+++ b/drivers/usb/gadget/f_audio.c
@@ -50,16 +50,16 @@ static struct usb_interface_descriptor ac_interface_desc __initdata = {
 	.bInterfaceSubClass =	USB_SUBCLASS_AUDIOCONTROL,
 };
 
-DECLARE_USB_AC_HEADER_DESCRIPTOR(2);
+DECLARE_UAC_AC_HEADER_DESCRIPTOR(2);
 
-#define USB_DT_AC_HEADER_LENGH	USB_DT_AC_HEADER_SIZE(F_AUDIO_NUM_INTERFACES)
+#define UAC_DT_AC_HEADER_LENGTH	UAC_DT_AC_HEADER_SIZE(F_AUDIO_NUM_INTERFACES)
 /* B.3.2  Class-Specific AC Interface Descriptor */
-static struct usb_ac_header_descriptor_2 ac_header_desc = {
-	.bLength =		USB_DT_AC_HEADER_LENGH,
+static struct uac_ac_header_descriptor_2 ac_header_desc = {
+	.bLength =		UAC_DT_AC_HEADER_LENGTH,
 	.bDescriptorType =	USB_DT_CS_INTERFACE,
-	.bDescriptorSubtype =	HEADER,
+	.bDescriptorSubtype =	UAC_HEADER,
 	.bcdADC =		__constant_cpu_to_le16(0x0100),
-	.wTotalLength =		__constant_cpu_to_le16(USB_DT_AC_HEADER_LENGH),
+	.wTotalLength =		__constant_cpu_to_le16(UAC_DT_AC_HEADER_LENGTH),
 	.bInCollection =	F_AUDIO_NUM_INTERFACES,
 	.baInterfaceNr = {
 		[0] =		F_AUDIO_AC_INTERFACE,
@@ -68,33 +68,33 @@ static struct usb_ac_header_descriptor_2 ac_header_desc = {
 };
 
 #define INPUT_TERMINAL_ID	1
-static struct usb_input_terminal_descriptor input_terminal_desc = {
-	.bLength =		USB_DT_AC_INPUT_TERMINAL_SIZE,
+static struct uac_input_terminal_descriptor input_terminal_desc = {
+	.bLength =		UAC_DT_INPUT_TERMINAL_SIZE,
 	.bDescriptorType =	USB_DT_CS_INTERFACE,
-	.bDescriptorSubtype =	INPUT_TERMINAL,
+	.bDescriptorSubtype =	UAC_INPUT_TERMINAL,
 	.bTerminalID =		INPUT_TERMINAL_ID,
-	.wTerminalType =	USB_AC_TERMINAL_STREAMING,
+	.wTerminalType =	UAC_TERMINAL_STREAMING,
 	.bAssocTerminal =	0,
 	.wChannelConfig =	0x3,
 };
 
-DECLARE_USB_AC_FEATURE_UNIT_DESCRIPTOR(0);
+DECLARE_UAC_FEATURE_UNIT_DESCRIPTOR(0);
 
 #define FEATURE_UNIT_ID		2
-static struct usb_ac_feature_unit_descriptor_0 feature_unit_desc = {
-	.bLength		= USB_DT_AC_FEATURE_UNIT_SIZE(0),
+static struct uac_feature_unit_descriptor_0 feature_unit_desc = {
+	.bLength		= UAC_DT_FEATURE_UNIT_SIZE(0),
 	.bDescriptorType	= USB_DT_CS_INTERFACE,
-	.bDescriptorSubtype	= FEATURE_UNIT,
+	.bDescriptorSubtype	= UAC_FEATURE_UNIT,
 	.bUnitID		= FEATURE_UNIT_ID,
 	.bSourceID		= INPUT_TERMINAL_ID,
 	.bControlSize		= 2,
-	.bmaControls[0]		= (FU_MUTE | FU_VOLUME),
+	.bmaControls[0]		= (UAC_FU_MUTE | UAC_FU_VOLUME),
 };
 
 static struct usb_audio_control mute_control = {
 	.list = LIST_HEAD_INIT(mute_control.list),
 	.name = "Mute Control",
-	.type = MUTE_CONTROL,
+	.type = UAC_MUTE_CONTROL,
 	/* Todo: add real Mute control code */
 	.set = generic_set_cmd,
 	.get = generic_get_cmd,
@@ -103,7 +103,7 @@ static struct usb_audio_control mute_control = {
 static struct usb_audio_control volume_control = {
 	.list = LIST_HEAD_INIT(volume_control.list),
 	.name = "Volume Control",
-	.type = VOLUME_CONTROL,
+	.type = UAC_VOLUME_CONTROL,
 	/* Todo: add real Volume control code */
 	.set = generic_set_cmd,
 	.get = generic_get_cmd,
@@ -113,17 +113,17 @@ static struct usb_audio_control_selector feature_unit = {
 	.list = LIST_HEAD_INIT(feature_unit.list),
 	.id = FEATURE_UNIT_ID,
 	.name = "Mute & Volume Control",
-	.type = FEATURE_UNIT,
+	.type = UAC_FEATURE_UNIT,
 	.desc = (struct usb_descriptor_header *)&feature_unit_desc,
 };
 
 #define OUTPUT_TERMINAL_ID	3
-static struct usb_output_terminal_descriptor output_terminal_desc = {
-	.bLength		= USB_DT_AC_OUTPUT_TERMINAL_SIZE,
+static struct uac_output_terminal_descriptor output_terminal_desc = {
+	.bLength		= UAC_DT_OUTPUT_TERMINAL_SIZE,
 	.bDescriptorType	= USB_DT_CS_INTERFACE,
-	.bDescriptorSubtype	= OUTPUT_TERMINAL,
+	.bDescriptorSubtype	= UAC_OUTPUT_TERMINAL,
 	.bTerminalID		= OUTPUT_TERMINAL_ID,
-	.wTerminalType		= USB_AC_OUTPUT_TERMINAL_SPEAKER,
+	.wTerminalType		= UAC_OUTPUT_TERMINAL_SPEAKER,
 	.bAssocTerminal		= FEATURE_UNIT_ID,
 	.bSourceID		= FEATURE_UNIT_ID,
 };
@@ -148,22 +148,22 @@ static struct usb_interface_descriptor as_interface_alt_1_desc = {
 };
 
 /* B.4.2  Class-Specific AS Interface Descriptor */
-static struct usb_as_header_descriptor as_header_desc = {
-	.bLength =		USB_DT_AS_HEADER_SIZE,
+static struct uac_as_header_descriptor as_header_desc = {
+	.bLength =		UAC_DT_AS_HEADER_SIZE,
 	.bDescriptorType =	USB_DT_CS_INTERFACE,
-	.bDescriptorSubtype =	AS_GENERAL,
+	.bDescriptorSubtype =	UAC_AS_GENERAL,
 	.bTerminalLink =	INPUT_TERMINAL_ID,
 	.bDelay =		1,
-	.wFormatTag =		USB_AS_AUDIO_FORMAT_TYPE_I_PCM,
+	.wFormatTag =		UAC_FORMAT_TYPE_I_PCM,
 };
 
-DECLARE_USB_AS_FORMAT_TYPE_I_DISCRETE_DESC(1);
+DECLARE_UAC_FORMAT_TYPE_I_DISCRETE_DESC(1);
 
-static struct usb_as_formate_type_i_discrete_descriptor_1 as_type_i_desc = {
-	.bLength =		USB_AS_FORMAT_TYPE_I_DISCRETE_DESC_SIZE(1),
+static struct uac_format_type_i_discrete_descriptor_1 as_type_i_desc = {
+	.bLength =		UAC_FORMAT_TYPE_I_DISCRETE_DESC_SIZE(1),
 	.bDescriptorType =	USB_DT_CS_INTERFACE,
-	.bDescriptorSubtype =	FORMAT_TYPE,
-	.bFormatType =		USB_AS_FORMAT_TYPE_I,
+	.bDescriptorSubtype =	UAC_FORMAT_TYPE,
+	.bFormatType =		UAC_FORMAT_TYPE_I,
 	.bSubframeSize =	2,
 	.bBitResolution =	16,
 	.bSamFreqType =		1,
@@ -181,10 +181,10 @@ static struct usb_endpoint_descriptor as_out_ep_desc __initdata = {
 };
 
 /* Class-specific AS ISO OUT Endpoint Descriptor */
-static struct usb_as_iso_endpoint_descriptor as_iso_out_desc __initdata = {
-	.bLength =		USB_AS_ISO_ENDPOINT_DESC_SIZE,
+static struct uac_iso_endpoint_descriptor as_iso_out_desc __initdata = {
+	.bLength =		UAC_ISO_ENDPOINT_DESC_SIZE,
 	.bDescriptorType =	USB_DT_CS_ENDPOINT,
-	.bDescriptorSubtype =	EP_GENERAL,
+	.bDescriptorSubtype =	UAC_EP_GENERAL,
 	.bmAttributes = 	1,
 	.bLockDelayUnits =	1,
 	.wLockDelay =		__constant_cpu_to_le16(1),
@@ -456,11 +456,11 @@ f_audio_setup(struct usb_function *f, const struct usb_ctrlrequest *ctrl)
 	 * Audio class messages; interface activation uses set_alt().
 	 */
 	switch (ctrl->bRequestType) {
-	case USB_AUDIO_SET_INTF:
+	case USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_INTERFACE:
 		value = audio_set_intf_req(f, ctrl);
 		break;
 
-	case USB_AUDIO_GET_INTF:
+	case USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE:
 		value = audio_get_intf_req(f, ctrl);
 		break;
 
@@ -642,10 +642,10 @@ int __init control_selector_init(struct f_audio *audio)
 	list_add(&mute_control.list, &feature_unit.control);
 	list_add(&volume_control.list, &feature_unit.control);
 
-	volume_control.data[_CUR] = 0xffc0;
-	volume_control.data[_MIN] = 0xe3a0;
-	volume_control.data[_MAX] = 0xfff0;
-	volume_control.data[_RES] = 0x0030;
+	volume_control.data[UAC__CUR] = 0xffc0;
+	volume_control.data[UAC__MIN] = 0xe3a0;
+	volume_control.data[UAC__MAX] = 0xfff0;
+	volume_control.data[UAC__RES] = 0x0030;
 
 	return 0;
 }
diff --git a/drivers/usb/gadget/gmidi.c b/drivers/usb/gadget/gmidi.c
index b9312dc6e041..d0b1e836f0e0 100644
--- a/drivers/usb/gadget/gmidi.c
+++ b/drivers/usb/gadget/gmidi.c
@@ -191,7 +191,7 @@ module_param(qlen, uint, S_IRUGO);
 #define GMIDI_MS_INTERFACE	1
 #define GMIDI_NUM_INTERFACES	2
 
-DECLARE_USB_AC_HEADER_DESCRIPTOR(1);
+DECLARE_UAC_AC_HEADER_DESCRIPTOR(1);
 DECLARE_USB_MIDI_OUT_JACK_DESCRIPTOR(1);
 DECLARE_USB_MS_ENDPOINT_DESCRIPTOR(1);
 
@@ -237,12 +237,12 @@ static const struct usb_interface_descriptor ac_interface_desc = {
 };
 
 /* B.3.2  Class-Specific AC Interface Descriptor */
-static const struct usb_ac_header_descriptor_1 ac_header_desc = {
-	.bLength =		USB_DT_AC_HEADER_SIZE(1),
+static const struct uac_ac_header_descriptor_1 ac_header_desc = {
+	.bLength =		UAC_DT_AC_HEADER_SIZE(1),
 	.bDescriptorType =	USB_DT_CS_INTERFACE,
 	.bDescriptorSubtype =	USB_MS_HEADER,
 	.bcdADC =		cpu_to_le16(0x0100),
-	.wTotalLength =		cpu_to_le16(USB_DT_AC_HEADER_SIZE(1)),
+	.wTotalLength =		cpu_to_le16(UAC_DT_AC_HEADER_SIZE(1)),
 	.bInCollection =	1,
 	.baInterfaceNr = {
 		[0] =		GMIDI_MS_INTERFACE,
diff --git a/include/linux/usb/audio.h b/include/linux/usb/audio.h
index f75092aba02f..c3edfcb72c34 100644
--- a/include/linux/usb/audio.h
+++ b/include/linux/usb/audio.h
@@ -25,80 +25,77 @@
 #define USB_SUBCLASS_AUDIOSTREAMING	0x02
 #define USB_SUBCLASS_MIDISTREAMING	0x03
 
-/* A.5 Audio Class-Specific AC interface Descriptor Subtypes*/
-#define HEADER				0x01
-#define INPUT_TERMINAL			0x02
-#define OUTPUT_TERMINAL			0x03
-#define MIXER_UNIT			0x04
-#define SELECTOR_UNIT			0x05
-#define FEATURE_UNIT			0x06
-#define PROCESSING_UNIT			0x07
-#define EXTENSION_UNIT			0x08
-
-#define AS_GENERAL			0x01
-#define FORMAT_TYPE			0x02
-#define FORMAT_SPECIFIC			0x03
-
-#define EP_GENERAL			0x01
-
-#define MS_GENERAL			0x01
-#define MIDI_IN_JACK			0x02
-#define MIDI_OUT_JACK			0x03
-
-/* cs endpoint attributes */
-#define EP_CS_ATTR_SAMPLE_RATE		0x01
-#define EP_CS_ATTR_PITCH_CONTROL	0x02
-#define EP_CS_ATTR_FILL_MAX		0x80
-
-/* Audio Class specific Request Codes */
-#define USB_AUDIO_SET_INTF		0x21
-#define USB_AUDIO_SET_ENDPOINT		0x22
-#define USB_AUDIO_GET_INTF		0xa1
-#define USB_AUDIO_GET_ENDPOINT		0xa2
-
-#define SET_	0x00
-#define GET_	0x80
-
-#define _CUR	0x1
-#define _MIN	0x2
-#define _MAX	0x3
-#define _RES	0x4
-#define _MEM	0x5
-
-#define SET_CUR		(SET_ | _CUR)
-#define GET_CUR		(GET_ | _CUR)
-#define SET_MIN		(SET_ | _MIN)
-#define GET_MIN		(GET_ | _MIN)
-#define SET_MAX		(SET_ | _MAX)
-#define GET_MAX		(GET_ | _MAX)
-#define SET_RES		(SET_ | _RES)
-#define GET_RES		(GET_ | _RES)
-#define SET_MEM		(SET_ | _MEM)
-#define GET_MEM		(GET_ | _MEM)
-
-#define GET_STAT	0xff
-
-#define USB_AC_TERMINAL_UNDEFINED	0x100
-#define USB_AC_TERMINAL_STREAMING	0x101
-#define USB_AC_TERMINAL_VENDOR_SPEC	0x1FF
+/* A.5 Audio Class-Specific AC Interface Descriptor Subtypes */
+#define UAC_HEADER			0x01
+#define UAC_INPUT_TERMINAL		0x02
+#define UAC_OUTPUT_TERMINAL		0x03
+#define UAC_MIXER_UNIT			0x04
+#define UAC_SELECTOR_UNIT		0x05
+#define UAC_FEATURE_UNIT		0x06
+#define UAC_PROCESSING_UNIT		0x07
+#define UAC_EXTENSION_UNIT		0x08
+
+/* A.6 Audio Class-Specific AS Interface Descriptor Subtypes */
+#define UAC_AS_GENERAL			0x01
+#define UAC_FORMAT_TYPE			0x02
+#define UAC_FORMAT_SPECIFIC		0x03
+
+/* A.8 Audio Class-Specific Endpoint Descriptor Subtypes */
+#define UAC_EP_GENERAL			0x01
+
+/* A.9 Audio Class-Specific Request Codes */
+#define UAC_SET_			0x00
+#define UAC_GET_			0x80
+
+#define UAC__CUR			0x1
+#define UAC__MIN			0x2
+#define UAC__MAX			0x3
+#define UAC__RES			0x4
+#define UAC__MEM			0x5
+
+#define UAC_SET_CUR			(UAC_SET_ | UAC__CUR)
+#define UAC_GET_CUR			(UAC_GET_ | UAC__CUR)
+#define UAC_SET_MIN			(UAC_SET_ | UAC__MIN)
+#define UAC_GET_MIN			(UAC_GET_ | UAC__MIN)
+#define UAC_SET_MAX			(UAC_SET_ | UAC__MAX)
+#define UAC_GET_MAX			(UAC_GET_ | UAC__MAX)
+#define UAC_SET_RES			(UAC_SET_ | UAC__RES)
+#define UAC_GET_RES			(UAC_GET_ | UAC__RES)
+#define UAC_SET_MEM			(UAC_SET_ | UAC__MEM)
+#define UAC_GET_MEM			(UAC_GET_ | UAC__MEM)
+
+#define UAC_GET_STAT			0xff
+
+/* MIDI - A.1 MS Class-Specific Interface Descriptor Subtypes */
+#define UAC_MS_HEADER			0x01
+#define UAC_MIDI_IN_JACK		0x02
+#define UAC_MIDI_OUT_JACK		0x03
+
+/* MIDI - A.1 MS Class-Specific Endpoint Descriptor Subtypes */
+#define UAC_MS_GENERAL			0x01
+
+/* Terminals - 2.1 USB Terminal Types */
+#define UAC_TERMINAL_UNDEFINED		0x100
+#define UAC_TERMINAL_STREAMING		0x101
+#define UAC_TERMINAL_VENDOR_SPEC	0x1FF
 
 /* Terminal Control Selectors */
 /* 4.3.2  Class-Specific AC Interface Descriptor */
-struct usb_ac_header_descriptor {
+struct uac_ac_header_descriptor {
 	__u8  bLength;			/* 8 + n */
 	__u8  bDescriptorType;		/* USB_DT_CS_INTERFACE */
-	__u8  bDescriptorSubtype;	/* USB_MS_HEADER */
+	__u8  bDescriptorSubtype;	/* UAC_MS_HEADER */
 	__le16 bcdADC;			/* 0x0100 */
 	__le16 wTotalLength;		/* includes Unit and Terminal desc. */
 	__u8  bInCollection;		/* n */
 	__u8  baInterfaceNr[];		/* [n] */
 } __attribute__ ((packed));
 
-#define USB_DT_AC_HEADER_SIZE(n)	(8 + (n))
+#define UAC_DT_AC_HEADER_SIZE(n)	(8 + (n))
 
 /* As above, but more useful for defining your own descriptors: */
-#define DECLARE_USB_AC_HEADER_DESCRIPTOR(n) 			\
-struct usb_ac_header_descriptor_##n {				\
+#define DECLARE_UAC_AC_HEADER_DESCRIPTOR(n) 			\
+struct uac_ac_header_descriptor_##n {				\
 	__u8  bLength;						\
 	__u8  bDescriptorType;					\
 	__u8  bDescriptorSubtype;				\
@@ -109,7 +106,7 @@ struct usb_ac_header_descriptor_##n {				\
 } __attribute__ ((packed))
 
 /* 4.3.2.1 Input Terminal Descriptor */
-struct usb_input_terminal_descriptor {
+struct uac_input_terminal_descriptor {
 	__u8  bLength;			/* in bytes: 12 */
 	__u8  bDescriptorType;		/* CS_INTERFACE descriptor type */
 	__u8  bDescriptorSubtype;	/* INPUT_TERMINAL descriptor subtype */
@@ -122,18 +119,19 @@ struct usb_input_terminal_descriptor {
 	__u8  iTerminal;
 } __attribute__ ((packed));
 
-#define USB_DT_AC_INPUT_TERMINAL_SIZE			12
+#define UAC_DT_INPUT_TERMINAL_SIZE			12
 
-#define USB_AC_INPUT_TERMINAL_UNDEFINED			0x200
-#define USB_AC_INPUT_TERMINAL_MICROPHONE		0x201
-#define USB_AC_INPUT_TERMINAL_DESKTOP_MICROPHONE	0x202
-#define USB_AC_INPUT_TERMINAL_PERSONAL_MICROPHONE	0x203
-#define USB_AC_INPUT_TERMINAL_OMNI_DIR_MICROPHONE	0x204
-#define USB_AC_INPUT_TERMINAL_MICROPHONE_ARRAY		0x205
-#define USB_AC_INPUT_TERMINAL_PROC_MICROPHONE_ARRAY	0x206
+/* Terminals - 2.2 Input Terminal Types */
+#define UAC_INPUT_TERMINAL_UNDEFINED			0x200
+#define UAC_INPUT_TERMINAL_MICROPHONE			0x201
+#define UAC_INPUT_TERMINAL_DESKTOP_MICROPHONE		0x202
+#define UAC_INPUT_TERMINAL_PERSONAL_MICROPHONE		0x203
+#define UAC_INPUT_TERMINAL_OMNI_DIR_MICROPHONE		0x204
+#define UAC_INPUT_TERMINAL_MICROPHONE_ARRAY		0x205
+#define UAC_INPUT_TERMINAL_PROC_MICROPHONE_ARRAY	0x206
 
 /* 4.3.2.2 Output Terminal Descriptor */
-struct usb_output_terminal_descriptor {
+struct uac_output_terminal_descriptor {
 	__u8  bLength;			/* in bytes: 9 */
 	__u8  bDescriptorType;		/* CS_INTERFACE descriptor type */
 	__u8  bDescriptorSubtype;	/* OUTPUT_TERMINAL descriptor subtype */
@@ -144,23 +142,24 @@ struct usb_output_terminal_descriptor {
 	__u8  iTerminal;
 } __attribute__ ((packed));
 
-#define USB_DT_AC_OUTPUT_TERMINAL_SIZE				9
+#define UAC_DT_OUTPUT_TERMINAL_SIZE			9
 
-#define USB_AC_OUTPUT_TERMINAL_UNDEFINED			0x300
-#define USB_AC_OUTPUT_TERMINAL_SPEAKER				0x301
-#define USB_AC_OUTPUT_TERMINAL_HEADPHONES			0x302
-#define USB_AC_OUTPUT_TERMINAL_HEAD_MOUNTED_DISPLAY_AUDIO	0x303
-#define USB_AC_OUTPUT_TERMINAL_DESKTOP_SPEAKER			0x304
-#define USB_AC_OUTPUT_TERMINAL_ROOM_SPEAKER			0x305
-#define USB_AC_OUTPUT_TERMINAL_COMMUNICATION_SPEAKER		0x306
-#define USB_AC_OUTPUT_TERMINAL_LOW_FREQ_EFFECTS_SPEAKER		0x307
+/* Terminals - 2.3 Output Terminal Types */
+#define UAC_OUTPUT_TERMINAL_UNDEFINED			0x300
+#define UAC_OUTPUT_TERMINAL_SPEAKER			0x301
+#define UAC_OUTPUT_TERMINAL_HEADPHONES			0x302
+#define UAC_OUTPUT_TERMINAL_HEAD_MOUNTED_DISPLAY_AUDIO	0x303
+#define UAC_OUTPUT_TERMINAL_DESKTOP_SPEAKER		0x304
+#define UAC_OUTPUT_TERMINAL_ROOM_SPEAKER		0x305
+#define UAC_OUTPUT_TERMINAL_COMMUNICATION_SPEAKER	0x306
+#define UAC_OUTPUT_TERMINAL_LOW_FREQ_EFFECTS_SPEAKER	0x307
 
 /* Set bControlSize = 2 as default setting */
-#define USB_DT_AC_FEATURE_UNIT_SIZE(ch)		(7 + ((ch) + 1) * 2)
+#define UAC_DT_FEATURE_UNIT_SIZE(ch)		(7 + ((ch) + 1) * 2)
 
 /* As above, but more useful for defining your own descriptors: */
-#define DECLARE_USB_AC_FEATURE_UNIT_DESCRIPTOR(ch) 		\
-struct usb_ac_feature_unit_descriptor_##ch {			\
+#define DECLARE_UAC_FEATURE_UNIT_DESCRIPTOR(ch) 		\
+struct uac_feature_unit_descriptor_##ch {			\
 	__u8  bLength;						\
 	__u8  bDescriptorType;					\
 	__u8  bDescriptorSubtype;				\
@@ -172,7 +171,7 @@ struct usb_ac_feature_unit_descriptor_##ch {			\
 } __attribute__ ((packed))
 
 /* 4.5.2 Class-Specific AS Interface Descriptor */
-struct usb_as_header_descriptor {
+struct uac_as_header_descriptor {
 	__u8  bLength;			/* in bytes: 7 */
 	__u8  bDescriptorType;		/* USB_DT_CS_INTERFACE */
 	__u8  bDescriptorSubtype;	/* AS_GENERAL */
@@ -181,16 +180,17 @@ struct usb_as_header_descriptor {
 	__le16 wFormatTag;		/* The Audio Data Format */
 } __attribute__ ((packed));
 
-#define USB_DT_AS_HEADER_SIZE		7
+#define UAC_DT_AS_HEADER_SIZE		7
 
-#define USB_AS_AUDIO_FORMAT_TYPE_I_UNDEFINED	0x0
-#define USB_AS_AUDIO_FORMAT_TYPE_I_PCM		0x1
-#define USB_AS_AUDIO_FORMAT_TYPE_I_PCM8		0x2
-#define USB_AS_AUDIO_FORMAT_TYPE_I_IEEE_FLOAT	0x3
-#define USB_AS_AUDIO_FORMAT_TYPE_I_ALAW		0x4
-#define USB_AS_AUDIO_FORMAT_TYPE_I_MULAW	0x5
+/* Formats - A.1.1 Audio Data Format Type I Codes */
+#define UAC_FORMAT_TYPE_I_UNDEFINED	0x0
+#define UAC_FORMAT_TYPE_I_PCM		0x1
+#define UAC_FORMAT_TYPE_I_PCM8		0x2
+#define UAC_FORMAT_TYPE_I_IEEE_FLOAT	0x3
+#define UAC_FORMAT_TYPE_I_ALAW		0x4
+#define UAC_FORMAT_TYPE_I_MULAW		0x5
 
-struct usb_as_format_type_i_continuous_descriptor {
+struct uac_format_type_i_continuous_descriptor {
 	__u8  bLength;			/* in bytes: 8 + (ns * 3) */
 	__u8  bDescriptorType;		/* USB_DT_CS_INTERFACE */
 	__u8  bDescriptorSubtype;	/* FORMAT_TYPE */
@@ -203,9 +203,9 @@ struct usb_as_format_type_i_continuous_descriptor {
 	__u8  tUpperSamFreq[3];
 } __attribute__ ((packed));
 
-#define USB_AS_FORMAT_TYPE_I_CONTINUOUS_DESC_SIZE	14
+#define UAC_FORMAT_TYPE_I_CONTINUOUS_DESC_SIZE	14
 
-struct usb_as_formate_type_i_discrete_descriptor {
+struct uac_format_type_i_discrete_descriptor {
 	__u8  bLength;			/* in bytes: 8 + (ns * 3) */
 	__u8  bDescriptorType;		/* USB_DT_CS_INTERFACE */
 	__u8  bDescriptorSubtype;	/* FORMAT_TYPE */
@@ -217,8 +217,8 @@ struct usb_as_formate_type_i_discrete_descriptor {
 	__u8  tSamFreq[][3];
 } __attribute__ ((packed));
 
-#define DECLARE_USB_AS_FORMAT_TYPE_I_DISCRETE_DESC(n) 		\
-struct usb_as_formate_type_i_discrete_descriptor_##n {		\
+#define DECLARE_UAC_FORMAT_TYPE_I_DISCRETE_DESC(n) 		\
+struct uac_format_type_i_discrete_descriptor_##n {		\
 	__u8  bLength;						\
 	__u8  bDescriptorType;					\
 	__u8  bDescriptorSubtype;				\
@@ -230,14 +230,15 @@ struct usb_as_formate_type_i_discrete_descriptor_##n {		\
 	__u8  tSamFreq[n][3];					\
 } __attribute__ ((packed))
 
-#define USB_AS_FORMAT_TYPE_I_DISCRETE_DESC_SIZE(n)	(8 + (n * 3))
+#define UAC_FORMAT_TYPE_I_DISCRETE_DESC_SIZE(n)	(8 + (n * 3))
 
-#define USB_AS_FORMAT_TYPE_UNDEFINED	0x0
-#define USB_AS_FORMAT_TYPE_I		0x1
-#define USB_AS_FORMAT_TYPE_II		0x2
-#define USB_AS_FORMAT_TYPE_III		0x3
+/* Formats - A.2 Format Type Codes */
+#define UAC_FORMAT_TYPE_UNDEFINED	0x0
+#define UAC_FORMAT_TYPE_I		0x1
+#define UAC_FORMAT_TYPE_II		0x2
+#define UAC_FORMAT_TYPE_III		0x3
 
-struct usb_as_iso_endpoint_descriptor {
+struct uac_iso_endpoint_descriptor {
 	__u8  bLength;			/* in bytes: 7 */
 	__u8  bDescriptorType;		/* USB_DT_CS_ENDPOINT */
 	__u8  bDescriptorSubtype;	/* EP_GENERAL */
@@ -245,30 +246,35 @@ struct usb_as_iso_endpoint_descriptor {
 	__u8  bLockDelayUnits;
 	__le16 wLockDelay;
 };
-#define USB_AS_ISO_ENDPOINT_DESC_SIZE	7
-
-#define FU_CONTROL_UNDEFINED		0x00
-#define MUTE_CONTROL			0x01
-#define VOLUME_CONTROL			0x02
-#define BASS_CONTROL			0x03
-#define MID_CONTROL			0x04
-#define TREBLE_CONTROL			0x05
-#define GRAPHIC_EQUALIZER_CONTROL	0x06
-#define AUTOMATIC_GAIN_CONTROL		0x07
-#define DELAY_CONTROL			0x08
-#define BASS_BOOST_CONTROL		0x09
-#define LOUDNESS_CONTROL		0x0a
-
-#define FU_MUTE		(1 << (MUTE_CONTROL - 1))
-#define FU_VOLUME	(1 << (VOLUME_CONTROL - 1))
-#define FU_BASS		(1 << (BASS_CONTROL - 1))
-#define FU_MID		(1 << (MID_CONTROL - 1))
-#define FU_TREBLE	(1 << (TREBLE_CONTROL - 1))
-#define FU_GRAPHIC_EQ	(1 << (GRAPHIC_EQUALIZER_CONTROL - 1))
-#define FU_AUTO_GAIN	(1 << (AUTOMATIC_GAIN_CONTROL - 1))
-#define FU_DELAY	(1 << (DELAY_CONTROL - 1))
-#define FU_BASS_BOOST	(1 << (BASS_BOOST_CONTROL - 1))
-#define FU_LOUDNESS	(1 << (LOUDNESS_CONTROL - 1))
+#define UAC_ISO_ENDPOINT_DESC_SIZE	7
+
+#define UAC_EP_CS_ATTR_SAMPLE_RATE	0x01
+#define UAC_EP_CS_ATTR_PITCH_CONTROL	0x02
+#define UAC_EP_CS_ATTR_FILL_MAX		0x80
+
+/* A.10.2 Feature Unit Control Selectors */
+#define UAC_FU_CONTROL_UNDEFINED	0x00
+#define UAC_MUTE_CONTROL		0x01
+#define UAC_VOLUME_CONTROL		0x02
+#define UAC_BASS_CONTROL		0x03
+#define UAC_MID_CONTROL			0x04
+#define UAC_TREBLE_CONTROL		0x05
+#define UAC_GRAPHIC_EQUALIZER_CONTROL	0x06
+#define UAC_AUTOMATIC_GAIN_CONTROL	0x07
+#define UAC_DELAY_CONTROL		0x08
+#define UAC_BASS_BOOST_CONTROL		0x09
+#define UAC_LOUDNESS_CONTROL		0x0a
+
+#define UAC_FU_MUTE		(1 << (UAC_MUTE_CONTROL - 1))
+#define UAC_FU_VOLUME		(1 << (UAC_VOLUME_CONTROL - 1))
+#define UAC_FU_BASS		(1 << (UAC_BASS_CONTROL - 1))
+#define UAC_FU_MID		(1 << (UAC_MID_CONTROL - 1))
+#define UAC_FU_TREBLE		(1 << (UAC_TREBLE_CONTROL - 1))
+#define UAC_FU_GRAPHIC_EQ	(1 << (UAC_GRAPHIC_EQUALIZER_CONTROL - 1))
+#define UAC_FU_AUTO_GAIN	(1 << (UAC_AUTOMATIC_GAIN_CONTROL - 1))
+#define UAC_FU_DELAY		(1 << (UAC_DELAY_CONTROL - 1))
+#define UAC_FU_BASS_BOOST	(1 << (UAC_BASS_BOOST_CONTROL - 1))
+#define UAC_FU_LOUDNESS		(1 << (UAC_LOUDNESS_CONTROL - 1))
 
 struct usb_audio_control {
 	struct list_head list;
-- 
cgit v1.2.3-71-gd317


From b95cd7ec3e93bae199e820bd65b21b23e4538acc Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@skynet.be>
Date: Sun, 21 Jun 2009 23:21:55 +0200
Subject: USB audio gadget: Un-inline generic_[gs]et_cmd

Those functions are used only used to fill the set/get members of
usb_audio_control. It doesn't make much sense to inline them.

Signed-off-by: Laurent Pinchart <laurent.pinchart@skynet.be>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/gadget/f_audio.c | 15 +++++++++++++++
 include/linux/usb/audio.h    | 12 ------------
 2 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/f_audio.c b/drivers/usb/gadget/f_audio.c
index 7b05b3c8c0b1..98e9bb977291 100644
--- a/drivers/usb/gadget/f_audio.c
+++ b/drivers/usb/gadget/f_audio.c
@@ -28,6 +28,9 @@ static int audio_buf_size = 48000;
 module_param(audio_buf_size, int, S_IRUGO);
 MODULE_PARM_DESC(audio_buf_size, "Audio buffer size");
 
+static int generic_set_cmd(struct usb_audio_control *con, u8 cmd, int value);
+static int generic_get_cmd(struct usb_audio_control *con, u8 cmd);
+
 /*
  * DESCRIPTORS ... most are static, but strings and full
  * configuration descriptors are built on demand.
@@ -632,6 +635,18 @@ f_audio_unbind(struct usb_configuration *c, struct usb_function *f)
 
 /*-------------------------------------------------------------------------*/
 
+static int generic_set_cmd(struct usb_audio_control *con, u8 cmd, int value)
+{
+	con->data[cmd] = value;
+
+	return 0;
+}
+
+static int generic_get_cmd(struct usb_audio_control *con, u8 cmd)
+{
+	return con->data[cmd];
+}
+
 /* Todo: add more control selecotor dynamically */
 int __init control_selector_init(struct f_audio *audio)
 {
diff --git a/include/linux/usb/audio.h b/include/linux/usb/audio.h
index c3edfcb72c34..7b33c493917f 100644
--- a/include/linux/usb/audio.h
+++ b/include/linux/usb/audio.h
@@ -285,18 +285,6 @@ struct usb_audio_control {
 	int (*get)(struct usb_audio_control *con, u8 cmd);
 };
 
-static inline int generic_set_cmd(struct usb_audio_control *con, u8 cmd, int value)
-{
-	con->data[cmd] = value;
-
-	return 0;
-}
-
-static inline int generic_get_cmd(struct usb_audio_control *con, u8 cmd)
-{
-	return con->data[cmd];
-}
-
 struct usb_audio_control_selector {
 	struct list_head list;
 	struct list_head control;
-- 
cgit v1.2.3-71-gd317


From 7cbe5dca399a50ce8aa74314b1d276e2fb904e1b Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Mon, 29 Jun 2009 10:56:54 -0400
Subject: USB: add API for userspace drivers to "claim" ports

This patch (as1258) implements a feature that users have been asking
for: It gives programs the ability to "claim" a port on a hub, via a
new usbfs ioctl.  A device plugged into a "claimed" port will not be
touched by the kernel beyond the immediate necessities of
initialization and enumeration.

In particular, when a device is plugged into a "claimed" port, the
kernel will not select and install a configuration.  And when a config
is installed by usbfs or sysfs, the kernel will not probe any drivers
for any of the interfaces.  (However the kernel will fetch various
string descriptors during enumeration.  One could argue that this
isn't really necessary, but the strings are exported in sysfs.)

The patch does not guarantee exclusive access to these devices; it is
still possible for more than one program to open the device file
concurrently.  Programs are responsible for coordinating access among
themselves.

A demonstration program showing how to use the new interface can be
found in an attachment to

	http://marc.info/?l=linux-usb&m=124345857431452&w=2

The patch also makes a small simplification to the hub driver,
replacing a bunch of more-or-less useless variants of "out of memory"
with a single message.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/devio.c     | 35 ++++++++++++++++++
 drivers/usb/core/driver.c    |  3 ++
 drivers/usb/core/generic.c   |  4 +-
 drivers/usb/core/hub.c       | 88 +++++++++++++++++++++++++++++++++++++++++---
 drivers/usb/core/usb.h       |  7 ++++
 include/linux/usbdevice_fs.h |  2 +
 6 files changed, 133 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index 4247eccf858c..165de5d59005 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -52,6 +52,7 @@
 
 #include "hcd.h"	/* for usbcore internals */
 #include "usb.h"
+#include "hub.h"
 
 #define USB_MAXBUS			64
 #define USB_DEVICE_MAX			USB_MAXBUS * 128
@@ -655,6 +656,7 @@ static int usbdev_release(struct inode *inode, struct file *file)
 	struct async *as;
 
 	usb_lock_device(dev);
+	usb_hub_release_all_ports(dev, ps);
 
 	/* Protect against simultaneous open */
 	mutex_lock(&usbfs_mutex);
@@ -1548,6 +1550,29 @@ static int proc_ioctl_compat(struct dev_state *ps, compat_uptr_t arg)
 }
 #endif
 
+static int proc_claim_port(struct dev_state *ps, void __user *arg)
+{
+	unsigned portnum;
+	int rc;
+
+	if (get_user(portnum, (unsigned __user *) arg))
+		return -EFAULT;
+	rc = usb_hub_claim_port(ps->dev, portnum, ps);
+	if (rc == 0)
+		snoop(&ps->dev->dev, "port %d claimed by process %d: %s\n",
+			portnum, task_pid_nr(current), current->comm);
+	return rc;
+}
+
+static int proc_release_port(struct dev_state *ps, void __user *arg)
+{
+	unsigned portnum;
+
+	if (get_user(portnum, (unsigned __user *) arg))
+		return -EFAULT;
+	return usb_hub_release_port(ps->dev, portnum, ps);
+}
+
 /*
  * NOTE:  All requests here that have interface numbers as parameters
  * are assuming that somehow the configuration has been prevented from
@@ -1689,6 +1714,16 @@ static int usbdev_ioctl(struct inode *inode, struct file *file,
 		snoop(&dev->dev, "%s: IOCTL\n", __func__);
 		ret = proc_ioctl_default(ps, p);
 		break;
+
+	case USBDEVFS_CLAIM_PORT:
+		snoop(&dev->dev, "%s: CLAIM_PORT\n", __func__);
+		ret = proc_claim_port(ps, p);
+		break;
+
+	case USBDEVFS_RELEASE_PORT:
+		snoop(&dev->dev, "%s: RELEASE_PORT\n", __func__);
+		ret = proc_release_port(ps, p);
+		break;
 	}
 	usb_unlock_device(dev);
 	if (ret >= 0)
diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index 69e5773abfce..1bad4e5a6abb 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -207,6 +207,9 @@ static int usb_probe_interface(struct device *dev)
 
 	intf->needs_binding = 0;
 
+	if (usb_device_is_owned(udev))
+		return -ENODEV;
+
 	if (udev->authorized == 0) {
 		dev_err(&intf->dev, "Device is not authorized for usage\n");
 		return -ENODEV;
diff --git a/drivers/usb/core/generic.c b/drivers/usb/core/generic.c
index 30ecac3af15a..05e6d313961e 100644
--- a/drivers/usb/core/generic.c
+++ b/drivers/usb/core/generic.c
@@ -158,7 +158,9 @@ static int generic_probe(struct usb_device *udev)
 	/* Choose and set the configuration.  This registers the interfaces
 	 * with the driver core and lets interface drivers bind to them.
 	 */
-	if (udev->authorized == 0)
+	if (usb_device_is_owned(udev))
+		;		/* Don't configure if the device is owned */
+	else if (udev->authorized == 0)
 		dev_err(&udev->dev, "Device is not authorized for usage\n");
 	else {
 		c = usb_choose_configuration(udev);
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 71f86c60d83c..cb54420ed583 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -78,6 +78,7 @@ struct usb_hub {
 	u8			indicator[USB_MAXCHILDREN];
 	struct delayed_work	leds;
 	struct delayed_work	init_work;
+	void			**port_owners;
 };
 
 
@@ -860,19 +861,17 @@ static int hub_configure(struct usb_hub *hub,
 	u16 wHubCharacteristics;
 	unsigned int pipe;
 	int maxp, ret;
-	char *message;
+	char *message = "out of memory";
 
 	hub->buffer = usb_buffer_alloc(hdev, sizeof(*hub->buffer), GFP_KERNEL,
 			&hub->buffer_dma);
 	if (!hub->buffer) {
-		message = "can't allocate hub irq buffer";
 		ret = -ENOMEM;
 		goto fail;
 	}
 
 	hub->status = kmalloc(sizeof(*hub->status), GFP_KERNEL);
 	if (!hub->status) {
-		message = "can't kmalloc hub status buffer";
 		ret = -ENOMEM;
 		goto fail;
 	}
@@ -880,7 +879,6 @@ static int hub_configure(struct usb_hub *hub,
 
 	hub->descriptor = kmalloc(sizeof(*hub->descriptor), GFP_KERNEL);
 	if (!hub->descriptor) {
-		message = "can't kmalloc hub descriptor";
 		ret = -ENOMEM;
 		goto fail;
 	}
@@ -904,6 +902,12 @@ static int hub_configure(struct usb_hub *hub,
 	dev_info (hub_dev, "%d port%s detected\n", hdev->maxchild,
 		(hdev->maxchild == 1) ? "" : "s");
 
+	hub->port_owners = kzalloc(hdev->maxchild * sizeof(void *), GFP_KERNEL);
+	if (!hub->port_owners) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
 	wHubCharacteristics = le16_to_cpu(hub->descriptor->wHubCharacteristics);
 
 	if (wHubCharacteristics & HUB_CHAR_COMPOUND) {
@@ -1082,7 +1086,6 @@ static int hub_configure(struct usb_hub *hub,
 
 	hub->urb = usb_alloc_urb(0, GFP_KERNEL);
 	if (!hub->urb) {
-		message = "couldn't allocate interrupt urb";
 		ret = -ENOMEM;
 		goto fail;
 	}
@@ -1131,11 +1134,13 @@ static void hub_disconnect(struct usb_interface *intf)
 	hub_quiesce(hub, HUB_DISCONNECT);
 
 	usb_set_intfdata (intf, NULL);
+	hub->hdev->maxchild = 0;
 
 	if (hub->hdev->speed == USB_SPEED_HIGH)
 		highspeed_hubs--;
 
 	usb_free_urb(hub->urb);
+	kfree(hub->port_owners);
 	kfree(hub->descriptor);
 	kfree(hub->status);
 	usb_buffer_free(hub->hdev, sizeof(*hub->buffer), hub->buffer,
@@ -1250,6 +1255,79 @@ hub_ioctl(struct usb_interface *intf, unsigned int code, void *user_data)
 	}
 }
 
+/*
+ * Allow user programs to claim ports on a hub.  When a device is attached
+ * to one of these "claimed" ports, the program will "own" the device.
+ */
+static int find_port_owner(struct usb_device *hdev, unsigned port1,
+		void ***ppowner)
+{
+	if (hdev->state == USB_STATE_NOTATTACHED)
+		return -ENODEV;
+	if (port1 == 0 || port1 > hdev->maxchild)
+		return -EINVAL;
+
+	/* This assumes that devices not managed by the hub driver
+	 * will always have maxchild equal to 0.
+	 */
+	*ppowner = &(hdev_to_hub(hdev)->port_owners[port1 - 1]);
+	return 0;
+}
+
+/* In the following three functions, the caller must hold hdev's lock */
+int usb_hub_claim_port(struct usb_device *hdev, unsigned port1, void *owner)
+{
+	int rc;
+	void **powner;
+
+	rc = find_port_owner(hdev, port1, &powner);
+	if (rc)
+		return rc;
+	if (*powner)
+		return -EBUSY;
+	*powner = owner;
+	return rc;
+}
+
+int usb_hub_release_port(struct usb_device *hdev, unsigned port1, void *owner)
+{
+	int rc;
+	void **powner;
+
+	rc = find_port_owner(hdev, port1, &powner);
+	if (rc)
+		return rc;
+	if (*powner != owner)
+		return -ENOENT;
+	*powner = NULL;
+	return rc;
+}
+
+void usb_hub_release_all_ports(struct usb_device *hdev, void *owner)
+{
+	int n;
+	void **powner;
+
+	n = find_port_owner(hdev, 1, &powner);
+	if (n == 0) {
+		for (; n < hdev->maxchild; (++n, ++powner)) {
+			if (*powner == owner)
+				*powner = NULL;
+		}
+	}
+}
+
+/* The caller must hold udev's lock */
+bool usb_device_is_owned(struct usb_device *udev)
+{
+	struct usb_hub *hub;
+
+	if (udev->state == USB_STATE_NOTATTACHED || !udev->parent)
+		return false;
+	hub = hdev_to_hub(udev->parent);
+	return !!hub->port_owners[udev->portnum - 1];
+}
+
 
 static void recursively_mark_NOTATTACHED(struct usb_device *udev)
 {
diff --git a/drivers/usb/core/usb.h b/drivers/usb/core/usb.h
index c0e0ae2bb8e7..9a8b15e6377a 100644
--- a/drivers/usb/core/usb.h
+++ b/drivers/usb/core/usb.h
@@ -37,6 +37,13 @@ extern int usb_match_device(struct usb_device *dev,
 extern void usb_forced_unbind_intf(struct usb_interface *intf);
 extern void usb_rebind_intf(struct usb_interface *intf);
 
+extern int usb_hub_claim_port(struct usb_device *hdev, unsigned port,
+		void *owner);
+extern int usb_hub_release_port(struct usb_device *hdev, unsigned port,
+		void *owner);
+extern void usb_hub_release_all_ports(struct usb_device *hdev, void *owner);
+extern bool usb_device_is_owned(struct usb_device *udev);
+
 extern int  usb_hub_init(void);
 extern void usb_hub_cleanup(void);
 extern int usb_major_init(void);
diff --git a/include/linux/usbdevice_fs.h b/include/linux/usbdevice_fs.h
index 0044d9b4cb85..00ceebeb9e5c 100644
--- a/include/linux/usbdevice_fs.h
+++ b/include/linux/usbdevice_fs.h
@@ -175,4 +175,6 @@ struct usbdevfs_ioctl32 {
 #define USBDEVFS_CLEAR_HALT        _IOR('U', 21, unsigned int)
 #define USBDEVFS_DISCONNECT        _IO('U', 22)
 #define USBDEVFS_CONNECT           _IO('U', 23)
+#define USBDEVFS_CLAIM_PORT        _IOR('U', 24, unsigned int)
+#define USBDEVFS_RELEASE_PORT      _IOR('U', 25, unsigned int)
 #endif /* _LINUX_USBDEVICE_FS_H */
-- 
cgit v1.2.3-71-gd317


From ccf5b801cef4f9e2d708d3b87e91e2bc6abd5206 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Mon, 29 Jun 2009 11:00:01 -0400
Subject: USB: make intf.pm_usage an atomic_t

This patch (as1260) changes the pm_usage_cnt field in struct
usb_interface from an int to an atomic_t.  This is so that drivers can
invoke the usb_autopm_get_interface_async() and
usb_autopm_put_interface_async() routines without locking and without
fear of corrupting the pm_usage_cnt value.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/driver.c | 38 ++++++++++++++++++++++----------------
 drivers/usb/core/hub.c    |  5 +++--
 include/linux/usb.h       |  6 +++---
 3 files changed, 28 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index 1bad4e5a6abb..1c976c141f33 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -235,7 +235,7 @@ static int usb_probe_interface(struct device *dev)
 		/* The interface should always appear to be in use
 		 * unless the driver suports autosuspend.
 		 */
-		intf->pm_usage_cnt = !(driver->supports_autosuspend);
+		atomic_set(&intf->pm_usage_cnt, !driver->supports_autosuspend);
 
 		/* Carry out a deferred switch to altsetting 0 */
 		if (intf->needs_altsetting0) {
@@ -347,7 +347,7 @@ int usb_driver_claim_interface(struct usb_driver *driver,
 	usb_pm_lock(udev);
 	iface->condition = USB_INTERFACE_BOUND;
 	mark_active(iface);
-	iface->pm_usage_cnt = !(driver->supports_autosuspend);
+	atomic_set(&iface->pm_usage_cnt, !driver->supports_autosuspend);
 	usb_pm_unlock(udev);
 
 	/* if interface was already added, bind now; else let
@@ -1068,7 +1068,7 @@ static int autosuspend_check(struct usb_device *udev, int reschedule)
 			intf = udev->actconfig->interface[i];
 			if (!is_active(intf))
 				continue;
-			if (intf->pm_usage_cnt > 0)
+			if (atomic_read(&intf->pm_usage_cnt) > 0)
 				return -EBUSY;
 			if (intf->needs_remote_wakeup &&
 					!udev->do_remote_wakeup) {
@@ -1464,17 +1464,19 @@ static int usb_autopm_do_interface(struct usb_interface *intf,
 		status = -ENODEV;
 	else {
 		udev->auto_pm = 1;
-		intf->pm_usage_cnt += inc_usage_cnt;
+		atomic_add(inc_usage_cnt, &intf->pm_usage_cnt);
 		udev->last_busy = jiffies;
-		if (inc_usage_cnt >= 0 && intf->pm_usage_cnt > 0) {
+		if (inc_usage_cnt >= 0 &&
+				atomic_read(&intf->pm_usage_cnt) > 0) {
 			if (udev->state == USB_STATE_SUSPENDED)
 				status = usb_resume_both(udev,
 						PMSG_AUTO_RESUME);
 			if (status != 0)
-				intf->pm_usage_cnt -= inc_usage_cnt;
+				atomic_sub(inc_usage_cnt, &intf->pm_usage_cnt);
 			else
 				udev->last_busy = jiffies;
-		} else if (inc_usage_cnt <= 0 && intf->pm_usage_cnt <= 0) {
+		} else if (inc_usage_cnt <= 0 &&
+				atomic_read(&intf->pm_usage_cnt) <= 0) {
 			status = usb_suspend_both(udev, PMSG_AUTO_SUSPEND);
 		}
 	}
@@ -1519,7 +1521,7 @@ void usb_autopm_put_interface(struct usb_interface *intf)
 
 	status = usb_autopm_do_interface(intf, -1);
 	dev_vdbg(&intf->dev, "%s: status %d cnt %d\n",
-			__func__, status, intf->pm_usage_cnt);
+			__func__, status, atomic_read(&intf->pm_usage_cnt));
 }
 EXPORT_SYMBOL_GPL(usb_autopm_put_interface);
 
@@ -1547,10 +1549,10 @@ void usb_autopm_put_interface_async(struct usb_interface *intf)
 		status = -ENODEV;
 	} else {
 		udev->last_busy = jiffies;
-		--intf->pm_usage_cnt;
+		atomic_dec(&intf->pm_usage_cnt);
 		if (udev->autosuspend_disabled || udev->autosuspend_delay < 0)
 			status = -EPERM;
-		else if (intf->pm_usage_cnt <= 0 &&
+		else if (atomic_read(&intf->pm_usage_cnt) <= 0 &&
 				!timer_pending(&udev->autosuspend.timer)) {
 			queue_delayed_work(ksuspend_usb_wq, &udev->autosuspend,
 					round_jiffies_up_relative(
@@ -1558,7 +1560,7 @@ void usb_autopm_put_interface_async(struct usb_interface *intf)
 		}
 	}
 	dev_vdbg(&intf->dev, "%s: status %d cnt %d\n",
-			__func__, status, intf->pm_usage_cnt);
+			__func__, status, atomic_read(&intf->pm_usage_cnt));
 }
 EXPORT_SYMBOL_GPL(usb_autopm_put_interface_async);
 
@@ -1602,7 +1604,7 @@ int usb_autopm_get_interface(struct usb_interface *intf)
 
 	status = usb_autopm_do_interface(intf, 1);
 	dev_vdbg(&intf->dev, "%s: status %d cnt %d\n",
-			__func__, status, intf->pm_usage_cnt);
+			__func__, status, atomic_read(&intf->pm_usage_cnt));
 	return status;
 }
 EXPORT_SYMBOL_GPL(usb_autopm_get_interface);
@@ -1630,10 +1632,14 @@ int usb_autopm_get_interface_async(struct usb_interface *intf)
 		status = -ENODEV;
 	else if (udev->autoresume_disabled)
 		status = -EPERM;
-	else if (++intf->pm_usage_cnt > 0 && udev->state == USB_STATE_SUSPENDED)
-		queue_work(ksuspend_usb_wq, &udev->autoresume);
+	else {
+		atomic_inc(&intf->pm_usage_cnt);
+		if (atomic_read(&intf->pm_usage_cnt) > 0 &&
+				udev->state == USB_STATE_SUSPENDED)
+			queue_work(ksuspend_usb_wq, &udev->autoresume);
+	}
 	dev_vdbg(&intf->dev, "%s: status %d cnt %d\n",
-			__func__, status, intf->pm_usage_cnt);
+			__func__, status, atomic_read(&intf->pm_usage_cnt));
 	return status;
 }
 EXPORT_SYMBOL_GPL(usb_autopm_get_interface_async);
@@ -1655,7 +1661,7 @@ int usb_autopm_set_interface(struct usb_interface *intf)
 
 	status = usb_autopm_do_interface(intf, 0);
 	dev_vdbg(&intf->dev, "%s: status %d cnt %d\n",
-			__func__, status, intf->pm_usage_cnt);
+			__func__, status, atomic_read(&intf->pm_usage_cnt));
 	return status;
 }
 EXPORT_SYMBOL_GPL(usb_autopm_set_interface);
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index cb54420ed583..69e3a966a4b7 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -373,7 +373,7 @@ static void kick_khubd(struct usb_hub *hub)
 	unsigned long	flags;
 
 	/* Suppress autosuspend until khubd runs */
-	to_usb_interface(hub->intfdev)->pm_usage_cnt = 1;
+	atomic_set(&to_usb_interface(hub->intfdev)->pm_usage_cnt, 1);
 
 	spin_lock_irqsave(&hub_event_lock, flags);
 	if (!hub->disconnected && list_empty(&hub->event_list)) {
@@ -678,7 +678,8 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type)
 					msecs_to_jiffies(delay));
 
 			/* Suppress autosuspend until init is done */
-			to_usb_interface(hub->intfdev)->pm_usage_cnt = 1;
+			atomic_set(&to_usb_interface(hub->intfdev)->
+					pm_usage_cnt, 1);
 			return;		/* Continues at init2: below */
 		} else {
 			hub_power_on(hub, true);
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 3b45a0d27b80..a34fa89f1474 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -195,7 +195,7 @@ struct usb_interface {
 
 	struct device dev;		/* interface specific device info */
 	struct device *usb_dev;
-	int pm_usage_cnt;		/* usage counter for autosuspend */
+	atomic_t pm_usage_cnt;		/* usage counter for autosuspend */
 	struct work_struct reset_ws;	/* for resets in atomic context */
 };
 #define	to_usb_interface(d) container_of(d, struct usb_interface, dev)
@@ -551,13 +551,13 @@ extern void usb_autopm_put_interface_async(struct usb_interface *intf);
 
 static inline void usb_autopm_enable(struct usb_interface *intf)
 {
-	intf->pm_usage_cnt = 0;
+	atomic_set(&intf->pm_usage_cnt, 0);
 	usb_autopm_set_interface(intf);
 }
 
 static inline void usb_autopm_disable(struct usb_interface *intf)
 {
-	intf->pm_usage_cnt = 1;
+	atomic_set(&intf->pm_usage_cnt, 1);
 	usb_autopm_set_interface(intf);
 }
 
-- 
cgit v1.2.3-71-gd317


From 331ac6b288d9f3689514ced1878041fb0df7e13c Mon Sep 17 00:00:00 2001
From: Alek Du <alek.du@intel.com>
Date: Mon, 13 Jul 2009 12:41:20 +0800
Subject: USB: EHCI: Add Intel Moorestown EHCI controller HOSTPCx extensions
 and support phy low power mode

The Intel Moorestown EHCI controller supports non-standard HOSTPCx register
extension. This register controls the LPM behaviour and controls the behaviour
of each USB port.

Signed-off-by: Jacob Pan <jacob.jun.pan@intel.com>
Signed-off-by: Alek Du <alek.du@intel.com>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Cc: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/host/ehci-hcd.c  |  6 +++++
 drivers/usb/host/ehci-hub.c  | 62 ++++++++++++++++++++++++++++++++++++++++----
 drivers/usb/host/ehci.h      |  3 ++-
 include/linux/usb/ehci_def.h | 13 ++++++++++
 4 files changed, 78 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
index 7bee1638dfd7..5fbc67c5a913 100644
--- a/drivers/usb/host/ehci-hcd.c
+++ b/drivers/usb/host/ehci-hcd.c
@@ -249,6 +249,12 @@ static int ehci_reset (struct ehci_hcd *ehci)
 	retval = handshake (ehci, &ehci->regs->command,
 			    CMD_RESET, 0, 250 * 1000);
 
+	if (ehci->has_hostpc) {
+		ehci_writel(ehci, USBMODE_EX_HC | USBMODE_EX_VBPS,
+			(u32 __iomem *)(((u8 *)ehci->regs) + USBMODE_EX));
+		ehci_writel(ehci, TXFIFO_DEFAULT,
+			(u32 __iomem *)(((u8 *)ehci->regs) + TXFILLTUNING));
+	}
 	if (retval)
 		return retval;
 
diff --git a/drivers/usb/host/ehci-hub.c b/drivers/usb/host/ehci-hub.c
index f46ad27c9a90..6fef1ee7d101 100644
--- a/drivers/usb/host/ehci-hub.c
+++ b/drivers/usb/host/ehci-hub.c
@@ -111,6 +111,7 @@ static int ehci_bus_suspend (struct usb_hcd *hcd)
 	struct ehci_hcd		*ehci = hcd_to_ehci (hcd);
 	int			port;
 	int			mask;
+	u32 __iomem		*hostpc_reg = NULL;
 
 	ehci_dbg(ehci, "suspend root hub\n");
 
@@ -142,6 +143,9 @@ static int ehci_bus_suspend (struct usb_hcd *hcd)
 		u32		t1 = ehci_readl(ehci, reg) & ~PORT_RWC_BITS;
 		u32		t2 = t1;
 
+		if (ehci->has_hostpc)
+			hostpc_reg = (u32 __iomem *)((u8 *)ehci->regs
+				+ HOSTPC0 + 4 * (port & 0xff));
 		/* keep track of which ports we suspend */
 		if (t1 & PORT_OWNER)
 			set_bit(port, &ehci->owned_ports);
@@ -151,15 +155,37 @@ static int ehci_bus_suspend (struct usb_hcd *hcd)
 		}
 
 		/* enable remote wakeup on all ports */
-		if (hcd->self.root_hub->do_remote_wakeup)
-			t2 |= PORT_WAKE_BITS;
-		else
+		if (hcd->self.root_hub->do_remote_wakeup) {
+			/* only enable appropriate wake bits, otherwise the
+			 * hardware can not go phy low power mode. If a race
+			 * condition happens here(connection change during bits
+			 * set), the port change detection will finally fix it.
+			 */
+			if (t1 & PORT_CONNECT) {
+				t2 |= PORT_WKOC_E | PORT_WKDISC_E;
+				t2 &= ~PORT_WKCONN_E;
+			} else {
+				t2 |= PORT_WKOC_E | PORT_WKCONN_E;
+				t2 &= ~PORT_WKDISC_E;
+			}
+		} else
 			t2 &= ~PORT_WAKE_BITS;
 
 		if (t1 != t2) {
 			ehci_vdbg (ehci, "port %d, %08x -> %08x\n",
 				port + 1, t1, t2);
 			ehci_writel(ehci, t2, reg);
+			if (hostpc_reg) {
+				u32	t3;
+
+				msleep(5);/* 5ms for HCD enter low pwr mode */
+				t3 = ehci_readl(ehci, hostpc_reg);
+				ehci_writel(ehci, t3 | HOSTPC_PHCD, hostpc_reg);
+				t3 = ehci_readl(ehci, hostpc_reg);
+				ehci_dbg(ehci, "Port%d phy low pwr mode %s\n",
+					port, (t3 & HOSTPC_PHCD) ?
+					"succeeded" : "failed");
+			}
 		}
 	}
 
@@ -563,7 +589,8 @@ static int ehci_hub_control (
 	int		ports = HCS_N_PORTS (ehci->hcs_params);
 	u32 __iomem	*status_reg = &ehci->regs->port_status[
 				(wIndex & 0xff) - 1];
-	u32		temp, status;
+	u32 __iomem	*hostpc_reg = NULL;
+	u32		temp, temp1, status;
 	unsigned long	flags;
 	int		retval = 0;
 	unsigned	selector;
@@ -575,6 +602,9 @@ static int ehci_hub_control (
 	 * power, "this is the one", etc.  EHCI spec supports this.
 	 */
 
+	if (ehci->has_hostpc)
+		hostpc_reg = (u32 __iomem *)((u8 *)ehci->regs
+				+ HOSTPC0 + 4 * ((wIndex & 0xff) - 1));
 	spin_lock_irqsave (&ehci->lock, flags);
 	switch (typeReq) {
 	case ClearHubFeature:
@@ -773,7 +803,11 @@ static int ehci_hub_control (
 		if (temp & PORT_CONNECT) {
 			status |= 1 << USB_PORT_FEAT_CONNECTION;
 			// status may be from integrated TT
-			status |= ehci_port_speed(ehci, temp);
+			if (ehci->has_hostpc) {
+				temp1 = ehci_readl(ehci, hostpc_reg);
+				status |= ehci_port_speed(ehci, temp1);
+			} else
+				status |= ehci_port_speed(ehci, temp);
 		}
 		if (temp & PORT_PE)
 			status |= 1 << USB_PORT_FEAT_ENABLE;
@@ -832,6 +866,24 @@ static int ehci_hub_control (
 					|| (temp & PORT_RESET) != 0)
 				goto error;
 			ehci_writel(ehci, temp | PORT_SUSPEND, status_reg);
+			/* After above check the port must be connected.
+			 * Set appropriate bit thus could put phy into low power
+			 * mode if we have hostpc feature
+			 */
+			if (hostpc_reg) {
+				temp &= ~PORT_WKCONN_E;
+				temp |= (PORT_WKDISC_E | PORT_WKOC_E);
+				ehci_writel(ehci, temp | PORT_SUSPEND,
+							status_reg);
+				msleep(5);/* 5ms for HCD enter low pwr mode */
+				temp1 = ehci_readl(ehci, hostpc_reg);
+				ehci_writel(ehci, temp1 | HOSTPC_PHCD,
+					hostpc_reg);
+				temp1 = ehci_readl(ehci, hostpc_reg);
+				ehci_dbg(ehci, "Port%d phy low pwr mode %s\n",
+					wIndex, (temp1 & HOSTPC_PHCD) ?
+					"succeeded" : "failed");
+			}
 			set_bit(wIndex, &ehci->suspended_ports);
 			break;
 		case USB_PORT_FEAT_POWER:
diff --git a/drivers/usb/host/ehci.h b/drivers/usb/host/ehci.h
index fa12f20fbfe2..ec3dba6b8e48 100644
--- a/drivers/usb/host/ehci.h
+++ b/drivers/usb/host/ehci.h
@@ -136,6 +136,7 @@ struct ehci_hcd {			/* one per controller */
 	#define OHCI_HCCTRL_OFFSET      0x4
 	#define OHCI_HCCTRL_LEN         0x4
 	__hc32			*ohci_hcctrl_reg;
+	unsigned		has_hostpc:1;
 
 	u8			sbrn;		/* packed release number */
 
@@ -548,7 +549,7 @@ static inline unsigned int
 ehci_port_speed(struct ehci_hcd *ehci, unsigned int portsc)
 {
 	if (ehci_is_TDI(ehci)) {
-		switch ((portsc>>26)&3) {
+		switch ((portsc >> (ehci->has_hostpc ? 25 : 26)) & 3) {
 		case 0:
 			return 0;
 		case 1:
diff --git a/include/linux/usb/ehci_def.h b/include/linux/usb/ehci_def.h
index 5b88e36c9103..4c4b701ff265 100644
--- a/include/linux/usb/ehci_def.h
+++ b/include/linux/usb/ehci_def.h
@@ -132,6 +132,19 @@ struct ehci_regs {
 #define USBMODE_CM_HC	(3<<0)		/* host controller mode */
 #define USBMODE_CM_IDLE	(0<<0)		/* idle state */
 
+/* Moorestown has some non-standard registers, partially due to the fact that
+ * its EHCI controller has both TT and LPM support. HOSTPCx are extentions to
+ * PORTSCx
+ */
+#define HOSTPC0		0x84		/* HOSTPC extension */
+#define HOSTPC_PHCD	(1<<22)		/* Phy clock disable */
+#define HOSTPC_PSPD	(3<<25)		/* Port speed detection */
+#define USBMODE_EX	0xc8		/* USB Device mode extension */
+#define USBMODE_EX_VBPS	(1<<5)		/* VBus Power Select On */
+#define USBMODE_EX_HC	(3<<0)		/* host controller mode */
+#define TXFILLTUNING	0x24		/* TX FIFO Tuning register */
+#define TXFIFO_DEFAULT	(8<<16)		/* FIFO burst threshold 8 */
+
 /* Appendix C, Debug port ... intended for use with special "debug devices"
  * that can help if there's no serial console.  (nonstandard enumeration.)
  */
-- 
cgit v1.2.3-71-gd317


From 9da69c604d87afea37b5411867bb76e3c624cc92 Mon Sep 17 00:00:00 2001
From: Michael Hennerich <michael.hennerich@analog.com>
Date: Wed, 15 Jul 2009 23:22:54 -0400
Subject: USB: isp1760: allow platform devices to customize devflags

Platform device support was merged earlier, but support for boards to
customize the devflags aspect of the controller was not.  We want this on
Blackfin systems to control the bus width, but might as well expose all of
the fields while we're at it.

Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/host/isp1760-hcd.c |  4 ++++
 drivers/usb/host/isp1760-hcd.h |  2 ++
 drivers/usb/host/isp1760-if.c  | 21 ++++++++++++++++++++-
 include/linux/usb/isp1760.h    | 18 ++++++++++++++++++
 4 files changed, 44 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/usb/isp1760.h

(limited to 'include/linux')

diff --git a/drivers/usb/host/isp1760-hcd.c b/drivers/usb/host/isp1760-hcd.c
index 15438469f21a..9600a58299db 100644
--- a/drivers/usb/host/isp1760-hcd.c
+++ b/drivers/usb/host/isp1760-hcd.c
@@ -386,6 +386,10 @@ static int isp1760_hc_setup(struct usb_hcd *hcd)
 		hwmode |= HW_DACK_POL_HIGH;
 	if (priv->devflags & ISP1760_FLAG_DREQ_POL_HIGH)
 		hwmode |= HW_DREQ_POL_HIGH;
+	if (priv->devflags & ISP1760_FLAG_INTR_POL_HIGH)
+		hwmode |= HW_INTR_HIGH_ACT;
+	if (priv->devflags & ISP1760_FLAG_INTR_EDGE_TRIG)
+		hwmode |= HW_INTR_EDGE_TRIG;
 
 	/*
 	 * We have to set this first in case we're in 16-bit mode.
diff --git a/drivers/usb/host/isp1760-hcd.h b/drivers/usb/host/isp1760-hcd.h
index 462f4943cb1b..6931ef5c9650 100644
--- a/drivers/usb/host/isp1760-hcd.h
+++ b/drivers/usb/host/isp1760-hcd.h
@@ -142,6 +142,8 @@ typedef void (packet_enqueue)(struct usb_hcd *hcd, struct isp1760_qh *qh,
 #define ISP1760_FLAG_DACK_POL_HIGH	0x00000010 /* DACK active high */
 #define ISP1760_FLAG_DREQ_POL_HIGH	0x00000020 /* DREQ active high */
 #define ISP1760_FLAG_ISP1761		0x00000040 /* Chip is ISP1761 */
+#define ISP1760_FLAG_INTR_POL_HIGH	0x00000080 /* Interrupt polarity active high */
+#define ISP1760_FLAG_INTR_EDGE_TRIG	0x00000100 /* Interrupt edge triggered */
 
 /* chip memory management */
 struct memory_chunk {
diff --git a/drivers/usb/host/isp1760-if.c b/drivers/usb/host/isp1760-if.c
index d4feebfc63bd..1c9f977a5c9c 100644
--- a/drivers/usb/host/isp1760-if.c
+++ b/drivers/usb/host/isp1760-if.c
@@ -3,6 +3,7 @@
  * Currently there is support for
  * - OpenFirmware
  * - PCI
+ * - PDEV (generic platform device centralized driver model)
  *
  * (c) 2007 Sebastian Siewior <bigeasy@linutronix.de>
  *
@@ -11,6 +12,7 @@
 #include <linux/usb.h>
 #include <linux/io.h>
 #include <linux/platform_device.h>
+#include <linux/usb/isp1760.h>
 
 #include "../core/hcd.h"
 #include "isp1760-hcd.h"
@@ -308,6 +310,8 @@ static int __devinit isp1760_plat_probe(struct platform_device *pdev)
 	struct resource *mem_res;
 	struct resource *irq_res;
 	resource_size_t mem_size;
+	struct isp1760_platform_data *priv = pdev->dev.platform_data;
+	unsigned int devflags = 0;
 	unsigned long irqflags = IRQF_SHARED | IRQF_DISABLED;
 
 	mem_res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
@@ -330,8 +334,23 @@ static int __devinit isp1760_plat_probe(struct platform_device *pdev)
 	}
 	irqflags |= irq_res->flags & IRQF_TRIGGER_MASK;
 
+	if (priv) {
+		if (priv->is_isp1761)
+			devflags |= ISP1760_FLAG_ISP1761;
+		if (priv->bus_width_16)
+			devflags |= ISP1760_FLAG_BUS_WIDTH_16;
+		if (priv->port1_otg)
+			devflags |= ISP1760_FLAG_OTG_EN;
+		if (priv->analog_oc)
+			devflags |= ISP1760_FLAG_ANALOG_OC;
+		if (priv->dack_polarity_high)
+			devflags |= ISP1760_FLAG_DACK_POL_HIGH;
+		if (priv->dreq_polarity_high)
+			devflags |= ISP1760_FLAG_DREQ_POL_HIGH;
+	}
+
 	hcd = isp1760_register(mem_res->start, mem_size, irq_res->start,
-			       irqflags, &pdev->dev, dev_name(&pdev->dev), 0);
+			       irqflags, &pdev->dev, dev_name(&pdev->dev), devflags);
 	if (IS_ERR(hcd)) {
 		pr_warning("isp1760: Failed to register the HCD device\n");
 		ret = -ENODEV;
diff --git a/include/linux/usb/isp1760.h b/include/linux/usb/isp1760.h
new file mode 100644
index 000000000000..de7de53c5531
--- /dev/null
+++ b/include/linux/usb/isp1760.h
@@ -0,0 +1,18 @@
+/*
+ * board initialization should put one of these into dev->platform_data
+ * and place the isp1760 onto platform_bus named "isp1760-hcd".
+ */
+
+#ifndef __LINUX_USB_ISP1760_H
+#define __LINUX_USB_ISP1760_H
+
+struct isp1760_platform_data {
+	unsigned is_isp1761:1;			/* Chip is ISP1761 */
+	unsigned bus_width_16:1;		/* 16/32-bit data bus width */
+	unsigned port1_otg:1;			/* Port 1 supports OTG */
+	unsigned analog_oc:1;			/* Analog overcurrent */
+	unsigned dack_polarity_high:1;		/* DACK active high */
+	unsigned dreq_polarity_high:1;		/* DREQ active high */
+};
+
+#endif /* __LINUX_USB_ISP1760_H */
-- 
cgit v1.2.3-71-gd317


From a9d43091c5be1e7a60d5abe84be4f3050236b26a Mon Sep 17 00:00:00 2001
From: Lothar Wassmann <LW@KARO-electronics.de>
Date: Thu, 16 Jul 2009 20:51:21 -0400
Subject: USB: NXP ISP1362 USB host driver

Signed-off-by: Lothar Wassmann <LW@KARO-electronics.de>
Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/Makefile           |    1 +
 drivers/usb/host/Kconfig       |   12 +
 drivers/usb/host/Makefile      |    1 +
 drivers/usb/host/isp1362-hcd.c | 2905 ++++++++++++++++++++++++++++++++++++++++
 drivers/usb/host/isp1362.h     | 1079 +++++++++++++++
 include/linux/usb/isp1362.h    |   46 +
 6 files changed, 4044 insertions(+)
 create mode 100644 drivers/usb/host/isp1362-hcd.c
 create mode 100644 drivers/usb/host/isp1362.h
 create mode 100644 include/linux/usb/isp1362.h

(limited to 'include/linux')

diff --git a/drivers/usb/Makefile b/drivers/usb/Makefile
index 19cb7d5480d7..dbe4fb694ad2 100644
--- a/drivers/usb/Makefile
+++ b/drivers/usb/Makefile
@@ -16,6 +16,7 @@ obj-$(CONFIG_USB_UHCI_HCD)	+= host/
 obj-$(CONFIG_USB_FHCI_HCD)	+= host/
 obj-$(CONFIG_USB_XHCI_HCD)	+= host/
 obj-$(CONFIG_USB_SL811_HCD)	+= host/
+obj-$(CONFIG_USB_ISP1362_HCD)	+= host/
 obj-$(CONFIG_USB_U132_HCD)	+= host/
 obj-$(CONFIG_USB_R8A66597_HCD)	+= host/
 obj-$(CONFIG_USB_HWA_HCD)	+= host/
diff --git a/drivers/usb/host/Kconfig b/drivers/usb/host/Kconfig
index 41b8d7de2e07..9b43b226817f 100644
--- a/drivers/usb/host/Kconfig
+++ b/drivers/usb/host/Kconfig
@@ -159,6 +159,18 @@ config USB_ISP1760_HCD
 	  To compile this driver as a module, choose M here: the
 	  module will be called isp1760.
 
+config USB_ISP1362_HCD
+	tristate "ISP1362 HCD support"
+	depends on USB
+	default N
+	---help---
+	  Supports the Philips ISP1362 chip as a host controller
+
+	  This driver does not support isochronous transfers.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called isp1362-hcd.
+
 config USB_OHCI_HCD
 	tristate "OHCI HCD support"
 	depends on USB && USB_ARCH_HAS_OHCI
diff --git a/drivers/usb/host/Makefile b/drivers/usb/host/Makefile
index 289d748bb414..f58b2494c44a 100644
--- a/drivers/usb/host/Makefile
+++ b/drivers/usb/host/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_PCI)		+= pci-quirks.o
 obj-$(CONFIG_USB_EHCI_HCD)	+= ehci-hcd.o
 obj-$(CONFIG_USB_OXU210HP_HCD)	+= oxu210hp-hcd.o
 obj-$(CONFIG_USB_ISP116X_HCD)	+= isp116x-hcd.o
+obj-$(CONFIG_USB_ISP1362_HCD)	+= isp1362-hcd.o
 obj-$(CONFIG_USB_OHCI_HCD)	+= ohci-hcd.o
 obj-$(CONFIG_USB_UHCI_HCD)	+= uhci-hcd.o
 obj-$(CONFIG_USB_FHCI_HCD)	+= fhci.o
diff --git a/drivers/usb/host/isp1362-hcd.c b/drivers/usb/host/isp1362-hcd.c
new file mode 100644
index 000000000000..5819e10a146c
--- /dev/null
+++ b/drivers/usb/host/isp1362-hcd.c
@@ -0,0 +1,2905 @@
+/*
+ * ISP1362 HCD (Host Controller Driver) for USB.
+ *
+ * Copyright (C) 2005 Lothar Wassmann <LW@KARO-electronics.de>
+ *
+ * Derived from the SL811 HCD, rewritten for ISP116x.
+ * Copyright (C) 2005 Olav Kongas <ok@artecdesign.ee>
+ *
+ * Portions:
+ * Copyright (C) 2004 Psion Teklogix (for NetBook PRO)
+ * Copyright (C) 2004 David Brownell
+ */
+
+/*
+ * The ISP1362 chip requires a large delay (300ns and 462ns) between
+ * accesses to the address and data register.
+ * The following timing options exist:
+ *
+ * 1. Configure your memory controller to add such delays if it can (the best)
+ * 2. Implement platform-specific delay function possibly
+ *    combined with configuring the memory controller; see
+ *    include/linux/usb_isp1362.h for more info.
+ * 3. Use ndelay (easiest, poorest).
+ *
+ * Use the corresponding macros USE_PLATFORM_DELAY and USE_NDELAY in the
+ * platform specific section of isp1362.h to select the appropriate variant.
+ *
+ * Also note that according to the Philips "ISP1362 Errata" document
+ * Rev 1.00 from 27 May data corruption may occur when the #WR signal
+ * is reasserted (even with #CS deasserted) within 132ns after a
+ * write cycle to any controller register. If the hardware doesn't
+ * implement the recommended fix (gating the #WR with #CS) software
+ * must ensure that no further write cycle (not necessarily to the chip!)
+ * is issued by the CPU within this interval.
+
+ * For PXA25x this can be ensured by using VLIO with the maximum
+ * recovery time (MSCx = 0x7f8c) with a memory clock of 99.53 MHz.
+ */
+
+#ifdef CONFIG_USB_DEBUG
+# define ISP1362_DEBUG
+#else
+# undef ISP1362_DEBUG
+#endif
+
+/*
+ * The PXA255 UDC apparently doesn't handle GET_STATUS, GET_CONFIG and
+ * GET_INTERFACE requests correctly when the SETUP and DATA stages of the
+ * requests are carried out in separate frames. This will delay any SETUP
+ * packets until the start of the next frame so that this situation is
+ * unlikely to occur (and makes usbtest happy running with a PXA255 target
+ * device).
+ */
+#undef BUGGY_PXA2XX_UDC_USBTEST
+
+#undef PTD_TRACE
+#undef URB_TRACE
+#undef VERBOSE
+#undef REGISTERS
+
+/* This enables a memory test on the ISP1362 chip memory to make sure the
+ * chip access timing is correct.
+ */
+#undef CHIP_BUFFER_TEST
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/ioport.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/interrupt.h>
+#include <linux/usb.h>
+#include <linux/usb/isp1362.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+#include <linux/io.h>
+#include <linux/bitops.h>
+
+#include <asm/irq.h>
+#include <asm/system.h>
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+
+static int dbg_level;
+#ifdef ISP1362_DEBUG
+module_param(dbg_level, int, 0644);
+#else
+module_param(dbg_level, int, 0);
+#define	STUB_DEBUG_FILE
+#endif
+
+#include "../core/hcd.h"
+#include "../core/usb.h"
+#include "isp1362.h"
+
+
+#define DRIVER_VERSION	"2005-04-04"
+#define DRIVER_DESC	"ISP1362 USB Host Controller Driver"
+
+MODULE_DESCRIPTION(DRIVER_DESC);
+MODULE_LICENSE("GPL");
+
+static const char hcd_name[] = "isp1362-hcd";
+
+static void isp1362_hc_stop(struct usb_hcd *hcd);
+static int isp1362_hc_start(struct usb_hcd *hcd);
+
+/*-------------------------------------------------------------------------*/
+
+/*
+ * When called from the interrupthandler only isp1362_hcd->irqenb is modified,
+ * since the interrupt handler will write isp1362_hcd->irqenb to HCuPINT upon
+ * completion.
+ * We don't need a 'disable' counterpart, since interrupts will be disabled
+ * only by the interrupt handler.
+ */
+static inline void isp1362_enable_int(struct isp1362_hcd *isp1362_hcd, u16 mask)
+{
+	if ((isp1362_hcd->irqenb | mask) == isp1362_hcd->irqenb)
+		return;
+	if (mask & ~isp1362_hcd->irqenb)
+		isp1362_write_reg16(isp1362_hcd, HCuPINT, mask & ~isp1362_hcd->irqenb);
+	isp1362_hcd->irqenb |= mask;
+	if (isp1362_hcd->irq_active)
+		return;
+	isp1362_write_reg16(isp1362_hcd, HCuPINTENB, isp1362_hcd->irqenb);
+}
+
+/*-------------------------------------------------------------------------*/
+
+static inline struct isp1362_ep_queue *get_ptd_queue(struct isp1362_hcd *isp1362_hcd,
+						     u16 offset)
+{
+	struct isp1362_ep_queue *epq = NULL;
+
+	if (offset < isp1362_hcd->istl_queue[1].buf_start)
+		epq = &isp1362_hcd->istl_queue[0];
+	else if (offset < isp1362_hcd->intl_queue.buf_start)
+		epq = &isp1362_hcd->istl_queue[1];
+	else if (offset < isp1362_hcd->atl_queue.buf_start)
+		epq = &isp1362_hcd->intl_queue;
+	else if (offset < isp1362_hcd->atl_queue.buf_start +
+		   isp1362_hcd->atl_queue.buf_size)
+		epq = &isp1362_hcd->atl_queue;
+
+	if (epq)
+		DBG(1, "%s: PTD $%04x is on %s queue\n", __func__, offset, epq->name);
+	else
+		pr_warning("%s: invalid PTD $%04x\n", __func__, offset);
+
+	return epq;
+}
+
+static inline int get_ptd_offset(struct isp1362_ep_queue *epq, u8 index)
+{
+	int offset;
+
+	if (index * epq->blk_size > epq->buf_size) {
+		pr_warning("%s: Bad %s index %d(%d)\n", __func__, epq->name, index,
+		     epq->buf_size / epq->blk_size);
+		return -EINVAL;
+	}
+	offset = epq->buf_start + index * epq->blk_size;
+	DBG(3, "%s: %s PTD[%02x] # %04x\n", __func__, epq->name, index, offset);
+
+	return offset;
+}
+
+/*-------------------------------------------------------------------------*/
+
+static inline u16 max_transfer_size(struct isp1362_ep_queue *epq, size_t size,
+				    int mps)
+{
+	u16 xfer_size = min_t(size_t, MAX_XFER_SIZE, size);
+
+	xfer_size = min_t(size_t, xfer_size, epq->buf_avail * epq->blk_size - PTD_HEADER_SIZE);
+	if (xfer_size < size && xfer_size % mps)
+		xfer_size -= xfer_size % mps;
+
+	return xfer_size;
+}
+
+static int claim_ptd_buffers(struct isp1362_ep_queue *epq,
+			     struct isp1362_ep *ep, u16 len)
+{
+	int ptd_offset = -EINVAL;
+	int index;
+	int num_ptds = ((len + PTD_HEADER_SIZE - 1) / epq->blk_size) + 1;
+	int found = -1;
+	int last = -1;
+
+	BUG_ON(len > epq->buf_size);
+
+	if (!epq->buf_avail)
+		return -ENOMEM;
+
+	if (ep->num_ptds)
+		pr_err("%s: %s len %d/%d num_ptds %d buf_map %08lx skip_map %08lx\n", __func__,
+		    epq->name, len, epq->blk_size, num_ptds, epq->buf_map, epq->skip_map);
+	BUG_ON(ep->num_ptds != 0);
+
+	for (index = 0; index <= epq->buf_count - num_ptds; index++) {
+		if (test_bit(index, &epq->buf_map))
+			continue;
+		found = index;
+		for (last = index + 1; last < index + num_ptds; last++) {
+			if (test_bit(last, &epq->buf_map)) {
+				found = -1;
+				break;
+			}
+		}
+		if (found >= 0)
+			break;
+	}
+	if (found < 0)
+		return -EOVERFLOW;
+
+	DBG(1, "%s: Found %d PTDs[%d] for %d/%d byte\n", __func__,
+	    num_ptds, found, len, (int)(epq->blk_size - PTD_HEADER_SIZE));
+	ptd_offset = get_ptd_offset(epq, found);
+	WARN_ON(ptd_offset < 0);
+	ep->ptd_offset = ptd_offset;
+	ep->num_ptds += num_ptds;
+	epq->buf_avail -= num_ptds;
+	BUG_ON(epq->buf_avail > epq->buf_count);
+	ep->ptd_index = found;
+	for (index = found; index < last; index++)
+		__set_bit(index, &epq->buf_map);
+	DBG(1, "%s: Done %s PTD[%d] $%04x, avail %d count %d claimed %d %08lx:%08lx\n",
+	    __func__, epq->name, ep->ptd_index, ep->ptd_offset,
+	    epq->buf_avail, epq->buf_count, num_ptds, epq->buf_map, epq->skip_map);
+
+	return found;
+}
+
+static inline void release_ptd_buffers(struct isp1362_ep_queue *epq, struct isp1362_ep *ep)
+{
+	int index = ep->ptd_index;
+	int last = ep->ptd_index + ep->num_ptds;
+
+	if (last > epq->buf_count)
+		pr_err("%s: ep %p req %d len %d %s PTD[%d] $%04x num_ptds %d buf_count %d buf_avail %d buf_map %08lx skip_map %08lx\n",
+		    __func__, ep, ep->num_req, ep->length, epq->name, ep->ptd_index,
+		    ep->ptd_offset, ep->num_ptds, epq->buf_count, epq->buf_avail,
+		    epq->buf_map, epq->skip_map);
+	BUG_ON(last > epq->buf_count);
+
+	for (; index < last; index++) {
+		__clear_bit(index, &epq->buf_map);
+		__set_bit(index, &epq->skip_map);
+	}
+	epq->buf_avail += ep->num_ptds;
+	epq->ptd_count--;
+
+	BUG_ON(epq->buf_avail > epq->buf_count);
+	BUG_ON(epq->ptd_count > epq->buf_count);
+
+	DBG(1, "%s: Done %s PTDs $%04x released %d avail %d count %d\n",
+	    __func__, epq->name,
+	    ep->ptd_offset, ep->num_ptds, epq->buf_avail, epq->buf_count);
+	DBG(1, "%s: buf_map %08lx skip_map %08lx\n", __func__,
+	    epq->buf_map, epq->skip_map);
+
+	ep->num_ptds = 0;
+	ep->ptd_offset = -EINVAL;
+	ep->ptd_index = -EINVAL;
+}
+
+/*-------------------------------------------------------------------------*/
+
+/*
+  Set up PTD's.
+*/
+static void prepare_ptd(struct isp1362_hcd *isp1362_hcd, struct urb *urb,
+			struct isp1362_ep *ep, struct isp1362_ep_queue *epq,
+			u16 fno)
+{
+	struct ptd *ptd;
+	int toggle;
+	int dir;
+	u16 len;
+	size_t buf_len = urb->transfer_buffer_length - urb->actual_length;
+
+	DBG(3, "%s: %s ep %p\n", __func__, epq->name, ep);
+
+	ptd = &ep->ptd;
+
+	ep->data = (unsigned char *)urb->transfer_buffer + urb->actual_length;
+
+	switch (ep->nextpid) {
+	case USB_PID_IN:
+		toggle = usb_gettoggle(urb->dev, ep->epnum, 0);
+		dir = PTD_DIR_IN;
+		if (usb_pipecontrol(urb->pipe)) {
+			len = min_t(size_t, ep->maxpacket, buf_len);
+		} else if (usb_pipeisoc(urb->pipe)) {
+			len = min_t(size_t, urb->iso_frame_desc[fno].length, MAX_XFER_SIZE);
+			ep->data = urb->transfer_buffer + urb->iso_frame_desc[fno].offset;
+		} else
+			len = max_transfer_size(epq, buf_len, ep->maxpacket);
+		DBG(1, "%s: IN    len %d/%d/%d from URB\n", __func__, len, ep->maxpacket,
+		    (int)buf_len);
+		break;
+	case USB_PID_OUT:
+		toggle = usb_gettoggle(urb->dev, ep->epnum, 1);
+		dir = PTD_DIR_OUT;
+		if (usb_pipecontrol(urb->pipe))
+			len = min_t(size_t, ep->maxpacket, buf_len);
+		else if (usb_pipeisoc(urb->pipe))
+			len = min_t(size_t, urb->iso_frame_desc[0].length, MAX_XFER_SIZE);
+		else
+			len = max_transfer_size(epq, buf_len, ep->maxpacket);
+		if (len == 0)
+			pr_info("%s: Sending ZERO packet: %d\n", __func__,
+			     urb->transfer_flags & URB_ZERO_PACKET);
+		DBG(1, "%s: OUT   len %d/%d/%d from URB\n", __func__, len, ep->maxpacket,
+		    (int)buf_len);
+		break;
+	case USB_PID_SETUP:
+		toggle = 0;
+		dir = PTD_DIR_SETUP;
+		len = sizeof(struct usb_ctrlrequest);
+		DBG(1, "%s: SETUP len %d\n", __func__, len);
+		ep->data = urb->setup_packet;
+		break;
+	case USB_PID_ACK:
+		toggle = 1;
+		len = 0;
+		dir = (urb->transfer_buffer_length && usb_pipein(urb->pipe)) ?
+			PTD_DIR_OUT : PTD_DIR_IN;
+		DBG(1, "%s: ACK   len %d\n", __func__, len);
+		break;
+	default:
+		toggle = dir = len = 0;
+		pr_err("%s@%d: ep->nextpid %02x\n", __func__, __LINE__, ep->nextpid);
+		BUG_ON(1);
+	}
+
+	ep->length = len;
+	if (!len)
+		ep->data = NULL;
+
+	ptd->count = PTD_CC_MSK | PTD_ACTIVE_MSK | PTD_TOGGLE(toggle);
+	ptd->mps = PTD_MPS(ep->maxpacket) | PTD_SPD(urb->dev->speed == USB_SPEED_LOW) |
+		PTD_EP(ep->epnum);
+	ptd->len = PTD_LEN(len) | PTD_DIR(dir);
+	ptd->faddr = PTD_FA(usb_pipedevice(urb->pipe));
+
+	if (usb_pipeint(urb->pipe)) {
+		ptd->faddr |= PTD_SF_INT(ep->branch);
+		ptd->faddr |= PTD_PR(ep->interval ? __ffs(ep->interval) : 0);
+	}
+	if (usb_pipeisoc(urb->pipe))
+		ptd->faddr |= PTD_SF_ISO(fno);
+
+	DBG(1, "%s: Finished\n", __func__);
+}
+
+static void isp1362_write_ptd(struct isp1362_hcd *isp1362_hcd, struct isp1362_ep *ep,
+			      struct isp1362_ep_queue *epq)
+{
+	struct ptd *ptd = &ep->ptd;
+	int len = PTD_GET_DIR(ptd) == PTD_DIR_IN ? 0 : ep->length;
+
+	_BUG_ON(ep->ptd_offset < 0);
+
+	prefetch(ptd);
+	isp1362_write_buffer(isp1362_hcd, ptd, ep->ptd_offset, PTD_HEADER_SIZE);
+	if (len)
+		isp1362_write_buffer(isp1362_hcd, ep->data,
+				     ep->ptd_offset + PTD_HEADER_SIZE, len);
+
+	dump_ptd(ptd);
+	dump_ptd_out_data(ptd, ep->data);
+}
+
+static void isp1362_read_ptd(struct isp1362_hcd *isp1362_hcd, struct isp1362_ep *ep,
+			     struct isp1362_ep_queue *epq)
+{
+	struct ptd *ptd = &ep->ptd;
+	int act_len;
+
+	WARN_ON(list_empty(&ep->active));
+	BUG_ON(ep->ptd_offset < 0);
+
+	list_del_init(&ep->active);
+	DBG(1, "%s: ep %p removed from active list %p\n", __func__, ep, &epq->active);
+
+	prefetchw(ptd);
+	isp1362_read_buffer(isp1362_hcd, ptd, ep->ptd_offset, PTD_HEADER_SIZE);
+	dump_ptd(ptd);
+	act_len = PTD_GET_COUNT(ptd);
+	if (PTD_GET_DIR(ptd) != PTD_DIR_IN || act_len == 0)
+		return;
+	if (act_len > ep->length)
+		pr_err("%s: ep %p PTD $%04x act_len %d ep->length %d\n", __func__, ep,
+			 ep->ptd_offset, act_len, ep->length);
+	BUG_ON(act_len > ep->length);
+	/* Only transfer the amount of data that has actually been overwritten
+	 * in the chip buffer. We don't want any data that doesn't belong to the
+	 * transfer to leak out of the chip to the callers transfer buffer!
+	 */
+	prefetchw(ep->data);
+	isp1362_read_buffer(isp1362_hcd, ep->data,
+			    ep->ptd_offset + PTD_HEADER_SIZE, act_len);
+	dump_ptd_in_data(ptd, ep->data);
+}
+
+/*
+ * INT PTDs will stay in the chip until data is available.
+ * This function will remove a PTD from the chip when the URB is dequeued.
+ * Must be called with the spinlock held and IRQs disabled
+ */
+static void remove_ptd(struct isp1362_hcd *isp1362_hcd, struct isp1362_ep *ep)
+
+{
+	int index;
+	struct isp1362_ep_queue *epq;
+
+	DBG(1, "%s: ep %p PTD[%d] $%04x\n", __func__, ep, ep->ptd_index, ep->ptd_offset);
+	BUG_ON(ep->ptd_offset < 0);
+
+	epq = get_ptd_queue(isp1362_hcd, ep->ptd_offset);
+	BUG_ON(!epq);
+
+	/* put ep in remove_list for cleanup */
+	WARN_ON(!list_empty(&ep->remove_list));
+	list_add_tail(&ep->remove_list, &isp1362_hcd->remove_list);
+	/* let SOF interrupt handle the cleanup */
+	isp1362_enable_int(isp1362_hcd, HCuPINT_SOF);
+
+	index = ep->ptd_index;
+	if (index < 0)
+		/* ISO queues don't have SKIP registers */
+		return;
+
+	DBG(1, "%s: Disabling PTD[%02x] $%04x %08lx|%08x\n", __func__,
+	    index, ep->ptd_offset, epq->skip_map, 1 << index);
+
+	/* prevent further processing of PTD (will be effective after next SOF) */
+	epq->skip_map |= 1 << index;
+	if (epq == &isp1362_hcd->atl_queue) {
+		DBG(2, "%s: ATLSKIP = %08x -> %08lx\n", __func__,
+		    isp1362_read_reg32(isp1362_hcd, HCATLSKIP), epq->skip_map);
+		isp1362_write_reg32(isp1362_hcd, HCATLSKIP, epq->skip_map);
+		if (~epq->skip_map == 0)
+			isp1362_clr_mask16(isp1362_hcd, HCBUFSTAT, HCBUFSTAT_ATL_ACTIVE);
+	} else if (epq == &isp1362_hcd->intl_queue) {
+		DBG(2, "%s: INTLSKIP = %08x -> %08lx\n", __func__,
+		    isp1362_read_reg32(isp1362_hcd, HCINTLSKIP), epq->skip_map);
+		isp1362_write_reg32(isp1362_hcd, HCINTLSKIP, epq->skip_map);
+		if (~epq->skip_map == 0)
+			isp1362_clr_mask16(isp1362_hcd, HCBUFSTAT, HCBUFSTAT_INTL_ACTIVE);
+	}
+}
+
+/*
+  Take done or failed requests out of schedule. Give back
+  processed urbs.
+*/
+static void finish_request(struct isp1362_hcd *isp1362_hcd, struct isp1362_ep *ep,
+			   struct urb *urb, int status)
+     __releases(isp1362_hcd->lock)
+     __acquires(isp1362_hcd->lock)
+{
+	urb->hcpriv = NULL;
+	ep->error_count = 0;
+
+	if (usb_pipecontrol(urb->pipe))
+		ep->nextpid = USB_PID_SETUP;
+
+	URB_DBG("%s: req %d FA %d ep%d%s %s: len %d/%d %s stat %d\n", __func__,
+		ep->num_req, usb_pipedevice(urb->pipe),
+		usb_pipeendpoint(urb->pipe),
+		!usb_pipein(urb->pipe) ? "out" : "in",
+		usb_pipecontrol(urb->pipe) ? "ctrl" :
+			usb_pipeint(urb->pipe) ? "int" :
+			usb_pipebulk(urb->pipe) ? "bulk" :
+			"iso",
+		urb->actual_length, urb->transfer_buffer_length,
+		!(urb->transfer_flags & URB_SHORT_NOT_OK) ?
+		"short_ok" : "", urb->status);
+
+
+	usb_hcd_unlink_urb_from_ep(isp1362_hcd_to_hcd(isp1362_hcd), urb);
+	spin_unlock(&isp1362_hcd->lock);
+	usb_hcd_giveback_urb(isp1362_hcd_to_hcd(isp1362_hcd), urb, status);
+	spin_lock(&isp1362_hcd->lock);
+
+	/* take idle endpoints out of the schedule right away */
+	if (!list_empty(&ep->hep->urb_list))
+		return;
+
+	/* async deschedule */
+	if (!list_empty(&ep->schedule)) {
+		list_del_init(&ep->schedule);
+		return;
+	}
+
+
+	if (ep->interval) {
+		/* periodic deschedule */
+		DBG(1, "deschedule qh%d/%p branch %d load %d bandwidth %d -> %d\n", ep->interval,
+		    ep, ep->branch, ep->load,
+		    isp1362_hcd->load[ep->branch],
+		    isp1362_hcd->load[ep->branch] - ep->load);
+		isp1362_hcd->load[ep->branch] -= ep->load;
+		ep->branch = PERIODIC_SIZE;
+	}
+}
+
+/*
+ * Analyze transfer results, handle partial transfers and errors
+*/
+static void postproc_ep(struct isp1362_hcd *isp1362_hcd, struct isp1362_ep *ep)
+{
+	struct urb *urb = get_urb(ep);
+	struct usb_device *udev;
+	struct ptd *ptd;
+	int short_ok;
+	u16 len;
+	int urbstat = -EINPROGRESS;
+	u8 cc;
+
+	DBG(2, "%s: ep %p req %d\n", __func__, ep, ep->num_req);
+
+	udev = urb->dev;
+	ptd = &ep->ptd;
+	cc = PTD_GET_CC(ptd);
+	if (cc == PTD_NOTACCESSED) {
+		pr_err("%s: req %d PTD %p Untouched by ISP1362\n", __func__,
+		    ep->num_req, ptd);
+		cc = PTD_DEVNOTRESP;
+	}
+
+	short_ok = !(urb->transfer_flags & URB_SHORT_NOT_OK);
+	len = urb->transfer_buffer_length - urb->actual_length;
+
+	/* Data underrun is special. For allowed underrun
+	   we clear the error and continue as normal. For
+	   forbidden underrun we finish the DATA stage
+	   immediately while for control transfer,
+	   we do a STATUS stage.
+	*/
+	if (cc == PTD_DATAUNDERRUN) {
+		if (short_ok) {
+			DBG(1, "%s: req %d Allowed data underrun short_%sok %d/%d/%d byte\n",
+			    __func__, ep->num_req, short_ok ? "" : "not_",
+			    PTD_GET_COUNT(ptd), ep->maxpacket, len);
+			cc = PTD_CC_NOERROR;
+			urbstat = 0;
+		} else {
+			DBG(1, "%s: req %d Data Underrun %s nextpid %02x short_%sok %d/%d/%d byte\n",
+			    __func__, ep->num_req,
+			    usb_pipein(urb->pipe) ? "IN" : "OUT", ep->nextpid,
+			    short_ok ? "" : "not_",
+			    PTD_GET_COUNT(ptd), ep->maxpacket, len);
+			if (usb_pipecontrol(urb->pipe)) {
+				ep->nextpid = USB_PID_ACK;
+				/* save the data underrun error code for later and
+				 * procede with the status stage
+				 */
+				urb->actual_length += PTD_GET_COUNT(ptd);
+				BUG_ON(urb->actual_length > urb->transfer_buffer_length);
+
+				if (urb->status == -EINPROGRESS)
+					urb->status = cc_to_error[PTD_DATAUNDERRUN];
+			} else {
+				usb_settoggle(udev, ep->epnum, ep->nextpid == USB_PID_OUT,
+					      PTD_GET_TOGGLE(ptd));
+				urbstat = cc_to_error[PTD_DATAUNDERRUN];
+			}
+			goto out;
+		}
+	}
+
+	if (cc != PTD_CC_NOERROR) {
+		if (++ep->error_count >= 3 || cc == PTD_CC_STALL || cc == PTD_DATAOVERRUN) {
+			urbstat = cc_to_error[cc];
+			DBG(1, "%s: req %d nextpid %02x, status %d, error %d, error_count %d\n",
+			    __func__, ep->num_req, ep->nextpid, urbstat, cc,
+			    ep->error_count);
+		}
+		goto out;
+	}
+
+	switch (ep->nextpid) {
+	case USB_PID_OUT:
+		if (PTD_GET_COUNT(ptd) != ep->length)
+			pr_err("%s: count=%d len=%d\n", __func__,
+			   PTD_GET_COUNT(ptd), ep->length);
+		BUG_ON(PTD_GET_COUNT(ptd) != ep->length);
+		urb->actual_length += ep->length;
+		BUG_ON(urb->actual_length > urb->transfer_buffer_length);
+		usb_settoggle(udev, ep->epnum, 1, PTD_GET_TOGGLE(ptd));
+		if (urb->actual_length == urb->transfer_buffer_length) {
+			DBG(3, "%s: req %d xfer complete %d/%d status %d -> 0\n", __func__,
+			    ep->num_req, len, ep->maxpacket, urbstat);
+			if (usb_pipecontrol(urb->pipe)) {
+				DBG(3, "%s: req %d %s Wait for ACK\n", __func__,
+				    ep->num_req,
+				    usb_pipein(urb->pipe) ? "IN" : "OUT");
+				ep->nextpid = USB_PID_ACK;
+			} else {
+				if (len % ep->maxpacket ||
+				    !(urb->transfer_flags & URB_ZERO_PACKET)) {
+					urbstat = 0;
+					DBG(3, "%s: req %d URB %s status %d count %d/%d/%d\n",
+					    __func__, ep->num_req, usb_pipein(urb->pipe) ? "IN" : "OUT",
+					    urbstat, len, ep->maxpacket, urb->actual_length);
+				}
+			}
+		}
+		break;
+	case USB_PID_IN:
+		len = PTD_GET_COUNT(ptd);
+		BUG_ON(len > ep->length);
+		urb->actual_length += len;
+		BUG_ON(urb->actual_length > urb->transfer_buffer_length);
+		usb_settoggle(udev, ep->epnum, 0, PTD_GET_TOGGLE(ptd));
+		/* if transfer completed or (allowed) data underrun */
+		if ((urb->transfer_buffer_length == urb->actual_length) ||
+		    len % ep->maxpacket) {
+			DBG(3, "%s: req %d xfer complete %d/%d status %d -> 0\n", __func__,
+			    ep->num_req, len, ep->maxpacket, urbstat);
+			if (usb_pipecontrol(urb->pipe)) {
+				DBG(3, "%s: req %d %s Wait for ACK\n", __func__,
+				    ep->num_req,
+				    usb_pipein(urb->pipe) ? "IN" : "OUT");
+				ep->nextpid = USB_PID_ACK;
+			} else {
+				urbstat = 0;
+				DBG(3, "%s: req %d URB %s status %d count %d/%d/%d\n",
+				    __func__, ep->num_req, usb_pipein(urb->pipe) ? "IN" : "OUT",
+				    urbstat, len, ep->maxpacket, urb->actual_length);
+			}
+		}
+		break;
+	case USB_PID_SETUP:
+		if (urb->transfer_buffer_length == urb->actual_length) {
+			ep->nextpid = USB_PID_ACK;
+		} else if (usb_pipeout(urb->pipe)) {
+			usb_settoggle(udev, 0, 1, 1);
+			ep->nextpid = USB_PID_OUT;
+		} else {
+			usb_settoggle(udev, 0, 0, 1);
+			ep->nextpid = USB_PID_IN;
+		}
+		break;
+	case USB_PID_ACK:
+		DBG(3, "%s: req %d got ACK %d -> 0\n", __func__, ep->num_req,
+		    urbstat);
+		WARN_ON(urbstat != -EINPROGRESS);
+		urbstat = 0;
+		ep->nextpid = 0;
+		break;
+	default:
+		BUG_ON(1);
+	}
+
+ out:
+	if (urbstat != -EINPROGRESS) {
+		DBG(2, "%s: Finishing ep %p req %d urb %p status %d\n", __func__,
+		    ep, ep->num_req, urb, urbstat);
+		finish_request(isp1362_hcd, ep, urb, urbstat);
+	}
+}
+
+static void finish_unlinks(struct isp1362_hcd *isp1362_hcd)
+{
+	struct isp1362_ep *ep;
+	struct isp1362_ep *tmp;
+
+	list_for_each_entry_safe(ep, tmp, &isp1362_hcd->remove_list, remove_list) {
+		struct isp1362_ep_queue *epq =
+			get_ptd_queue(isp1362_hcd, ep->ptd_offset);
+		int index = ep->ptd_index;
+
+		BUG_ON(epq == NULL);
+		if (index >= 0) {
+			DBG(1, "%s: remove PTD[%d] $%04x\n", __func__, index, ep->ptd_offset);
+			BUG_ON(ep->num_ptds == 0);
+			release_ptd_buffers(epq, ep);
+		}
+		if (!list_empty(&ep->hep->urb_list)) {
+			struct urb *urb = get_urb(ep);
+
+			DBG(1, "%s: Finishing req %d ep %p from remove_list\n", __func__,
+			    ep->num_req, ep);
+			finish_request(isp1362_hcd, ep, urb, -ESHUTDOWN);
+		}
+		WARN_ON(list_empty(&ep->active));
+		if (!list_empty(&ep->active)) {
+			list_del_init(&ep->active);
+			DBG(1, "%s: ep %p removed from active list\n", __func__, ep);
+		}
+		list_del_init(&ep->remove_list);
+		DBG(1, "%s: ep %p removed from remove_list\n", __func__, ep);
+	}
+	DBG(1, "%s: Done\n", __func__);
+}
+
+static inline void enable_atl_transfers(struct isp1362_hcd *isp1362_hcd, int count)
+{
+	if (count > 0) {
+		if (count < isp1362_hcd->atl_queue.ptd_count)
+			isp1362_write_reg16(isp1362_hcd, HCATLDTC, count);
+		isp1362_enable_int(isp1362_hcd, HCuPINT_ATL);
+		isp1362_write_reg32(isp1362_hcd, HCATLSKIP, isp1362_hcd->atl_queue.skip_map);
+		isp1362_set_mask16(isp1362_hcd, HCBUFSTAT, HCBUFSTAT_ATL_ACTIVE);
+	} else
+		isp1362_enable_int(isp1362_hcd, HCuPINT_SOF);
+}
+
+static inline void enable_intl_transfers(struct isp1362_hcd *isp1362_hcd)
+{
+	isp1362_enable_int(isp1362_hcd, HCuPINT_INTL);
+	isp1362_set_mask16(isp1362_hcd, HCBUFSTAT, HCBUFSTAT_INTL_ACTIVE);
+	isp1362_write_reg32(isp1362_hcd, HCINTLSKIP, isp1362_hcd->intl_queue.skip_map);
+}
+
+static inline void enable_istl_transfers(struct isp1362_hcd *isp1362_hcd, int flip)
+{
+	isp1362_enable_int(isp1362_hcd, flip ? HCuPINT_ISTL1 : HCuPINT_ISTL0);
+	isp1362_set_mask16(isp1362_hcd, HCBUFSTAT, flip ?
+			   HCBUFSTAT_ISTL1_FULL : HCBUFSTAT_ISTL0_FULL);
+}
+
+static int submit_req(struct isp1362_hcd *isp1362_hcd, struct urb *urb,
+		      struct isp1362_ep *ep, struct isp1362_ep_queue *epq)
+{
+	int index = epq->free_ptd;
+
+	prepare_ptd(isp1362_hcd, urb, ep, epq, 0);
+	index = claim_ptd_buffers(epq, ep, ep->length);
+	if (index == -ENOMEM) {
+		DBG(1, "%s: req %d No free %s PTD available: %d, %08lx:%08lx\n", __func__,
+		    ep->num_req, epq->name, ep->num_ptds, epq->buf_map, epq->skip_map);
+		return index;
+	} else if (index == -EOVERFLOW) {
+		DBG(1, "%s: req %d Not enough space for %d byte %s PTD %d %08lx:%08lx\n",
+		    __func__, ep->num_req, ep->length, epq->name, ep->num_ptds,
+		    epq->buf_map, epq->skip_map);
+		return index;
+	} else
+		BUG_ON(index < 0);
+	list_add_tail(&ep->active, &epq->active);
+	DBG(1, "%s: ep %p req %d len %d added to active list %p\n", __func__,
+	    ep, ep->num_req, ep->length, &epq->active);
+	DBG(1, "%s: Submitting %s PTD $%04x for ep %p req %d\n", __func__, epq->name,
+	    ep->ptd_offset, ep, ep->num_req);
+	isp1362_write_ptd(isp1362_hcd, ep, epq);
+	__clear_bit(ep->ptd_index, &epq->skip_map);
+
+	return 0;
+}
+
+static void start_atl_transfers(struct isp1362_hcd *isp1362_hcd)
+{
+	int ptd_count = 0;
+	struct isp1362_ep_queue *epq = &isp1362_hcd->atl_queue;
+	struct isp1362_ep *ep;
+	int defer = 0;
+
+	if (atomic_read(&epq->finishing)) {
+		DBG(1, "%s: finish_transfers is active for %s\n", __func__, epq->name);
+		return;
+	}
+
+	list_for_each_entry(ep, &isp1362_hcd->async, schedule) {
+		struct urb *urb = get_urb(ep);
+		int ret;
+
+		if (!list_empty(&ep->active)) {
+			DBG(2, "%s: Skipping active %s ep %p\n", __func__, epq->name, ep);
+			continue;
+		}
+
+		DBG(1, "%s: Processing %s ep %p req %d\n", __func__, epq->name,
+		    ep, ep->num_req);
+
+		ret = submit_req(isp1362_hcd, urb, ep, epq);
+		if (ret == -ENOMEM) {
+			defer = 1;
+			break;
+		} else if (ret == -EOVERFLOW) {
+			defer = 1;
+			continue;
+		}
+#ifdef BUGGY_PXA2XX_UDC_USBTEST
+		defer = ep->nextpid == USB_PID_SETUP;
+#endif
+		ptd_count++;
+	}
+
+	/* Avoid starving of endpoints */
+	if (isp1362_hcd->async.next != isp1362_hcd->async.prev) {
+		DBG(2, "%s: Cycling ASYNC schedule %d\n", __func__, ptd_count);
+		list_move(&isp1362_hcd->async, isp1362_hcd->async.next);
+	}
+	if (ptd_count || defer)
+		enable_atl_transfers(isp1362_hcd, defer ? 0 : ptd_count);
+
+	epq->ptd_count += ptd_count;
+	if (epq->ptd_count > epq->stat_maxptds) {
+		epq->stat_maxptds = epq->ptd_count;
+		DBG(0, "%s: max_ptds: %d\n", __func__, epq->stat_maxptds);
+	}
+}
+
+static void start_intl_transfers(struct isp1362_hcd *isp1362_hcd)
+{
+	int ptd_count = 0;
+	struct isp1362_ep_queue *epq = &isp1362_hcd->intl_queue;
+	struct isp1362_ep *ep;
+
+	if (atomic_read(&epq->finishing)) {
+		DBG(1, "%s: finish_transfers is active for %s\n", __func__, epq->name);
+		return;
+	}
+
+	list_for_each_entry(ep, &isp1362_hcd->periodic, schedule) {
+		struct urb *urb = get_urb(ep);
+		int ret;
+
+		if (!list_empty(&ep->active)) {
+			DBG(1, "%s: Skipping active %s ep %p\n", __func__,
+			    epq->name, ep);
+			continue;
+		}
+
+		DBG(1, "%s: Processing %s ep %p req %d\n", __func__,
+		    epq->name, ep, ep->num_req);
+		ret = submit_req(isp1362_hcd, urb, ep, epq);
+		if (ret == -ENOMEM)
+			break;
+		else if (ret == -EOVERFLOW)
+			continue;
+		ptd_count++;
+	}
+
+	if (ptd_count) {
+		static int last_count;
+
+		if (ptd_count != last_count) {
+			DBG(0, "%s: ptd_count: %d\n", __func__, ptd_count);
+			last_count = ptd_count;
+		}
+		enable_intl_transfers(isp1362_hcd);
+	}
+
+	epq->ptd_count += ptd_count;
+	if (epq->ptd_count > epq->stat_maxptds)
+		epq->stat_maxptds = epq->ptd_count;
+}
+
+static inline int next_ptd(struct isp1362_ep_queue *epq, struct isp1362_ep *ep)
+{
+	u16 ptd_offset = ep->ptd_offset;
+	int num_ptds = (ep->length + PTD_HEADER_SIZE + (epq->blk_size - 1)) / epq->blk_size;
+
+	DBG(2, "%s: PTD offset $%04x + %04x => %d * %04x -> $%04x\n", __func__, ptd_offset,
+	    ep->length, num_ptds, epq->blk_size, ptd_offset + num_ptds * epq->blk_size);
+
+	ptd_offset += num_ptds * epq->blk_size;
+	if (ptd_offset < epq->buf_start + epq->buf_size)
+		return ptd_offset;
+	else
+		return -ENOMEM;
+}
+
+static void start_iso_transfers(struct isp1362_hcd *isp1362_hcd)
+{
+	int ptd_count = 0;
+	int flip = isp1362_hcd->istl_flip;
+	struct isp1362_ep_queue *epq;
+	int ptd_offset;
+	struct isp1362_ep *ep;
+	struct isp1362_ep *tmp;
+	u16 fno = isp1362_read_reg32(isp1362_hcd, HCFMNUM);
+
+ fill2:
+	epq = &isp1362_hcd->istl_queue[flip];
+	if (atomic_read(&epq->finishing)) {
+		DBG(1, "%s: finish_transfers is active for %s\n", __func__, epq->name);
+		return;
+	}
+
+	if (!list_empty(&epq->active))
+		return;
+
+	ptd_offset = epq->buf_start;
+	list_for_each_entry_safe(ep, tmp, &isp1362_hcd->isoc, schedule) {
+		struct urb *urb = get_urb(ep);
+		s16 diff = fno - (u16)urb->start_frame;
+
+		DBG(1, "%s: Processing %s ep %p\n", __func__, epq->name, ep);
+
+		if (diff > urb->number_of_packets) {
+			/* time frame for this URB has elapsed */
+			finish_request(isp1362_hcd, ep, urb, -EOVERFLOW);
+			continue;
+		} else if (diff < -1) {
+			/* URB is not due in this frame or the next one.
+			 * Comparing with '-1' instead of '0' accounts for double
+			 * buffering in the ISP1362 which enables us to queue the PTD
+			 * one frame ahead of time
+			 */
+		} else if (diff == -1) {
+			/* submit PTD's that are due in the next frame */
+			prepare_ptd(isp1362_hcd, urb, ep, epq, fno);
+			if (ptd_offset + PTD_HEADER_SIZE + ep->length >
+			    epq->buf_start + epq->buf_size) {
+				pr_err("%s: Not enough ISO buffer space for %d byte PTD\n",
+				    __func__, ep->length);
+				continue;
+			}
+			ep->ptd_offset = ptd_offset;
+			list_add_tail(&ep->active, &epq->active);
+
+			ptd_offset = next_ptd(epq, ep);
+			if (ptd_offset < 0) {
+				pr_warning("%s: req %d No more %s PTD buffers available\n", __func__,
+				     ep->num_req, epq->name);
+				break;
+			}
+		}
+	}
+	list_for_each_entry(ep, &epq->active, active) {
+		if (epq->active.next == &ep->active)
+			ep->ptd.mps |= PTD_LAST_MSK;
+		isp1362_write_ptd(isp1362_hcd, ep, epq);
+		ptd_count++;
+	}
+
+	if (ptd_count)
+		enable_istl_transfers(isp1362_hcd, flip);
+
+	epq->ptd_count += ptd_count;
+	if (epq->ptd_count > epq->stat_maxptds)
+		epq->stat_maxptds = epq->ptd_count;
+
+	/* check, whether the second ISTL buffer may also be filled */
+	if (!(isp1362_read_reg16(isp1362_hcd, HCBUFSTAT) &
+	      (flip ? HCBUFSTAT_ISTL0_FULL : HCBUFSTAT_ISTL1_FULL))) {
+		fno++;
+		ptd_count = 0;
+		flip = 1 - flip;
+		goto fill2;
+	}
+}
+
+static void finish_transfers(struct isp1362_hcd *isp1362_hcd, unsigned long done_map,
+			     struct isp1362_ep_queue *epq)
+{
+	struct isp1362_ep *ep;
+	struct isp1362_ep *tmp;
+
+	if (list_empty(&epq->active)) {
+		DBG(1, "%s: Nothing to do for %s queue\n", __func__, epq->name);
+		return;
+	}
+
+	DBG(1, "%s: Finishing %s transfers %08lx\n", __func__, epq->name, done_map);
+
+	atomic_inc(&epq->finishing);
+	list_for_each_entry_safe(ep, tmp, &epq->active, active) {
+		int index = ep->ptd_index;
+
+		DBG(1, "%s: Checking %s PTD[%02x] $%04x\n", __func__, epq->name,
+		    index, ep->ptd_offset);
+
+		BUG_ON(index < 0);
+		if (__test_and_clear_bit(index, &done_map)) {
+			isp1362_read_ptd(isp1362_hcd, ep, epq);
+			epq->free_ptd = index;
+			BUG_ON(ep->num_ptds == 0);
+			release_ptd_buffers(epq, ep);
+
+			DBG(1, "%s: ep %p req %d removed from active list\n", __func__,
+			    ep, ep->num_req);
+			if (!list_empty(&ep->remove_list)) {
+				list_del_init(&ep->remove_list);
+				DBG(1, "%s: ep %p removed from remove list\n", __func__, ep);
+			}
+			DBG(1, "%s: Postprocessing %s ep %p req %d\n", __func__, epq->name,
+			    ep, ep->num_req);
+			postproc_ep(isp1362_hcd, ep);
+		}
+		if (!done_map)
+			break;
+	}
+	if (done_map)
+		pr_warning("%s: done_map not clear: %08lx:%08lx\n", __func__, done_map,
+		     epq->skip_map);
+	atomic_dec(&epq->finishing);
+}
+
+static void finish_iso_transfers(struct isp1362_hcd *isp1362_hcd, struct isp1362_ep_queue *epq)
+{
+	struct isp1362_ep *ep;
+	struct isp1362_ep *tmp;
+
+	if (list_empty(&epq->active)) {
+		DBG(1, "%s: Nothing to do for %s queue\n", __func__, epq->name);
+		return;
+	}
+
+	DBG(1, "%s: Finishing %s transfers\n", __func__, epq->name);
+
+	atomic_inc(&epq->finishing);
+	list_for_each_entry_safe(ep, tmp, &epq->active, active) {
+		DBG(1, "%s: Checking PTD $%04x\n", __func__, ep->ptd_offset);
+
+		isp1362_read_ptd(isp1362_hcd, ep, epq);
+		DBG(1, "%s: Postprocessing %s ep %p\n", __func__, epq->name, ep);
+		postproc_ep(isp1362_hcd, ep);
+	}
+	WARN_ON(epq->blk_size != 0);
+	atomic_dec(&epq->finishing);
+}
+
+static irqreturn_t isp1362_irq(struct usb_hcd *hcd)
+{
+	int handled = 0;
+	struct isp1362_hcd *isp1362_hcd = hcd_to_isp1362_hcd(hcd);
+	u16 irqstat;
+	u16 svc_mask;
+
+	spin_lock(&isp1362_hcd->lock);
+
+	BUG_ON(isp1362_hcd->irq_active++);
+
+	isp1362_write_reg16(isp1362_hcd, HCuPINTENB, 0);
+
+	irqstat = isp1362_read_reg16(isp1362_hcd, HCuPINT);
+	DBG(3, "%s: got IRQ %04x:%04x\n", __func__, irqstat, isp1362_hcd->irqenb);
+
+	/* only handle interrupts that are currently enabled */
+	irqstat &= isp1362_hcd->irqenb;
+	isp1362_write_reg16(isp1362_hcd, HCuPINT, irqstat);
+	svc_mask = irqstat;
+
+	if (irqstat & HCuPINT_SOF) {
+		isp1362_hcd->irqenb &= ~HCuPINT_SOF;
+		isp1362_hcd->irq_stat[ISP1362_INT_SOF]++;
+		handled = 1;
+		svc_mask &= ~HCuPINT_SOF;
+		DBG(3, "%s: SOF\n", __func__);
+		isp1362_hcd->fmindex = isp1362_read_reg32(isp1362_hcd, HCFMNUM);
+		if (!list_empty(&isp1362_hcd->remove_list))
+			finish_unlinks(isp1362_hcd);
+		if (!list_empty(&isp1362_hcd->async) && !(irqstat & HCuPINT_ATL)) {
+			if (list_empty(&isp1362_hcd->atl_queue.active)) {
+				start_atl_transfers(isp1362_hcd);
+			} else {
+				isp1362_enable_int(isp1362_hcd, HCuPINT_ATL);
+				isp1362_write_reg32(isp1362_hcd, HCATLSKIP,
+						    isp1362_hcd->atl_queue.skip_map);
+				isp1362_set_mask16(isp1362_hcd, HCBUFSTAT, HCBUFSTAT_ATL_ACTIVE);
+			}
+		}
+	}
+
+	if (irqstat & HCuPINT_ISTL0) {
+		isp1362_hcd->irq_stat[ISP1362_INT_ISTL0]++;
+		handled = 1;
+		svc_mask &= ~HCuPINT_ISTL0;
+		isp1362_clr_mask16(isp1362_hcd, HCBUFSTAT, HCBUFSTAT_ISTL0_FULL);
+		DBG(1, "%s: ISTL0\n", __func__);
+		WARN_ON((int)!!isp1362_hcd->istl_flip);
+		WARN_ON(isp1362_read_reg16(isp1362_hcd, HCBUFSTAT) & HCBUFSTAT_ISTL0_ACTIVE);
+		WARN_ON(!isp1362_read_reg16(isp1362_hcd, HCBUFSTAT) & HCBUFSTAT_ISTL0_DONE);
+		isp1362_hcd->irqenb &= ~HCuPINT_ISTL0;
+	}
+
+	if (irqstat & HCuPINT_ISTL1) {
+		isp1362_hcd->irq_stat[ISP1362_INT_ISTL1]++;
+		handled = 1;
+		svc_mask &= ~HCuPINT_ISTL1;
+		isp1362_clr_mask16(isp1362_hcd, HCBUFSTAT, HCBUFSTAT_ISTL1_FULL);
+		DBG(1, "%s: ISTL1\n", __func__);
+		WARN_ON(!(int)isp1362_hcd->istl_flip);
+		WARN_ON(isp1362_read_reg16(isp1362_hcd, HCBUFSTAT) & HCBUFSTAT_ISTL1_ACTIVE);
+		WARN_ON(!isp1362_read_reg16(isp1362_hcd, HCBUFSTAT) & HCBUFSTAT_ISTL1_DONE);
+		isp1362_hcd->irqenb &= ~HCuPINT_ISTL1;
+	}
+
+	if (irqstat & (HCuPINT_ISTL0 | HCuPINT_ISTL1)) {
+		WARN_ON((irqstat & (HCuPINT_ISTL0 | HCuPINT_ISTL1)) ==
+			(HCuPINT_ISTL0 | HCuPINT_ISTL1));
+		finish_iso_transfers(isp1362_hcd,
+				     &isp1362_hcd->istl_queue[isp1362_hcd->istl_flip]);
+		start_iso_transfers(isp1362_hcd);
+		isp1362_hcd->istl_flip = 1 - isp1362_hcd->istl_flip;
+	}
+
+	if (irqstat & HCuPINT_INTL) {
+		u32 done_map = isp1362_read_reg32(isp1362_hcd, HCINTLDONE);
+		u32 skip_map = isp1362_read_reg32(isp1362_hcd, HCINTLSKIP);
+		isp1362_hcd->irq_stat[ISP1362_INT_INTL]++;
+
+		DBG(2, "%s: INTL\n", __func__);
+
+		svc_mask &= ~HCuPINT_INTL;
+
+		isp1362_write_reg32(isp1362_hcd, HCINTLSKIP, skip_map | done_map);
+		if (~(done_map | skip_map) == 0)
+			/* All PTDs are finished, disable INTL processing entirely */
+			isp1362_clr_mask16(isp1362_hcd, HCBUFSTAT, HCBUFSTAT_INTL_ACTIVE);
+
+		handled = 1;
+		WARN_ON(!done_map);
+		if (done_map) {
+			DBG(3, "%s: INTL done_map %08x\n", __func__, done_map);
+			finish_transfers(isp1362_hcd, done_map, &isp1362_hcd->intl_queue);
+			start_intl_transfers(isp1362_hcd);
+		}
+	}
+
+	if (irqstat & HCuPINT_ATL) {
+		u32 done_map = isp1362_read_reg32(isp1362_hcd, HCATLDONE);
+		u32 skip_map = isp1362_read_reg32(isp1362_hcd, HCATLSKIP);
+		isp1362_hcd->irq_stat[ISP1362_INT_ATL]++;
+
+		DBG(2, "%s: ATL\n", __func__);
+
+		svc_mask &= ~HCuPINT_ATL;
+
+		isp1362_write_reg32(isp1362_hcd, HCATLSKIP, skip_map | done_map);
+		if (~(done_map | skip_map) == 0)
+			isp1362_clr_mask16(isp1362_hcd, HCBUFSTAT, HCBUFSTAT_ATL_ACTIVE);
+		if (done_map) {
+			DBG(3, "%s: ATL done_map %08x\n", __func__, done_map);
+			finish_transfers(isp1362_hcd, done_map, &isp1362_hcd->atl_queue);
+			start_atl_transfers(isp1362_hcd);
+		}
+		handled = 1;
+	}
+
+	if (irqstat & HCuPINT_OPR) {
+		u32 intstat = isp1362_read_reg32(isp1362_hcd, HCINTSTAT);
+		isp1362_hcd->irq_stat[ISP1362_INT_OPR]++;
+
+		svc_mask &= ~HCuPINT_OPR;
+		DBG(2, "%s: OPR %08x:%08x\n", __func__, intstat, isp1362_hcd->intenb);
+		intstat &= isp1362_hcd->intenb;
+		if (intstat & OHCI_INTR_UE) {
+			pr_err("Unrecoverable error\n");
+			/* FIXME: do here reset or cleanup or whatever */
+		}
+		if (intstat & OHCI_INTR_RHSC) {
+			isp1362_hcd->rhstatus = isp1362_read_reg32(isp1362_hcd, HCRHSTATUS);
+			isp1362_hcd->rhport[0] = isp1362_read_reg32(isp1362_hcd, HCRHPORT1);
+			isp1362_hcd->rhport[1] = isp1362_read_reg32(isp1362_hcd, HCRHPORT2);
+		}
+		if (intstat & OHCI_INTR_RD) {
+			pr_info("%s: RESUME DETECTED\n", __func__);
+			isp1362_show_reg(isp1362_hcd, HCCONTROL);
+			usb_hcd_resume_root_hub(hcd);
+		}
+		isp1362_write_reg32(isp1362_hcd, HCINTSTAT, intstat);
+		irqstat &= ~HCuPINT_OPR;
+		handled = 1;
+	}
+
+	if (irqstat & HCuPINT_SUSP) {
+		isp1362_hcd->irq_stat[ISP1362_INT_SUSP]++;
+		handled = 1;
+		svc_mask &= ~HCuPINT_SUSP;
+
+		pr_info("%s: SUSPEND IRQ\n", __func__);
+	}
+
+	if (irqstat & HCuPINT_CLKRDY) {
+		isp1362_hcd->irq_stat[ISP1362_INT_CLKRDY]++;
+		handled = 1;
+		isp1362_hcd->irqenb &= ~HCuPINT_CLKRDY;
+		svc_mask &= ~HCuPINT_CLKRDY;
+		pr_info("%s: CLKRDY IRQ\n", __func__);
+	}
+
+	if (svc_mask)
+		pr_err("%s: Unserviced interrupt(s) %04x\n", __func__, svc_mask);
+
+	isp1362_write_reg16(isp1362_hcd, HCuPINTENB, isp1362_hcd->irqenb);
+	isp1362_hcd->irq_active--;
+	spin_unlock(&isp1362_hcd->lock);
+
+	return IRQ_RETVAL(handled);
+}
+
+/*-------------------------------------------------------------------------*/
+
+#define	MAX_PERIODIC_LOAD	900	/* out of 1000 usec */
+static int balance(struct isp1362_hcd *isp1362_hcd, u16 interval, u16 load)
+{
+	int i, branch = -ENOSPC;
+
+	/* search for the least loaded schedule branch of that interval
+	 * which has enough bandwidth left unreserved.
+	 */
+	for (i = 0; i < interval; i++) {
+		if (branch < 0 || isp1362_hcd->load[branch] > isp1362_hcd->load[i]) {
+			int j;
+
+			for (j = i; j < PERIODIC_SIZE; j += interval) {
+				if ((isp1362_hcd->load[j] + load) > MAX_PERIODIC_LOAD) {
+					pr_err("%s: new load %d load[%02x] %d max %d\n", __func__,
+					    load, j, isp1362_hcd->load[j], MAX_PERIODIC_LOAD);
+					break;
+				}
+			}
+			if (j < PERIODIC_SIZE)
+				continue;
+			branch = i;
+		}
+	}
+	return branch;
+}
+
+/* NB! ALL the code above this point runs with isp1362_hcd->lock
+   held, irqs off
+*/
+
+/*-------------------------------------------------------------------------*/
+
+static int isp1362_urb_enqueue(struct usb_hcd *hcd,
+			       struct urb *urb,
+			       gfp_t mem_flags)
+{
+	struct isp1362_hcd *isp1362_hcd = hcd_to_isp1362_hcd(hcd);
+	struct usb_device *udev = urb->dev;
+	unsigned int pipe = urb->pipe;
+	int is_out = !usb_pipein(pipe);
+	int type = usb_pipetype(pipe);
+	int epnum = usb_pipeendpoint(pipe);
+	struct usb_host_endpoint *hep = urb->ep;
+	struct isp1362_ep *ep = NULL;
+	unsigned long flags;
+	int retval = 0;
+
+	DBG(3, "%s: urb %p\n", __func__, urb);
+
+	if (type == PIPE_ISOCHRONOUS) {
+		pr_err("Isochronous transfers not supported\n");
+		return -ENOSPC;
+	}
+
+	URB_DBG("%s: FA %d ep%d%s %s: len %d %s%s\n", __func__,
+		usb_pipedevice(pipe), epnum,
+		is_out ? "out" : "in",
+		usb_pipecontrol(pipe) ? "ctrl" :
+			usb_pipeint(pipe) ? "int" :
+			usb_pipebulk(pipe) ? "bulk" :
+			"iso",
+		urb->transfer_buffer_length,
+		(urb->transfer_flags & URB_ZERO_PACKET) ? "ZERO_PACKET " : "",
+		!(urb->transfer_flags & URB_SHORT_NOT_OK) ?
+		"short_ok" : "");
+
+	/* avoid all allocations within spinlocks: request or endpoint */
+	if (!hep->hcpriv) {
+		ep = kcalloc(1, sizeof *ep, mem_flags);
+		if (!ep)
+			return -ENOMEM;
+	}
+	spin_lock_irqsave(&isp1362_hcd->lock, flags);
+
+	/* don't submit to a dead or disabled port */
+	if (!((isp1362_hcd->rhport[0] | isp1362_hcd->rhport[1]) &
+	      (1 << USB_PORT_FEAT_ENABLE)) ||
+	    !HC_IS_RUNNING(hcd->state)) {
+		kfree(ep);
+		retval = -ENODEV;
+		goto fail_not_linked;
+	}
+
+	retval = usb_hcd_link_urb_to_ep(hcd, urb);
+	if (retval) {
+		kfree(ep);
+		goto fail_not_linked;
+	}
+
+	if (hep->hcpriv) {
+		ep = hep->hcpriv;
+	} else {
+		INIT_LIST_HEAD(&ep->schedule);
+		INIT_LIST_HEAD(&ep->active);
+		INIT_LIST_HEAD(&ep->remove_list);
+		ep->udev = usb_get_dev(udev);
+		ep->hep = hep;
+		ep->epnum = epnum;
+		ep->maxpacket = usb_maxpacket(udev, urb->pipe, is_out);
+		ep->ptd_offset = -EINVAL;
+		ep->ptd_index = -EINVAL;
+		usb_settoggle(udev, epnum, is_out, 0);
+
+		if (type == PIPE_CONTROL)
+			ep->nextpid = USB_PID_SETUP;
+		else if (is_out)
+			ep->nextpid = USB_PID_OUT;
+		else
+			ep->nextpid = USB_PID_IN;
+
+		switch (type) {
+		case PIPE_ISOCHRONOUS:
+		case PIPE_INTERRUPT:
+			if (urb->interval > PERIODIC_SIZE)
+				urb->interval = PERIODIC_SIZE;
+			ep->interval = urb->interval;
+			ep->branch = PERIODIC_SIZE;
+			ep->load = usb_calc_bus_time(udev->speed, !is_out,
+						     (type == PIPE_ISOCHRONOUS),
+						     usb_maxpacket(udev, pipe, is_out)) / 1000;
+			break;
+		}
+		hep->hcpriv = ep;
+	}
+	ep->num_req = isp1362_hcd->req_serial++;
+
+	/* maybe put endpoint into schedule */
+	switch (type) {
+	case PIPE_CONTROL:
+	case PIPE_BULK:
+		if (list_empty(&ep->schedule)) {
+			DBG(1, "%s: Adding ep %p req %d to async schedule\n",
+				__func__, ep, ep->num_req);
+			list_add_tail(&ep->schedule, &isp1362_hcd->async);
+		}
+		break;
+	case PIPE_ISOCHRONOUS:
+	case PIPE_INTERRUPT:
+		urb->interval = ep->interval;
+
+		/* urb submitted for already existing EP */
+		if (ep->branch < PERIODIC_SIZE)
+			break;
+
+		retval = balance(isp1362_hcd, ep->interval, ep->load);
+		if (retval < 0) {
+			pr_err("%s: balance returned %d\n", __func__, retval);
+			goto fail;
+		}
+		ep->branch = retval;
+		retval = 0;
+		isp1362_hcd->fmindex = isp1362_read_reg32(isp1362_hcd, HCFMNUM);
+		DBG(1, "%s: Current frame %04x branch %02x start_frame %04x(%04x)\n",
+		    __func__, isp1362_hcd->fmindex, ep->branch,
+		    ((isp1362_hcd->fmindex + PERIODIC_SIZE - 1) &
+		     ~(PERIODIC_SIZE - 1)) + ep->branch,
+		    (isp1362_hcd->fmindex & (PERIODIC_SIZE - 1)) + ep->branch);
+
+		if (list_empty(&ep->schedule)) {
+			if (type == PIPE_ISOCHRONOUS) {
+				u16 frame = isp1362_hcd->fmindex;
+
+				frame += max_t(u16, 8, ep->interval);
+				frame &= ~(ep->interval - 1);
+				frame |= ep->branch;
+				if (frame_before(frame, isp1362_hcd->fmindex))
+					frame += ep->interval;
+				urb->start_frame = frame;
+
+				DBG(1, "%s: Adding ep %p to isoc schedule\n", __func__, ep);
+				list_add_tail(&ep->schedule, &isp1362_hcd->isoc);
+			} else {
+				DBG(1, "%s: Adding ep %p to periodic schedule\n", __func__, ep);
+				list_add_tail(&ep->schedule, &isp1362_hcd->periodic);
+			}
+		} else
+			DBG(1, "%s: ep %p already scheduled\n", __func__, ep);
+
+		DBG(2, "%s: load %d bandwidth %d -> %d\n", __func__,
+		    ep->load / ep->interval, isp1362_hcd->load[ep->branch],
+		    isp1362_hcd->load[ep->branch] + ep->load);
+		isp1362_hcd->load[ep->branch] += ep->load;
+	}
+
+	urb->hcpriv = hep;
+	ALIGNSTAT(isp1362_hcd, urb->transfer_buffer);
+
+	switch (type) {
+	case PIPE_CONTROL:
+	case PIPE_BULK:
+		start_atl_transfers(isp1362_hcd);
+		break;
+	case PIPE_INTERRUPT:
+		start_intl_transfers(isp1362_hcd);
+		break;
+	case PIPE_ISOCHRONOUS:
+		start_iso_transfers(isp1362_hcd);
+		break;
+	default:
+		BUG();
+	}
+ fail:
+	if (retval)
+		usb_hcd_unlink_urb_from_ep(hcd, urb);
+
+
+ fail_not_linked:
+	spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+	if (retval)
+		DBG(0, "%s: urb %p failed with %d\n", __func__, urb, retval);
+	return retval;
+}
+
+static int isp1362_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
+{
+	struct isp1362_hcd *isp1362_hcd = hcd_to_isp1362_hcd(hcd);
+	struct usb_host_endpoint *hep;
+	unsigned long flags;
+	struct isp1362_ep *ep;
+	int retval = 0;
+
+	DBG(3, "%s: urb %p\n", __func__, urb);
+
+	spin_lock_irqsave(&isp1362_hcd->lock, flags);
+	retval = usb_hcd_check_unlink_urb(hcd, urb, status);
+	if (retval)
+		goto done;
+
+	hep = urb->hcpriv;
+
+	if (!hep) {
+		spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+		return -EIDRM;
+	}
+
+	ep = hep->hcpriv;
+	if (ep) {
+		/* In front of queue? */
+		if (ep->hep->urb_list.next == &urb->urb_list) {
+			if (!list_empty(&ep->active)) {
+				DBG(1, "%s: urb %p ep %p req %d active PTD[%d] $%04x\n", __func__,
+				    urb, ep, ep->num_req, ep->ptd_index, ep->ptd_offset);
+				/* disable processing and queue PTD for removal */
+				remove_ptd(isp1362_hcd, ep);
+				urb = NULL;
+			}
+		}
+		if (urb) {
+			DBG(1, "%s: Finishing ep %p req %d\n", __func__, ep,
+			    ep->num_req);
+			finish_request(isp1362_hcd, ep, urb, status);
+		} else
+			DBG(1, "%s: urb %p active; wait4irq\n", __func__, urb);
+	} else {
+		pr_warning("%s: No EP in URB %p\n", __func__, urb);
+		retval = -EINVAL;
+	}
+done:
+	spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+
+	DBG(3, "%s: exit\n", __func__);
+
+	return retval;
+}
+
+static void isp1362_endpoint_disable(struct usb_hcd *hcd, struct usb_host_endpoint *hep)
+{
+	struct isp1362_ep *ep = hep->hcpriv;
+	struct isp1362_hcd *isp1362_hcd = hcd_to_isp1362_hcd(hcd);
+	unsigned long flags;
+
+	DBG(1, "%s: ep %p\n", __func__, ep);
+	if (!ep)
+		return;
+	spin_lock_irqsave(&isp1362_hcd->lock, flags);
+	if (!list_empty(&hep->urb_list)) {
+		if (!list_empty(&ep->active) && list_empty(&ep->remove_list)) {
+			DBG(1, "%s: Removing ep %p req %d PTD[%d] $%04x\n", __func__,
+			    ep, ep->num_req, ep->ptd_index, ep->ptd_offset);
+			remove_ptd(isp1362_hcd, ep);
+			pr_info("%s: Waiting for Interrupt to clean up\n", __func__);
+		}
+	}
+	spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+	/* Wait for interrupt to clear out active list */
+	while (!list_empty(&ep->active))
+		msleep(1);
+
+	DBG(1, "%s: Freeing EP %p\n", __func__, ep);
+
+	usb_put_dev(ep->udev);
+	kfree(ep);
+	hep->hcpriv = NULL;
+}
+
+static int isp1362_get_frame(struct usb_hcd *hcd)
+{
+	struct isp1362_hcd *isp1362_hcd = hcd_to_isp1362_hcd(hcd);
+	u32 fmnum;
+	unsigned long flags;
+
+	spin_lock_irqsave(&isp1362_hcd->lock, flags);
+	fmnum = isp1362_read_reg32(isp1362_hcd, HCFMNUM);
+	spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+
+	return (int)fmnum;
+}
+
+/*-------------------------------------------------------------------------*/
+
+/* Adapted from ohci-hub.c */
+static int isp1362_hub_status_data(struct usb_hcd *hcd, char *buf)
+{
+	struct isp1362_hcd *isp1362_hcd = hcd_to_isp1362_hcd(hcd);
+	int ports, i, changed = 0;
+	unsigned long flags;
+
+	if (!HC_IS_RUNNING(hcd->state))
+		return -ESHUTDOWN;
+
+	/* Report no status change now, if we are scheduled to be
+	   called later */
+	if (timer_pending(&hcd->rh_timer))
+		return 0;
+
+	ports = isp1362_hcd->rhdesca & RH_A_NDP;
+	BUG_ON(ports > 2);
+
+	spin_lock_irqsave(&isp1362_hcd->lock, flags);
+	/* init status */
+	if (isp1362_hcd->rhstatus & (RH_HS_LPSC | RH_HS_OCIC))
+		buf[0] = changed = 1;
+	else
+		buf[0] = 0;
+
+	for (i = 0; i < ports; i++) {
+		u32 status = isp1362_hcd->rhport[i];
+
+		if (status & (RH_PS_CSC | RH_PS_PESC | RH_PS_PSSC |
+			      RH_PS_OCIC | RH_PS_PRSC)) {
+			changed = 1;
+			buf[0] |= 1 << (i + 1);
+			continue;
+		}
+
+		if (!(status & RH_PS_CCS))
+			continue;
+	}
+	spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+	return changed;
+}
+
+static void isp1362_hub_descriptor(struct isp1362_hcd *isp1362_hcd,
+				   struct usb_hub_descriptor *desc)
+{
+	u32 reg = isp1362_hcd->rhdesca;
+
+	DBG(3, "%s: enter\n", __func__);
+
+	desc->bDescriptorType = 0x29;
+	desc->bDescLength = 9;
+	desc->bHubContrCurrent = 0;
+	desc->bNbrPorts = reg & 0x3;
+	/* Power switching, device type, overcurrent. */
+	desc->wHubCharacteristics = cpu_to_le16((reg >> 8) & 0x1f);
+	DBG(0, "%s: hubcharacteristics = %02x\n", __func__, cpu_to_le16((reg >> 8) & 0x1f));
+	desc->bPwrOn2PwrGood = (reg >> 24) & 0xff;
+	/* two bitmaps:  ports removable, and legacy PortPwrCtrlMask */
+	desc->bitmap[0] = desc->bNbrPorts == 1 ? 1 << 1 : 3 << 1;
+	desc->bitmap[1] = ~0;
+
+	DBG(3, "%s: exit\n", __func__);
+}
+
+/* Adapted from ohci-hub.c */
+static int isp1362_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue,
+			       u16 wIndex, char *buf, u16 wLength)
+{
+	struct isp1362_hcd *isp1362_hcd = hcd_to_isp1362_hcd(hcd);
+	int retval = 0;
+	unsigned long flags;
+	unsigned long t1;
+	int ports = isp1362_hcd->rhdesca & RH_A_NDP;
+	u32 tmp = 0;
+
+	switch (typeReq) {
+	case ClearHubFeature:
+		DBG(0, "ClearHubFeature: ");
+		switch (wValue) {
+		case C_HUB_OVER_CURRENT:
+			_DBG(0, "C_HUB_OVER_CURRENT\n");
+			spin_lock_irqsave(&isp1362_hcd->lock, flags);
+			isp1362_write_reg32(isp1362_hcd, HCRHSTATUS, RH_HS_OCIC);
+			spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+		case C_HUB_LOCAL_POWER:
+			_DBG(0, "C_HUB_LOCAL_POWER\n");
+			break;
+		default:
+			goto error;
+		}
+		break;
+	case SetHubFeature:
+		DBG(0, "SetHubFeature: ");
+		switch (wValue) {
+		case C_HUB_OVER_CURRENT:
+		case C_HUB_LOCAL_POWER:
+			_DBG(0, "C_HUB_OVER_CURRENT or C_HUB_LOCAL_POWER\n");
+			break;
+		default:
+			goto error;
+		}
+		break;
+	case GetHubDescriptor:
+		DBG(0, "GetHubDescriptor\n");
+		isp1362_hub_descriptor(isp1362_hcd, (struct usb_hub_descriptor *)buf);
+		break;
+	case GetHubStatus:
+		DBG(0, "GetHubStatus\n");
+		put_unaligned(cpu_to_le32(0), (__le32 *) buf);
+		break;
+	case GetPortStatus:
+#ifndef VERBOSE
+		DBG(0, "GetPortStatus\n");
+#endif
+		if (!wIndex || wIndex > ports)
+			goto error;
+		tmp = isp1362_hcd->rhport[--wIndex];
+		put_unaligned(cpu_to_le32(tmp), (__le32 *) buf);
+		break;
+	case ClearPortFeature:
+		DBG(0, "ClearPortFeature: ");
+		if (!wIndex || wIndex > ports)
+			goto error;
+		wIndex--;
+
+		switch (wValue) {
+		case USB_PORT_FEAT_ENABLE:
+			_DBG(0, "USB_PORT_FEAT_ENABLE\n");
+			tmp = RH_PS_CCS;
+			break;
+		case USB_PORT_FEAT_C_ENABLE:
+			_DBG(0, "USB_PORT_FEAT_C_ENABLE\n");
+			tmp = RH_PS_PESC;
+			break;
+		case USB_PORT_FEAT_SUSPEND:
+			_DBG(0, "USB_PORT_FEAT_SUSPEND\n");
+			tmp = RH_PS_POCI;
+			break;
+		case USB_PORT_FEAT_C_SUSPEND:
+			_DBG(0, "USB_PORT_FEAT_C_SUSPEND\n");
+			tmp = RH_PS_PSSC;
+			break;
+		case USB_PORT_FEAT_POWER:
+			_DBG(0, "USB_PORT_FEAT_POWER\n");
+			tmp = RH_PS_LSDA;
+
+			break;
+		case USB_PORT_FEAT_C_CONNECTION:
+			_DBG(0, "USB_PORT_FEAT_C_CONNECTION\n");
+			tmp = RH_PS_CSC;
+			break;
+		case USB_PORT_FEAT_C_OVER_CURRENT:
+			_DBG(0, "USB_PORT_FEAT_C_OVER_CURRENT\n");
+			tmp = RH_PS_OCIC;
+			break;
+		case USB_PORT_FEAT_C_RESET:
+			_DBG(0, "USB_PORT_FEAT_C_RESET\n");
+			tmp = RH_PS_PRSC;
+			break;
+		default:
+			goto error;
+		}
+
+		spin_lock_irqsave(&isp1362_hcd->lock, flags);
+		isp1362_write_reg32(isp1362_hcd, HCRHPORT1 + wIndex, tmp);
+		isp1362_hcd->rhport[wIndex] =
+			isp1362_read_reg32(isp1362_hcd, HCRHPORT1 + wIndex);
+		spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+		break;
+	case SetPortFeature:
+		DBG(0, "SetPortFeature: ");
+		if (!wIndex || wIndex > ports)
+			goto error;
+		wIndex--;
+		switch (wValue) {
+		case USB_PORT_FEAT_SUSPEND:
+			_DBG(0, "USB_PORT_FEAT_SUSPEND\n");
+#ifdef	CONFIG_USB_OTG
+			if (ohci->hcd.self.otg_port == (wIndex + 1) &&
+			    ohci->hcd.self.b_hnp_enable) {
+				start_hnp(ohci);
+				break;
+			}
+#endif
+			spin_lock_irqsave(&isp1362_hcd->lock, flags);
+			isp1362_write_reg32(isp1362_hcd, HCRHPORT1 + wIndex, RH_PS_PSS);
+			isp1362_hcd->rhport[wIndex] =
+				isp1362_read_reg32(isp1362_hcd, HCRHPORT1 + wIndex);
+			spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+			break;
+		case USB_PORT_FEAT_POWER:
+			_DBG(0, "USB_PORT_FEAT_POWER\n");
+			spin_lock_irqsave(&isp1362_hcd->lock, flags);
+			isp1362_write_reg32(isp1362_hcd, HCRHPORT1 + wIndex, RH_PS_PPS);
+			isp1362_hcd->rhport[wIndex] =
+				isp1362_read_reg32(isp1362_hcd, HCRHPORT1 + wIndex);
+			spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+			break;
+		case USB_PORT_FEAT_RESET:
+			_DBG(0, "USB_PORT_FEAT_RESET\n");
+			spin_lock_irqsave(&isp1362_hcd->lock, flags);
+
+			t1 = jiffies + msecs_to_jiffies(USB_RESET_WIDTH);
+			while (time_before(jiffies, t1)) {
+				/* spin until any current reset finishes */
+				for (;;) {
+					tmp = isp1362_read_reg32(isp1362_hcd, HCRHPORT1 + wIndex);
+					if (!(tmp & RH_PS_PRS))
+						break;
+					udelay(500);
+				}
+				if (!(tmp & RH_PS_CCS))
+					break;
+				/* Reset lasts 10ms (claims datasheet) */
+				isp1362_write_reg32(isp1362_hcd, HCRHPORT1 + wIndex, (RH_PS_PRS));
+
+				spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+				msleep(10);
+				spin_lock_irqsave(&isp1362_hcd->lock, flags);
+			}
+
+			isp1362_hcd->rhport[wIndex] = isp1362_read_reg32(isp1362_hcd,
+									 HCRHPORT1 + wIndex);
+			spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+			break;
+		default:
+			goto error;
+		}
+		break;
+
+	default:
+ error:
+		/* "protocol stall" on error */
+		_DBG(0, "PROTOCOL STALL\n");
+		retval = -EPIPE;
+	}
+
+	return retval;
+}
+
+#ifdef	CONFIG_PM
+static int isp1362_bus_suspend(struct usb_hcd *hcd)
+{
+	int status = 0;
+	struct isp1362_hcd *isp1362_hcd = hcd_to_isp1362_hcd(hcd);
+	unsigned long flags;
+
+	if (time_before(jiffies, isp1362_hcd->next_statechange))
+		msleep(5);
+
+	spin_lock_irqsave(&isp1362_hcd->lock, flags);
+
+	isp1362_hcd->hc_control = isp1362_read_reg32(isp1362_hcd, HCCONTROL);
+	switch (isp1362_hcd->hc_control & OHCI_CTRL_HCFS) {
+	case OHCI_USB_RESUME:
+		DBG(0, "%s: resume/suspend?\n", __func__);
+		isp1362_hcd->hc_control &= ~OHCI_CTRL_HCFS;
+		isp1362_hcd->hc_control |= OHCI_USB_RESET;
+		isp1362_write_reg32(isp1362_hcd, HCCONTROL, isp1362_hcd->hc_control);
+		/* FALL THROUGH */
+	case OHCI_USB_RESET:
+		status = -EBUSY;
+		pr_warning("%s: needs reinit!\n", __func__);
+		goto done;
+	case OHCI_USB_SUSPEND:
+		pr_warning("%s: already suspended?\n", __func__);
+		goto done;
+	}
+	DBG(0, "%s: suspend root hub\n", __func__);
+
+	/* First stop any processing */
+	hcd->state = HC_STATE_QUIESCING;
+	if (!list_empty(&isp1362_hcd->atl_queue.active) ||
+	    !list_empty(&isp1362_hcd->intl_queue.active) ||
+	    !list_empty(&isp1362_hcd->istl_queue[0] .active) ||
+	    !list_empty(&isp1362_hcd->istl_queue[1] .active)) {
+		int limit;
+
+		isp1362_write_reg32(isp1362_hcd, HCATLSKIP, ~0);
+		isp1362_write_reg32(isp1362_hcd, HCINTLSKIP, ~0);
+		isp1362_write_reg16(isp1362_hcd, HCBUFSTAT, 0);
+		isp1362_write_reg16(isp1362_hcd, HCuPINTENB, 0);
+		isp1362_write_reg32(isp1362_hcd, HCINTSTAT, OHCI_INTR_SF);
+
+		DBG(0, "%s: stopping schedules ...\n", __func__);
+		limit = 2000;
+		while (limit > 0) {
+			udelay(250);
+			limit -= 250;
+			if (isp1362_read_reg32(isp1362_hcd, HCINTSTAT) & OHCI_INTR_SF)
+				break;
+		}
+		mdelay(7);
+		if (isp1362_read_reg16(isp1362_hcd, HCuPINT) & HCuPINT_ATL) {
+			u32 done_map = isp1362_read_reg32(isp1362_hcd, HCATLDONE);
+			finish_transfers(isp1362_hcd, done_map, &isp1362_hcd->atl_queue);
+		}
+		if (isp1362_read_reg16(isp1362_hcd, HCuPINT) & HCuPINT_INTL) {
+			u32 done_map = isp1362_read_reg32(isp1362_hcd, HCINTLDONE);
+			finish_transfers(isp1362_hcd, done_map, &isp1362_hcd->intl_queue);
+		}
+		if (isp1362_read_reg16(isp1362_hcd, HCuPINT) & HCuPINT_ISTL0)
+			finish_iso_transfers(isp1362_hcd, &isp1362_hcd->istl_queue[0]);
+		if (isp1362_read_reg16(isp1362_hcd, HCuPINT) & HCuPINT_ISTL1)
+			finish_iso_transfers(isp1362_hcd, &isp1362_hcd->istl_queue[1]);
+	}
+	DBG(0, "%s: HCINTSTAT: %08x\n", __func__,
+		    isp1362_read_reg32(isp1362_hcd, HCINTSTAT));
+	isp1362_write_reg32(isp1362_hcd, HCINTSTAT,
+			    isp1362_read_reg32(isp1362_hcd, HCINTSTAT));
+
+	/* Suspend hub */
+	isp1362_hcd->hc_control = OHCI_USB_SUSPEND;
+	isp1362_show_reg(isp1362_hcd, HCCONTROL);
+	isp1362_write_reg32(isp1362_hcd, HCCONTROL, isp1362_hcd->hc_control);
+	isp1362_show_reg(isp1362_hcd, HCCONTROL);
+
+#if 1
+	isp1362_hcd->hc_control = isp1362_read_reg32(isp1362_hcd, HCCONTROL);
+	if ((isp1362_hcd->hc_control & OHCI_CTRL_HCFS) != OHCI_USB_SUSPEND) {
+		pr_err("%s: controller won't suspend %08x\n", __func__,
+		    isp1362_hcd->hc_control);
+		status = -EBUSY;
+	} else
+#endif
+	{
+		/* no resumes until devices finish suspending */
+		isp1362_hcd->next_statechange = jiffies + msecs_to_jiffies(5);
+	}
+done:
+	if (status == 0) {
+		hcd->state = HC_STATE_SUSPENDED;
+		DBG(0, "%s: HCD suspended: %08x\n", __func__,
+		    isp1362_read_reg32(isp1362_hcd, HCCONTROL));
+	}
+	spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+	return status;
+}
+
+static int isp1362_bus_resume(struct usb_hcd *hcd)
+{
+	struct isp1362_hcd *isp1362_hcd = hcd_to_isp1362_hcd(hcd);
+	u32 port;
+	unsigned long flags;
+	int status = -EINPROGRESS;
+
+	if (time_before(jiffies, isp1362_hcd->next_statechange))
+		msleep(5);
+
+	spin_lock_irqsave(&isp1362_hcd->lock, flags);
+	isp1362_hcd->hc_control = isp1362_read_reg32(isp1362_hcd, HCCONTROL);
+	pr_info("%s: HCCONTROL: %08x\n", __func__, isp1362_hcd->hc_control);
+	if (hcd->state == HC_STATE_RESUMING) {
+		pr_warning("%s: duplicate resume\n", __func__);
+		status = 0;
+	} else
+		switch (isp1362_hcd->hc_control & OHCI_CTRL_HCFS) {
+		case OHCI_USB_SUSPEND:
+			DBG(0, "%s: resume root hub\n", __func__);
+			isp1362_hcd->hc_control &= ~OHCI_CTRL_HCFS;
+			isp1362_hcd->hc_control |= OHCI_USB_RESUME;
+			isp1362_write_reg32(isp1362_hcd, HCCONTROL, isp1362_hcd->hc_control);
+			break;
+		case OHCI_USB_RESUME:
+			/* HCFS changes sometime after INTR_RD */
+			DBG(0, "%s: remote wakeup\n", __func__);
+			break;
+		case OHCI_USB_OPER:
+			DBG(0, "%s: odd resume\n", __func__);
+			status = 0;
+			hcd->self.root_hub->dev.power.power_state = PMSG_ON;
+			break;
+		default:		/* RESET, we lost power */
+			DBG(0, "%s: root hub hardware reset\n", __func__);
+			status = -EBUSY;
+		}
+	spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+	if (status == -EBUSY) {
+		DBG(0, "%s: Restarting HC\n", __func__);
+		isp1362_hc_stop(hcd);
+		return isp1362_hc_start(hcd);
+	}
+	if (status != -EINPROGRESS)
+		return status;
+	spin_lock_irqsave(&isp1362_hcd->lock, flags);
+	port = isp1362_read_reg32(isp1362_hcd, HCRHDESCA) & RH_A_NDP;
+	while (port--) {
+		u32 stat = isp1362_read_reg32(isp1362_hcd, HCRHPORT1 + port);
+
+		/* force global, not selective, resume */
+		if (!(stat & RH_PS_PSS)) {
+			DBG(0, "%s: Not Resuming RH port %d\n", __func__, port);
+			continue;
+		}
+		DBG(0, "%s: Resuming RH port %d\n", __func__, port);
+		isp1362_write_reg32(isp1362_hcd, HCRHPORT1 + port, RH_PS_POCI);
+	}
+	spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+
+	/* Some controllers (lucent) need extra-long delays */
+	hcd->state = HC_STATE_RESUMING;
+	mdelay(20 /* usb 11.5.1.10 */ + 15);
+
+	isp1362_hcd->hc_control = OHCI_USB_OPER;
+	spin_lock_irqsave(&isp1362_hcd->lock, flags);
+	isp1362_show_reg(isp1362_hcd, HCCONTROL);
+	isp1362_write_reg32(isp1362_hcd, HCCONTROL, isp1362_hcd->hc_control);
+	spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+	/* TRSMRCY */
+	msleep(10);
+
+	/* keep it alive for ~5x suspend + resume costs */
+	isp1362_hcd->next_statechange = jiffies + msecs_to_jiffies(250);
+
+	hcd->self.root_hub->dev.power.power_state = PMSG_ON;
+	hcd->state = HC_STATE_RUNNING;
+	return 0;
+}
+#else
+#define	isp1362_bus_suspend	NULL
+#define	isp1362_bus_resume	NULL
+#endif
+
+/*-------------------------------------------------------------------------*/
+
+#ifdef STUB_DEBUG_FILE
+
+static inline void create_debug_file(struct isp1362_hcd *isp1362_hcd)
+{
+}
+static inline void remove_debug_file(struct isp1362_hcd *isp1362_hcd)
+{
+}
+
+#else
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+static void dump_irq(struct seq_file *s, char *label, u16 mask)
+{
+	seq_printf(s, "%-15s %04x%s%s%s%s%s%s\n", label, mask,
+		   mask & HCuPINT_CLKRDY ? " clkrdy" : "",
+		   mask & HCuPINT_SUSP ? " susp" : "",
+		   mask & HCuPINT_OPR ? " opr" : "",
+		   mask & HCuPINT_EOT ? " eot" : "",
+		   mask & HCuPINT_ATL ? " atl" : "",
+		   mask & HCuPINT_SOF ? " sof" : "");
+}
+
+static void dump_int(struct seq_file *s, char *label, u32 mask)
+{
+	seq_printf(s, "%-15s %08x%s%s%s%s%s%s%s\n", label, mask,
+		   mask & OHCI_INTR_MIE ? " MIE" : "",
+		   mask & OHCI_INTR_RHSC ? " rhsc" : "",
+		   mask & OHCI_INTR_FNO ? " fno" : "",
+		   mask & OHCI_INTR_UE ? " ue" : "",
+		   mask & OHCI_INTR_RD ? " rd" : "",
+		   mask & OHCI_INTR_SF ? " sof" : "",
+		   mask & OHCI_INTR_SO ? " so" : "");
+}
+
+static void dump_ctrl(struct seq_file *s, char *label, u32 mask)
+{
+	seq_printf(s, "%-15s %08x%s%s%s\n", label, mask,
+		   mask & OHCI_CTRL_RWC ? " rwc" : "",
+		   mask & OHCI_CTRL_RWE ? " rwe" : "",
+		   ({
+			   char *hcfs;
+			   switch (mask & OHCI_CTRL_HCFS) {
+			   case OHCI_USB_OPER:
+				   hcfs = " oper";
+				   break;
+			   case OHCI_USB_RESET:
+				   hcfs = " reset";
+				   break;
+			   case OHCI_USB_RESUME:
+				   hcfs = " resume";
+				   break;
+			   case OHCI_USB_SUSPEND:
+				   hcfs = " suspend";
+				   break;
+			   default:
+				   hcfs = " ?";
+			   }
+			   hcfs;
+		   }));
+}
+
+static void dump_regs(struct seq_file *s, struct isp1362_hcd *isp1362_hcd)
+{
+	seq_printf(s, "HCREVISION [%02x] %08x\n", ISP1362_REG_NO(ISP1362_REG_HCREVISION),
+		   isp1362_read_reg32(isp1362_hcd, HCREVISION));
+	seq_printf(s, "HCCONTROL  [%02x] %08x\n", ISP1362_REG_NO(ISP1362_REG_HCCONTROL),
+		   isp1362_read_reg32(isp1362_hcd, HCCONTROL));
+	seq_printf(s, "HCCMDSTAT  [%02x] %08x\n", ISP1362_REG_NO(ISP1362_REG_HCCMDSTAT),
+		   isp1362_read_reg32(isp1362_hcd, HCCMDSTAT));
+	seq_printf(s, "HCINTSTAT  [%02x] %08x\n", ISP1362_REG_NO(ISP1362_REG_HCINTSTAT),
+		   isp1362_read_reg32(isp1362_hcd, HCINTSTAT));
+	seq_printf(s, "HCINTENB   [%02x] %08x\n", ISP1362_REG_NO(ISP1362_REG_HCINTENB),
+		   isp1362_read_reg32(isp1362_hcd, HCINTENB));
+	seq_printf(s, "HCFMINTVL  [%02x] %08x\n", ISP1362_REG_NO(ISP1362_REG_HCFMINTVL),
+		   isp1362_read_reg32(isp1362_hcd, HCFMINTVL));
+	seq_printf(s, "HCFMREM    [%02x] %08x\n", ISP1362_REG_NO(ISP1362_REG_HCFMREM),
+		   isp1362_read_reg32(isp1362_hcd, HCFMREM));
+	seq_printf(s, "HCFMNUM    [%02x] %08x\n", ISP1362_REG_NO(ISP1362_REG_HCFMNUM),
+		   isp1362_read_reg32(isp1362_hcd, HCFMNUM));
+	seq_printf(s, "HCLSTHRESH [%02x] %08x\n", ISP1362_REG_NO(ISP1362_REG_HCLSTHRESH),
+		   isp1362_read_reg32(isp1362_hcd, HCLSTHRESH));
+	seq_printf(s, "HCRHDESCA  [%02x] %08x\n", ISP1362_REG_NO(ISP1362_REG_HCRHDESCA),
+		   isp1362_read_reg32(isp1362_hcd, HCRHDESCA));
+	seq_printf(s, "HCRHDESCB  [%02x] %08x\n", ISP1362_REG_NO(ISP1362_REG_HCRHDESCB),
+		   isp1362_read_reg32(isp1362_hcd, HCRHDESCB));
+	seq_printf(s, "HCRHSTATUS [%02x] %08x\n", ISP1362_REG_NO(ISP1362_REG_HCRHSTATUS),
+		   isp1362_read_reg32(isp1362_hcd, HCRHSTATUS));
+	seq_printf(s, "HCRHPORT1  [%02x] %08x\n", ISP1362_REG_NO(ISP1362_REG_HCRHPORT1),
+		   isp1362_read_reg32(isp1362_hcd, HCRHPORT1));
+	seq_printf(s, "HCRHPORT2  [%02x] %08x\n", ISP1362_REG_NO(ISP1362_REG_HCRHPORT2),
+		   isp1362_read_reg32(isp1362_hcd, HCRHPORT2));
+	seq_printf(s, "\n");
+	seq_printf(s, "HCHWCFG    [%02x]     %04x\n", ISP1362_REG_NO(ISP1362_REG_HCHWCFG),
+		   isp1362_read_reg16(isp1362_hcd, HCHWCFG));
+	seq_printf(s, "HCDMACFG   [%02x]     %04x\n", ISP1362_REG_NO(ISP1362_REG_HCDMACFG),
+		   isp1362_read_reg16(isp1362_hcd, HCDMACFG));
+	seq_printf(s, "HCXFERCTR  [%02x]     %04x\n", ISP1362_REG_NO(ISP1362_REG_HCXFERCTR),
+		   isp1362_read_reg16(isp1362_hcd, HCXFERCTR));
+	seq_printf(s, "HCuPINT    [%02x]     %04x\n", ISP1362_REG_NO(ISP1362_REG_HCuPINT),
+		   isp1362_read_reg16(isp1362_hcd, HCuPINT));
+	seq_printf(s, "HCuPINTENB [%02x]     %04x\n", ISP1362_REG_NO(ISP1362_REG_HCuPINTENB),
+		   isp1362_read_reg16(isp1362_hcd, HCuPINTENB));
+	seq_printf(s, "HCCHIPID   [%02x]     %04x\n", ISP1362_REG_NO(ISP1362_REG_HCCHIPID),
+		   isp1362_read_reg16(isp1362_hcd, HCCHIPID));
+	seq_printf(s, "HCSCRATCH  [%02x]     %04x\n", ISP1362_REG_NO(ISP1362_REG_HCSCRATCH),
+		   isp1362_read_reg16(isp1362_hcd, HCSCRATCH));
+	seq_printf(s, "HCBUFSTAT  [%02x]     %04x\n", ISP1362_REG_NO(ISP1362_REG_HCBUFSTAT),
+		   isp1362_read_reg16(isp1362_hcd, HCBUFSTAT));
+	seq_printf(s, "HCDIRADDR  [%02x] %08x\n", ISP1362_REG_NO(ISP1362_REG_HCDIRADDR),
+		   isp1362_read_reg32(isp1362_hcd, HCDIRADDR));
+#if 0
+	seq_printf(s, "HCDIRDATA  [%02x]     %04x\n", ISP1362_REG_NO(HCDIRDATA),
+		   isp1362_read_reg16(isp1362_hcd, HCDIRDATA));
+#endif
+	seq_printf(s, "HCISTLBUFSZ[%02x]     %04x\n", ISP1362_REG_NO(ISP1362_REG_HCISTLBUFSZ),
+		   isp1362_read_reg16(isp1362_hcd, HCISTLBUFSZ));
+	seq_printf(s, "HCISTLRATE [%02x]     %04x\n", ISP1362_REG_NO(ISP1362_REG_HCISTLRATE),
+		   isp1362_read_reg16(isp1362_hcd, HCISTLRATE));
+	seq_printf(s, "\n");
+	seq_printf(s, "HCINTLBUFSZ[%02x]     %04x\n", ISP1362_REG_NO(ISP1362_REG_HCINTLBUFSZ),
+		   isp1362_read_reg16(isp1362_hcd, HCINTLBUFSZ));
+	seq_printf(s, "HCINTLBLKSZ[%02x]     %04x\n", ISP1362_REG_NO(ISP1362_REG_HCINTLBLKSZ),
+		   isp1362_read_reg16(isp1362_hcd, HCINTLBLKSZ));
+	seq_printf(s, "HCINTLDONE [%02x] %08x\n", ISP1362_REG_NO(ISP1362_REG_HCINTLDONE),
+		   isp1362_read_reg32(isp1362_hcd, HCINTLDONE));
+	seq_printf(s, "HCINTLSKIP [%02x] %08x\n", ISP1362_REG_NO(ISP1362_REG_HCINTLSKIP),
+		   isp1362_read_reg32(isp1362_hcd, HCINTLSKIP));
+	seq_printf(s, "HCINTLLAST [%02x] %08x\n", ISP1362_REG_NO(ISP1362_REG_HCINTLLAST),
+		   isp1362_read_reg32(isp1362_hcd, HCINTLLAST));
+	seq_printf(s, "HCINTLCURR [%02x]     %04x\n", ISP1362_REG_NO(ISP1362_REG_HCINTLCURR),
+		   isp1362_read_reg16(isp1362_hcd, HCINTLCURR));
+	seq_printf(s, "\n");
+	seq_printf(s, "HCATLBUFSZ [%02x]     %04x\n", ISP1362_REG_NO(ISP1362_REG_HCATLBUFSZ),
+		   isp1362_read_reg16(isp1362_hcd, HCATLBUFSZ));
+	seq_printf(s, "HCATLBLKSZ [%02x]     %04x\n", ISP1362_REG_NO(ISP1362_REG_HCATLBLKSZ),
+		   isp1362_read_reg16(isp1362_hcd, HCATLBLKSZ));
+#if 0
+	seq_printf(s, "HCATLDONE  [%02x] %08x\n", ISP1362_REG_NO(ISP1362_REG_HCATLDONE),
+		   isp1362_read_reg32(isp1362_hcd, HCATLDONE));
+#endif
+	seq_printf(s, "HCATLSKIP  [%02x] %08x\n", ISP1362_REG_NO(ISP1362_REG_HCATLSKIP),
+		   isp1362_read_reg32(isp1362_hcd, HCATLSKIP));
+	seq_printf(s, "HCATLLAST  [%02x] %08x\n", ISP1362_REG_NO(ISP1362_REG_HCATLLAST),
+		   isp1362_read_reg32(isp1362_hcd, HCATLLAST));
+	seq_printf(s, "HCATLCURR  [%02x]     %04x\n", ISP1362_REG_NO(ISP1362_REG_HCATLCURR),
+		   isp1362_read_reg16(isp1362_hcd, HCATLCURR));
+	seq_printf(s, "\n");
+	seq_printf(s, "HCATLDTC   [%02x]     %04x\n", ISP1362_REG_NO(ISP1362_REG_HCATLDTC),
+		   isp1362_read_reg16(isp1362_hcd, HCATLDTC));
+	seq_printf(s, "HCATLDTCTO [%02x]     %04x\n", ISP1362_REG_NO(ISP1362_REG_HCATLDTCTO),
+		   isp1362_read_reg16(isp1362_hcd, HCATLDTCTO));
+}
+
+static int proc_isp1362_show(struct seq_file *s, void *unused)
+{
+	struct isp1362_hcd *isp1362_hcd = s->private;
+	struct isp1362_ep *ep;
+	int i;
+
+	seq_printf(s, "%s\n%s version %s\n",
+		   isp1362_hcd_to_hcd(isp1362_hcd)->product_desc, hcd_name, DRIVER_VERSION);
+
+	/* collect statistics to help estimate potential win for
+	 * DMA engines that care about alignment (PXA)
+	 */
+	seq_printf(s, "alignment:  16b/%ld 8b/%ld 4b/%ld 2b/%ld 1b/%ld\n",
+		   isp1362_hcd->stat16, isp1362_hcd->stat8, isp1362_hcd->stat4,
+		   isp1362_hcd->stat2, isp1362_hcd->stat1);
+	seq_printf(s, "max # ptds in ATL  fifo: %d\n", isp1362_hcd->atl_queue.stat_maxptds);
+	seq_printf(s, "max # ptds in INTL fifo: %d\n", isp1362_hcd->intl_queue.stat_maxptds);
+	seq_printf(s, "max # ptds in ISTL fifo: %d\n",
+		   max(isp1362_hcd->istl_queue[0] .stat_maxptds,
+		       isp1362_hcd->istl_queue[1] .stat_maxptds));
+
+	/* FIXME: don't show the following in suspended state */
+	spin_lock_irq(&isp1362_hcd->lock);
+
+	dump_irq(s, "hc_irq_enable", isp1362_read_reg16(isp1362_hcd, HCuPINTENB));
+	dump_irq(s, "hc_irq_status", isp1362_read_reg16(isp1362_hcd, HCuPINT));
+	dump_int(s, "ohci_int_enable", isp1362_read_reg32(isp1362_hcd, HCINTENB));
+	dump_int(s, "ohci_int_status", isp1362_read_reg32(isp1362_hcd, HCINTSTAT));
+	dump_ctrl(s, "ohci_control", isp1362_read_reg32(isp1362_hcd, HCCONTROL));
+
+	for (i = 0; i < NUM_ISP1362_IRQS; i++)
+		if (isp1362_hcd->irq_stat[i])
+			seq_printf(s, "%-15s: %d\n",
+				   ISP1362_INT_NAME(i), isp1362_hcd->irq_stat[i]);
+
+	dump_regs(s, isp1362_hcd);
+	list_for_each_entry(ep, &isp1362_hcd->async, schedule) {
+		struct urb *urb;
+
+		seq_printf(s, "%p, ep%d%s, maxpacket %d:\n", ep, ep->epnum,
+			   ({
+				   char *s;
+				   switch (ep->nextpid) {
+				   case USB_PID_IN:
+					   s = "in";
+					   break;
+				   case USB_PID_OUT:
+					   s = "out";
+					   break;
+				   case USB_PID_SETUP:
+					   s = "setup";
+					   break;
+				   case USB_PID_ACK:
+					   s = "status";
+					   break;
+				   default:
+					   s = "?";
+					   break;
+				   };
+				   s;}), ep->maxpacket) ;
+		list_for_each_entry(urb, &ep->hep->urb_list, urb_list) {
+			seq_printf(s, "  urb%p, %d/%d\n", urb,
+				   urb->actual_length,
+				   urb->transfer_buffer_length);
+		}
+	}
+	if (!list_empty(&isp1362_hcd->async))
+		seq_printf(s, "\n");
+	dump_ptd_queue(&isp1362_hcd->atl_queue);
+
+	seq_printf(s, "periodic size= %d\n", PERIODIC_SIZE);
+
+	list_for_each_entry(ep, &isp1362_hcd->periodic, schedule) {
+		seq_printf(s, "branch:%2d load:%3d PTD[%d] $%04x:\n", ep->branch,
+			   isp1362_hcd->load[ep->branch], ep->ptd_index, ep->ptd_offset);
+
+		seq_printf(s, "   %d/%p (%sdev%d ep%d%s max %d)\n",
+			   ep->interval, ep,
+			   (ep->udev->speed == USB_SPEED_FULL) ? "" : "ls ",
+			   ep->udev->devnum, ep->epnum,
+			   (ep->epnum == 0) ? "" :
+			   ((ep->nextpid == USB_PID_IN) ?
+			    "in" : "out"), ep->maxpacket);
+	}
+	dump_ptd_queue(&isp1362_hcd->intl_queue);
+
+	seq_printf(s, "ISO:\n");
+
+	list_for_each_entry(ep, &isp1362_hcd->isoc, schedule) {
+		seq_printf(s, "   %d/%p (%sdev%d ep%d%s max %d)\n",
+			   ep->interval, ep,
+			   (ep->udev->speed == USB_SPEED_FULL) ? "" : "ls ",
+			   ep->udev->devnum, ep->epnum,
+			   (ep->epnum == 0) ? "" :
+			   ((ep->nextpid == USB_PID_IN) ?
+			    "in" : "out"), ep->maxpacket);
+	}
+
+	spin_unlock_irq(&isp1362_hcd->lock);
+	seq_printf(s, "\n");
+
+	return 0;
+}
+
+static int proc_isp1362_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, proc_isp1362_show, PDE(inode)->data);
+}
+
+static const struct file_operations proc_ops = {
+	.open = proc_isp1362_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+/* expect just one isp1362_hcd per system */
+static const char proc_filename[] = "driver/isp1362";
+
+static void create_debug_file(struct isp1362_hcd *isp1362_hcd)
+{
+	struct proc_dir_entry *pde;
+
+	pde = create_proc_entry(proc_filename, 0, NULL);
+	if (pde == NULL) {
+		pr_warning("%s: Failed to create debug file '%s'\n", __func__, proc_filename);
+		return;
+	}
+
+	pde->proc_fops = &proc_ops;
+	pde->data = isp1362_hcd;
+	isp1362_hcd->pde = pde;
+}
+
+static void remove_debug_file(struct isp1362_hcd *isp1362_hcd)
+{
+	if (isp1362_hcd->pde)
+		remove_proc_entry(proc_filename, 0);
+}
+
+#endif
+
+/*-------------------------------------------------------------------------*/
+
+static void isp1362_sw_reset(struct isp1362_hcd *isp1362_hcd)
+{
+	int tmp = 20;
+	unsigned long flags;
+
+	spin_lock_irqsave(&isp1362_hcd->lock, flags);
+
+	isp1362_write_reg16(isp1362_hcd, HCSWRES, HCSWRES_MAGIC);
+	isp1362_write_reg32(isp1362_hcd, HCCMDSTAT, OHCI_HCR);
+	while (--tmp) {
+		mdelay(1);
+		if (!(isp1362_read_reg32(isp1362_hcd, HCCMDSTAT) & OHCI_HCR))
+			break;
+	}
+	if (!tmp)
+		pr_err("Software reset timeout\n");
+	spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+}
+
+static int isp1362_mem_config(struct usb_hcd *hcd)
+{
+	struct isp1362_hcd *isp1362_hcd = hcd_to_isp1362_hcd(hcd);
+	unsigned long flags;
+	u32 total;
+	u16 istl_size = ISP1362_ISTL_BUFSIZE;
+	u16 intl_blksize = ISP1362_INTL_BLKSIZE + PTD_HEADER_SIZE;
+	u16 intl_size = ISP1362_INTL_BUFFERS * intl_blksize;
+	u16 atl_blksize = ISP1362_ATL_BLKSIZE + PTD_HEADER_SIZE;
+	u16 atl_buffers = (ISP1362_BUF_SIZE - (istl_size + intl_size)) / atl_blksize;
+	u16 atl_size;
+	int i;
+
+	WARN_ON(istl_size & 3);
+	WARN_ON(atl_blksize & 3);
+	WARN_ON(intl_blksize & 3);
+	WARN_ON(atl_blksize < PTD_HEADER_SIZE);
+	WARN_ON(intl_blksize < PTD_HEADER_SIZE);
+
+	BUG_ON((unsigned)ISP1362_INTL_BUFFERS > 32);
+	if (atl_buffers > 32)
+		atl_buffers = 32;
+	atl_size = atl_buffers * atl_blksize;
+	total = atl_size + intl_size + istl_size;
+	dev_info(hcd->self.controller, "ISP1362 Memory usage:\n");
+	dev_info(hcd->self.controller, "  ISTL:    2 * %4d:     %4d @ $%04x:$%04x\n",
+		 istl_size / 2, istl_size, 0, istl_size / 2);
+	dev_info(hcd->self.controller, "  INTL: %4d * (%3u+8):  %4d @ $%04x\n",
+		 ISP1362_INTL_BUFFERS, intl_blksize - PTD_HEADER_SIZE,
+		 intl_size, istl_size);
+	dev_info(hcd->self.controller, "  ATL : %4d * (%3u+8):  %4d @ $%04x\n",
+		 atl_buffers, atl_blksize - PTD_HEADER_SIZE,
+		 atl_size, istl_size + intl_size);
+	dev_info(hcd->self.controller, "  USED/FREE:   %4d      %4d\n", total,
+		 ISP1362_BUF_SIZE - total);
+
+	if (total > ISP1362_BUF_SIZE) {
+		dev_err(hcd->self.controller, "%s: Memory requested: %d, available %d\n",
+			__func__, total, ISP1362_BUF_SIZE);
+		return -ENOMEM;
+	}
+
+	total = istl_size + intl_size + atl_size;
+	spin_lock_irqsave(&isp1362_hcd->lock, flags);
+
+	for (i = 0; i < 2; i++) {
+		isp1362_hcd->istl_queue[i].buf_start = i * istl_size / 2,
+		isp1362_hcd->istl_queue[i].buf_size = istl_size / 2;
+		isp1362_hcd->istl_queue[i].blk_size = 4;
+		INIT_LIST_HEAD(&isp1362_hcd->istl_queue[i].active);
+		snprintf(isp1362_hcd->istl_queue[i].name,
+			 sizeof(isp1362_hcd->istl_queue[i].name), "ISTL%d", i);
+		DBG(3, "%s: %5s buf $%04x %d\n", __func__,
+		     isp1362_hcd->istl_queue[i].name,
+		     isp1362_hcd->istl_queue[i].buf_start,
+		     isp1362_hcd->istl_queue[i].buf_size);
+	}
+	isp1362_write_reg16(isp1362_hcd, HCISTLBUFSZ, istl_size / 2);
+
+	isp1362_hcd->intl_queue.buf_start = istl_size;
+	isp1362_hcd->intl_queue.buf_size = intl_size;
+	isp1362_hcd->intl_queue.buf_count = ISP1362_INTL_BUFFERS;
+	isp1362_hcd->intl_queue.blk_size = intl_blksize;
+	isp1362_hcd->intl_queue.buf_avail = isp1362_hcd->intl_queue.buf_count;
+	isp1362_hcd->intl_queue.skip_map = ~0;
+	INIT_LIST_HEAD(&isp1362_hcd->intl_queue.active);
+
+	isp1362_write_reg16(isp1362_hcd, HCINTLBUFSZ,
+			    isp1362_hcd->intl_queue.buf_size);
+	isp1362_write_reg16(isp1362_hcd, HCINTLBLKSZ,
+			    isp1362_hcd->intl_queue.blk_size - PTD_HEADER_SIZE);
+	isp1362_write_reg32(isp1362_hcd, HCINTLSKIP, ~0);
+	isp1362_write_reg32(isp1362_hcd, HCINTLLAST,
+			    1 << (ISP1362_INTL_BUFFERS - 1));
+
+	isp1362_hcd->atl_queue.buf_start = istl_size + intl_size;
+	isp1362_hcd->atl_queue.buf_size = atl_size;
+	isp1362_hcd->atl_queue.buf_count = atl_buffers;
+	isp1362_hcd->atl_queue.blk_size = atl_blksize;
+	isp1362_hcd->atl_queue.buf_avail = isp1362_hcd->atl_queue.buf_count;
+	isp1362_hcd->atl_queue.skip_map = ~0;
+	INIT_LIST_HEAD(&isp1362_hcd->atl_queue.active);
+
+	isp1362_write_reg16(isp1362_hcd, HCATLBUFSZ,
+			    isp1362_hcd->atl_queue.buf_size);
+	isp1362_write_reg16(isp1362_hcd, HCATLBLKSZ,
+			    isp1362_hcd->atl_queue.blk_size - PTD_HEADER_SIZE);
+	isp1362_write_reg32(isp1362_hcd, HCATLSKIP, ~0);
+	isp1362_write_reg32(isp1362_hcd, HCATLLAST,
+			    1 << (atl_buffers - 1));
+
+	snprintf(isp1362_hcd->atl_queue.name,
+		 sizeof(isp1362_hcd->atl_queue.name), "ATL");
+	snprintf(isp1362_hcd->intl_queue.name,
+		 sizeof(isp1362_hcd->intl_queue.name), "INTL");
+	DBG(3, "%s: %5s buf $%04x %2d * %4d = %4d\n", __func__,
+	     isp1362_hcd->intl_queue.name,
+	     isp1362_hcd->intl_queue.buf_start,
+	     ISP1362_INTL_BUFFERS, isp1362_hcd->intl_queue.blk_size,
+	     isp1362_hcd->intl_queue.buf_size);
+	DBG(3, "%s: %5s buf $%04x %2d * %4d = %4d\n", __func__,
+	     isp1362_hcd->atl_queue.name,
+	     isp1362_hcd->atl_queue.buf_start,
+	     atl_buffers, isp1362_hcd->atl_queue.blk_size,
+	     isp1362_hcd->atl_queue.buf_size);
+
+	spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+
+	return 0;
+}
+
+static int isp1362_hc_reset(struct usb_hcd *hcd)
+{
+	int ret = 0;
+	struct isp1362_hcd *isp1362_hcd = hcd_to_isp1362_hcd(hcd);
+	unsigned long t;
+	unsigned long timeout = 100;
+	unsigned long flags;
+	int clkrdy = 0;
+
+	pr_info("%s:\n", __func__);
+
+	if (isp1362_hcd->board && isp1362_hcd->board->reset) {
+		isp1362_hcd->board->reset(hcd->self.controller, 1);
+		msleep(20);
+		if (isp1362_hcd->board->clock)
+			isp1362_hcd->board->clock(hcd->self.controller, 1);
+		isp1362_hcd->board->reset(hcd->self.controller, 0);
+	} else
+		isp1362_sw_reset(isp1362_hcd);
+
+	/* chip has been reset. First we need to see a clock */
+	t = jiffies + msecs_to_jiffies(timeout);
+	while (!clkrdy && time_before_eq(jiffies, t)) {
+		spin_lock_irqsave(&isp1362_hcd->lock, flags);
+		clkrdy = isp1362_read_reg16(isp1362_hcd, HCuPINT) & HCuPINT_CLKRDY;
+		spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+		if (!clkrdy)
+			msleep(4);
+	}
+
+	spin_lock_irqsave(&isp1362_hcd->lock, flags);
+	isp1362_write_reg16(isp1362_hcd, HCuPINT, HCuPINT_CLKRDY);
+	spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+	if (!clkrdy) {
+		pr_err("Clock not ready after %lums\n", timeout);
+		ret = -ENODEV;
+	}
+	return ret;
+}
+
+static void isp1362_hc_stop(struct usb_hcd *hcd)
+{
+	struct isp1362_hcd *isp1362_hcd = hcd_to_isp1362_hcd(hcd);
+	unsigned long flags;
+	u32 tmp;
+
+	pr_info("%s:\n", __func__);
+
+	del_timer_sync(&hcd->rh_timer);
+
+	spin_lock_irqsave(&isp1362_hcd->lock, flags);
+
+	isp1362_write_reg16(isp1362_hcd, HCuPINTENB, 0);
+
+	/* Switch off power for all ports */
+	tmp = isp1362_read_reg32(isp1362_hcd, HCRHDESCA);
+	tmp &= ~(RH_A_NPS | RH_A_PSM);
+	isp1362_write_reg32(isp1362_hcd, HCRHDESCA, tmp);
+	isp1362_write_reg32(isp1362_hcd, HCRHSTATUS, RH_HS_LPS);
+
+	/* Reset the chip */
+	if (isp1362_hcd->board && isp1362_hcd->board->reset)
+		isp1362_hcd->board->reset(hcd->self.controller, 1);
+	else
+		isp1362_sw_reset(isp1362_hcd);
+
+	if (isp1362_hcd->board && isp1362_hcd->board->clock)
+		isp1362_hcd->board->clock(hcd->self.controller, 0);
+
+	spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+}
+
+#ifdef CHIP_BUFFER_TEST
+static int isp1362_chip_test(struct isp1362_hcd *isp1362_hcd)
+{
+	int ret = 0;
+	u16 *ref;
+	unsigned long flags;
+
+	ref = kmalloc(2 * ISP1362_BUF_SIZE, GFP_KERNEL);
+	if (ref) {
+		int offset;
+		u16 *tst = &ref[ISP1362_BUF_SIZE / 2];
+
+		for (offset = 0; offset < ISP1362_BUF_SIZE / 2; offset++) {
+			ref[offset] = ~offset;
+			tst[offset] = offset;
+		}
+
+		for (offset = 0; offset < 4; offset++) {
+			int j;
+
+			for (j = 0; j < 8; j++) {
+				spin_lock_irqsave(&isp1362_hcd->lock, flags);
+				isp1362_write_buffer(isp1362_hcd, (u8 *)ref + offset, 0, j);
+				isp1362_read_buffer(isp1362_hcd, (u8 *)tst + offset, 0, j);
+				spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+
+				if (memcmp(ref, tst, j)) {
+					ret = -ENODEV;
+					pr_err("%s: memory check with %d byte offset %d failed\n",
+					    __func__, j, offset);
+					dump_data((u8 *)ref + offset, j);
+					dump_data((u8 *)tst + offset, j);
+				}
+			}
+		}
+
+		spin_lock_irqsave(&isp1362_hcd->lock, flags);
+		isp1362_write_buffer(isp1362_hcd, ref, 0, ISP1362_BUF_SIZE);
+		isp1362_read_buffer(isp1362_hcd, tst, 0, ISP1362_BUF_SIZE);
+		spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+
+		if (memcmp(ref, tst, ISP1362_BUF_SIZE)) {
+			ret = -ENODEV;
+			pr_err("%s: memory check failed\n", __func__);
+			dump_data((u8 *)tst, ISP1362_BUF_SIZE / 2);
+		}
+
+		for (offset = 0; offset < 256; offset++) {
+			int test_size = 0;
+
+			yield();
+
+			memset(tst, 0, ISP1362_BUF_SIZE);
+			spin_lock_irqsave(&isp1362_hcd->lock, flags);
+			isp1362_write_buffer(isp1362_hcd, tst, 0, ISP1362_BUF_SIZE);
+			isp1362_read_buffer(isp1362_hcd, tst, 0, ISP1362_BUF_SIZE);
+			spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+			if (memcmp(tst, tst + (ISP1362_BUF_SIZE / (2 * sizeof(*tst))),
+				   ISP1362_BUF_SIZE / 2)) {
+				pr_err("%s: Failed to clear buffer\n", __func__);
+				dump_data((u8 *)tst, ISP1362_BUF_SIZE);
+				break;
+			}
+			spin_lock_irqsave(&isp1362_hcd->lock, flags);
+			isp1362_write_buffer(isp1362_hcd, ref, offset * 2, PTD_HEADER_SIZE);
+			isp1362_write_buffer(isp1362_hcd, ref + PTD_HEADER_SIZE / sizeof(*ref),
+					     offset * 2 + PTD_HEADER_SIZE, test_size);
+			isp1362_read_buffer(isp1362_hcd, tst, offset * 2,
+					    PTD_HEADER_SIZE + test_size);
+			spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+			if (memcmp(ref, tst, PTD_HEADER_SIZE + test_size)) {
+				dump_data(((u8 *)ref) + offset, PTD_HEADER_SIZE + test_size);
+				dump_data((u8 *)tst, PTD_HEADER_SIZE + test_size);
+				spin_lock_irqsave(&isp1362_hcd->lock, flags);
+				isp1362_read_buffer(isp1362_hcd, tst, offset * 2,
+						    PTD_HEADER_SIZE + test_size);
+				spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+				if (memcmp(ref, tst, PTD_HEADER_SIZE + test_size)) {
+					ret = -ENODEV;
+					pr_err("%s: memory check with offset %02x failed\n",
+					    __func__, offset);
+					break;
+				}
+				pr_warning("%s: memory check with offset %02x ok after second read\n",
+				     __func__, offset);
+			}
+		}
+		kfree(ref);
+	}
+	return ret;
+}
+#endif
+
+static int isp1362_hc_start(struct usb_hcd *hcd)
+{
+	int ret;
+	struct isp1362_hcd *isp1362_hcd = hcd_to_isp1362_hcd(hcd);
+	struct isp1362_platform_data *board = isp1362_hcd->board;
+	u16 hwcfg;
+	u16 chipid;
+	unsigned long flags;
+
+	pr_info("%s:\n", __func__);
+
+	spin_lock_irqsave(&isp1362_hcd->lock, flags);
+	chipid = isp1362_read_reg16(isp1362_hcd, HCCHIPID);
+	spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+
+	if ((chipid & HCCHIPID_MASK) != HCCHIPID_MAGIC) {
+		pr_err("%s: Invalid chip ID %04x\n", __func__, chipid);
+		return -ENODEV;
+	}
+
+#ifdef CHIP_BUFFER_TEST
+	ret = isp1362_chip_test(isp1362_hcd);
+	if (ret)
+		return -ENODEV;
+#endif
+	spin_lock_irqsave(&isp1362_hcd->lock, flags);
+	/* clear interrupt status and disable all interrupt sources */
+	isp1362_write_reg16(isp1362_hcd, HCuPINT, 0xff);
+	isp1362_write_reg16(isp1362_hcd, HCuPINTENB, 0);
+
+	/* HW conf */
+	hwcfg = HCHWCFG_INT_ENABLE | HCHWCFG_DBWIDTH(1);
+	if (board->sel15Kres)
+		hwcfg |= HCHWCFG_PULLDOWN_DS2 |
+			(MAX_ROOT_PORTS > 1) ? HCHWCFG_PULLDOWN_DS1 : 0;
+	if (board->clknotstop)
+		hwcfg |= HCHWCFG_CLKNOTSTOP;
+	if (board->oc_enable)
+		hwcfg |= HCHWCFG_ANALOG_OC;
+	if (board->int_act_high)
+		hwcfg |= HCHWCFG_INT_POL;
+	if (board->int_edge_triggered)
+		hwcfg |= HCHWCFG_INT_TRIGGER;
+	if (board->dreq_act_high)
+		hwcfg |= HCHWCFG_DREQ_POL;
+	if (board->dack_act_high)
+		hwcfg |= HCHWCFG_DACK_POL;
+	isp1362_write_reg16(isp1362_hcd, HCHWCFG, hwcfg);
+	isp1362_show_reg(isp1362_hcd, HCHWCFG);
+	isp1362_write_reg16(isp1362_hcd, HCDMACFG, 0);
+	spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+
+	ret = isp1362_mem_config(hcd);
+	if (ret)
+		return ret;
+
+	spin_lock_irqsave(&isp1362_hcd->lock, flags);
+
+	/* Root hub conf */
+	isp1362_hcd->rhdesca = 0;
+	if (board->no_power_switching)
+		isp1362_hcd->rhdesca |= RH_A_NPS;
+	if (board->power_switching_mode)
+		isp1362_hcd->rhdesca |= RH_A_PSM;
+	if (board->potpg)
+		isp1362_hcd->rhdesca |= (board->potpg << 24) & RH_A_POTPGT;
+	else
+		isp1362_hcd->rhdesca |= (25 << 24) & RH_A_POTPGT;
+
+	isp1362_write_reg32(isp1362_hcd, HCRHDESCA, isp1362_hcd->rhdesca & ~RH_A_OCPM);
+	isp1362_write_reg32(isp1362_hcd, HCRHDESCA, isp1362_hcd->rhdesca | RH_A_OCPM);
+	isp1362_hcd->rhdesca = isp1362_read_reg32(isp1362_hcd, HCRHDESCA);
+
+	isp1362_hcd->rhdescb = RH_B_PPCM;
+	isp1362_write_reg32(isp1362_hcd, HCRHDESCB, isp1362_hcd->rhdescb);
+	isp1362_hcd->rhdescb = isp1362_read_reg32(isp1362_hcd, HCRHDESCB);
+
+	isp1362_read_reg32(isp1362_hcd, HCFMINTVL);
+	isp1362_write_reg32(isp1362_hcd, HCFMINTVL, (FSMP(FI) << 16) | FI);
+	isp1362_write_reg32(isp1362_hcd, HCLSTHRESH, LSTHRESH);
+
+	spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+
+	isp1362_hcd->hc_control = OHCI_USB_OPER;
+	hcd->state = HC_STATE_RUNNING;
+
+	spin_lock_irqsave(&isp1362_hcd->lock, flags);
+	/* Set up interrupts */
+	isp1362_hcd->intenb = OHCI_INTR_MIE | OHCI_INTR_RHSC | OHCI_INTR_UE;
+	isp1362_hcd->intenb |= OHCI_INTR_RD;
+	isp1362_hcd->irqenb = HCuPINT_OPR | HCuPINT_SUSP;
+	isp1362_write_reg32(isp1362_hcd, HCINTENB, isp1362_hcd->intenb);
+	isp1362_write_reg16(isp1362_hcd, HCuPINTENB, isp1362_hcd->irqenb);
+
+	/* Go operational */
+	isp1362_write_reg32(isp1362_hcd, HCCONTROL, isp1362_hcd->hc_control);
+	/* enable global power */
+	isp1362_write_reg32(isp1362_hcd, HCRHSTATUS, RH_HS_LPSC | RH_HS_DRWE);
+
+	spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+
+	return 0;
+}
+
+/*-------------------------------------------------------------------------*/
+
+static struct hc_driver isp1362_hc_driver = {
+	.description =		hcd_name,
+	.product_desc =		"ISP1362 Host Controller",
+	.hcd_priv_size =	sizeof(struct isp1362_hcd),
+
+	.irq =			isp1362_irq,
+	.flags =		HCD_USB11 | HCD_MEMORY,
+
+	.reset =		isp1362_hc_reset,
+	.start =		isp1362_hc_start,
+	.stop =			isp1362_hc_stop,
+
+	.urb_enqueue =		isp1362_urb_enqueue,
+	.urb_dequeue =		isp1362_urb_dequeue,
+	.endpoint_disable =	isp1362_endpoint_disable,
+
+	.get_frame_number =	isp1362_get_frame,
+
+	.hub_status_data =	isp1362_hub_status_data,
+	.hub_control =		isp1362_hub_control,
+	.bus_suspend =		isp1362_bus_suspend,
+	.bus_resume =		isp1362_bus_resume,
+};
+
+/*-------------------------------------------------------------------------*/
+
+#define resource_len(r) (((r)->end - (r)->start) + 1)
+
+static int __devexit isp1362_remove(struct platform_device *pdev)
+{
+	struct usb_hcd *hcd = platform_get_drvdata(pdev);
+	struct isp1362_hcd *isp1362_hcd = hcd_to_isp1362_hcd(hcd);
+	struct resource *res;
+
+	remove_debug_file(isp1362_hcd);
+	DBG(0, "%s: Removing HCD\n", __func__);
+	usb_remove_hcd(hcd);
+
+	DBG(0, "%s: Unmapping data_reg @ %08x\n", __func__,
+	    (u32)isp1362_hcd->data_reg);
+	iounmap(isp1362_hcd->data_reg);
+
+	DBG(0, "%s: Unmapping addr_reg @ %08x\n", __func__,
+	    (u32)isp1362_hcd->addr_reg);
+	iounmap(isp1362_hcd->addr_reg);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+	DBG(0, "%s: release mem_region: %08lx\n", __func__, (long unsigned int)res->start);
+	if (res)
+		release_mem_region(res->start, resource_len(res));
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	DBG(0, "%s: release mem_region: %08lx\n", __func__, (long unsigned int)res->start);
+	if (res)
+		release_mem_region(res->start, resource_len(res));
+
+	DBG(0, "%s: put_hcd\n", __func__);
+	usb_put_hcd(hcd);
+	DBG(0, "%s: Done\n", __func__);
+
+	return 0;
+}
+
+static int __init isp1362_probe(struct platform_device *pdev)
+{
+	struct usb_hcd *hcd;
+	struct isp1362_hcd *isp1362_hcd;
+	struct resource *addr, *data;
+	void __iomem *addr_reg;
+	void __iomem *data_reg;
+	int irq;
+	int retval = 0;
+
+	/* basic sanity checks first.  board-specific init logic should
+	 * have initialized this the three resources and probably board
+	 * specific platform_data.  we don't probe for IRQs, and do only
+	 * minimal sanity checking.
+	 */
+	if (pdev->num_resources < 3) {
+		retval = -ENODEV;
+		goto err1;
+	}
+
+	data = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	addr = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+	irq = platform_get_irq(pdev, 0);
+	if (!addr || !data || irq < 0) {
+		retval = -ENODEV;
+		goto err1;
+	}
+
+#ifdef CONFIG_USB_HCD_DMA
+	if (pdev->dev.dma_mask) {
+		struct resource *dma_res = platform_get_resource(pdev, IORESOURCE_MEM, 2);
+
+		if (!dma_res) {
+			retval = -ENODEV;
+			goto err1;
+		}
+		isp1362_hcd->data_dma = dma_res->start;
+		isp1362_hcd->max_dma_size = resource_len(dma_res);
+	}
+#else
+	if (pdev->dev.dma_mask) {
+		DBG(1, "won't do DMA");
+		retval = -ENODEV;
+		goto err1;
+	}
+#endif
+
+	if (!request_mem_region(addr->start, resource_len(addr), hcd_name)) {
+		retval = -EBUSY;
+		goto err1;
+	}
+	addr_reg = ioremap(addr->start, resource_len(addr));
+	if (addr_reg == NULL) {
+		retval = -ENOMEM;
+		goto err2;
+	}
+
+	if (!request_mem_region(data->start, resource_len(data), hcd_name)) {
+		retval = -EBUSY;
+		goto err3;
+	}
+	data_reg = ioremap(data->start, resource_len(data));
+	if (data_reg == NULL) {
+		retval = -ENOMEM;
+		goto err4;
+	}
+
+	/* allocate and initialize hcd */
+	hcd = usb_create_hcd(&isp1362_hc_driver, &pdev->dev, dev_name(&pdev->dev));
+	if (!hcd) {
+		retval = -ENOMEM;
+		goto err5;
+	}
+	hcd->rsrc_start = data->start;
+	isp1362_hcd = hcd_to_isp1362_hcd(hcd);
+	isp1362_hcd->data_reg = data_reg;
+	isp1362_hcd->addr_reg = addr_reg;
+
+	isp1362_hcd->next_statechange = jiffies;
+	spin_lock_init(&isp1362_hcd->lock);
+	INIT_LIST_HEAD(&isp1362_hcd->async);
+	INIT_LIST_HEAD(&isp1362_hcd->periodic);
+	INIT_LIST_HEAD(&isp1362_hcd->isoc);
+	INIT_LIST_HEAD(&isp1362_hcd->remove_list);
+	isp1362_hcd->board = pdev->dev.platform_data;
+#if USE_PLATFORM_DELAY
+	if (!isp1362_hcd->board->delay) {
+		dev_err(hcd->self.controller, "No platform delay function given\n");
+		retval = -ENODEV;
+		goto err6;
+	}
+#endif
+
+#ifdef CONFIG_ARM
+	if (isp1362_hcd->board)
+		set_irq_type(irq, isp1362_hcd->board->int_act_high ? IRQT_RISING : IRQT_FALLING);
+#endif
+
+	retval = usb_add_hcd(hcd, irq, IRQF_TRIGGER_LOW | IRQF_DISABLED | IRQF_SHARED);
+	if (retval != 0)
+		goto err6;
+	pr_info("%s, irq %d\n", hcd->product_desc, irq);
+
+	create_debug_file(isp1362_hcd);
+
+	return 0;
+
+ err6:
+	DBG(0, "%s: Freeing dev %08x\n", __func__, (u32)isp1362_hcd);
+	usb_put_hcd(hcd);
+ err5:
+	DBG(0, "%s: Unmapping data_reg @ %08x\n", __func__, (u32)data_reg);
+	iounmap(data_reg);
+ err4:
+	DBG(0, "%s: Releasing mem region %08lx\n", __func__, (long unsigned int)data->start);
+	release_mem_region(data->start, resource_len(data));
+ err3:
+	DBG(0, "%s: Unmapping addr_reg @ %08x\n", __func__, (u32)addr_reg);
+	iounmap(addr_reg);
+ err2:
+	DBG(0, "%s: Releasing mem region %08lx\n", __func__, (long unsigned int)addr->start);
+	release_mem_region(addr->start, resource_len(addr));
+ err1:
+	pr_err("%s: init error, %d\n", __func__, retval);
+
+	return retval;
+}
+
+#ifdef	CONFIG_PM
+static int isp1362_suspend(struct platform_device *pdev, pm_message_t state)
+{
+	struct usb_hcd *hcd = platform_get_drvdata(pdev);
+	struct isp1362_hcd *isp1362_hcd = hcd_to_isp1362_hcd(hcd);
+	unsigned long flags;
+	int retval = 0;
+
+	DBG(0, "%s: Suspending device\n", __func__);
+
+	if (state.event == PM_EVENT_FREEZE) {
+		DBG(0, "%s: Suspending root hub\n", __func__);
+		retval = isp1362_bus_suspend(hcd);
+	} else {
+		DBG(0, "%s: Suspending RH ports\n", __func__);
+		spin_lock_irqsave(&isp1362_hcd->lock, flags);
+		isp1362_write_reg32(isp1362_hcd, HCRHSTATUS, RH_HS_LPS);
+		spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+	}
+	if (retval == 0)
+		pdev->dev.power.power_state = state;
+	return retval;
+}
+
+static int isp1362_resume(struct platform_device *pdev)
+{
+	struct usb_hcd *hcd = platform_get_drvdata(pdev);
+	struct isp1362_hcd *isp1362_hcd = hcd_to_isp1362_hcd(hcd);
+	unsigned long flags;
+
+	DBG(0, "%s: Resuming\n", __func__);
+
+	if (pdev->dev.power.power_state.event == PM_EVENT_SUSPEND) {
+		DBG(0, "%s: Resume RH ports\n", __func__);
+		spin_lock_irqsave(&isp1362_hcd->lock, flags);
+		isp1362_write_reg32(isp1362_hcd, HCRHSTATUS, RH_HS_LPSC);
+		spin_unlock_irqrestore(&isp1362_hcd->lock, flags);
+		return 0;
+	}
+
+	pdev->dev.power.power_state = PMSG_ON;
+
+	return isp1362_bus_resume(isp1362_hcd_to_hcd(isp1362_hcd));
+}
+#else
+#define	isp1362_suspend	NULL
+#define	isp1362_resume	NULL
+#endif
+
+static struct platform_driver isp1362_driver = {
+	.probe = isp1362_probe,
+	.remove = __devexit_p(isp1362_remove),
+
+	.suspend = isp1362_suspend,
+	.resume = isp1362_resume,
+	.driver = {
+		.name = (char *)hcd_name,
+		.owner = THIS_MODULE,
+	},
+};
+
+/*-------------------------------------------------------------------------*/
+
+static int __init isp1362_init(void)
+{
+	if (usb_disabled())
+		return -ENODEV;
+	pr_info("driver %s, %s\n", hcd_name, DRIVER_VERSION);
+	return platform_driver_register(&isp1362_driver);
+}
+module_init(isp1362_init);
+
+static void __exit isp1362_cleanup(void)
+{
+	platform_driver_unregister(&isp1362_driver);
+}
+module_exit(isp1362_cleanup);
diff --git a/drivers/usb/host/isp1362.h b/drivers/usb/host/isp1362.h
new file mode 100644
index 000000000000..26e44fc8776f
--- /dev/null
+++ b/drivers/usb/host/isp1362.h
@@ -0,0 +1,1079 @@
+/*
+ * ISP1362 HCD (Host Controller Driver) for USB.
+ *
+ * COPYRIGHT (C) by L. Wassmann <LW@KARO-electronics.de>
+ */
+
+/* ------------------------------------------------------------------------- */
+/*
+ * Platform specific compile time options
+ */
+#if defined(CONFIG_ARCH_KARO)
+#include <asm/arch/hardware.h>
+#include <asm/arch/pxa-regs.h>
+#include <asm/arch/karo.h>
+
+#define USE_32BIT		1
+
+
+/* These options are mutually eclusive */
+#define USE_PLATFORM_DELAY	1
+#define USE_NDELAY		0
+/*
+ * MAX_ROOT_PORTS: Number of downstream ports
+ *
+ * The chip has two USB ports, one of which can be configured as
+ * an USB device port, so the value of this constant is implementation
+ * specific.
+ */
+#define MAX_ROOT_PORTS		2
+#define DUMMY_DELAY_ACCESS do {} while (0)
+
+/* insert platform specific definitions for other machines here */
+#elif defined(CONFIG_BLACKFIN)
+
+#include <linux/io.h>
+#define USE_32BIT		0
+#define MAX_ROOT_PORTS		2
+#define USE_PLATFORM_DELAY	0
+#define USE_NDELAY		1
+
+#define DUMMY_DELAY_ACCESS \
+	do { \
+		bfin_read16(ASYNC_BANK0_BASE); \
+		bfin_read16(ASYNC_BANK0_BASE); \
+		bfin_read16(ASYNC_BANK0_BASE); \
+	} while (0)
+
+#undef insw
+#undef outsw
+
+#define insw  delayed_insw
+#define outsw  delayed_outsw
+
+static inline void delayed_outsw(unsigned int addr, void *buf, int len)
+{
+	unsigned short *bp = (unsigned short *)buf;
+	while (len--) {
+		DUMMY_DELAY_ACCESS;
+		outw(*bp++, addr);
+	}
+}
+
+static inline void delayed_insw(unsigned int addr, void *buf, int len)
+{
+	unsigned short *bp = (unsigned short *)buf;
+	while (len--) {
+		DUMMY_DELAY_ACCESS;
+		*bp++ = inw((void *)addr);
+	}
+}
+
+#else
+
+#define MAX_ROOT_PORTS		2
+
+#define USE_32BIT		0
+
+/* These options are mutually eclusive */
+#define USE_PLATFORM_DELAY	0
+#define USE_NDELAY		0
+
+#define DUMMY_DELAY_ACCESS do {} while (0)
+
+#endif
+
+
+/* ------------------------------------------------------------------------- */
+
+#define USB_RESET_WIDTH			50
+#define MAX_XFER_SIZE			1023
+
+/* Buffer sizes */
+#define ISP1362_BUF_SIZE		4096
+#define ISP1362_ISTL_BUFSIZE		512
+#define ISP1362_INTL_BLKSIZE		64
+#define ISP1362_INTL_BUFFERS		16
+#define ISP1362_ATL_BLKSIZE		64
+
+#define ISP1362_REG_WRITE_OFFSET	0x80
+
+#ifdef ISP1362_DEBUG
+typedef const unsigned int isp1362_reg_t;
+
+#define REG_WIDTH_16			0x000
+#define REG_WIDTH_32			0x100
+#define REG_WIDTH_MASK			0x100
+#define REG_NO_MASK			0x0ff
+
+#define REG_ACCESS_R			0x200
+#define REG_ACCESS_W			0x400
+#define REG_ACCESS_RW			0x600
+#define REG_ACCESS_MASK			0x600
+
+#define ISP1362_REG_NO(r)		((r) & REG_NO_MASK)
+
+#define _BUG_ON(x)	BUG_ON(x)
+#define _WARN_ON(x)	WARN_ON(x)
+
+#define ISP1362_REG(name, addr, width, rw) \
+static isp1362_reg_t ISP1362_REG_##name = ((addr) | (width) | (rw))
+
+#define REG_ACCESS_TEST(r)   BUG_ON(((r) & ISP1362_REG_WRITE_OFFSET) && !((r) & REG_ACCESS_W))
+#define REG_WIDTH_TEST(r, w) BUG_ON(((r) & REG_WIDTH_MASK) != (w))
+#else
+typedef const unsigned char isp1362_reg_t;
+#define ISP1362_REG_NO(r)		(r)
+#define _BUG_ON(x)			do {} while (0)
+#define _WARN_ON(x)			do {} while (0)
+
+#define ISP1362_REG(name, addr, width, rw) \
+static isp1362_reg_t ISP1362_REG_##name = addr
+
+#define REG_ACCESS_TEST(r)		do {} while (0)
+#define REG_WIDTH_TEST(r, w)		do {} while (0)
+#endif
+
+/* OHCI compatible registers */
+/*
+ * Note: Some of the ISP1362 'OHCI' registers implement only
+ * a subset of the bits defined in the OHCI spec.
+ *
+ * Bitmasks for the individual bits of these registers are defined in "ohci.h"
+ */
+ISP1362_REG(HCREVISION,	0x00,	REG_WIDTH_32,	REG_ACCESS_R);
+ISP1362_REG(HCCONTROL,	0x01,	REG_WIDTH_32,	REG_ACCESS_RW);
+ISP1362_REG(HCCMDSTAT,	0x02,	REG_WIDTH_32,	REG_ACCESS_RW);
+ISP1362_REG(HCINTSTAT,	0x03,	REG_WIDTH_32,	REG_ACCESS_RW);
+ISP1362_REG(HCINTENB,	0x04,	REG_WIDTH_32,	REG_ACCESS_RW);
+ISP1362_REG(HCINTDIS,	0x05,	REG_WIDTH_32,	REG_ACCESS_RW);
+ISP1362_REG(HCFMINTVL,	0x0d,	REG_WIDTH_32,	REG_ACCESS_RW);
+ISP1362_REG(HCFMREM,	0x0e,	REG_WIDTH_32,	REG_ACCESS_RW);
+ISP1362_REG(HCFMNUM,	0x0f,	REG_WIDTH_32,	REG_ACCESS_RW);
+ISP1362_REG(HCLSTHRESH,	0x11,	REG_WIDTH_32,	REG_ACCESS_RW);
+ISP1362_REG(HCRHDESCA,	0x12,	REG_WIDTH_32,	REG_ACCESS_RW);
+ISP1362_REG(HCRHDESCB,	0x13,	REG_WIDTH_32,	REG_ACCESS_RW);
+ISP1362_REG(HCRHSTATUS,	0x14,	REG_WIDTH_32,	REG_ACCESS_RW);
+ISP1362_REG(HCRHPORT1,	0x15,	REG_WIDTH_32,	REG_ACCESS_RW);
+ISP1362_REG(HCRHPORT2,	0x16,	REG_WIDTH_32,	REG_ACCESS_RW);
+
+/* Philips ISP1362 specific registers */
+ISP1362_REG(HCHWCFG,	0x20,	REG_WIDTH_16,	REG_ACCESS_RW);
+#define HCHWCFG_DISABLE_SUSPEND	(1 << 15)
+#define HCHWCFG_GLOBAL_PWRDOWN	(1 << 14)
+#define HCHWCFG_PULLDOWN_DS1	(1 << 13)
+#define HCHWCFG_PULLDOWN_DS2	(1 << 12)
+#define HCHWCFG_CLKNOTSTOP	(1 << 11)
+#define HCHWCFG_ANALOG_OC	(1 << 10)
+#define HCHWCFG_ONEINT		(1 << 9)
+#define HCHWCFG_DACK_MODE	(1 << 8)
+#define HCHWCFG_ONEDMA		(1 << 7)
+#define HCHWCFG_DACK_POL	(1 << 6)
+#define HCHWCFG_DREQ_POL	(1 << 5)
+#define HCHWCFG_DBWIDTH_MASK	(0x03 << 3)
+#define HCHWCFG_DBWIDTH(n)	(((n) << 3) & HCHWCFG_DBWIDTH_MASK)
+#define HCHWCFG_INT_POL		(1 << 2)
+#define HCHWCFG_INT_TRIGGER	(1 << 1)
+#define HCHWCFG_INT_ENABLE	(1 << 0)
+
+ISP1362_REG(HCDMACFG,	0x21,	REG_WIDTH_16,	REG_ACCESS_RW);
+#define HCDMACFG_CTR_ENABLE	(1 << 7)
+#define HCDMACFG_BURST_LEN_MASK	(0x03 << 5)
+#define HCDMACFG_BURST_LEN(n)	(((n) << 5) & HCDMACFG_BURST_LEN_MASK)
+#define HCDMACFG_BURST_LEN_1	HCDMACFG_BURST_LEN(0)
+#define HCDMACFG_BURST_LEN_4	HCDMACFG_BURST_LEN(1)
+#define HCDMACFG_BURST_LEN_8	HCDMACFG_BURST_LEN(2)
+#define HCDMACFG_DMA_ENABLE	(1 << 4)
+#define HCDMACFG_BUF_TYPE_MASK	(0x07 << 1)
+#define HCDMACFG_BUF_TYPE(n)	(((n) << 1) & HCDMACFG_BUF_TYPE_MASK)
+#define HCDMACFG_BUF_ISTL0	HCDMACFG_BUF_TYPE(0)
+#define HCDMACFG_BUF_ISTL1	HCDMACFG_BUF_TYPE(1)
+#define HCDMACFG_BUF_INTL	HCDMACFG_BUF_TYPE(2)
+#define HCDMACFG_BUF_ATL	HCDMACFG_BUF_TYPE(3)
+#define HCDMACFG_BUF_DIRECT	HCDMACFG_BUF_TYPE(4)
+#define HCDMACFG_DMA_RW_SELECT	(1 << 0)
+
+ISP1362_REG(HCXFERCTR,	0x22,	REG_WIDTH_16,	REG_ACCESS_RW);
+
+ISP1362_REG(HCuPINT,	0x24,	REG_WIDTH_16,	REG_ACCESS_RW);
+#define HCuPINT_SOF		(1 << 0)
+#define HCuPINT_ISTL0		(1 << 1)
+#define HCuPINT_ISTL1		(1 << 2)
+#define HCuPINT_EOT		(1 << 3)
+#define HCuPINT_OPR		(1 << 4)
+#define HCuPINT_SUSP		(1 << 5)
+#define HCuPINT_CLKRDY		(1 << 6)
+#define HCuPINT_INTL		(1 << 7)
+#define HCuPINT_ATL		(1 << 8)
+#define HCuPINT_OTG		(1 << 9)
+
+ISP1362_REG(HCuPINTENB,	0x25,	REG_WIDTH_16,	REG_ACCESS_RW);
+/* same bit definitions apply as for HCuPINT */
+
+ISP1362_REG(HCCHIPID,	0x27,	REG_WIDTH_16,	REG_ACCESS_R);
+#define HCCHIPID_MASK		0xff00
+#define HCCHIPID_MAGIC		0x3600
+
+ISP1362_REG(HCSCRATCH,	0x28,	REG_WIDTH_16,	REG_ACCESS_RW);
+
+ISP1362_REG(HCSWRES,	0x29,	REG_WIDTH_16,	REG_ACCESS_W);
+#define HCSWRES_MAGIC		0x00f6
+
+ISP1362_REG(HCBUFSTAT,	0x2c,	REG_WIDTH_16,	REG_ACCESS_RW);
+#define HCBUFSTAT_ISTL0_FULL	(1 << 0)
+#define HCBUFSTAT_ISTL1_FULL	(1 << 1)
+#define HCBUFSTAT_INTL_ACTIVE	(1 << 2)
+#define HCBUFSTAT_ATL_ACTIVE	(1 << 3)
+#define HCBUFSTAT_RESET_HWPP	(1 << 4)
+#define HCBUFSTAT_ISTL0_ACTIVE	(1 << 5)
+#define HCBUFSTAT_ISTL1_ACTIVE	(1 << 6)
+#define HCBUFSTAT_ISTL0_DONE	(1 << 8)
+#define HCBUFSTAT_ISTL1_DONE	(1 << 9)
+#define HCBUFSTAT_PAIRED_PTDPP	(1 << 10)
+
+ISP1362_REG(HCDIRADDR,	0x32,	REG_WIDTH_32,	REG_ACCESS_RW);
+#define HCDIRADDR_ADDR_MASK	0x0000ffff
+#define HCDIRADDR_ADDR(n)	(((n) << 0) & HCDIRADDR_ADDR_MASK)
+#define HCDIRADDR_COUNT_MASK	0xffff0000
+#define HCDIRADDR_COUNT(n)	(((n) << 16) & HCDIRADDR_COUNT_MASK)
+ISP1362_REG(HCDIRDATA,	0x45,	REG_WIDTH_16,	REG_ACCESS_RW);
+
+ISP1362_REG(HCISTLBUFSZ, 0x30,	REG_WIDTH_16,	REG_ACCESS_RW);
+ISP1362_REG(HCISTL0PORT, 0x40,	REG_WIDTH_16,	REG_ACCESS_RW);
+ISP1362_REG(HCISTL1PORT, 0x42,	REG_WIDTH_16,	REG_ACCESS_RW);
+ISP1362_REG(HCISTLRATE,	0x47,	REG_WIDTH_16,	REG_ACCESS_RW);
+
+ISP1362_REG(HCINTLBUFSZ, 0x33,	REG_WIDTH_16,	REG_ACCESS_RW);
+ISP1362_REG(HCINTLPORT,	0x43,	REG_WIDTH_16,	REG_ACCESS_RW);
+ISP1362_REG(HCINTLBLKSZ, 0x53,	REG_WIDTH_16,	REG_ACCESS_RW);
+ISP1362_REG(HCINTLDONE,	0x17,	REG_WIDTH_32,	REG_ACCESS_R);
+ISP1362_REG(HCINTLSKIP,	0x18,	REG_WIDTH_32,	REG_ACCESS_RW);
+ISP1362_REG(HCINTLLAST,	0x19,	REG_WIDTH_32,	REG_ACCESS_RW);
+ISP1362_REG(HCINTLCURR,	0x1a,	REG_WIDTH_16,	REG_ACCESS_R);
+
+ISP1362_REG(HCATLBUFSZ, 0x34,	REG_WIDTH_16,	REG_ACCESS_RW);
+ISP1362_REG(HCATLPORT,	0x44,	REG_WIDTH_16,	REG_ACCESS_RW);
+ISP1362_REG(HCATLBLKSZ, 0x54,	REG_WIDTH_16,	REG_ACCESS_RW);
+ISP1362_REG(HCATLDONE,	0x1b,	REG_WIDTH_32,	REG_ACCESS_R);
+ISP1362_REG(HCATLSKIP,	0x1c,	REG_WIDTH_32,	REG_ACCESS_RW);
+ISP1362_REG(HCATLLAST,	0x1d,	REG_WIDTH_32,	REG_ACCESS_RW);
+ISP1362_REG(HCATLCURR,	0x1e,	REG_WIDTH_16,	REG_ACCESS_R);
+
+ISP1362_REG(HCATLDTC,	0x51,	REG_WIDTH_16,	REG_ACCESS_RW);
+ISP1362_REG(HCATLDTCTO,	0x52,	REG_WIDTH_16,	REG_ACCESS_RW);
+
+
+ISP1362_REG(OTGCONTROL,	0x62,	REG_WIDTH_16,	REG_ACCESS_RW);
+ISP1362_REG(OTGSTATUS,	0x67,	REG_WIDTH_16,	REG_ACCESS_R);
+ISP1362_REG(OTGINT,	0x68,	REG_WIDTH_16,	REG_ACCESS_RW);
+ISP1362_REG(OTGINTENB,	0x69,	REG_WIDTH_16,	REG_ACCESS_RW);
+ISP1362_REG(OTGTIMER,	0x6A,	REG_WIDTH_16,	REG_ACCESS_RW);
+ISP1362_REG(OTGALTTMR,	0x6C,	REG_WIDTH_16,	REG_ACCESS_RW);
+
+/* Philips transfer descriptor, cpu-endian */
+struct ptd {
+	u16 count;
+#define	PTD_COUNT_MSK	(0x3ff << 0)
+#define	PTD_TOGGLE_MSK	(1 << 10)
+#define	PTD_ACTIVE_MSK	(1 << 11)
+#define	PTD_CC_MSK	(0xf << 12)
+	u16 mps;
+#define	PTD_MPS_MSK	(0x3ff << 0)
+#define	PTD_SPD_MSK	(1 << 10)
+#define	PTD_LAST_MSK	(1 << 11)
+#define	PTD_EP_MSK	(0xf << 12)
+	u16 len;
+#define	PTD_LEN_MSK	(0x3ff << 0)
+#define	PTD_DIR_MSK	(3 << 10)
+#define	PTD_DIR_SETUP	(0)
+#define	PTD_DIR_OUT	(1)
+#define	PTD_DIR_IN	(2)
+	u16 faddr;
+#define	PTD_FA_MSK	(0x7f << 0)
+/* PTD Byte 7: [StartingFrame (if ISO PTD) | StartingFrame[0..4], PollingRate[0..2] (if INT PTD)] */
+#define PTD_SF_ISO_MSK	(0xff << 8)
+#define PTD_SF_INT_MSK	(0x1f << 8)
+#define PTD_PR_MSK	(0x07 << 13)
+} __attribute__ ((packed, aligned(2)));
+#define PTD_HEADER_SIZE sizeof(struct ptd)
+
+/* ------------------------------------------------------------------------- */
+/* Copied from ohci.h: */
+/*
+ * Hardware transfer status codes -- CC from PTD
+ */
+#define PTD_CC_NOERROR      0x00
+#define PTD_CC_CRC          0x01
+#define PTD_CC_BITSTUFFING  0x02
+#define PTD_CC_DATATOGGLEM  0x03
+#define PTD_CC_STALL        0x04
+#define PTD_DEVNOTRESP      0x05
+#define PTD_PIDCHECKFAIL    0x06
+#define PTD_UNEXPECTEDPID   0x07
+#define PTD_DATAOVERRUN     0x08
+#define PTD_DATAUNDERRUN    0x09
+    /* 0x0A, 0x0B reserved for hardware */
+#define PTD_BUFFEROVERRUN   0x0C
+#define PTD_BUFFERUNDERRUN  0x0D
+    /* 0x0E, 0x0F reserved for HCD */
+#define PTD_NOTACCESSED     0x0F
+
+
+/* map OHCI TD status codes (CC) to errno values */
+static const int cc_to_error[16] = {
+	/* No  Error  */               0,
+	/* CRC Error  */               -EILSEQ,
+	/* Bit Stuff  */               -EPROTO,
+	/* Data Togg  */               -EILSEQ,
+	/* Stall      */               -EPIPE,
+	/* DevNotResp */               -ETIMEDOUT,
+	/* PIDCheck   */               -EPROTO,
+	/* UnExpPID   */               -EPROTO,
+	/* DataOver   */               -EOVERFLOW,
+	/* DataUnder  */               -EREMOTEIO,
+	/* (for hw)   */               -EIO,
+	/* (for hw)   */               -EIO,
+	/* BufferOver */               -ECOMM,
+	/* BuffUnder  */               -ENOSR,
+	/* (for HCD)  */               -EALREADY,
+	/* (for HCD)  */               -EALREADY
+};
+
+
+/*
+ * HcControl (control) register masks
+ */
+#define OHCI_CTRL_HCFS	(3 << 6)	/* host controller functional state */
+#define OHCI_CTRL_RWC	(1 << 9)	/* remote wakeup connected */
+#define OHCI_CTRL_RWE	(1 << 10)	/* remote wakeup enable */
+
+/* pre-shifted values for HCFS */
+#	define OHCI_USB_RESET	(0 << 6)
+#	define OHCI_USB_RESUME	(1 << 6)
+#	define OHCI_USB_OPER	(2 << 6)
+#	define OHCI_USB_SUSPEND	(3 << 6)
+
+/*
+ * HcCommandStatus (cmdstatus) register masks
+ */
+#define OHCI_HCR	(1 << 0)	/* host controller reset */
+#define OHCI_SOC  	(3 << 16)	/* scheduling overrun count */
+
+/*
+ * masks used with interrupt registers:
+ * HcInterruptStatus (intrstatus)
+ * HcInterruptEnable (intrenable)
+ * HcInterruptDisable (intrdisable)
+ */
+#define OHCI_INTR_SO	(1 << 0)	/* scheduling overrun */
+#define OHCI_INTR_WDH	(1 << 1)	/* writeback of done_head */
+#define OHCI_INTR_SF	(1 << 2)	/* start frame */
+#define OHCI_INTR_RD	(1 << 3)	/* resume detect */
+#define OHCI_INTR_UE	(1 << 4)	/* unrecoverable error */
+#define OHCI_INTR_FNO	(1 << 5)	/* frame number overflow */
+#define OHCI_INTR_RHSC	(1 << 6)	/* root hub status change */
+#define OHCI_INTR_OC	(1 << 30)	/* ownership change */
+#define OHCI_INTR_MIE	(1 << 31)	/* master interrupt enable */
+
+/* roothub.portstatus [i] bits */
+#define RH_PS_CCS            0x00000001   	/* current connect status */
+#define RH_PS_PES            0x00000002   	/* port enable status*/
+#define RH_PS_PSS            0x00000004   	/* port suspend status */
+#define RH_PS_POCI           0x00000008   	/* port over current indicator */
+#define RH_PS_PRS            0x00000010  	/* port reset status */
+#define RH_PS_PPS            0x00000100   	/* port power status */
+#define RH_PS_LSDA           0x00000200    	/* low speed device attached */
+#define RH_PS_CSC            0x00010000 	/* connect status change */
+#define RH_PS_PESC           0x00020000   	/* port enable status change */
+#define RH_PS_PSSC           0x00040000    	/* port suspend status change */
+#define RH_PS_OCIC           0x00080000    	/* over current indicator change */
+#define RH_PS_PRSC           0x00100000   	/* port reset status change */
+
+/* roothub.status bits */
+#define RH_HS_LPS	     0x00000001		/* local power status */
+#define RH_HS_OCI	     0x00000002		/* over current indicator */
+#define RH_HS_DRWE	     0x00008000		/* device remote wakeup enable */
+#define RH_HS_LPSC	     0x00010000		/* local power status change */
+#define RH_HS_OCIC	     0x00020000		/* over current indicator change */
+#define RH_HS_CRWE	     0x80000000		/* clear remote wakeup enable */
+
+/* roothub.b masks */
+#define RH_B_DR		0x0000ffff		/* device removable flags */
+#define RH_B_PPCM	0xffff0000		/* port power control mask */
+
+/* roothub.a masks */
+#define	RH_A_NDP	(0xff << 0)		/* number of downstream ports */
+#define	RH_A_PSM	(1 << 8)		/* power switching mode */
+#define	RH_A_NPS	(1 << 9)		/* no power switching */
+#define	RH_A_DT		(1 << 10)		/* device type (mbz) */
+#define	RH_A_OCPM	(1 << 11)		/* over current protection mode */
+#define	RH_A_NOCP	(1 << 12)		/* no over current protection */
+#define	RH_A_POTPGT	(0xff << 24)		/* power on to power good time */
+
+#define	FI			0x2edf		/* 12000 bits per frame (-1) */
+#define	FSMP(fi) 		(0x7fff & ((6 * ((fi) - 210)) / 7))
+#define LSTHRESH		0x628		/* lowspeed bit threshold */
+
+/* ------------------------------------------------------------------------- */
+
+/* PTD accessor macros. */
+#define PTD_GET_COUNT(p)	(((p)->count & PTD_COUNT_MSK) >> 0)
+#define PTD_COUNT(v)		(((v) << 0) & PTD_COUNT_MSK)
+#define PTD_GET_TOGGLE(p)	(((p)->count & PTD_TOGGLE_MSK) >> 10)
+#define PTD_TOGGLE(v)		(((v) << 10) & PTD_TOGGLE_MSK)
+#define PTD_GET_ACTIVE(p)	(((p)->count & PTD_ACTIVE_MSK) >> 11)
+#define PTD_ACTIVE(v)		(((v) << 11) & PTD_ACTIVE_MSK)
+#define PTD_GET_CC(p)		(((p)->count & PTD_CC_MSK) >> 12)
+#define PTD_CC(v)		(((v) << 12) & PTD_CC_MSK)
+#define PTD_GET_MPS(p)		(((p)->mps & PTD_MPS_MSK) >> 0)
+#define PTD_MPS(v)		(((v) << 0) & PTD_MPS_MSK)
+#define PTD_GET_SPD(p)		(((p)->mps & PTD_SPD_MSK) >> 10)
+#define PTD_SPD(v)		(((v) << 10) & PTD_SPD_MSK)
+#define PTD_GET_LAST(p)		(((p)->mps & PTD_LAST_MSK) >> 11)
+#define PTD_LAST(v)		(((v) << 11) & PTD_LAST_MSK)
+#define PTD_GET_EP(p)		(((p)->mps & PTD_EP_MSK) >> 12)
+#define PTD_EP(v)		(((v) << 12) & PTD_EP_MSK)
+#define PTD_GET_LEN(p)		(((p)->len & PTD_LEN_MSK) >> 0)
+#define PTD_LEN(v)		(((v) << 0) & PTD_LEN_MSK)
+#define PTD_GET_DIR(p)		(((p)->len & PTD_DIR_MSK) >> 10)
+#define PTD_DIR(v)		(((v) << 10) & PTD_DIR_MSK)
+#define PTD_GET_FA(p)		(((p)->faddr & PTD_FA_MSK) >> 0)
+#define PTD_FA(v)		(((v) << 0) & PTD_FA_MSK)
+#define PTD_GET_SF_INT(p)	(((p)->faddr & PTD_SF_INT_MSK) >> 8)
+#define PTD_SF_INT(v)		(((v) << 8) & PTD_SF_INT_MSK)
+#define PTD_GET_SF_ISO(p)	(((p)->faddr & PTD_SF_ISO_MSK) >> 8)
+#define PTD_SF_ISO(v)		(((v) << 8) & PTD_SF_ISO_MSK)
+#define PTD_GET_PR(p)		(((p)->faddr & PTD_PR_MSK) >> 13)
+#define PTD_PR(v)		(((v) << 13) & PTD_PR_MSK)
+
+#define	LOG2_PERIODIC_SIZE	5	/* arbitrary; this matches OHCI */
+#define	PERIODIC_SIZE		(1 << LOG2_PERIODIC_SIZE)
+
+struct isp1362_ep {
+	struct usb_host_endpoint *hep;
+	struct usb_device	*udev;
+
+	/* philips transfer descriptor */
+	struct ptd		ptd;
+
+	u8			maxpacket;
+	u8			epnum;
+	u8			nextpid;
+	u16			error_count;
+	u16			length;		/* of current packet */
+	s16			ptd_offset;	/* buffer offset in ISP1362 where
+						   PTD has been stored
+						   (for access thru HCDIRDATA) */
+	int			ptd_index;
+	int num_ptds;
+	void 			*data;		/* to databuf */
+	/* queue of active EPs (the ones transmitted to the chip) */
+	struct list_head	active;
+
+	/* periodic schedule */
+	u8			branch;
+	u16			interval;
+	u16			load;
+	u16			last_iso;
+
+	/* async schedule */
+	struct list_head	schedule;	/* list of all EPs that need processing */
+	struct list_head	remove_list;
+	int			num_req;
+};
+
+struct isp1362_ep_queue {
+	struct list_head	active;		/* list of PTDs currently processed by HC */
+	atomic_t		finishing;
+	unsigned long		buf_map;
+	unsigned long		skip_map;
+	int			free_ptd;
+	u16			buf_start;
+	u16			buf_size;
+	u16			blk_size;	/* PTD buffer block size for ATL and INTL */
+	u8			buf_count;
+	u8			buf_avail;
+	char			name[16];
+
+	/* for statistical tracking */
+	u8			stat_maxptds;	/* Max # of ptds seen simultaneously in fifo */
+	u8			ptd_count;	/* number of ptds submitted to this queue */
+};
+
+struct isp1362_hcd {
+	spinlock_t		lock;
+	void __iomem		*addr_reg;
+	void __iomem		*data_reg;
+
+	struct isp1362_platform_data *board;
+
+	struct proc_dir_entry	*pde;
+	unsigned long		stat1, stat2, stat4, stat8, stat16;
+
+	/* HC registers */
+	u32			intenb;		/* "OHCI" interrupts */
+	u16			irqenb;		/* uP interrupts */
+
+	/* Root hub registers */
+	u32			rhdesca;
+	u32			rhdescb;
+	u32			rhstatus;
+	u32			rhport[MAX_ROOT_PORTS];
+	unsigned long		next_statechange;
+
+	/* HC control reg shadow copy */
+	u32			hc_control;
+
+	/* async schedule: control, bulk */
+	struct list_head	async;
+
+	/* periodic schedule: int */
+	u16			load[PERIODIC_SIZE];
+	struct list_head	periodic;
+	u16			fmindex;
+
+	/* periodic schedule: isochronous */
+	struct list_head	isoc;
+	int			istl_flip:1;
+	int			irq_active:1;
+
+	/* Schedules for the current frame */
+	struct isp1362_ep_queue atl_queue;
+	struct isp1362_ep_queue intl_queue;
+	struct isp1362_ep_queue istl_queue[2];
+
+	/* list of PTDs retrieved from HC */
+	struct list_head	remove_list;
+	enum {
+		ISP1362_INT_SOF,
+		ISP1362_INT_ISTL0,
+		ISP1362_INT_ISTL1,
+		ISP1362_INT_EOT,
+		ISP1362_INT_OPR,
+		ISP1362_INT_SUSP,
+		ISP1362_INT_CLKRDY,
+		ISP1362_INT_INTL,
+		ISP1362_INT_ATL,
+		ISP1362_INT_OTG,
+		NUM_ISP1362_IRQS
+	} IRQ_NAMES;
+	unsigned int		irq_stat[NUM_ISP1362_IRQS];
+	int			req_serial;
+};
+
+static inline const char *ISP1362_INT_NAME(int n)
+{
+	switch (n) {
+	case ISP1362_INT_SOF:    return "SOF";
+	case ISP1362_INT_ISTL0:  return "ISTL0";
+	case ISP1362_INT_ISTL1:  return "ISTL1";
+	case ISP1362_INT_EOT:    return "EOT";
+	case ISP1362_INT_OPR:    return "OPR";
+	case ISP1362_INT_SUSP:   return "SUSP";
+	case ISP1362_INT_CLKRDY: return "CLKRDY";
+	case ISP1362_INT_INTL:   return "INTL";
+	case ISP1362_INT_ATL:    return "ATL";
+	case ISP1362_INT_OTG:    return "OTG";
+	default:                 return "unknown";
+	}
+}
+
+static inline void ALIGNSTAT(struct isp1362_hcd *isp1362_hcd, void *ptr)
+{
+	unsigned p = (unsigned)ptr;
+	if (!(p & 0xf))
+		isp1362_hcd->stat16++;
+	else if (!(p & 0x7))
+		isp1362_hcd->stat8++;
+	else if (!(p & 0x3))
+		isp1362_hcd->stat4++;
+	else if (!(p & 0x1))
+		isp1362_hcd->stat2++;
+	else
+		isp1362_hcd->stat1++;
+}
+
+static inline struct isp1362_hcd *hcd_to_isp1362_hcd(struct usb_hcd *hcd)
+{
+	return (struct isp1362_hcd *) (hcd->hcd_priv);
+}
+
+static inline struct usb_hcd *isp1362_hcd_to_hcd(struct isp1362_hcd *isp1362_hcd)
+{
+	return container_of((void *)isp1362_hcd, struct usb_hcd, hcd_priv);
+}
+
+#define frame_before(f1, f2)	((s16)((u16)f1 - (u16)f2) < 0)
+
+/*
+ * ISP1362 HW Interface
+ */
+
+#ifdef ISP1362_DEBUG
+#define DBG(level, fmt...) \
+	do { \
+		if (dbg_level > level) \
+			pr_debug(fmt); \
+	} while (0)
+#define _DBG(level, fmt...)	\
+	do { \
+		if (dbg_level > level) \
+			printk(fmt); \
+	} while (0)
+#else
+#define DBG(fmt...)		do {} while (0)
+#define _DBG DBG
+#endif
+
+#ifdef VERBOSE
+#    define VDBG(fmt...)	DBG(3, fmt)
+#else
+#    define VDBG(fmt...)	do {} while (0)
+#endif
+
+#ifdef REGISTERS
+#    define RDBG(fmt...)	DBG(1, fmt)
+#else
+#    define RDBG(fmt...)	do {} while (0)
+#endif
+
+#ifdef URB_TRACE
+#define URB_DBG(fmt...)		DBG(0, fmt)
+#else
+#define URB_DBG(fmt...)		do {} while (0)
+#endif
+
+
+#if USE_PLATFORM_DELAY
+#if USE_NDELAY
+#error USE_PLATFORM_DELAY and USE_NDELAY defined simultaneously.
+#endif
+#define	isp1362_delay(h, d)	(h)->board->delay(isp1362_hcd_to_hcd(h)->self.controller, d)
+#elif USE_NDELAY
+#define	isp1362_delay(h, d)	ndelay(d)
+#else
+#define	isp1362_delay(h, d)	do {} while (0)
+#endif
+
+#define get_urb(ep) ({							\
+	BUG_ON(list_empty(&ep->hep->urb_list));				\
+	container_of(ep->hep->urb_list.next, struct urb, urb_list);	\
+})
+
+/* basic access functions for ISP1362 chip registers */
+/* NOTE: The contents of the address pointer register cannot be read back! The driver must ensure,
+ * that all register accesses are performed with interrupts disabled, since the interrupt
+ * handler has no way of restoring the previous state.
+ */
+static void isp1362_write_addr(struct isp1362_hcd *isp1362_hcd, isp1362_reg_t reg)
+{
+	/*_BUG_ON((reg & ISP1362_REG_WRITE_OFFSET) && !(reg & REG_ACCESS_W));*/
+	REG_ACCESS_TEST(reg);
+	_BUG_ON(!irqs_disabled());
+	DUMMY_DELAY_ACCESS;
+	writew(ISP1362_REG_NO(reg), isp1362_hcd->addr_reg);
+	DUMMY_DELAY_ACCESS;
+	isp1362_delay(isp1362_hcd, 1);
+}
+
+static void isp1362_write_data16(struct isp1362_hcd *isp1362_hcd, u16 val)
+{
+	_BUG_ON(!irqs_disabled());
+	DUMMY_DELAY_ACCESS;
+	writew(val, isp1362_hcd->data_reg);
+}
+
+static u16 isp1362_read_data16(struct isp1362_hcd *isp1362_hcd)
+{
+	u16 val;
+
+	_BUG_ON(!irqs_disabled());
+	DUMMY_DELAY_ACCESS;
+	val = readw(isp1362_hcd->data_reg);
+
+	return val;
+}
+
+static void isp1362_write_data32(struct isp1362_hcd *isp1362_hcd, u32 val)
+{
+	_BUG_ON(!irqs_disabled());
+#if USE_32BIT
+	DUMMY_DELAY_ACCESS;
+	writel(val, isp1362_hcd->data_reg);
+#else
+	DUMMY_DELAY_ACCESS;
+	writew((u16)val, isp1362_hcd->data_reg);
+	DUMMY_DELAY_ACCESS;
+	writew(val >> 16, isp1362_hcd->data_reg);
+#endif
+}
+
+static u32 isp1362_read_data32(struct isp1362_hcd *isp1362_hcd)
+{
+	u32 val;
+
+	_BUG_ON(!irqs_disabled());
+#if USE_32BIT
+	DUMMY_DELAY_ACCESS;
+	val = readl(isp1362_hcd->data_reg);
+#else
+	DUMMY_DELAY_ACCESS;
+	val = (u32)readw(isp1362_hcd->data_reg);
+	DUMMY_DELAY_ACCESS;
+	val |= (u32)readw(isp1362_hcd->data_reg) << 16;
+#endif
+	return val;
+}
+
+/* use readsw/writesw to access the fifo whenever possible */
+/* assume HCDIRDATA or XFERCTR & addr_reg have been set up */
+static void isp1362_read_fifo(struct isp1362_hcd *isp1362_hcd, void *buf, u16 len)
+{
+	u8 *dp = buf;
+	u16 data;
+
+	if (!len)
+		return;
+
+	_BUG_ON(!irqs_disabled());
+
+	RDBG("%s: Reading %d byte from fifo to mem @ %p\n", __func__, len, buf);
+#if USE_32BIT
+	if (len >= 4) {
+		RDBG("%s: Using readsl for %d dwords\n", __func__, len >> 2);
+		readsl(isp1362_hcd->data_reg, dp, len >> 2);
+		dp += len & ~3;
+		len &= 3;
+	}
+#endif
+	if (len >= 2) {
+		RDBG("%s: Using readsw for %d words\n", __func__, len >> 1);
+		insw((unsigned long)isp1362_hcd->data_reg, dp, len >> 1);
+		dp += len & ~1;
+		len &= 1;
+	}
+
+	BUG_ON(len & ~1);
+	if (len > 0) {
+		data = isp1362_read_data16(isp1362_hcd);
+		RDBG("%s: Reading trailing byte %02x to mem @ %08x\n", __func__,
+		     (u8)data, (u32)dp);
+		*dp = (u8)data;
+	}
+}
+
+static void isp1362_write_fifo(struct isp1362_hcd *isp1362_hcd, void *buf, u16 len)
+{
+	u8 *dp = buf;
+	u16 data;
+
+	if (!len)
+		return;
+
+	if ((unsigned)dp & 0x1) {
+		/* not aligned */
+		for (; len > 1; len -= 2) {
+			data = *dp++;
+			data |= *dp++ << 8;
+			isp1362_write_data16(isp1362_hcd, data);
+		}
+		if (len)
+			isp1362_write_data16(isp1362_hcd, *dp);
+		return;
+	}
+
+	_BUG_ON(!irqs_disabled());
+
+	RDBG("%s: Writing %d byte to fifo from memory @%p\n", __func__, len, buf);
+#if USE_32BIT
+	if (len >= 4) {
+		RDBG("%s: Using writesl for %d dwords\n", __func__, len >> 2);
+		writesl(isp1362_hcd->data_reg, dp, len >> 2);
+		dp += len & ~3;
+		len &= 3;
+	}
+#endif
+	if (len >= 2) {
+		RDBG("%s: Using writesw for %d words\n", __func__, len >> 1);
+		outsw((unsigned long)isp1362_hcd->data_reg, dp, len >> 1);
+		dp += len & ~1;
+		len &= 1;
+	}
+
+	BUG_ON(len & ~1);
+	if (len > 0) {
+		/* finally write any trailing byte; we don't need to care
+		 * about the high byte of the last word written
+		 */
+		data = (u16)*dp;
+		RDBG("%s: Sending trailing byte %02x from mem @ %08x\n", __func__,
+			data, (u32)dp);
+		isp1362_write_data16(isp1362_hcd, data);
+	}
+}
+
+#define isp1362_read_reg16(d, r)		({			\
+	u16 __v;							\
+	REG_WIDTH_TEST(ISP1362_REG_##r, REG_WIDTH_16);			\
+	isp1362_write_addr(d, ISP1362_REG_##r);				\
+	__v = isp1362_read_data16(d);					\
+	RDBG("%s: Read %04x from %s[%02x]\n", __func__, __v, #r,	\
+	     ISP1362_REG_NO(ISP1362_REG_##r));				\
+	__v;								\
+})
+
+#define isp1362_read_reg32(d, r)		({			\
+	u32 __v;							\
+	REG_WIDTH_TEST(ISP1362_REG_##r, REG_WIDTH_32);			\
+	isp1362_write_addr(d, ISP1362_REG_##r);				\
+	__v = isp1362_read_data32(d);					\
+	RDBG("%s: Read %08x from %s[%02x]\n", __func__, __v, #r,	\
+	     ISP1362_REG_NO(ISP1362_REG_##r));				\
+	__v;								\
+})
+
+#define isp1362_write_reg16(d, r, v)	{					\
+	REG_WIDTH_TEST(ISP1362_REG_##r, REG_WIDTH_16);				\
+	isp1362_write_addr(d, (ISP1362_REG_##r) | ISP1362_REG_WRITE_OFFSET);	\
+	isp1362_write_data16(d, (u16)(v));					\
+	RDBG("%s: Wrote %04x to %s[%02x]\n", __func__, (u16)(v), #r,	\
+	     ISP1362_REG_NO(ISP1362_REG_##r));					\
+}
+
+#define isp1362_write_reg32(d, r, v)	{					\
+	REG_WIDTH_TEST(ISP1362_REG_##r, REG_WIDTH_32);				\
+	isp1362_write_addr(d, (ISP1362_REG_##r) | ISP1362_REG_WRITE_OFFSET);	\
+	isp1362_write_data32(d, (u32)(v));					\
+	RDBG("%s: Wrote %08x to %s[%02x]\n", __func__, (u32)(v), #r,	\
+	     ISP1362_REG_NO(ISP1362_REG_##r));					\
+}
+
+#define isp1362_set_mask16(d, r, m) {			\
+	u16 __v;					\
+	__v = isp1362_read_reg16(d, r);			\
+	if ((__v | m) != __v)				\
+		isp1362_write_reg16(d, r, __v | m);	\
+}
+
+#define isp1362_clr_mask16(d, r, m) {			\
+	u16 __v;					\
+	__v = isp1362_read_reg16(d, r);			\
+	if ((__v & ~m) != __v)			\
+		isp1362_write_reg16(d, r, __v & ~m);	\
+}
+
+#define isp1362_set_mask32(d, r, m) {			\
+	u32 __v;					\
+	__v = isp1362_read_reg32(d, r);			\
+	if ((__v | m) != __v)				\
+		isp1362_write_reg32(d, r, __v | m);	\
+}
+
+#define isp1362_clr_mask32(d, r, m) {			\
+	u32 __v;					\
+	__v = isp1362_read_reg32(d, r);			\
+	if ((__v & ~m) != __v)			\
+		isp1362_write_reg32(d, r, __v & ~m);	\
+}
+
+#ifdef ISP1362_DEBUG
+#define isp1362_show_reg(d, r) {								\
+	if ((ISP1362_REG_##r & REG_WIDTH_MASK) == REG_WIDTH_32)			\
+		DBG(0, "%-12s[%02x]: %08x\n", #r,					\
+			ISP1362_REG_NO(ISP1362_REG_##r), isp1362_read_reg32(d, r));	\
+	else									\
+		DBG(0, "%-12s[%02x]:     %04x\n", #r,					\
+			ISP1362_REG_NO(ISP1362_REG_##r), isp1362_read_reg16(d, r));	\
+}
+#else
+#define isp1362_show_reg(d, r)	do {} while (0)
+#endif
+
+static void __attribute__((__unused__)) isp1362_show_regs(struct isp1362_hcd *isp1362_hcd)
+{
+	isp1362_show_reg(isp1362_hcd, HCREVISION);
+	isp1362_show_reg(isp1362_hcd, HCCONTROL);
+	isp1362_show_reg(isp1362_hcd, HCCMDSTAT);
+	isp1362_show_reg(isp1362_hcd, HCINTSTAT);
+	isp1362_show_reg(isp1362_hcd, HCINTENB);
+	isp1362_show_reg(isp1362_hcd, HCFMINTVL);
+	isp1362_show_reg(isp1362_hcd, HCFMREM);
+	isp1362_show_reg(isp1362_hcd, HCFMNUM);
+	isp1362_show_reg(isp1362_hcd, HCLSTHRESH);
+	isp1362_show_reg(isp1362_hcd, HCRHDESCA);
+	isp1362_show_reg(isp1362_hcd, HCRHDESCB);
+	isp1362_show_reg(isp1362_hcd, HCRHSTATUS);
+	isp1362_show_reg(isp1362_hcd, HCRHPORT1);
+	isp1362_show_reg(isp1362_hcd, HCRHPORT2);
+
+	isp1362_show_reg(isp1362_hcd, HCHWCFG);
+	isp1362_show_reg(isp1362_hcd, HCDMACFG);
+	isp1362_show_reg(isp1362_hcd, HCXFERCTR);
+	isp1362_show_reg(isp1362_hcd, HCuPINT);
+
+	if (in_interrupt())
+		DBG(0, "%-12s[%02x]:     %04x\n", "HCuPINTENB",
+			 ISP1362_REG_NO(ISP1362_REG_HCuPINTENB), isp1362_hcd->irqenb);
+	else
+		isp1362_show_reg(isp1362_hcd, HCuPINTENB);
+	isp1362_show_reg(isp1362_hcd, HCCHIPID);
+	isp1362_show_reg(isp1362_hcd, HCSCRATCH);
+	isp1362_show_reg(isp1362_hcd, HCBUFSTAT);
+	isp1362_show_reg(isp1362_hcd, HCDIRADDR);
+	/* Access would advance fifo
+	 * isp1362_show_reg(isp1362_hcd, HCDIRDATA);
+	 */
+	isp1362_show_reg(isp1362_hcd, HCISTLBUFSZ);
+	isp1362_show_reg(isp1362_hcd, HCISTLRATE);
+	isp1362_show_reg(isp1362_hcd, HCINTLBUFSZ);
+	isp1362_show_reg(isp1362_hcd, HCINTLBLKSZ);
+	isp1362_show_reg(isp1362_hcd, HCINTLDONE);
+	isp1362_show_reg(isp1362_hcd, HCINTLSKIP);
+	isp1362_show_reg(isp1362_hcd, HCINTLLAST);
+	isp1362_show_reg(isp1362_hcd, HCINTLCURR);
+	isp1362_show_reg(isp1362_hcd, HCATLBUFSZ);
+	isp1362_show_reg(isp1362_hcd, HCATLBLKSZ);
+	/* only valid after ATL_DONE interrupt
+	 * isp1362_show_reg(isp1362_hcd, HCATLDONE);
+	 */
+	isp1362_show_reg(isp1362_hcd, HCATLSKIP);
+	isp1362_show_reg(isp1362_hcd, HCATLLAST);
+	isp1362_show_reg(isp1362_hcd, HCATLCURR);
+	isp1362_show_reg(isp1362_hcd, HCATLDTC);
+	isp1362_show_reg(isp1362_hcd, HCATLDTCTO);
+}
+
+static void isp1362_write_diraddr(struct isp1362_hcd *isp1362_hcd, u16 offset, u16 len)
+{
+	_BUG_ON(offset & 1);
+	_BUG_ON(offset >= ISP1362_BUF_SIZE);
+	_BUG_ON(len > ISP1362_BUF_SIZE);
+	_BUG_ON(offset + len > ISP1362_BUF_SIZE);
+	len = (len + 1) & ~1;
+
+	isp1362_clr_mask16(isp1362_hcd, HCDMACFG, HCDMACFG_CTR_ENABLE);
+	isp1362_write_reg32(isp1362_hcd, HCDIRADDR,
+			    HCDIRADDR_ADDR(offset) | HCDIRADDR_COUNT(len));
+}
+
+static void isp1362_read_buffer(struct isp1362_hcd *isp1362_hcd, void *buf, u16 offset, int len)
+{
+	_BUG_ON(offset & 1);
+
+	isp1362_write_diraddr(isp1362_hcd, offset, len);
+
+	DBG(3, "%s: Reading %d byte from buffer @%04x to memory @ %08x\n", __func__,
+	    len, offset, (u32)buf);
+
+	isp1362_write_reg16(isp1362_hcd, HCuPINT, HCuPINT_EOT);
+	_WARN_ON((isp1362_read_reg16(isp1362_hcd, HCuPINT) & HCuPINT_EOT));
+
+	isp1362_write_addr(isp1362_hcd, ISP1362_REG_HCDIRDATA);
+
+	isp1362_read_fifo(isp1362_hcd, buf, len);
+	_WARN_ON(!(isp1362_read_reg16(isp1362_hcd, HCuPINT) & HCuPINT_EOT));
+	isp1362_write_reg16(isp1362_hcd, HCuPINT, HCuPINT_EOT);
+	_WARN_ON((isp1362_read_reg16(isp1362_hcd, HCuPINT) & HCuPINT_EOT));
+}
+
+static void isp1362_write_buffer(struct isp1362_hcd *isp1362_hcd, void *buf, u16 offset, int len)
+{
+	_BUG_ON(offset & 1);
+
+	isp1362_write_diraddr(isp1362_hcd, offset, len);
+
+	DBG(3, "%s: Writing %d byte to buffer @%04x from memory @ %08x\n", __func__,
+	    len, offset, (u32)buf);
+
+	isp1362_write_reg16(isp1362_hcd, HCuPINT, HCuPINT_EOT);
+	_WARN_ON((isp1362_read_reg16(isp1362_hcd, HCuPINT) & HCuPINT_EOT));
+
+	isp1362_write_addr(isp1362_hcd, ISP1362_REG_HCDIRDATA | ISP1362_REG_WRITE_OFFSET);
+	isp1362_write_fifo(isp1362_hcd, buf, len);
+
+	_WARN_ON(!(isp1362_read_reg16(isp1362_hcd, HCuPINT) & HCuPINT_EOT));
+	isp1362_write_reg16(isp1362_hcd, HCuPINT, HCuPINT_EOT);
+	_WARN_ON((isp1362_read_reg16(isp1362_hcd, HCuPINT) & HCuPINT_EOT));
+}
+
+static void __attribute__((unused)) dump_data(char *buf, int len)
+{
+	if (dbg_level > 0) {
+		int k;
+		int lf = 0;
+
+		for (k = 0; k < len; ++k) {
+			if (!lf)
+				DBG(0, "%04x:", k);
+			printk(" %02x", ((u8 *) buf)[k]);
+			lf = 1;
+			if (!k)
+				continue;
+			if (k % 16 == 15) {
+				printk("\n");
+				lf = 0;
+				continue;
+			}
+			if (k % 8 == 7)
+				printk(" ");
+			if (k % 4 == 3)
+				printk(" ");
+		}
+		if (lf)
+			printk("\n");
+	}
+}
+
+#if defined(ISP1362_DEBUG) && defined(PTD_TRACE)
+
+static void dump_ptd(struct ptd *ptd)
+{
+	DBG(0, "EP %p: CC=%x EP=%d DIR=%x CNT=%d LEN=%d MPS=%d TGL=%x ACT=%x FA=%d SPD=%x SF=%x PR=%x LST=%x\n",
+	    container_of(ptd, struct isp1362_ep, ptd),
+	    PTD_GET_CC(ptd), PTD_GET_EP(ptd), PTD_GET_DIR(ptd),
+	    PTD_GET_COUNT(ptd), PTD_GET_LEN(ptd), PTD_GET_MPS(ptd),
+	    PTD_GET_TOGGLE(ptd), PTD_GET_ACTIVE(ptd), PTD_GET_FA(ptd),
+	    PTD_GET_SPD(ptd), PTD_GET_SF_INT(ptd), PTD_GET_PR(ptd), PTD_GET_LAST(ptd));
+	DBG(0, "  %04x %04x %04x %04x\n", ptd->count, ptd->mps, ptd->len, ptd->faddr);
+}
+
+static void dump_ptd_out_data(struct ptd *ptd, u8 *buf)
+{
+	if (dbg_level > 0) {
+		if (PTD_GET_DIR(ptd) != PTD_DIR_IN && PTD_GET_LEN(ptd)) {
+			DBG(0, "--out->\n");
+			dump_data(buf, PTD_GET_LEN(ptd));
+		}
+	}
+}
+
+static void dump_ptd_in_data(struct ptd *ptd, u8 *buf)
+{
+	if (dbg_level > 0) {
+		if (PTD_GET_DIR(ptd) == PTD_DIR_IN && PTD_GET_COUNT(ptd)) {
+			DBG(0, "<--in--\n");
+			dump_data(buf, PTD_GET_COUNT(ptd));
+		}
+		DBG(0, "-----\n");
+	}
+}
+
+static void dump_ptd_queue(struct isp1362_ep_queue *epq)
+{
+	struct isp1362_ep *ep;
+	int dbg = dbg_level;
+
+	dbg_level = 1;
+	list_for_each_entry(ep, &epq->active, active) {
+		dump_ptd(&ep->ptd);
+		dump_data(ep->data, ep->length);
+	}
+	dbg_level = dbg;
+}
+#else
+#define dump_ptd(ptd)			do {} while (0)
+#define dump_ptd_in_data(ptd, buf)	do {} while (0)
+#define dump_ptd_out_data(ptd, buf)	do {} while (0)
+#define dump_ptd_data(ptd, buf)		do {} while (0)
+#define dump_ptd_queue(epq)		do {} while (0)
+#endif
diff --git a/include/linux/usb/isp1362.h b/include/linux/usb/isp1362.h
new file mode 100644
index 000000000000..642684bb9292
--- /dev/null
+++ b/include/linux/usb/isp1362.h
@@ -0,0 +1,46 @@
+/*
+ * board initialization code should put one of these into dev->platform_data
+ * and place the isp1362 onto platform_bus.
+ */
+
+#ifndef __LINUX_USB_ISP1362_H__
+#define __LINUX_USB_ISP1362_H__
+
+struct isp1362_platform_data {
+	/* Enable internal pulldown resistors on downstream ports */
+	unsigned sel15Kres:1;
+	/* Clock cannot be stopped */
+	unsigned clknotstop:1;
+	/* On-chip overcurrent protection */
+	unsigned oc_enable:1;
+	/* INT output polarity */
+	unsigned int_act_high:1;
+	/* INT edge or level triggered */
+	unsigned int_edge_triggered:1;
+	/* DREQ output polarity */
+	unsigned dreq_act_high:1;
+	/* DACK input polarity */
+	unsigned dack_act_high:1;
+	/* chip can be resumed via H_WAKEUP pin */
+	unsigned remote_wakeup_connected:1;
+	/* Switch or not to switch (keep always powered) */
+	unsigned no_power_switching:1;
+	/* Ganged port power switching (0) or individual port power switching (1) */
+	unsigned power_switching_mode:1;
+	/* Given port_power, msec/2 after power on till power good */
+	u8 potpg;
+	/* Hardware reset set/clear */
+	void (*reset) (struct device *dev, int set);
+	/* Clock start/stop */
+	void (*clock) (struct device *dev, int start);
+	/* Inter-io delay (ns). The chip is picky about access timings; it
+	 * expects at least:
+	 * 110ns delay between consecutive accesses to DATA_REG,
+	 * 300ns delay between access to ADDR_REG and DATA_REG (registers)
+	 * 462ns delay between access to ADDR_REG and DATA_REG (buffer memory)
+	 * WE MUST NOT be activated during these intervals (even without CS!)
+	 */
+	void (*delay) (struct device *dev, unsigned int delay);
+};
+
+#endif
-- 
cgit v1.2.3-71-gd317


From c35013087aa9b10e4674b53b7c8f7966de83c194 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Wed, 29 Jul 2009 14:23:25 +0300
Subject: USB: audio: guard kernel-only code with __KERNEL__

include/linux/usb/audio.h is exported to userspace,
so part of this file that is for internal kernel
usage need to be guarded with ifdef __KERNEL__.
This way make headers_install will stript it out.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/usb/audio.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/audio.h b/include/linux/usb/audio.h
index 7b33c493917f..eaf9dffe0a01 100644
--- a/include/linux/usb/audio.h
+++ b/include/linux/usb/audio.h
@@ -276,6 +276,8 @@ struct uac_iso_endpoint_descriptor {
 #define UAC_FU_BASS_BOOST	(1 << (UAC_BASS_BOOST_CONTROL - 1))
 #define UAC_FU_LOUDNESS		(1 << (UAC_LOUDNESS_CONTROL - 1))
 
+#ifdef __KERNEL__
+
 struct usb_audio_control {
 	struct list_head list;
 	const char *name;
@@ -294,4 +296,6 @@ struct usb_audio_control_selector {
 	struct usb_descriptor_header *desc;
 };
 
+#endif /* __KERNEL__ */
+
 #endif /* __LINUX_USB_AUDIO_H */
-- 
cgit v1.2.3-71-gd317


From 8e8dce065088833fc418bfa5fbf035cb0726c04c Mon Sep 17 00:00:00 2001
From: David VomLehn <dvomlehn@cisco.com>
Date: Fri, 28 Aug 2009 12:54:27 -0700
Subject: USB: use kfifo to buffer usb-generic serial writes

When do_output_char() attempts to write a carriage return/line feed sequence,
it first checks to see how much buffer room is available. If there are at least
two characters free, it will write the carriage return/line feed with two calls
to tty_put_char(). It calls the tty_operation functions write() for devices that
don't support the tty_operations function put_char(). If the USB generic serial
device's write URB is not in use, it will return the buffer size when asked how
much room is available. The write() of the carriage return will cause it to mark
the write URB busy, so the subsequent write() of the line feed will be ignored.

This patch uses the kfifo infrastructure to implement a write FIFO that
accurately returns the amount of space available in the buffer.

Signed-off-by: David VomLehn <dvomlehn@cisco.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/serial/generic.c    | 206 ++++++++++++++++++++++++----------------
 drivers/usb/serial/usb-serial.c |   7 ++
 include/linux/usb/serial.h      |   2 +
 3 files changed, 134 insertions(+), 81 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index d9398e9f30ce..deba08c7a015 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -19,7 +19,7 @@
 #include <linux/usb.h>
 #include <linux/usb/serial.h>
 #include <linux/uaccess.h>
-
+#include <linux/kfifo.h>
 
 static int debug;
 
@@ -166,24 +166,6 @@ static void generic_cleanup(struct usb_serial_port *port)
 	}
 }
 
-int usb_serial_generic_resume(struct usb_serial *serial)
-{
-	struct usb_serial_port *port;
-	int i, c = 0, r;
-
-	for (i = 0; i < serial->num_ports; i++) {
-		port = serial->port[i];
-		if (port->port.count && port->read_urb) {
-			r = usb_submit_urb(port->read_urb, GFP_NOIO);
-			if (r < 0)
-				c++;
-		}
-	}
-
-	return c ? -EIO : 0;
-}
-EXPORT_SYMBOL_GPL(usb_serial_generic_resume);
-
 void usb_serial_generic_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
@@ -272,12 +254,81 @@ error_no_buffer:
 	return bwrite;
 }
 
+/**
+ * usb_serial_generic_write_start - kick off an URB write
+ * @port:	Pointer to the &struct usb_serial_port data
+ *
+ * Returns the number of bytes queued on success. This will be zero if there
+ * was nothing to send. Otherwise, it returns a negative errno value
+ */
+static int usb_serial_generic_write_start(struct usb_serial_port *port)
+{
+	struct usb_serial *serial = port->serial;
+	unsigned char *data;
+	int result;
+	int count;
+	unsigned long flags;
+	bool start_io;
+
+	/* Atomically determine whether we can and need to start a USB
+	 * operation. */
+	spin_lock_irqsave(&port->lock, flags);
+	if (port->write_urb_busy)
+		start_io = false;
+	else {
+		start_io = (__kfifo_len(port->write_fifo) != 0);
+		port->write_urb_busy = start_io;
+	}
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	if (!start_io)
+		return 0;
+
+	data = port->write_urb->transfer_buffer;
+	count = kfifo_get(port->write_fifo, data, port->bulk_out_size);
+	usb_serial_debug_data(debug, &port->dev, __func__, count, data);
+
+	/* set up our urb */
+	usb_fill_bulk_urb(port->write_urb, serial->dev,
+			   usb_sndbulkpipe(serial->dev,
+				port->bulk_out_endpointAddress),
+			   port->write_urb->transfer_buffer, count,
+			   ((serial->type->write_bulk_callback) ?
+			     serial->type->write_bulk_callback :
+			     usb_serial_generic_write_bulk_callback),
+			   port);
+
+	/* send the data out the bulk port */
+	result = usb_submit_urb(port->write_urb, GFP_ATOMIC);
+	if (result) {
+		dev_err(&port->dev,
+			"%s - failed submitting write urb, error %d\n",
+						__func__, result);
+		/* don't have to grab the lock here, as we will
+		   retry if != 0 */
+		port->write_urb_busy = 0;
+	} else
+		result = count;
+
+	return result;
+}
+
+/**
+ * usb_serial_generic_write - generic write function for serial USB devices
+ * @tty:	Pointer to &struct tty_struct for the device
+ * @port:	Pointer to the &usb_serial_port structure for the device
+ * @buf:	Pointer to the data to write
+ * @count:	Number of bytes to write
+ *
+ * Returns the number of characters actually written, which may be anything
+ * from zero to @count. If an error occurs, it returns the negative errno
+ * value.
+ */
 int usb_serial_generic_write(struct tty_struct *tty,
 	struct usb_serial_port *port, const unsigned char *buf, int count)
 {
 	struct usb_serial *serial = port->serial;
 	int result;
-	unsigned char *data;
 
 	dbg("%s - port %d", __func__, port->number);
 
@@ -287,57 +338,20 @@ int usb_serial_generic_write(struct tty_struct *tty,
 	}
 
 	/* only do something if we have a bulk out endpoint */
-	if (serial->num_bulk_out) {
-		unsigned long flags;
-
-		if (serial->type->max_in_flight_urbs)
-			return usb_serial_multi_urb_write(tty, port,
-							  buf, count);
-
-		spin_lock_irqsave(&port->lock, flags);
-		if (port->write_urb_busy) {
-			spin_unlock_irqrestore(&port->lock, flags);
-			dbg("%s - already writing", __func__);
-			return 0;
-		}
-		port->write_urb_busy = 1;
-		spin_unlock_irqrestore(&port->lock, flags);
-
-		count = (count > port->bulk_out_size) ?
-					port->bulk_out_size : count;
-
-		memcpy(port->write_urb->transfer_buffer, buf, count);
-		data = port->write_urb->transfer_buffer;
-		usb_serial_debug_data(debug, &port->dev, __func__, count, data);
+	if (!serial->num_bulk_out)
+		return 0;
 
-		/* set up our urb */
-		usb_fill_bulk_urb(port->write_urb, serial->dev,
-				   usb_sndbulkpipe(serial->dev,
-					port->bulk_out_endpointAddress),
-				   port->write_urb->transfer_buffer, count,
-				   ((serial->type->write_bulk_callback) ?
-				     serial->type->write_bulk_callback :
-				     usb_serial_generic_write_bulk_callback),
-				   port);
+	if (serial->type->max_in_flight_urbs)
+		return usb_serial_multi_urb_write(tty, port,
+						  buf, count);
 
-		/* send the data out the bulk port */
-		port->write_urb_busy = 1;
-		result = usb_submit_urb(port->write_urb, GFP_ATOMIC);
-		if (result) {
-			dev_err(&port->dev,
-				"%s - failed submitting write urb, error %d\n",
-							__func__, result);
-			/* don't have to grab the lock here, as we will
-			   retry if != 0 */
-			port->write_urb_busy = 0;
-		} else
-			result = count;
+	count = kfifo_put(port->write_fifo, buf, count);
+	result = usb_serial_generic_write_start(port);
 
-		return result;
-	}
+	if (result >= 0)
+		result = count;
 
-	/* no bulk out, so return 0 bytes written */
-	return 0;
+	return result;
 }
 EXPORT_SYMBOL_GPL(usb_serial_generic_write);
 
@@ -355,9 +369,8 @@ int usb_serial_generic_write_room(struct tty_struct *tty)
 			room = port->bulk_out_size *
 				(serial->type->max_in_flight_urbs -
 				 port->urbs_in_flight);
-	} else if (serial->num_bulk_out && !(port->write_urb_busy)) {
-		room = port->bulk_out_size;
-	}
+	} else if (serial->num_bulk_out)
+		room = port->write_fifo->size - __kfifo_len(port->write_fifo);
 	spin_unlock_irqrestore(&port->lock, flags);
 
 	dbg("%s - returns %d", __func__, room);
@@ -377,11 +390,8 @@ int usb_serial_generic_chars_in_buffer(struct tty_struct *tty)
 		spin_lock_irqsave(&port->lock, flags);
 		chars = port->tx_bytes_flight;
 		spin_unlock_irqrestore(&port->lock, flags);
-	} else if (serial->num_bulk_out) {
-		/* FIXME: Locking */
-		if (port->write_urb_busy)
-			chars = port->write_urb->transfer_buffer_length;
-	}
+	} else if (serial->num_bulk_out)
+		chars = kfifo_len(port->write_fifo);
 
 	dbg("%s - returns %d", __func__, chars);
 	return chars;
@@ -485,16 +495,23 @@ void usb_serial_generic_write_bulk_callback(struct urb *urb)
 		if (port->urbs_in_flight < 0)
 			port->urbs_in_flight = 0;
 		spin_unlock_irqrestore(&port->lock, flags);
+
+		if (status) {
+			dbg("%s - nonzero multi-urb write bulk status "
+				"received: %d", __func__, status);
+			return;
+		}
 	} else {
-		/* Handle the case for single urb mode */
 		port->write_urb_busy = 0;
-	}
 
-	if (status) {
-		dbg("%s - nonzero write bulk status received: %d",
-		    __func__, status);
-		return;
+		if (status) {
+			dbg("%s - nonzero multi-urb write bulk status "
+				"received: %d", __func__, status);
+			kfifo_reset(port->write_fifo);
+		} else
+			usb_serial_generic_write_start(port);
 	}
+
 	usb_serial_port_softint(port);
 }
 EXPORT_SYMBOL_GPL(usb_serial_generic_write_bulk_callback);
@@ -559,6 +576,33 @@ int usb_serial_handle_break(struct usb_serial_port *port)
 }
 EXPORT_SYMBOL_GPL(usb_serial_handle_break);
 
+int usb_serial_generic_resume(struct usb_serial *serial)
+{
+	struct usb_serial_port *port;
+	int i, c = 0, r;
+
+	for (i = 0; i < serial->num_ports; i++) {
+		port = serial->port[i];
+		if (!port->port.count)
+			continue;
+
+		if (port->read_urb) {
+			r = usb_submit_urb(port->read_urb, GFP_NOIO);
+			if (r < 0)
+				c++;
+		}
+
+		if (port->write_urb) {
+			r = usb_serial_generic_write_start(port);
+			if (r < 0)
+				c++;
+		}
+	}
+
+	return c ? -EIO : 0;
+}
+EXPORT_SYMBOL_GPL(usb_serial_generic_resume);
+
 void usb_serial_generic_disconnect(struct usb_serial *serial)
 {
 	int i;
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index 45975b4984ea..ff75a3589e7e 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -35,6 +35,7 @@
 #include <linux/serial.h>
 #include <linux/usb.h>
 #include <linux/usb/serial.h>
+#include <linux/kfifo.h>
 #include "pl2303.h"
 
 /*
@@ -625,6 +626,8 @@ static void port_release(struct device *dev)
 	usb_free_urb(port->write_urb);
 	usb_free_urb(port->interrupt_in_urb);
 	usb_free_urb(port->interrupt_out_urb);
+	if (!IS_ERR(port->write_fifo) && port->write_fifo)
+		kfifo_free(port->write_fifo);
 	kfree(port->bulk_in_buffer);
 	kfree(port->bulk_out_buffer);
 	kfree(port->interrupt_in_buffer);
@@ -964,6 +967,10 @@ int usb_serial_probe(struct usb_interface *interface,
 			dev_err(&interface->dev, "No free urbs available\n");
 			goto probe_error;
 		}
+		port->write_fifo = kfifo_alloc(PAGE_SIZE, GFP_KERNEL,
+			&port->lock);
+		if (IS_ERR(port->write_fifo))
+			goto probe_error;
 		buffer_size = le16_to_cpu(endpoint->wMaxPacketSize);
 		port->bulk_out_size = buffer_size;
 		port->bulk_out_endpointAddress = endpoint->bEndpointAddress;
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 7b85e327af91..c17eb64d7213 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -59,6 +59,7 @@ enum port_dev_state {
  * @bulk_out_buffer: pointer to the bulk out buffer for this port.
  * @bulk_out_size: the size of the bulk_out_buffer, in bytes.
  * @write_urb: pointer to the bulk out struct urb for this port.
+ * @write_fifo: kfifo used to buffer outgoing data
  * @write_urb_busy: port`s writing status
  * @bulk_out_endpointAddress: endpoint address for the bulk out pipe for this
  *	port.
@@ -96,6 +97,7 @@ struct usb_serial_port {
 	unsigned char		*bulk_out_buffer;
 	int			bulk_out_size;
 	struct urb		*write_urb;
+	struct kfifo		*write_fifo;
 	int			write_urb_busy;
 	__u8			bulk_out_endpointAddress;
 
-- 
cgit v1.2.3-71-gd317


From df6c516900d48df3581b23d37d6516a22ec4f2ca Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 20 Aug 2009 15:39:48 -0500
Subject: USB: ehci,dbgp,early_printk: split ehci debug driver from
 early_printk.c

Move the dbgp early printk driver in advance of refactoring and adding
new code, so the changes to this code are tracked separately from the
move of the code.

The drivers/usb/early directory will be the location of the current
and future early usb code for driving usb devices prior initializing
the standard interrupt driven USB drivers.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/x86/kernel/early_printk.c | 715 ----------------------------------------
 drivers/usb/Makefile           |   1 +
 drivers/usb/early/Makefile     |   5 +
 drivers/usb/early/ehci-dbgp.c  | 723 +++++++++++++++++++++++++++++++++++++++++
 include/linux/usb/ehci_def.h   |   6 +
 5 files changed, 735 insertions(+), 715 deletions(-)
 create mode 100644 drivers/usb/early/Makefile
 create mode 100644 drivers/usb/early/ehci-dbgp.c

(limited to 'include/linux')

diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index b11cab3c323a..519a5e10be53 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -160,721 +160,6 @@ static struct console early_serial_console = {
 	.index =	-1,
 };
 
-#ifdef CONFIG_EARLY_PRINTK_DBGP
-
-static struct ehci_caps __iomem *ehci_caps;
-static struct ehci_regs __iomem *ehci_regs;
-static struct ehci_dbg_port __iomem *ehci_debug;
-static unsigned int dbgp_endpoint_out;
-
-struct ehci_dev {
-	u32 bus;
-	u32 slot;
-	u32 func;
-};
-
-static struct ehci_dev ehci_dev;
-
-#define USB_DEBUG_DEVNUM 127
-
-#define DBGP_DATA_TOGGLE	0x8800
-
-static inline u32 dbgp_pid_update(u32 x, u32 tok)
-{
-	return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff);
-}
-
-static inline u32 dbgp_len_update(u32 x, u32 len)
-{
-	return (x & ~0x0f) | (len & 0x0f);
-}
-
-/*
- * USB Packet IDs (PIDs)
- */
-
-/* token */
-#define USB_PID_OUT		0xe1
-#define USB_PID_IN		0x69
-#define USB_PID_SOF		0xa5
-#define USB_PID_SETUP		0x2d
-/* handshake */
-#define USB_PID_ACK		0xd2
-#define USB_PID_NAK		0x5a
-#define USB_PID_STALL		0x1e
-#define USB_PID_NYET		0x96
-/* data */
-#define USB_PID_DATA0		0xc3
-#define USB_PID_DATA1		0x4b
-#define USB_PID_DATA2		0x87
-#define USB_PID_MDATA		0x0f
-/* Special */
-#define USB_PID_PREAMBLE	0x3c
-#define USB_PID_ERR		0x3c
-#define USB_PID_SPLIT		0x78
-#define USB_PID_PING		0xb4
-#define USB_PID_UNDEF_0		0xf0
-
-#define USB_PID_DATA_TOGGLE	0x88
-#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE)
-
-#define PCI_CAP_ID_EHCI_DEBUG	0xa
-
-#define HUB_ROOT_RESET_TIME	50	/* times are in msec */
-#define HUB_SHORT_RESET_TIME	10
-#define HUB_LONG_RESET_TIME	200
-#define HUB_RESET_TIMEOUT	500
-
-#define DBGP_MAX_PACKET		8
-
-static int dbgp_wait_until_complete(void)
-{
-	u32 ctrl;
-	int loop = 0x100000;
-
-	do {
-		ctrl = readl(&ehci_debug->control);
-		/* Stop when the transaction is finished */
-		if (ctrl & DBGP_DONE)
-			break;
-	} while (--loop > 0);
-
-	if (!loop)
-		return -1;
-
-	/*
-	 * Now that we have observed the completed transaction,
-	 * clear the done bit.
-	 */
-	writel(ctrl | DBGP_DONE, &ehci_debug->control);
-	return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
-}
-
-static void __init dbgp_mdelay(int ms)
-{
-	int i;
-
-	while (ms--) {
-		for (i = 0; i < 1000; i++)
-			outb(0x1, 0x80);
-	}
-}
-
-static void dbgp_breath(void)
-{
-	/* Sleep to give the debug port a chance to breathe */
-}
-
-static int dbgp_wait_until_done(unsigned ctrl)
-{
-	u32 pids, lpid;
-	int ret;
-	int loop = 3;
-
-retry:
-	writel(ctrl | DBGP_GO, &ehci_debug->control);
-	ret = dbgp_wait_until_complete();
-	pids = readl(&ehci_debug->pids);
-	lpid = DBGP_PID_GET(pids);
-
-	if (ret < 0)
-		return ret;
-
-	/*
-	 * If the port is getting full or it has dropped data
-	 * start pacing ourselves, not necessary but it's friendly.
-	 */
-	if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET))
-		dbgp_breath();
-
-	/* If I get a NACK reissue the transmission */
-	if (lpid == USB_PID_NAK) {
-		if (--loop > 0)
-			goto retry;
-	}
-
-	return ret;
-}
-
-static void dbgp_set_data(const void *buf, int size)
-{
-	const unsigned char *bytes = buf;
-	u32 lo, hi;
-	int i;
-
-	lo = hi = 0;
-	for (i = 0; i < 4 && i < size; i++)
-		lo |= bytes[i] << (8*i);
-	for (; i < 8 && i < size; i++)
-		hi |= bytes[i] << (8*(i - 4));
-	writel(lo, &ehci_debug->data03);
-	writel(hi, &ehci_debug->data47);
-}
-
-static void __init dbgp_get_data(void *buf, int size)
-{
-	unsigned char *bytes = buf;
-	u32 lo, hi;
-	int i;
-
-	lo = readl(&ehci_debug->data03);
-	hi = readl(&ehci_debug->data47);
-	for (i = 0; i < 4 && i < size; i++)
-		bytes[i] = (lo >> (8*i)) & 0xff;
-	for (; i < 8 && i < size; i++)
-		bytes[i] = (hi >> (8*(i - 4))) & 0xff;
-}
-
-static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
-			 const char *bytes, int size)
-{
-	u32 pids, addr, ctrl;
-	int ret;
-
-	if (size > DBGP_MAX_PACKET)
-		return -1;
-
-	addr = DBGP_EPADDR(devnum, endpoint);
-
-	pids = readl(&ehci_debug->pids);
-	pids = dbgp_pid_update(pids, USB_PID_OUT);
-
-	ctrl = readl(&ehci_debug->control);
-	ctrl = dbgp_len_update(ctrl, size);
-	ctrl |= DBGP_OUT;
-	ctrl |= DBGP_GO;
-
-	dbgp_set_data(bytes, size);
-	writel(addr, &ehci_debug->address);
-	writel(pids, &ehci_debug->pids);
-
-	ret = dbgp_wait_until_done(ctrl);
-	if (ret < 0)
-		return ret;
-
-	return ret;
-}
-
-static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
-				 int size)
-{
-	u32 pids, addr, ctrl;
-	int ret;
-
-	if (size > DBGP_MAX_PACKET)
-		return -1;
-
-	addr = DBGP_EPADDR(devnum, endpoint);
-
-	pids = readl(&ehci_debug->pids);
-	pids = dbgp_pid_update(pids, USB_PID_IN);
-
-	ctrl = readl(&ehci_debug->control);
-	ctrl = dbgp_len_update(ctrl, size);
-	ctrl &= ~DBGP_OUT;
-	ctrl |= DBGP_GO;
-
-	writel(addr, &ehci_debug->address);
-	writel(pids, &ehci_debug->pids);
-	ret = dbgp_wait_until_done(ctrl);
-	if (ret < 0)
-		return ret;
-
-	if (size > ret)
-		size = ret;
-	dbgp_get_data(data, size);
-	return ret;
-}
-
-static int __init dbgp_control_msg(unsigned devnum, int requesttype,
-	int request, int value, int index, void *data, int size)
-{
-	u32 pids, addr, ctrl;
-	struct usb_ctrlrequest req;
-	int read;
-	int ret;
-
-	read = (requesttype & USB_DIR_IN) != 0;
-	if (size > (read ? DBGP_MAX_PACKET:0))
-		return -1;
-
-	/* Compute the control message */
-	req.bRequestType = requesttype;
-	req.bRequest = request;
-	req.wValue = cpu_to_le16(value);
-	req.wIndex = cpu_to_le16(index);
-	req.wLength = cpu_to_le16(size);
-
-	pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP);
-	addr = DBGP_EPADDR(devnum, 0);
-
-	ctrl = readl(&ehci_debug->control);
-	ctrl = dbgp_len_update(ctrl, sizeof(req));
-	ctrl |= DBGP_OUT;
-	ctrl |= DBGP_GO;
-
-	/* Send the setup message */
-	dbgp_set_data(&req, sizeof(req));
-	writel(addr, &ehci_debug->address);
-	writel(pids, &ehci_debug->pids);
-	ret = dbgp_wait_until_done(ctrl);
-	if (ret < 0)
-		return ret;
-
-	/* Read the result */
-	return dbgp_bulk_read(devnum, 0, data, size);
-}
-
-
-/* Find a PCI capability */
-static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap)
-{
-	u8 pos;
-	int bytes;
-
-	if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
-		PCI_STATUS_CAP_LIST))
-		return 0;
-
-	pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
-	for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
-		u8 id;
-
-		pos &= ~3;
-		id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
-		if (id == 0xff)
-			break;
-		if (id == cap)
-			return pos;
-
-		pos = read_pci_config_byte(num, slot, func,
-						 pos+PCI_CAP_LIST_NEXT);
-	}
-	return 0;
-}
-
-static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func)
-{
-	u32 class;
-
-	class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
-	if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI)
-		return 0;
-
-	return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG);
-}
-
-static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
-{
-	u32 bus, slot, func;
-
-	for (bus = 0; bus < 256; bus++) {
-		for (slot = 0; slot < 32; slot++) {
-			for (func = 0; func < 8; func++) {
-				unsigned cap;
-
-				cap = __find_dbgp(bus, slot, func);
-
-				if (!cap)
-					continue;
-				if (ehci_num-- != 0)
-					continue;
-				*rbus = bus;
-				*rslot = slot;
-				*rfunc = func;
-				return cap;
-			}
-		}
-	}
-	return 0;
-}
-
-static int __init ehci_reset_port(int port)
-{
-	u32 portsc;
-	u32 delay_time, delay;
-	int loop;
-
-	/* Reset the usb debug port */
-	portsc = readl(&ehci_regs->port_status[port - 1]);
-	portsc &= ~PORT_PE;
-	portsc |= PORT_RESET;
-	writel(portsc, &ehci_regs->port_status[port - 1]);
-
-	delay = HUB_ROOT_RESET_TIME;
-	for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT;
-	     delay_time += delay) {
-		dbgp_mdelay(delay);
-
-		portsc = readl(&ehci_regs->port_status[port - 1]);
-		if (portsc & PORT_RESET) {
-			/* force reset to complete */
-			loop = 2;
-			writel(portsc & ~(PORT_RWC_BITS | PORT_RESET),
-				&ehci_regs->port_status[port - 1]);
-			do {
-				portsc = readl(&ehci_regs->port_status[port-1]);
-			} while ((portsc & PORT_RESET) && (--loop > 0));
-		}
-
-		/* Device went away? */
-		if (!(portsc & PORT_CONNECT))
-			return -ENOTCONN;
-
-		/* bomb out completely if something weird happend */
-		if ((portsc & PORT_CSC))
-			return -EINVAL;
-
-		/* If we've finished resetting, then break out of the loop */
-		if (!(portsc & PORT_RESET) && (portsc & PORT_PE))
-			return 0;
-	}
-	return -EBUSY;
-}
-
-static int __init ehci_wait_for_port(int port)
-{
-	u32 status;
-	int ret, reps;
-
-	for (reps = 0; reps < 3; reps++) {
-		dbgp_mdelay(100);
-		status = readl(&ehci_regs->status);
-		if (status & STS_PCD) {
-			ret = ehci_reset_port(port);
-			if (ret == 0)
-				return 0;
-		}
-	}
-	return -ENOTCONN;
-}
-
-#ifdef DBGP_DEBUG
-# define dbgp_printk early_printk
-#else
-static inline void dbgp_printk(const char *fmt, ...) { }
-#endif
-
-typedef void (*set_debug_port_t)(int port);
-
-static void __init default_set_debug_port(int port)
-{
-}
-
-static set_debug_port_t __initdata set_debug_port = default_set_debug_port;
-
-static void __init nvidia_set_debug_port(int port)
-{
-	u32 dword;
-	dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
-				 0x74);
-	dword &= ~(0x0f<<12);
-	dword |= ((port & 0x0f)<<12);
-	write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74,
-				 dword);
-	dbgp_printk("set debug port to %d\n", port);
-}
-
-static void __init detect_set_debug_port(void)
-{
-	u32 vendorid;
-
-	vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
-		 0x00);
-
-	if ((vendorid & 0xffff) == 0x10de) {
-		dbgp_printk("using nvidia set_debug_port\n");
-		set_debug_port = nvidia_set_debug_port;
-	}
-}
-
-static int __init ehci_setup(void)
-{
-	struct usb_debug_descriptor dbgp_desc;
-	u32 cmd, ctrl, status, portsc, hcs_params;
-	u32 debug_port, new_debug_port = 0, n_ports;
-	u32  devnum;
-	int ret, i;
-	int loop;
-	int port_map_tried;
-	int playtimes = 3;
-
-try_next_time:
-	port_map_tried = 0;
-
-try_next_port:
-
-	hcs_params = readl(&ehci_caps->hcs_params);
-	debug_port = HCS_DEBUG_PORT(hcs_params);
-	n_ports    = HCS_N_PORTS(hcs_params);
-
-	dbgp_printk("debug_port: %d\n", debug_port);
-	dbgp_printk("n_ports:    %d\n", n_ports);
-
-	for (i = 1; i <= n_ports; i++) {
-		portsc = readl(&ehci_regs->port_status[i-1]);
-		dbgp_printk("portstatus%d: %08x\n", i, portsc);
-	}
-
-	if (port_map_tried && (new_debug_port != debug_port)) {
-		if (--playtimes) {
-			set_debug_port(new_debug_port);
-			goto try_next_time;
-		}
-		return -1;
-	}
-
-	loop = 100000;
-	/* Reset the EHCI controller */
-	cmd = readl(&ehci_regs->command);
-	cmd |= CMD_RESET;
-	writel(cmd, &ehci_regs->command);
-	do {
-		cmd = readl(&ehci_regs->command);
-	} while ((cmd & CMD_RESET) && (--loop > 0));
-
-	if (!loop) {
-		dbgp_printk("can not reset ehci\n");
-		return -1;
-	}
-	dbgp_printk("ehci reset done\n");
-
-	/* Claim ownership, but do not enable yet */
-	ctrl = readl(&ehci_debug->control);
-	ctrl |= DBGP_OWNER;
-	ctrl &= ~(DBGP_ENABLED | DBGP_INUSE);
-	writel(ctrl, &ehci_debug->control);
-
-	/* Start the ehci running */
-	cmd = readl(&ehci_regs->command);
-	cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET);
-	cmd |= CMD_RUN;
-	writel(cmd, &ehci_regs->command);
-
-	/* Ensure everything is routed to the EHCI */
-	writel(FLAG_CF, &ehci_regs->configured_flag);
-
-	/* Wait until the controller is no longer halted */
-	loop = 10;
-	do {
-		status = readl(&ehci_regs->status);
-	} while ((status & STS_HALT) && (--loop > 0));
-
-	if (!loop) {
-		dbgp_printk("ehci can be started\n");
-		return -1;
-	}
-	dbgp_printk("ehci started\n");
-
-	/* Wait for a device to show up in the debug port */
-	ret = ehci_wait_for_port(debug_port);
-	if (ret < 0) {
-		dbgp_printk("No device found in debug port\n");
-		goto next_debug_port;
-	}
-	dbgp_printk("ehci wait for port done\n");
-
-	/* Enable the debug port */
-	ctrl = readl(&ehci_debug->control);
-	ctrl |= DBGP_CLAIM;
-	writel(ctrl, &ehci_debug->control);
-	ctrl = readl(&ehci_debug->control);
-	if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) {
-		dbgp_printk("No device in debug port\n");
-		writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control);
-		goto err;
-	}
-	dbgp_printk("debug ported enabled\n");
-
-	/* Completely transfer the debug device to the debug controller */
-	portsc = readl(&ehci_regs->port_status[debug_port - 1]);
-	portsc &= ~PORT_PE;
-	writel(portsc, &ehci_regs->port_status[debug_port - 1]);
-
-	dbgp_mdelay(100);
-
-	/* Find the debug device and make it device number 127 */
-	for (devnum = 0; devnum <= 127; devnum++) {
-		ret = dbgp_control_msg(devnum,
-			USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
-			USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0,
-			&dbgp_desc, sizeof(dbgp_desc));
-		if (ret > 0)
-			break;
-	}
-	if (devnum > 127) {
-		dbgp_printk("Could not find attached debug device\n");
-		goto err;
-	}
-	if (ret < 0) {
-		dbgp_printk("Attached device is not a debug device\n");
-		goto err;
-	}
-	dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint;
-
-	/* Move the device to 127 if it isn't already there */
-	if (devnum != USB_DEBUG_DEVNUM) {
-		ret = dbgp_control_msg(devnum,
-			USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
-			USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0);
-		if (ret < 0) {
-			dbgp_printk("Could not move attached device to %d\n",
-				USB_DEBUG_DEVNUM);
-			goto err;
-		}
-		devnum = USB_DEBUG_DEVNUM;
-		dbgp_printk("debug device renamed to 127\n");
-	}
-
-	/* Enable the debug interface */
-	ret = dbgp_control_msg(USB_DEBUG_DEVNUM,
-		USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
-		USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0);
-	if (ret < 0) {
-		dbgp_printk(" Could not enable the debug device\n");
-		goto err;
-	}
-	dbgp_printk("debug interface enabled\n");
-
-	/* Perform a small write to get the even/odd data state in sync
-	 */
-	ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1);
-	if (ret < 0) {
-		dbgp_printk("dbgp_bulk_write failed: %d\n", ret);
-		goto err;
-	}
-	dbgp_printk("small write doned\n");
-
-	return 0;
-err:
-	/* Things didn't work so remove my claim */
-	ctrl = readl(&ehci_debug->control);
-	ctrl &= ~(DBGP_CLAIM | DBGP_OUT);
-	writel(ctrl, &ehci_debug->control);
-	return -1;
-
-next_debug_port:
-	port_map_tried |= (1<<(debug_port - 1));
-	new_debug_port = ((debug_port-1+1)%n_ports) + 1;
-	if (port_map_tried != ((1<<n_ports) - 1)) {
-		set_debug_port(new_debug_port);
-		goto try_next_port;
-	}
-	if (--playtimes) {
-		set_debug_port(new_debug_port);
-		goto try_next_time;
-	}
-
-	return -1;
-}
-
-static int __init early_dbgp_init(char *s)
-{
-	u32 debug_port, bar, offset;
-	u32 bus, slot, func, cap;
-	void __iomem *ehci_bar;
-	u32 dbgp_num;
-	u32 bar_val;
-	char *e;
-	int ret;
-	u8 byte;
-
-	if (!early_pci_allowed())
-		return -1;
-
-	dbgp_num = 0;
-	if (*s)
-		dbgp_num = simple_strtoul(s, &e, 10);
-	dbgp_printk("dbgp_num: %d\n", dbgp_num);
-
-	cap = find_dbgp(dbgp_num, &bus, &slot, &func);
-	if (!cap)
-		return -1;
-
-	dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot,
-			 func);
-
-	debug_port = read_pci_config(bus, slot, func, cap);
-	bar = (debug_port >> 29) & 0x7;
-	bar = (bar * 4) + 0xc;
-	offset = (debug_port >> 16) & 0xfff;
-	dbgp_printk("bar: %02x offset: %03x\n", bar, offset);
-	if (bar != PCI_BASE_ADDRESS_0) {
-		dbgp_printk("only debug ports on bar 1 handled.\n");
-
-		return -1;
-	}
-
-	bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
-	dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset);
-	if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) {
-		dbgp_printk("only simple 32bit mmio bars supported\n");
-
-		return -1;
-	}
-
-	/* double check if the mem space is enabled */
-	byte = read_pci_config_byte(bus, slot, func, 0x04);
-	if (!(byte & 0x2)) {
-		byte  |= 0x02;
-		write_pci_config_byte(bus, slot, func, 0x04, byte);
-		dbgp_printk("mmio for ehci enabled\n");
-	}
-
-	/*
-	 * FIXME I don't have the bar size so just guess PAGE_SIZE is more
-	 * than enough.  1K is the biggest I have seen.
-	 */
-	set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK);
-	ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE);
-	ehci_bar += bar_val & ~PAGE_MASK;
-	dbgp_printk("ehci_bar: %p\n", ehci_bar);
-
-	ehci_caps  = ehci_bar;
-	ehci_regs  = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase));
-	ehci_debug = ehci_bar + offset;
-	ehci_dev.bus = bus;
-	ehci_dev.slot = slot;
-	ehci_dev.func = func;
-
-	detect_set_debug_port();
-
-	ret = ehci_setup();
-	if (ret < 0) {
-		dbgp_printk("ehci_setup failed\n");
-		ehci_debug = NULL;
-
-		return -1;
-	}
-
-	return 0;
-}
-
-static void early_dbgp_write(struct console *con, const char *str, u32 n)
-{
-	int chunk, ret;
-
-	if (!ehci_debug)
-		return;
-	while (n > 0) {
-		chunk = n;
-		if (chunk > DBGP_MAX_PACKET)
-			chunk = DBGP_MAX_PACKET;
-		ret = dbgp_bulk_write(USB_DEBUG_DEVNUM,
-			dbgp_endpoint_out, str, chunk);
-		str += chunk;
-		n -= chunk;
-	}
-}
-
-static struct console early_dbgp_console = {
-	.name =		"earlydbg",
-	.write =	early_dbgp_write,
-	.flags =	CON_PRINTBUFFER,
-	.index =	-1,
-};
-#endif
-
 /* Direct interface for emergencies */
 static struct console *early_console = &early_vga_console;
 static int __initdata early_console_initialized;
diff --git a/drivers/usb/Makefile b/drivers/usb/Makefile
index dbe4fb694ad2..be3c9b80bc9f 100644
--- a/drivers/usb/Makefile
+++ b/drivers/usb/Makefile
@@ -40,6 +40,7 @@ obj-$(CONFIG_USB_MICROTEK)	+= image/
 obj-$(CONFIG_USB_SERIAL)	+= serial/
 
 obj-$(CONFIG_USB)		+= misc/
+obj-y				+= early/
 
 obj-$(CONFIG_USB_ATM)		+= atm/
 obj-$(CONFIG_USB_SPEEDTOUCH)	+= atm/
diff --git a/drivers/usb/early/Makefile b/drivers/usb/early/Makefile
new file mode 100644
index 000000000000..dfedee8c45b6
--- /dev/null
+++ b/drivers/usb/early/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for early USB devices
+#
+
+obj-$(CONFIG_EARLY_PRINTK_DBGP)        += ehci-dbgp.o
diff --git a/drivers/usb/early/ehci-dbgp.c b/drivers/usb/early/ehci-dbgp.c
new file mode 100644
index 000000000000..821b7b21c29c
--- /dev/null
+++ b/drivers/usb/early/ehci-dbgp.c
@@ -0,0 +1,723 @@
+#include <linux/console.h>
+#include <linux/errno.h>
+#include <linux/pci_regs.h>
+#include <linux/pci_ids.h>
+#include <linux/usb/ch9.h>
+#include <linux/usb/ehci_def.h>
+#include <linux/delay.h>
+#include <asm/io.h>
+#include <asm/pci-direct.h>
+#include <asm/fixmap.h>
+
+static struct ehci_caps __iomem *ehci_caps;
+static struct ehci_regs __iomem *ehci_regs;
+static struct ehci_dbg_port __iomem *ehci_debug;
+static unsigned int dbgp_endpoint_out;
+
+struct ehci_dev {
+	u32 bus;
+	u32 slot;
+	u32 func;
+};
+
+static struct ehci_dev ehci_dev;
+
+#define USB_DEBUG_DEVNUM 127
+
+#define DBGP_DATA_TOGGLE	0x8800
+
+static inline u32 dbgp_pid_update(u32 x, u32 tok)
+{
+	return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff);
+}
+
+static inline u32 dbgp_len_update(u32 x, u32 len)
+{
+	return (x & ~0x0f) | (len & 0x0f);
+}
+
+/*
+ * USB Packet IDs (PIDs)
+ */
+
+/* token */
+#define USB_PID_OUT		0xe1
+#define USB_PID_IN		0x69
+#define USB_PID_SOF		0xa5
+#define USB_PID_SETUP		0x2d
+/* handshake */
+#define USB_PID_ACK		0xd2
+#define USB_PID_NAK		0x5a
+#define USB_PID_STALL		0x1e
+#define USB_PID_NYET		0x96
+/* data */
+#define USB_PID_DATA0		0xc3
+#define USB_PID_DATA1		0x4b
+#define USB_PID_DATA2		0x87
+#define USB_PID_MDATA		0x0f
+/* Special */
+#define USB_PID_PREAMBLE	0x3c
+#define USB_PID_ERR		0x3c
+#define USB_PID_SPLIT		0x78
+#define USB_PID_PING		0xb4
+#define USB_PID_UNDEF_0		0xf0
+
+#define USB_PID_DATA_TOGGLE	0x88
+#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE)
+
+#define PCI_CAP_ID_EHCI_DEBUG	0xa
+
+#define HUB_ROOT_RESET_TIME	50	/* times are in msec */
+#define HUB_SHORT_RESET_TIME	10
+#define HUB_LONG_RESET_TIME	200
+#define HUB_RESET_TIMEOUT	500
+
+#define DBGP_MAX_PACKET		8
+
+static int dbgp_wait_until_complete(void)
+{
+	u32 ctrl;
+	int loop = 0x100000;
+
+	do {
+		ctrl = readl(&ehci_debug->control);
+		/* Stop when the transaction is finished */
+		if (ctrl & DBGP_DONE)
+			break;
+	} while (--loop > 0);
+
+	if (!loop)
+		return -1;
+
+	/*
+	 * Now that we have observed the completed transaction,
+	 * clear the done bit.
+	 */
+	writel(ctrl | DBGP_DONE, &ehci_debug->control);
+	return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
+}
+
+static void __init dbgp_mdelay(int ms)
+{
+	int i;
+
+	while (ms--) {
+		for (i = 0; i < 1000; i++)
+			outb(0x1, 0x80);
+	}
+}
+
+static void dbgp_breath(void)
+{
+	/* Sleep to give the debug port a chance to breathe */
+}
+
+static int dbgp_wait_until_done(unsigned ctrl)
+{
+	u32 pids, lpid;
+	int ret;
+	int loop = 3;
+
+retry:
+	writel(ctrl | DBGP_GO, &ehci_debug->control);
+	ret = dbgp_wait_until_complete();
+	pids = readl(&ehci_debug->pids);
+	lpid = DBGP_PID_GET(pids);
+
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * If the port is getting full or it has dropped data
+	 * start pacing ourselves, not necessary but it's friendly.
+	 */
+	if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET))
+		dbgp_breath();
+
+	/* If I get a NACK reissue the transmission */
+	if (lpid == USB_PID_NAK) {
+		if (--loop > 0)
+			goto retry;
+	}
+
+	return ret;
+}
+
+static void dbgp_set_data(const void *buf, int size)
+{
+	const unsigned char *bytes = buf;
+	u32 lo, hi;
+	int i;
+
+	lo = hi = 0;
+	for (i = 0; i < 4 && i < size; i++)
+		lo |= bytes[i] << (8*i);
+	for (; i < 8 && i < size; i++)
+		hi |= bytes[i] << (8*(i - 4));
+	writel(lo, &ehci_debug->data03);
+	writel(hi, &ehci_debug->data47);
+}
+
+static void __init dbgp_get_data(void *buf, int size)
+{
+	unsigned char *bytes = buf;
+	u32 lo, hi;
+	int i;
+
+	lo = readl(&ehci_debug->data03);
+	hi = readl(&ehci_debug->data47);
+	for (i = 0; i < 4 && i < size; i++)
+		bytes[i] = (lo >> (8*i)) & 0xff;
+	for (; i < 8 && i < size; i++)
+		bytes[i] = (hi >> (8*(i - 4))) & 0xff;
+}
+
+static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
+			 const char *bytes, int size)
+{
+	u32 pids, addr, ctrl;
+	int ret;
+
+	if (size > DBGP_MAX_PACKET)
+		return -1;
+
+	addr = DBGP_EPADDR(devnum, endpoint);
+
+	pids = readl(&ehci_debug->pids);
+	pids = dbgp_pid_update(pids, USB_PID_OUT);
+
+	ctrl = readl(&ehci_debug->control);
+	ctrl = dbgp_len_update(ctrl, size);
+	ctrl |= DBGP_OUT;
+	ctrl |= DBGP_GO;
+
+	dbgp_set_data(bytes, size);
+	writel(addr, &ehci_debug->address);
+	writel(pids, &ehci_debug->pids);
+
+	ret = dbgp_wait_until_done(ctrl);
+	if (ret < 0)
+		return ret;
+
+	return ret;
+}
+
+static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
+				 int size)
+{
+	u32 pids, addr, ctrl;
+	int ret;
+
+	if (size > DBGP_MAX_PACKET)
+		return -1;
+
+	addr = DBGP_EPADDR(devnum, endpoint);
+
+	pids = readl(&ehci_debug->pids);
+	pids = dbgp_pid_update(pids, USB_PID_IN);
+
+	ctrl = readl(&ehci_debug->control);
+	ctrl = dbgp_len_update(ctrl, size);
+	ctrl &= ~DBGP_OUT;
+	ctrl |= DBGP_GO;
+
+	writel(addr, &ehci_debug->address);
+	writel(pids, &ehci_debug->pids);
+	ret = dbgp_wait_until_done(ctrl);
+	if (ret < 0)
+		return ret;
+
+	if (size > ret)
+		size = ret;
+	dbgp_get_data(data, size);
+	return ret;
+}
+
+static int __init dbgp_control_msg(unsigned devnum, int requesttype,
+	int request, int value, int index, void *data, int size)
+{
+	u32 pids, addr, ctrl;
+	struct usb_ctrlrequest req;
+	int read;
+	int ret;
+
+	read = (requesttype & USB_DIR_IN) != 0;
+	if (size > (read ? DBGP_MAX_PACKET:0))
+		return -1;
+
+	/* Compute the control message */
+	req.bRequestType = requesttype;
+	req.bRequest = request;
+	req.wValue = cpu_to_le16(value);
+	req.wIndex = cpu_to_le16(index);
+	req.wLength = cpu_to_le16(size);
+
+	pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP);
+	addr = DBGP_EPADDR(devnum, 0);
+
+	ctrl = readl(&ehci_debug->control);
+	ctrl = dbgp_len_update(ctrl, sizeof(req));
+	ctrl |= DBGP_OUT;
+	ctrl |= DBGP_GO;
+
+	/* Send the setup message */
+	dbgp_set_data(&req, sizeof(req));
+	writel(addr, &ehci_debug->address);
+	writel(pids, &ehci_debug->pids);
+	ret = dbgp_wait_until_done(ctrl);
+	if (ret < 0)
+		return ret;
+
+	/* Read the result */
+	return dbgp_bulk_read(devnum, 0, data, size);
+}
+
+
+/* Find a PCI capability */
+static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap)
+{
+	u8 pos;
+	int bytes;
+
+	if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
+		PCI_STATUS_CAP_LIST))
+		return 0;
+
+	pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
+	for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
+		u8 id;
+
+		pos &= ~3;
+		id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
+		if (id == 0xff)
+			break;
+		if (id == cap)
+			return pos;
+
+		pos = read_pci_config_byte(num, slot, func,
+						 pos+PCI_CAP_LIST_NEXT);
+	}
+	return 0;
+}
+
+static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func)
+{
+	u32 class;
+
+	class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
+	if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI)
+		return 0;
+
+	return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG);
+}
+
+static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
+{
+	u32 bus, slot, func;
+
+	for (bus = 0; bus < 256; bus++) {
+		for (slot = 0; slot < 32; slot++) {
+			for (func = 0; func < 8; func++) {
+				unsigned cap;
+
+				cap = __find_dbgp(bus, slot, func);
+
+				if (!cap)
+					continue;
+				if (ehci_num-- != 0)
+					continue;
+				*rbus = bus;
+				*rslot = slot;
+				*rfunc = func;
+				return cap;
+			}
+		}
+	}
+	return 0;
+}
+
+static int __init ehci_reset_port(int port)
+{
+	u32 portsc;
+	u32 delay_time, delay;
+	int loop;
+
+	/* Reset the usb debug port */
+	portsc = readl(&ehci_regs->port_status[port - 1]);
+	portsc &= ~PORT_PE;
+	portsc |= PORT_RESET;
+	writel(portsc, &ehci_regs->port_status[port - 1]);
+
+	delay = HUB_ROOT_RESET_TIME;
+	for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT;
+	     delay_time += delay) {
+		dbgp_mdelay(delay);
+
+		portsc = readl(&ehci_regs->port_status[port - 1]);
+		if (portsc & PORT_RESET) {
+			/* force reset to complete */
+			loop = 2;
+			writel(portsc & ~(PORT_RWC_BITS | PORT_RESET),
+				&ehci_regs->port_status[port - 1]);
+			do {
+				portsc = readl(&ehci_regs->port_status[port-1]);
+			} while ((portsc & PORT_RESET) && (--loop > 0));
+		}
+
+		/* Device went away? */
+		if (!(portsc & PORT_CONNECT))
+			return -ENOTCONN;
+
+		/* bomb out completely if something weird happend */
+		if ((portsc & PORT_CSC))
+			return -EINVAL;
+
+		/* If we've finished resetting, then break out of the loop */
+		if (!(portsc & PORT_RESET) && (portsc & PORT_PE))
+			return 0;
+	}
+	return -EBUSY;
+}
+
+static int __init ehci_wait_for_port(int port)
+{
+	u32 status;
+	int ret, reps;
+
+	for (reps = 0; reps < 3; reps++) {
+		dbgp_mdelay(100);
+		status = readl(&ehci_regs->status);
+		if (status & STS_PCD) {
+			ret = ehci_reset_port(port);
+			if (ret == 0)
+				return 0;
+		}
+	}
+	return -ENOTCONN;
+}
+
+#ifdef DBGP_DEBUG
+# define dbgp_printk early_printk
+#else
+static inline void dbgp_printk(const char *fmt, ...) { }
+#endif
+
+typedef void (*set_debug_port_t)(int port);
+
+static void __init default_set_debug_port(int port)
+{
+}
+
+static set_debug_port_t __initdata set_debug_port = default_set_debug_port;
+
+static void __init nvidia_set_debug_port(int port)
+{
+	u32 dword;
+	dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
+				 0x74);
+	dword &= ~(0x0f<<12);
+	dword |= ((port & 0x0f)<<12);
+	write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74,
+				 dword);
+	dbgp_printk("set debug port to %d\n", port);
+}
+
+static void __init detect_set_debug_port(void)
+{
+	u32 vendorid;
+
+	vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
+		 0x00);
+
+	if ((vendorid & 0xffff) == 0x10de) {
+		dbgp_printk("using nvidia set_debug_port\n");
+		set_debug_port = nvidia_set_debug_port;
+	}
+}
+
+static int __init ehci_setup(void)
+{
+	struct usb_debug_descriptor dbgp_desc;
+	u32 cmd, ctrl, status, portsc, hcs_params;
+	u32 debug_port, new_debug_port = 0, n_ports;
+	u32  devnum;
+	int ret, i;
+	int loop;
+	int port_map_tried;
+	int playtimes = 3;
+
+try_next_time:
+	port_map_tried = 0;
+
+try_next_port:
+
+	hcs_params = readl(&ehci_caps->hcs_params);
+	debug_port = HCS_DEBUG_PORT(hcs_params);
+	n_ports    = HCS_N_PORTS(hcs_params);
+
+	dbgp_printk("debug_port: %d\n", debug_port);
+	dbgp_printk("n_ports:    %d\n", n_ports);
+
+	for (i = 1; i <= n_ports; i++) {
+		portsc = readl(&ehci_regs->port_status[i-1]);
+		dbgp_printk("portstatus%d: %08x\n", i, portsc);
+	}
+
+	if (port_map_tried && (new_debug_port != debug_port)) {
+		if (--playtimes) {
+			set_debug_port(new_debug_port);
+			goto try_next_time;
+		}
+		return -1;
+	}
+
+	loop = 100000;
+	/* Reset the EHCI controller */
+	cmd = readl(&ehci_regs->command);
+	cmd |= CMD_RESET;
+	writel(cmd, &ehci_regs->command);
+	do {
+		cmd = readl(&ehci_regs->command);
+	} while ((cmd & CMD_RESET) && (--loop > 0));
+
+	if (!loop) {
+		dbgp_printk("can not reset ehci\n");
+		return -1;
+	}
+	dbgp_printk("ehci reset done\n");
+
+	/* Claim ownership, but do not enable yet */
+	ctrl = readl(&ehci_debug->control);
+	ctrl |= DBGP_OWNER;
+	ctrl &= ~(DBGP_ENABLED | DBGP_INUSE);
+	writel(ctrl, &ehci_debug->control);
+
+	/* Start the ehci running */
+	cmd = readl(&ehci_regs->command);
+	cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET);
+	cmd |= CMD_RUN;
+	writel(cmd, &ehci_regs->command);
+
+	/* Ensure everything is routed to the EHCI */
+	writel(FLAG_CF, &ehci_regs->configured_flag);
+
+	/* Wait until the controller is no longer halted */
+	loop = 10;
+	do {
+		status = readl(&ehci_regs->status);
+	} while ((status & STS_HALT) && (--loop > 0));
+
+	if (!loop) {
+		dbgp_printk("ehci can be started\n");
+		return -1;
+	}
+	dbgp_printk("ehci started\n");
+
+	/* Wait for a device to show up in the debug port */
+	ret = ehci_wait_for_port(debug_port);
+	if (ret < 0) {
+		dbgp_printk("No device found in debug port\n");
+		goto next_debug_port;
+	}
+	dbgp_printk("ehci wait for port done\n");
+
+	/* Enable the debug port */
+	ctrl = readl(&ehci_debug->control);
+	ctrl |= DBGP_CLAIM;
+	writel(ctrl, &ehci_debug->control);
+	ctrl = readl(&ehci_debug->control);
+	if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) {
+		dbgp_printk("No device in debug port\n");
+		writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control);
+		goto err;
+	}
+	dbgp_printk("debug ported enabled\n");
+
+	/* Completely transfer the debug device to the debug controller */
+	portsc = readl(&ehci_regs->port_status[debug_port - 1]);
+	portsc &= ~PORT_PE;
+	writel(portsc, &ehci_regs->port_status[debug_port - 1]);
+
+	dbgp_mdelay(100);
+
+	/* Find the debug device and make it device number 127 */
+	for (devnum = 0; devnum <= 127; devnum++) {
+		ret = dbgp_control_msg(devnum,
+			USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
+			USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0,
+			&dbgp_desc, sizeof(dbgp_desc));
+		if (ret > 0)
+			break;
+	}
+	if (devnum > 127) {
+		dbgp_printk("Could not find attached debug device\n");
+		goto err;
+	}
+	if (ret < 0) {
+		dbgp_printk("Attached device is not a debug device\n");
+		goto err;
+	}
+	dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint;
+
+	/* Move the device to 127 if it isn't already there */
+	if (devnum != USB_DEBUG_DEVNUM) {
+		ret = dbgp_control_msg(devnum,
+			USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
+			USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0);
+		if (ret < 0) {
+			dbgp_printk("Could not move attached device to %d\n",
+				USB_DEBUG_DEVNUM);
+			goto err;
+		}
+		devnum = USB_DEBUG_DEVNUM;
+		dbgp_printk("debug device renamed to 127\n");
+	}
+
+	/* Enable the debug interface */
+	ret = dbgp_control_msg(USB_DEBUG_DEVNUM,
+		USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
+		USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0);
+	if (ret < 0) {
+		dbgp_printk(" Could not enable the debug device\n");
+		goto err;
+	}
+	dbgp_printk("debug interface enabled\n");
+
+	/* Perform a small write to get the even/odd data state in sync
+	 */
+	ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1);
+	if (ret < 0) {
+		dbgp_printk("dbgp_bulk_write failed: %d\n", ret);
+		goto err;
+	}
+	dbgp_printk("small write doned\n");
+
+	return 0;
+err:
+	/* Things didn't work so remove my claim */
+	ctrl = readl(&ehci_debug->control);
+	ctrl &= ~(DBGP_CLAIM | DBGP_OUT);
+	writel(ctrl, &ehci_debug->control);
+	return -1;
+
+next_debug_port:
+	port_map_tried |= (1<<(debug_port - 1));
+	new_debug_port = ((debug_port-1+1)%n_ports) + 1;
+	if (port_map_tried != ((1<<n_ports) - 1)) {
+		set_debug_port(new_debug_port);
+		goto try_next_port;
+	}
+	if (--playtimes) {
+		set_debug_port(new_debug_port);
+		goto try_next_time;
+	}
+
+	return -1;
+}
+
+int __init early_dbgp_init(char *s)
+{
+	u32 debug_port, bar, offset;
+	u32 bus, slot, func, cap;
+	void __iomem *ehci_bar;
+	u32 dbgp_num;
+	u32 bar_val;
+	char *e;
+	int ret;
+	u8 byte;
+
+	if (!early_pci_allowed())
+		return -1;
+
+	dbgp_num = 0;
+	if (*s)
+		dbgp_num = simple_strtoul(s, &e, 10);
+	dbgp_printk("dbgp_num: %d\n", dbgp_num);
+
+	cap = find_dbgp(dbgp_num, &bus, &slot, &func);
+	if (!cap)
+		return -1;
+
+	dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot,
+			 func);
+
+	debug_port = read_pci_config(bus, slot, func, cap);
+	bar = (debug_port >> 29) & 0x7;
+	bar = (bar * 4) + 0xc;
+	offset = (debug_port >> 16) & 0xfff;
+	dbgp_printk("bar: %02x offset: %03x\n", bar, offset);
+	if (bar != PCI_BASE_ADDRESS_0) {
+		dbgp_printk("only debug ports on bar 1 handled.\n");
+
+		return -1;
+	}
+
+	bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
+	dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset);
+	if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) {
+		dbgp_printk("only simple 32bit mmio bars supported\n");
+
+		return -1;
+	}
+
+	/* double check if the mem space is enabled */
+	byte = read_pci_config_byte(bus, slot, func, 0x04);
+	if (!(byte & 0x2)) {
+		byte  |= 0x02;
+		write_pci_config_byte(bus, slot, func, 0x04, byte);
+		dbgp_printk("mmio for ehci enabled\n");
+	}
+
+	/*
+	 * FIXME I don't have the bar size so just guess PAGE_SIZE is more
+	 * than enough.  1K is the biggest I have seen.
+	 */
+	set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK);
+	ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE);
+	ehci_bar += bar_val & ~PAGE_MASK;
+	dbgp_printk("ehci_bar: %p\n", ehci_bar);
+
+	ehci_caps  = ehci_bar;
+	ehci_regs  = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase));
+	ehci_debug = ehci_bar + offset;
+	ehci_dev.bus = bus;
+	ehci_dev.slot = slot;
+	ehci_dev.func = func;
+
+	detect_set_debug_port();
+
+	ret = ehci_setup();
+	if (ret < 0) {
+		dbgp_printk("ehci_setup failed\n");
+		ehci_debug = NULL;
+
+		return -1;
+	}
+
+	return 0;
+}
+
+static void early_dbgp_write(struct console *con, const char *str, u32 n)
+{
+	int chunk, ret;
+
+	if (!ehci_debug)
+		return;
+	while (n > 0) {
+		chunk = n;
+		if (chunk > DBGP_MAX_PACKET)
+			chunk = DBGP_MAX_PACKET;
+		ret = dbgp_bulk_write(USB_DEBUG_DEVNUM,
+			dbgp_endpoint_out, str, chunk);
+		str += chunk;
+		n -= chunk;
+	}
+}
+
+struct console early_dbgp_console = {
+	.name =		"earlydbg",
+	.write =	early_dbgp_write,
+	.flags =	CON_PRINTBUFFER,
+	.index =	-1,
+};
+
diff --git a/include/linux/usb/ehci_def.h b/include/linux/usb/ehci_def.h
index 4c4b701ff265..1deac2a7b12e 100644
--- a/include/linux/usb/ehci_def.h
+++ b/include/linux/usb/ehci_def.h
@@ -170,4 +170,10 @@ struct ehci_dbg_port {
 #define DBGP_EPADDR(dev, ep)	(((dev)<<8)|(ep))
 } __attribute__ ((packed));
 
+#ifdef CONFIG_EARLY_PRINTK_DBGP
+#include <linux/init.h>
+extern int __init early_dbgp_init(char *s);
+extern struct console early_dbgp_console;
+#endif /* CONFIG_EARLY_PRINTK_DBGP */
+
 #endif /* __LINUX_USB_EHCI_DEF_H */
-- 
cgit v1.2.3-71-gd317


From 917778267fbe67703ab7d5c6f0b7a05d4c3df485 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 20 Aug 2009 15:39:53 -0500
Subject: USB: ehci-dbgp: stability improvements and external re-init

This patch implements several changes:

1) Improve the capability to debug the dbgp driver

   The dbgp_ehci_status() was added in a number of places to report
   the critical ehci registers to diagnose the cause of a failure of
   the ehci-dbgp driver.

2) Capability to survive the host controller initialization

   The dbgp_external_startup(), dbgp_not_safe, and dbgp_phys_port were
   added so as to allow the ehci-dbgp to re-initialize after the ehci
   host controller is reset by the standard host controller driver.
   This same routine is common for the early startup or
   re-initialization.

   This resulted in the need to move some of the initialization code
   out of the __init section because the ehci driver has the
   possibility to be loaded later on as a kernel module.

3) Stability improvements for device initialization

   The device enumeration from 0 to 127 has the possibility to fail
   the first time after a warm reset on some older EHCI debug
   controllers.  The enumeration will be tried up to 3 times to
   account for this failure case.

   The dbg_wait_until_complete() was changed to wait up to 250 ms
   before failing which only comes into play during device
   initialization. The maximum delay will never get hit during the
   course of normal operation of the driver, unless the device got
   unplugged or there was a ehci controller failure, in which case the
   dbgp device driver will shut itself down.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: dbrownell@users.sourceforge.net
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/early/ehci-dbgp.c | 442 ++++++++++++++++++++++++++++--------------
 include/linux/usb/ehci_def.h  |   5 +
 2 files changed, 299 insertions(+), 148 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/early/ehci-dbgp.c b/drivers/usb/early/ehci-dbgp.c
index 6198ebded3a4..06e05ea17871 100644
--- a/drivers/usb/early/ehci-dbgp.c
+++ b/drivers/usb/early/ehci-dbgp.c
@@ -1,5 +1,19 @@
+/*
+ * Standalone EHCI usb debug driver
+ *
+ * Originally written by:
+ *  Eric W. Biederman" <ebiederm@xmission.com> and
+ *  Yinghai Lu <yhlu.kernel@gmail.com>
+ *
+ * Changes for early/late printk and HW errata:
+ *  Jason Wessel <jason.wessel@windriver.com>
+ *  Copyright (C) 2009 Wind River Systems, Inc.
+ *
+ */
+
 #include <linux/console.h>
 #include <linux/errno.h>
+#include <linux/module.h>
 #include <linux/pci_regs.h>
 #include <linux/pci_ids.h>
 #include <linux/usb/ch9.h>
@@ -9,15 +23,37 @@
 #include <asm/pci-direct.h>
 #include <asm/fixmap.h>
 
-#ifdef DBGP_DEBUG
-# define dbgp_printk printk
-#else
-static inline void dbgp_printk(const char *fmt, ...) { }
-#endif
+/* The code here is intended to talk directly to the EHCI debug port
+ * and does not require that you have any kind of USB host controller
+ * drivers or USB device drivers compiled into the kernel.
+ *
+ * If you make a change to anything in here, the following test cases
+ * need to pass where a USB debug device works in the following
+ * configurations.
+ *
+ * 1. boot args:  earlyprintk=dbgp
+ *     o kernel compiled with # CONFIG_USB_EHCI_HCD is not set
+ *     o kernel compiled with CONFIG_USB_EHCI_HCD=y
+ * 2. boot args: earlyprintk=dbgp,keep
+ *     o kernel compiled with # CONFIG_USB_EHCI_HCD is not set
+ *     o kernel compiled with CONFIG_USB_EHCI_HCD=y
+ * 3. boot args: earlyprintk=dbgp console=ttyUSB0
+ *     o kernel has CONFIG_USB_EHCI_HCD=y and
+ *       CONFIG_USB_SERIAL_DEBUG=y
+ * 4. boot args: earlyprintk=vga,dbgp
+ *     o kernel compiled with # CONFIG_USB_EHCI_HCD is not set
+ *     o kernel compiled with CONFIG_USB_EHCI_HCD=y
+ *
+ * For the 4th configuration you can turn on or off the DBGP_DEBUG
+ * such that you can debug the dbgp device's driver code.
+ */
+
+static int dbgp_phys_port = 1;
 
 static struct ehci_caps __iomem *ehci_caps;
 static struct ehci_regs __iomem *ehci_regs;
 static struct ehci_dbg_port __iomem *ehci_debug;
+static int dbgp_not_safe; /* Cannot use debug device during ehci reset */
 static unsigned int dbgp_endpoint_out;
 
 struct ehci_dev {
@@ -32,6 +68,26 @@ static struct ehci_dev ehci_dev;
 
 #define DBGP_DATA_TOGGLE	0x8800
 
+#ifdef DBGP_DEBUG
+#define dbgp_printk printk
+static void dbgp_ehci_status(char *str)
+{
+	if (!ehci_debug)
+		return;
+	dbgp_printk("dbgp: %s\n", str);
+	dbgp_printk("  Debug control: %08x", readl(&ehci_debug->control));
+	dbgp_printk("  ehci cmd     : %08x", readl(&ehci_regs->command));
+	dbgp_printk("  ehci conf flg: %08x\n",
+		    readl(&ehci_regs->configured_flag));
+	dbgp_printk("  ehci status  : %08x", readl(&ehci_regs->status));
+	dbgp_printk("  ehci portsc  : %08x\n",
+		    readl(&ehci_regs->port_status[dbgp_phys_port - 1]));
+}
+#else
+static inline void dbgp_ehci_status(char *str) { }
+static inline void dbgp_printk(const char *fmt, ...) { }
+#endif
+
 static inline u32 dbgp_pid_update(u32 x, u32 tok)
 {
 	return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff);
@@ -79,21 +135,23 @@ static inline u32 dbgp_len_update(u32 x, u32 len)
 #define HUB_RESET_TIMEOUT	500
 
 #define DBGP_MAX_PACKET		8
+#define DBGP_TIMEOUT		(250 * 1000)
 
 static int dbgp_wait_until_complete(void)
 {
 	u32 ctrl;
-	int loop = 0x100000;
+	int loop = DBGP_TIMEOUT;
 
 	do {
 		ctrl = readl(&ehci_debug->control);
 		/* Stop when the transaction is finished */
 		if (ctrl & DBGP_DONE)
 			break;
+		udelay(1);
 	} while (--loop > 0);
 
 	if (!loop)
-		return -1;
+		return -DBGP_TIMEOUT;
 
 	/*
 	 * Now that we have observed the completed transaction,
@@ -103,7 +161,7 @@ static int dbgp_wait_until_complete(void)
 	return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
 }
 
-static void __init dbgp_mdelay(int ms)
+static inline void dbgp_mdelay(int ms)
 {
 	int i;
 
@@ -130,8 +188,17 @@ retry:
 	pids = readl(&ehci_debug->pids);
 	lpid = DBGP_PID_GET(pids);
 
-	if (ret < 0)
+	if (ret < 0) {
+		/* A -DBGP_TIMEOUT failure here means the device has
+		 * failed, perhaps because it was unplugged, in which
+		 * case we do not want to hang the system so the dbgp
+		 * will be marked as unsafe to use.  EHCI reset is the
+		 * only way to recover if you unplug the dbgp device.
+		 */
+		if (ret == -DBGP_TIMEOUT && !dbgp_not_safe)
+			dbgp_not_safe = 1;
 		return ret;
+	}
 
 	/*
 	 * If the port is getting full or it has dropped data
@@ -149,7 +216,7 @@ retry:
 	return ret;
 }
 
-static void dbgp_set_data(const void *buf, int size)
+static inline void dbgp_set_data(const void *buf, int size)
 {
 	const unsigned char *bytes = buf;
 	u32 lo, hi;
@@ -164,7 +231,7 @@ static void dbgp_set_data(const void *buf, int size)
 	writel(hi, &ehci_debug->data47);
 }
 
-static void __init dbgp_get_data(void *buf, int size)
+static inline void dbgp_get_data(void *buf, int size)
 {
 	unsigned char *bytes = buf;
 	u32 lo, hi;
@@ -208,7 +275,7 @@ static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
 	return ret;
 }
 
-static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
+static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
 				 int size)
 {
 	u32 pids, addr, ctrl;
@@ -239,7 +306,7 @@ static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
 	return ret;
 }
 
-static int __init dbgp_control_msg(unsigned devnum, int requesttype,
+static int dbgp_control_msg(unsigned devnum, int requesttype,
 	int request, int value, int index, void *data, int size)
 {
 	u32 pids, addr, ctrl;
@@ -342,13 +409,179 @@ static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
 	return 0;
 }
 
+static int dbgp_ehci_startup(void)
+{
+	u32 ctrl, cmd, status;
+	int loop;
+
+	/* Claim ownership, but do not enable yet */
+	ctrl = readl(&ehci_debug->control);
+	ctrl |= DBGP_OWNER;
+	ctrl &= ~(DBGP_ENABLED | DBGP_INUSE);
+	writel(ctrl, &ehci_debug->control);
+	udelay(1);
+
+	dbgp_ehci_status("EHCI startup");
+	/* Start the ehci running */
+	cmd = readl(&ehci_regs->command);
+	cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET);
+	cmd |= CMD_RUN;
+	writel(cmd, &ehci_regs->command);
+
+	/* Ensure everything is routed to the EHCI */
+	writel(FLAG_CF, &ehci_regs->configured_flag);
+
+	/* Wait until the controller is no longer halted */
+	loop = 10;
+	do {
+		status = readl(&ehci_regs->status);
+		if (!(status & STS_HALT))
+			break;
+		udelay(1);
+	} while (--loop > 0);
+
+	if (!loop) {
+		dbgp_printk("ehci can not be started\n");
+		return -ENODEV;
+	}
+	dbgp_printk("ehci started\n");
+	return 0;
+}
+
+static int dbgp_ehci_controller_reset(void)
+{
+	int loop = 250 * 1000;
+	u32 cmd;
+
+	/* Reset the EHCI controller */
+	cmd = readl(&ehci_regs->command);
+	cmd |= CMD_RESET;
+	writel(cmd, &ehci_regs->command);
+	do {
+		cmd = readl(&ehci_regs->command);
+	} while ((cmd & CMD_RESET) && (--loop > 0));
+
+	if (!loop) {
+		dbgp_printk("can not reset ehci\n");
+		return -1;
+	}
+	dbgp_ehci_status("ehci reset done");
+	return 0;
+}
+static int ehci_wait_for_port(int port);
+/* Return 0 on success
+ * Return -ENODEV for any general failure
+ * Return -EIO if wait for port fails
+ */
+int dbgp_external_startup(void)
+{
+	int devnum;
+	struct usb_debug_descriptor dbgp_desc;
+	int ret;
+	u32 ctrl, portsc;
+	int dbg_port = dbgp_phys_port;
+	int tries = 3;
+
+	ret = dbgp_ehci_startup();
+	if (ret)
+		return ret;
+
+	/* Wait for a device to show up in the debug port */
+	ret = ehci_wait_for_port(dbg_port);
+	if (ret < 0) {
+		portsc = readl(&ehci_regs->port_status[dbg_port - 1]);
+		dbgp_printk("No device found in debug port\n");
+		return -EIO;
+	}
+	dbgp_ehci_status("wait for port done");
+
+	/* Enable the debug port */
+	ctrl = readl(&ehci_debug->control);
+	ctrl |= DBGP_CLAIM;
+	writel(ctrl, &ehci_debug->control);
+	ctrl = readl(&ehci_debug->control);
+	if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) {
+		dbgp_printk("No device in debug port\n");
+		writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control);
+		return -ENODEV;
+	}
+	dbgp_ehci_status("debug ported enabled");
+
+	/* Completely transfer the debug device to the debug controller */
+	portsc = readl(&ehci_regs->port_status[dbg_port - 1]);
+	portsc &= ~PORT_PE;
+	writel(portsc, &ehci_regs->port_status[dbg_port - 1]);
+
+	dbgp_mdelay(100);
+
+try_again:
+	/* Find the debug device and make it device number 127 */
+	for (devnum = 0; devnum <= 127; devnum++) {
+		ret = dbgp_control_msg(devnum,
+			USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
+			USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0,
+			&dbgp_desc, sizeof(dbgp_desc));
+		if (ret > 0)
+			break;
+	}
+	if (devnum > 127) {
+		dbgp_printk("Could not find attached debug device\n");
+		goto err;
+	}
+	if (ret < 0) {
+		dbgp_printk("Attached device is not a debug device\n");
+		goto err;
+	}
+	dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint;
+
+	/* Move the device to 127 if it isn't already there */
+	if (devnum != USB_DEBUG_DEVNUM) {
+		ret = dbgp_control_msg(devnum,
+			USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
+			USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0);
+		if (ret < 0) {
+			dbgp_printk("Could not move attached device to %d\n",
+				USB_DEBUG_DEVNUM);
+			goto err;
+		}
+		devnum = USB_DEBUG_DEVNUM;
+		dbgp_printk("debug device renamed to 127\n");
+	}
+
+	/* Enable the debug interface */
+	ret = dbgp_control_msg(USB_DEBUG_DEVNUM,
+		USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
+		USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0);
+	if (ret < 0) {
+		dbgp_printk(" Could not enable the debug device\n");
+		goto err;
+	}
+	dbgp_printk("debug interface enabled\n");
+	/* Perform a small write to get the even/odd data state in sync
+	 */
+	ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1);
+	if (ret < 0) {
+		dbgp_printk("dbgp_bulk_write failed: %d\n", ret);
+		goto err;
+	}
+	dbgp_printk("small write doned\n");
+	dbgp_not_safe = 0;
+
+	return 0;
+err:
+	if (tries--)
+		goto try_again;
+	return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(dbgp_external_startup);
+
 static int __init ehci_reset_port(int port)
 {
 	u32 portsc;
 	u32 delay_time, delay;
 	int loop;
 
-	dbgp_printk("ehci_reset_port %i\n", port);
+	dbgp_ehci_status("reset port");
 	/* Reset the usb debug port */
 	portsc = readl(&ehci_regs->port_status[port - 1]);
 	portsc &= ~PORT_PE;
@@ -388,7 +621,7 @@ static int __init ehci_reset_port(int port)
 	return -EBUSY;
 }
 
-static int __init ehci_wait_for_port(int port)
+static int ehci_wait_for_port(int port)
 {
 	u32 status;
 	int ret, reps;
@@ -487,12 +720,9 @@ static void __init early_ehci_bios_handoff(void)
 
 static int __init ehci_setup(void)
 {
-	struct usb_debug_descriptor dbgp_desc;
-	u32 cmd, ctrl, status, portsc, hcs_params;
+	u32 ctrl, portsc, hcs_params;
 	u32 debug_port, new_debug_port = 0, n_ports;
-	u32  devnum;
 	int ret, i;
-	int loop;
 	int port_map_tried;
 	int playtimes = 3;
 
@@ -505,10 +735,12 @@ try_next_port:
 
 	hcs_params = readl(&ehci_caps->hcs_params);
 	debug_port = HCS_DEBUG_PORT(hcs_params);
+	dbgp_phys_port = debug_port;
 	n_ports    = HCS_N_PORTS(hcs_params);
 
 	dbgp_printk("debug_port: %d\n", debug_port);
 	dbgp_printk("n_ports:    %d\n", n_ports);
+	dbgp_ehci_status("");
 
 	for (i = 1; i <= n_ports; i++) {
 		portsc = readl(&ehci_regs->port_status[i-1]);
@@ -523,138 +755,27 @@ try_next_port:
 		return -1;
 	}
 
-	loop = 250 * 1000;
-	/* Reset the EHCI controller */
-	cmd = readl(&ehci_regs->command);
-	cmd |= CMD_RESET;
-	writel(cmd, &ehci_regs->command);
-	do {
-		cmd = readl(&ehci_regs->command);
-	} while ((cmd & CMD_RESET) && (--loop > 0));
-
-	if (!loop) {
-		dbgp_printk("can not reset ehci\n");
-		return -1;
-	}
-	dbgp_printk("ehci reset done\n");
-
-	/* Claim ownership, but do not enable yet */
-	ctrl = readl(&ehci_debug->control);
-	ctrl |= DBGP_OWNER;
-	ctrl &= ~(DBGP_ENABLED | DBGP_INUSE);
-	writel(ctrl, &ehci_debug->control);
-	udelay(1);
-
-	/* Start the ehci running */
-	cmd = readl(&ehci_regs->command);
-	cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET);
-	cmd |= CMD_RUN;
-	writel(cmd, &ehci_regs->command);
-
-	/* Ensure everything is routed to the EHCI */
-	writel(FLAG_CF, &ehci_regs->configured_flag);
-
-	/* Wait until the controller is no longer halted */
-	loop = 10;
-	do {
-		status = readl(&ehci_regs->status);
-		if (!(status & STS_HALT))
-			break;
-		udelay(1);
-	} while (--loop > 0);
-
-	if (!loop) {
-		dbgp_printk("ehci can not be started\n");
-		return -1;
+	/* Only reset the controller if it is not already in the
+	 * configured state */
+	if (!(readl(&ehci_regs->configured_flag) & FLAG_CF)) {
+		if (dbgp_ehci_controller_reset() != 0)
+			return -1;
+	} else {
+		dbgp_ehci_status("ehci skip - already configured");
 	}
-	dbgp_printk("ehci started\n");
 
-	/* Wait for a device to show up in the debug port */
-	ret = ehci_wait_for_port(debug_port);
-	if (ret < 0) {
-		dbgp_printk("No device found in debug port\n");
+	ret = dbgp_external_startup();
+	if (ret == -EIO)
 		goto next_debug_port;
-	}
-	dbgp_printk("ehci wait for port done\n");
-
-	/* Enable the debug port */
-	ctrl = readl(&ehci_debug->control);
-	ctrl |= DBGP_CLAIM;
-	writel(ctrl, &ehci_debug->control);
-	ctrl = readl(&ehci_debug->control);
-	if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) {
-		dbgp_printk("No device in debug port\n");
-		writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control);
-		goto err;
-	}
-	dbgp_printk("debug ported enabled\n");
 
-	/* Completely transfer the debug device to the debug controller */
-	portsc = readl(&ehci_regs->port_status[debug_port - 1]);
-	portsc &= ~PORT_PE;
-	writel(portsc, &ehci_regs->port_status[debug_port - 1]);
-
-	dbgp_mdelay(100);
-
-	/* Find the debug device and make it device number 127 */
-	for (devnum = 0; devnum <= 127; devnum++) {
-		ret = dbgp_control_msg(devnum,
-			USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
-			USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0,
-			&dbgp_desc, sizeof(dbgp_desc));
-		if (ret > 0)
-			break;
-	}
-	if (devnum > 127) {
-		dbgp_printk("Could not find attached debug device\n");
-		goto err;
-	}
 	if (ret < 0) {
-		dbgp_printk("Attached device is not a debug device\n");
-		goto err;
-	}
-	dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint;
-
-	/* Move the device to 127 if it isn't already there */
-	if (devnum != USB_DEBUG_DEVNUM) {
-		ret = dbgp_control_msg(devnum,
-			USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
-			USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0);
-		if (ret < 0) {
-			dbgp_printk("Could not move attached device to %d\n",
-				USB_DEBUG_DEVNUM);
-			goto err;
-		}
-		devnum = USB_DEBUG_DEVNUM;
-		dbgp_printk("debug device renamed to 127\n");
-	}
-
-	/* Enable the debug interface */
-	ret = dbgp_control_msg(USB_DEBUG_DEVNUM,
-		USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
-		USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0);
-	if (ret < 0) {
-		dbgp_printk(" Could not enable the debug device\n");
-		goto err;
-	}
-	dbgp_printk("debug interface enabled\n");
-
-	/* Perform a small write to get the even/odd data state in sync
-	 */
-	ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1);
-	if (ret < 0) {
-		dbgp_printk("dbgp_bulk_write failed: %d\n", ret);
-		goto err;
+		/* Things didn't work so remove my claim */
+		ctrl = readl(&ehci_debug->control);
+		ctrl &= ~(DBGP_CLAIM | DBGP_OUT);
+		writel(ctrl, &ehci_debug->control);
+		return -1;
 	}
-	dbgp_printk("small write doned\n");
-
 	return 0;
-err:
-	/* Things didn't work so remove my claim */
-	ctrl = readl(&ehci_debug->control);
-	ctrl &= ~(DBGP_CLAIM | DBGP_OUT);
-	writel(ctrl, &ehci_debug->control);
-	return -1;
 
 next_debug_port:
 	port_map_tried |= (1<<(debug_port - 1));
@@ -749,6 +870,7 @@ int __init early_dbgp_init(char *s)
 
 		return -1;
 	}
+	dbgp_ehci_status("early_init_complete");
 
 	return 0;
 }
@@ -758,9 +880,27 @@ static void early_dbgp_write(struct console *con, const char *str, u32 n)
 	int chunk, ret;
 	char buf[DBGP_MAX_PACKET];
 	int use_cr = 0;
+	u32 cmd, ctrl;
+	int reset_run = 0;
 
-	if (!ehci_debug)
+	if (!ehci_debug || dbgp_not_safe)
 		return;
+
+	cmd = readl(&ehci_regs->command);
+	if (unlikely(!(cmd & CMD_RUN))) {
+		/* If the ehci controller is not in the run state do extended
+		 * checks to see if the acpi or some other initialization also
+		 * reset the ehci debug port */
+		ctrl = readl(&ehci_debug->control);
+		if (!(ctrl & DBGP_ENABLED)) {
+			dbgp_not_safe = 1;
+			dbgp_external_startup();
+		} else {
+			cmd |= CMD_RUN;
+			writel(cmd, &ehci_regs->command);
+			reset_run = 1;
+		}
+	}
 	while (n > 0) {
 		for (chunk = 0; chunk < DBGP_MAX_PACKET && n > 0;
 		     str++, chunk++, n--) {
@@ -775,8 +915,15 @@ static void early_dbgp_write(struct console *con, const char *str, u32 n)
 				use_cr = 0;
 			buf[chunk] = *str;
 		}
-		ret = dbgp_bulk_write(USB_DEBUG_DEVNUM,
-			dbgp_endpoint_out, buf, chunk);
+		if (chunk > 0) {
+			ret = dbgp_bulk_write(USB_DEBUG_DEVNUM,
+				      dbgp_endpoint_out, buf, chunk);
+		}
+	}
+	if (unlikely(reset_run)) {
+		cmd = readl(&ehci_regs->command);
+		cmd &= ~CMD_RUN;
+		writel(cmd, &ehci_regs->command);
 	}
 }
 
@@ -786,4 +933,3 @@ struct console early_dbgp_console = {
 	.flags =	CON_PRINTBUFFER,
 	.index =	-1,
 };
-
diff --git a/include/linux/usb/ehci_def.h b/include/linux/usb/ehci_def.h
index 1deac2a7b12e..07851e05d763 100644
--- a/include/linux/usb/ehci_def.h
+++ b/include/linux/usb/ehci_def.h
@@ -176,4 +176,9 @@ extern int __init early_dbgp_init(char *s);
 extern struct console early_dbgp_console;
 #endif /* CONFIG_EARLY_PRINTK_DBGP */
 
+#ifdef CONFIG_EARLY_PRINTK_DBGP
+/* Call backs from ehci host driver to ehci debug driver */
+extern int dbgp_external_startup(void);
+#endif
+
 #endif /* __LINUX_USB_EHCI_DEF_H */
-- 
cgit v1.2.3-71-gd317


From 8d053c79f22462f55c02c8083580730b922cf7b4 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 20 Aug 2009 15:39:54 -0500
Subject: USB: ehci-dbgp,ehci: Allow early or late use of the dbgp device

If the EHCI debug port is initialized and in use, the EHCI host
controller driver must follow two rules.

1) If the EHCI host driver issues a controller reset, the debug
   controller driver re-initialization must get called after the reset
   is completed.

2) The EHCI host driver should ignore any requests to the physical
   EHCI debug port when the EHCI debug port is in use.

The code to check for the debug port was moved from ehci_pci_reinit()
to ehci_pci_setup because it must get called prior to ehci_reset()
which will clear the debug port registers.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: dbrownell@users.sourceforge.net
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/early/ehci-dbgp.c | 23 +++++++++++++++++++++++
 drivers/usb/host/ehci-hcd.c   |  8 ++++++++
 drivers/usb/host/ehci-hub.c   | 10 ++++++++++
 drivers/usb/host/ehci-pci.c   | 39 +++++++++++++++++++--------------------
 include/linux/usb/ehci_def.h  | 10 ++++++++++
 5 files changed, 70 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/early/ehci-dbgp.c b/drivers/usb/early/ehci-dbgp.c
index 06e05ea17871..b88cb65b64e0 100644
--- a/drivers/usb/early/ehci-dbgp.c
+++ b/drivers/usb/early/ehci-dbgp.c
@@ -933,3 +933,26 @@ struct console early_dbgp_console = {
 	.flags =	CON_PRINTBUFFER,
 	.index =	-1,
 };
+
+int dbgp_reset_prep(void)
+{
+	u32 ctrl;
+
+	dbgp_not_safe = 1;
+	if (!ehci_debug)
+		return 0;
+
+	if (early_dbgp_console.index != -1 &&
+		!(early_dbgp_console.flags & CON_BOOT))
+		return 1;
+	/* This means the console is not initialized, or should get
+	 * shutdown so as to allow for reuse of the usb device, which
+	 * means it is time to shutdown the usb debug port. */
+	ctrl = readl(&ehci_debug->control);
+	if (ctrl & DBGP_ENABLED) {
+		ctrl &= ~(DBGP_CLAIM);
+		writel(ctrl, &ehci_debug->control);
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dbgp_reset_prep);
diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
index 4f89d7ffd53a..9835e0713943 100644
--- a/drivers/usb/host/ehci-hcd.c
+++ b/drivers/usb/host/ehci-hcd.c
@@ -240,6 +240,11 @@ static int ehci_reset (struct ehci_hcd *ehci)
 	int	retval;
 	u32	command = ehci_readl(ehci, &ehci->regs->command);
 
+	/* If the EHCI debug controller is active, special care must be
+	 * taken before and after a host controller reset */
+	if (ehci->debug && !dbgp_reset_prep())
+		ehci->debug = NULL;
+
 	command |= CMD_RESET;
 	dbg_cmd (ehci, "reset", command);
 	ehci_writel(ehci, command, &ehci->regs->command);
@@ -260,6 +265,9 @@ static int ehci_reset (struct ehci_hcd *ehci)
 	if (ehci_is_TDI(ehci))
 		tdi_reset (ehci);
 
+	if (ehci->debug)
+		dbgp_external_startup();
+
 	return retval;
 }
 
diff --git a/drivers/usb/host/ehci-hub.c b/drivers/usb/host/ehci-hub.c
index 818647c33da8..6b5e4d18d4bf 100644
--- a/drivers/usb/host/ehci-hub.c
+++ b/drivers/usb/host/ehci-hub.c
@@ -855,6 +855,15 @@ static int ehci_hub_control (
 	case SetPortFeature:
 		selector = wIndex >> 8;
 		wIndex &= 0xff;
+		if (unlikely(ehci->debug)) {
+			/* If the debug port is active any port
+			 * feature requests should get denied */
+			if (wIndex == HCS_DEBUG_PORT(ehci->hcs_params) &&
+			    (readl(&ehci->debug->control) & DBGP_ENABLED)) {
+				retval = -ENODEV;
+				goto error_exit;
+			}
+		}
 		if (!wIndex || wIndex > ports)
 			goto error;
 		wIndex--;
@@ -951,6 +960,7 @@ error:
 		/* "stall" on error */
 		retval = -EPIPE;
 	}
+error_exit:
 	spin_unlock_irqrestore (&ehci->lock, flags);
 	return retval;
 }
diff --git a/drivers/usb/host/ehci-pci.c b/drivers/usb/host/ehci-pci.c
index a88ad517ec5c..378861b9d79a 100644
--- a/drivers/usb/host/ehci-pci.c
+++ b/drivers/usb/host/ehci-pci.c
@@ -27,28 +27,8 @@
 /* called after powerup, by probe or system-pm "wakeup" */
 static int ehci_pci_reinit(struct ehci_hcd *ehci, struct pci_dev *pdev)
 {
-	u32			temp;
 	int			retval;
 
-	/* optional debug port, normally in the first BAR */
-	temp = pci_find_capability(pdev, 0x0a);
-	if (temp) {
-		pci_read_config_dword(pdev, temp, &temp);
-		temp >>= 16;
-		if ((temp & (3 << 13)) == (1 << 13)) {
-			temp &= 0x1fff;
-			ehci->debug = ehci_to_hcd(ehci)->regs + temp;
-			temp = ehci_readl(ehci, &ehci->debug->control);
-			ehci_info(ehci, "debug port %d%s\n",
-				HCS_DEBUG_PORT(ehci->hcs_params),
-				(temp & DBGP_ENABLED)
-					? " IN USE"
-					: "");
-			if (!(temp & DBGP_ENABLED))
-				ehci->debug = NULL;
-		}
-	}
-
 	/* we expect static quirk code to handle the "extended capabilities"
 	 * (currently just BIOS handoff) allowed starting with EHCI 0.96
 	 */
@@ -195,6 +175,25 @@ static int ehci_pci_setup(struct usb_hcd *hcd)
 		break;
 	}
 
+	/* optional debug port, normally in the first BAR */
+	temp = pci_find_capability(pdev, 0x0a);
+	if (temp) {
+		pci_read_config_dword(pdev, temp, &temp);
+		temp >>= 16;
+		if ((temp & (3 << 13)) == (1 << 13)) {
+			temp &= 0x1fff;
+			ehci->debug = ehci_to_hcd(ehci)->regs + temp;
+			temp = ehci_readl(ehci, &ehci->debug->control);
+			ehci_info(ehci, "debug port %d%s\n",
+				HCS_DEBUG_PORT(ehci->hcs_params),
+				(temp & DBGP_ENABLED)
+					? " IN USE"
+					: "");
+			if (!(temp & DBGP_ENABLED))
+				ehci->debug = NULL;
+		}
+	}
+
 	ehci_reset(ehci);
 
 	/* at least the Genesys GL880S needs fixup here */
diff --git a/include/linux/usb/ehci_def.h b/include/linux/usb/ehci_def.h
index 07851e05d763..1909d924f816 100644
--- a/include/linux/usb/ehci_def.h
+++ b/include/linux/usb/ehci_def.h
@@ -179,6 +179,16 @@ extern struct console early_dbgp_console;
 #ifdef CONFIG_EARLY_PRINTK_DBGP
 /* Call backs from ehci host driver to ehci debug driver */
 extern int dbgp_external_startup(void);
+extern int dbgp_reset_prep(void);
+#else
+static inline int dbgp_reset_prep(void)
+{
+	return 1;
+}
+static inline int dbgp_external_startup(void)
+{
+	return -1;
+}
 #endif
 
 #endif /* __LINUX_USB_EHCI_DEF_H */
-- 
cgit v1.2.3-71-gd317


From aab2d4086a1876fcff282aa36e2d4a92aa9935c9 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 20 Aug 2009 15:39:55 -0500
Subject: USB: ehci-dbgp: errata for EHCI debug controller initialization

On some EHCI usb debug controllers, the EHCI debug device will fail to
be seen after a port reset, after a warm reset.  Two options exist to
get the device to initialize correctly.

Option 1 is to unplug and plug in the device.

Option 2 is to use the EHCI port test to get the usb debug device to
start talking again.  At that point the debug controller port reset
will succeed.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
CC: dbrownell@users.sourceforge.net
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/early/ehci-dbgp.c | 23 ++++++++++++++++++++++-
 include/linux/usb/ehci_def.h  |  1 +
 2 files changed, 23 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/usb/early/ehci-dbgp.c b/drivers/usb/early/ehci-dbgp.c
index b88cb65b64e0..f0a41c647bef 100644
--- a/drivers/usb/early/ehci-dbgp.c
+++ b/drivers/usb/early/ehci-dbgp.c
@@ -478,10 +478,13 @@ int dbgp_external_startup(void)
 	int devnum;
 	struct usb_debug_descriptor dbgp_desc;
 	int ret;
-	u32 ctrl, portsc;
+	u32 ctrl, portsc, cmd;
 	int dbg_port = dbgp_phys_port;
 	int tries = 3;
+	int reset_port_tries = 1;
+	int try_hard_once = 1;
 
+try_port_reset_again:
 	ret = dbgp_ehci_startup();
 	if (ret)
 		return ret;
@@ -490,6 +493,24 @@ int dbgp_external_startup(void)
 	ret = ehci_wait_for_port(dbg_port);
 	if (ret < 0) {
 		portsc = readl(&ehci_regs->port_status[dbg_port - 1]);
+		if (!(portsc & PORT_CONNECT) && try_hard_once) {
+			/* Last ditch effort to try to force enable
+			 * the debug device by using the packet test
+			 * ehci command to try and wake it up. */
+			try_hard_once = 0;
+			cmd = readl(&ehci_regs->command);
+			cmd &= ~CMD_RUN;
+			writel(cmd, &ehci_regs->command);
+			portsc = readl(&ehci_regs->port_status[dbg_port - 1]);
+			portsc |= PORT_TEST_PKT;
+			writel(portsc, &ehci_regs->port_status[dbg_port - 1]);
+			dbgp_ehci_status("Trying to force debug port online");
+			mdelay(50);
+			dbgp_ehci_controller_reset();
+			goto try_port_reset_again;
+		} else if (reset_port_tries--) {
+			goto try_port_reset_again;
+		}
 		dbgp_printk("No device found in debug port\n");
 		return -EIO;
 	}
diff --git a/include/linux/usb/ehci_def.h b/include/linux/usb/ehci_def.h
index 1909d924f816..af4b86f3aca3 100644
--- a/include/linux/usb/ehci_def.h
+++ b/include/linux/usb/ehci_def.h
@@ -105,6 +105,7 @@ struct ehci_regs {
 #define PORT_WKDISC_E	(1<<21)		/* wake on disconnect (enable) */
 #define PORT_WKCONN_E	(1<<20)		/* wake on connect (enable) */
 /* 19:16 for port testing */
+#define PORT_TEST_PKT	(0x4<<16)	/* Port Test Control - packet test */
 #define PORT_LED_OFF	(0<<14)
 #define PORT_LED_AMBER	(1<<14)
 #define PORT_LED_GREEN	(2<<14)
-- 
cgit v1.2.3-71-gd317


From 01c6460f968d7b57fc6f98adb587952628c6e099 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Tue, 1 Sep 2009 11:09:56 -0400
Subject: USB: usbfs: add USBDEVFS_URB_BULK_CONTINUATION flag

This patch (as1283) adds a new flag, USBDEVFS_URB_BULK_CONTINUATION,
to usbfs.  It is intended for userspace libraries such as libusb and
openusb.  When they have to break up a single usbfs bulk transfer into
multiple URBs, they will set the flag on all but the first URB of the
series.

If an error other than an unlink occurs, the kernel will automatically
cancel all the following URBs for the same endpoint and refuse to
accept new submissions, until an URB is encountered that is not marked
as a BULK_CONTINUATION.  Such an URB would indicate the start of a new
transfer or the presence of an older library, so the kernel returns to
normal operation.

This enables libraries to delimit bulk transfers correctly, even in
the presence of early termination as indicated by short packets.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/devio.c     | 78 +++++++++++++++++++++++++++++++++++++++++++-
 include/linux/usbdevice_fs.h |  1 +
 2 files changed, 78 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index 71514be8b715..181f78c84105 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -74,6 +74,7 @@ struct dev_state {
 	void __user *disccontext;
 	unsigned long ifclaimed;
 	u32 secid;
+	u32 disabled_bulk_eps;
 };
 
 struct async {
@@ -88,6 +89,8 @@ struct async {
 	struct urb *urb;
 	int status;
 	u32 secid;
+	u8 bulk_addr;
+	u8 bulk_status;
 };
 
 static int usbfs_snoop;
@@ -343,6 +346,43 @@ static void snoop_urb(struct usb_device *udev,
 	}
 }
 
+#define AS_CONTINUATION	1
+#define AS_UNLINK	2
+
+static void cancel_bulk_urbs(struct dev_state *ps, unsigned bulk_addr)
+__releases(ps->lock)
+__acquires(ps->lock)
+{
+	struct async *as;
+
+	/* Mark all the pending URBs that match bulk_addr, up to but not
+	 * including the first one without AS_CONTINUATION.  If such an
+	 * URB is encountered then a new transfer has already started so
+	 * the endpoint doesn't need to be disabled; otherwise it does.
+	 */
+	list_for_each_entry(as, &ps->async_pending, asynclist) {
+		if (as->bulk_addr == bulk_addr) {
+			if (as->bulk_status != AS_CONTINUATION)
+				goto rescan;
+			as->bulk_status = AS_UNLINK;
+			as->bulk_addr = 0;
+		}
+	}
+	ps->disabled_bulk_eps |= (1 << bulk_addr);
+
+	/* Now carefully unlink all the marked pending URBs */
+ rescan:
+	list_for_each_entry(as, &ps->async_pending, asynclist) {
+		if (as->bulk_status == AS_UNLINK) {
+			as->bulk_status = 0;		/* Only once */
+			spin_unlock(&ps->lock);		/* Allow completions */
+			usb_unlink_urb(as->urb);
+			spin_lock(&ps->lock);
+			goto rescan;
+		}
+	}
+}
+
 static void async_completed(struct urb *urb)
 {
 	struct async *as = urb->context;
@@ -371,6 +411,9 @@ static void async_completed(struct urb *urb)
 	snoop(&urb->dev->dev, "urb complete\n");
 	snoop_urb(urb->dev, as->userurb, urb->pipe, urb->actual_length,
 			as->status, COMPLETE);
+	if (as->status < 0 && as->bulk_addr && as->status != -ECONNRESET &&
+			as->status != -ENOENT)
+		cancel_bulk_urbs(ps, as->bulk_addr);
 	spin_unlock(&ps->lock);
 
 	if (signr)
@@ -993,6 +1036,7 @@ static int proc_do_submiturb(struct dev_state *ps, struct usbdevfs_urb *uurb,
 
 	if (uurb->flags & ~(USBDEVFS_URB_ISO_ASAP |
 				USBDEVFS_URB_SHORT_NOT_OK |
+				USBDEVFS_URB_BULK_CONTINUATION |
 				USBDEVFS_URB_NO_FSBR |
 				USBDEVFS_URB_ZERO_PACKET |
 				USBDEVFS_URB_NO_INTERRUPT))
@@ -1194,7 +1238,39 @@ static int proc_do_submiturb(struct dev_state *ps, struct usbdevfs_urb *uurb,
 	snoop_urb(ps->dev, as->userurb, as->urb->pipe,
 			as->urb->transfer_buffer_length, 0, SUBMIT);
 	async_newpending(as);
-	if ((ret = usb_submit_urb(as->urb, GFP_KERNEL))) {
+
+	if (usb_endpoint_xfer_bulk(&ep->desc)) {
+		spin_lock_irq(&ps->lock);
+
+		/* Not exactly the endpoint address; the direction bit is
+		 * shifted to the 0x10 position so that the value will be
+		 * between 0 and 31.
+		 */
+		as->bulk_addr = usb_endpoint_num(&ep->desc) |
+			((ep->desc.bEndpointAddress & USB_ENDPOINT_DIR_MASK)
+				>> 3);
+
+		/* If this bulk URB is the start of a new transfer, re-enable
+		 * the endpoint.  Otherwise mark it as a continuation URB.
+		 */
+		if (uurb->flags & USBDEVFS_URB_BULK_CONTINUATION)
+			as->bulk_status = AS_CONTINUATION;
+		else
+			ps->disabled_bulk_eps &= ~(1 << as->bulk_addr);
+
+		/* Don't accept continuation URBs if the endpoint is
+		 * disabled because of an earlier error.
+		 */
+		if (ps->disabled_bulk_eps & (1 << as->bulk_addr))
+			ret = -EREMOTEIO;
+		else
+			ret = usb_submit_urb(as->urb, GFP_ATOMIC);
+		spin_unlock_irq(&ps->lock);
+	} else {
+		ret = usb_submit_urb(as->urb, GFP_KERNEL);
+	}
+
+	if (ret) {
 		dev_printk(KERN_DEBUG, &ps->dev->dev,
 			   "usbfs: usb_submit_urb returned %d\n", ret);
 		snoop_urb(ps->dev, as->userurb, as->urb->pipe,
diff --git a/include/linux/usbdevice_fs.h b/include/linux/usbdevice_fs.h
index 00ceebeb9e5c..b2a7d8ba6ee3 100644
--- a/include/linux/usbdevice_fs.h
+++ b/include/linux/usbdevice_fs.h
@@ -77,6 +77,7 @@ struct usbdevfs_connectinfo {
 
 #define USBDEVFS_URB_SHORT_NOT_OK	0x01
 #define USBDEVFS_URB_ISO_ASAP		0x02
+#define USBDEVFS_URB_BULK_CONTINUATION	0x04
 #define USBDEVFS_URB_NO_FSBR		0x20
 #define USBDEVFS_URB_ZERO_PACKET	0x40
 #define USBDEVFS_URB_NO_INTERRUPT	0x80
-- 
cgit v1.2.3-71-gd317


From fa081b00a80ef3f4575c99af6e97d29e1628cf51 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 22 Sep 2009 16:43:27 -0700
Subject: include/linux/kmemcheck.h: fix a trillion warnings

of the form

include/net/inet_sock.h:208: warning: ISO C90 forbids mixed declarations and code

Cc: Johannes Berg <johannes@sipsolutions.net>
Acked-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kmemcheck.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h
index c8006607f947..136cdcdf92ef 100644
--- a/include/linux/kmemcheck.h
+++ b/include/linux/kmemcheck.h
@@ -145,10 +145,12 @@ static inline bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
 
 #define kmemcheck_annotate_bitfield(ptr, name)				\
 	do {								\
+		int _n;							\
+									\
 		if (!ptr)						\
 			break;						\
 									\
-		int _n = (long) &((ptr)->name##_end)			\
+		_n = (long) &((ptr)->name##_end)			\
 			- (long) &((ptr)->name##_begin);		\
 		BUILD_BUG_ON(_n < 0);					\
 									\
-- 
cgit v1.2.3-71-gd317


From af91322ef3f29ae4114e736e2a72e28b4d619cf9 Mon Sep 17 00:00:00 2001
From: Dave Young <hidave.darkstar@gmail.com>
Date: Tue, 22 Sep 2009 16:43:33 -0700
Subject: printk: add printk_delay to make messages readable for some scenarios

When syslog is not possible, at the same time there's no serial/net
console available, it will be hard to read the printk messages.  For
example oops/panic/warning messages in shutdown phase.

Add a printk delay feature, we can make each printk message delay some
milliseconds.

Setting the delay by proc/sysctl interface: /proc/sys/kernel/printk_delay

The value range from 0 - 10000, default value is 0

[akpm@linux-foundation.org: fix a few things]
Signed-off-by: Dave Young <hidave.darkstar@gmail.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/kernel.txt |  8 ++++++++
 include/linux/kernel.h          |  2 ++
 kernel/printk.c                 | 15 +++++++++++++++
 kernel/sysctl.c                 | 14 ++++++++++++++
 4 files changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 3e5b63ebb821..b3d8b4922740 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -313,6 +313,14 @@ send before ratelimiting kicks in.
 
 ==============================================================
 
+printk_delay:
+
+Delay each printk message in printk_delay milliseconds
+
+Value from 0 - 10000 is allowed.
+
+==============================================================
+
 randomize-va-space:
 
 This option can be used to select the type of process address
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2b5b1e0899a8..0a2a19087863 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -246,6 +246,8 @@ extern int printk_ratelimit(void);
 extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 				   unsigned int interval_msec);
 
+extern int printk_delay_msec;
+
 /*
  * Print a one-time message (analogous to WARN_ONCE() et al):
  */
diff --git a/kernel/printk.c b/kernel/printk.c
index 932ea21feb18..f38b07f78a4e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -653,6 +653,20 @@ static int recursion_bug;
 static int new_text_line = 1;
 static char printk_buf[1024];
 
+int printk_delay_msec __read_mostly;
+
+static inline void printk_delay(void)
+{
+	if (unlikely(printk_delay_msec)) {
+		int m = printk_delay_msec;
+
+		while (m--) {
+			mdelay(1);
+			touch_nmi_watchdog();
+		}
+	}
+}
+
 asmlinkage int vprintk(const char *fmt, va_list args)
 {
 	int printed_len = 0;
@@ -662,6 +676,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	char *p;
 
 	boot_delay_msec();
+	printk_delay();
 
 	preempt_disable();
 	/* This stops the holder of console_sem just where we want him */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6ba49c7cb128..0dfaa47d7cb6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -106,6 +106,9 @@ static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
+#ifdef CONFIG_PRINTK
+static int ten_thousand = 10000;
+#endif
 
 /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
 static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -722,6 +725,17 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "printk_delay",
+		.data		= &printk_delay_msec,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &ten_thousand,
+	},
 #endif
 	{
 		.ctl_name	= KERN_NGROUPS_MAX,
-- 
cgit v1.2.3-71-gd317


From 1fd7317d02ec03c6fdf072317841287933d06d24 Mon Sep 17 00:00:00 2001
From: Nick Black <dank@qemfd.net>
Date: Tue, 22 Sep 2009 16:43:33 -0700
Subject: Move magic numbers into magic.h

Move various magic-number definitions into magic.h.

Signed-off-by: Nick Black <dank@qemfd.net>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/devpts/inode.c          | 3 +--
 fs/hugetlbfs/inode.c       | 4 +---
 include/linux/magic.h      | 5 +++++
 net/socket.c               | 3 +--
 security/smack/smack_lsm.c | 8 +-------
 5 files changed, 9 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 75efb028974b..d5f8c96964be 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -18,14 +18,13 @@
 #include <linux/mount.h>
 #include <linux/tty.h>
 #include <linux/mutex.h>
+#include <linux/magic.h>
 #include <linux/idr.h>
 #include <linux/devpts_fs.h>
 #include <linux/parser.h>
 #include <linux/fsnotify.h>
 #include <linux/seq_file.h>
 
-#define DEVPTS_SUPER_MAGIC 0x1cd1
-
 #define DEVPTS_DEFAULT_MODE 0600
 /*
  * ptmx is a new node in /dev/pts and will be unused in legacy (single-
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 06b7c2623f99..eba6d552d9c9 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -31,12 +31,10 @@
 #include <linux/statfs.h>
 #include <linux/security.h>
 #include <linux/ima.h>
+#include <linux/magic.h>
 
 #include <asm/uaccess.h>
 
-/* some random number */
-#define HUGETLBFS_MAGIC	0x958458f6
-
 static const struct super_operations hugetlbfs_ops;
 static const struct address_space_operations hugetlbfs_aops;
 const struct file_operations hugetlbfs_file_operations;
diff --git a/include/linux/magic.h b/include/linux/magic.h
index 1923327b9869..bce37786a0a5 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -13,6 +13,7 @@
 #define SECURITYFS_MAGIC	0x73636673
 #define SELINUX_MAGIC		0xf97cff8c
 #define TMPFS_MAGIC		0x01021994
+#define HUGETLBFS_MAGIC 	0x958458f6	/* some random number */
 #define SQUASHFS_MAGIC		0x73717368
 #define EFS_SUPER_MAGIC		0x414A53
 #define EXT2_SUPER_MAGIC	0xEF53
@@ -53,4 +54,8 @@
 #define INOTIFYFS_SUPER_MAGIC	0x2BAD1DEA
 
 #define STACK_END_MAGIC		0x57AC6E9D
+
+#define DEVPTS_SUPER_MAGIC	0x1cd1
+#define SOCKFS_MAGIC		0x534F434B
+
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/net/socket.c b/net/socket.c
index 0ad02ae61a91..49917a1cac7d 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -86,6 +86,7 @@
 #include <linux/audit.h>
 #include <linux/wireless.h>
 #include <linux/nsproxy.h>
+#include <linux/magic.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -235,8 +236,6 @@ int move_addr_to_user(struct sockaddr *kaddr, int klen, void __user *uaddr,
 	return __put_user(klen, ulen);
 }
 
-#define SOCKFS_MAGIC 0x534F434B
-
 static struct kmem_cache *sock_inode_cachep __read_mostly;
 
 static struct inode *sock_alloc_inode(struct super_block *sb)
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index acae7ef4092d..c33b6bb9b6dd 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -30,17 +30,11 @@
 #include <net/netlabel.h>
 #include <net/cipso_ipv4.h>
 #include <linux/audit.h>
+#include <linux/magic.h>
 #include "smack.h"
 
 #define task_security(task)	(task_cred_xxx((task), security))
 
-/*
- * I hope these are the hokeyist lines of code in the module. Casey.
- */
-#define DEVPTS_SUPER_MAGIC	0x1cd1
-#define SOCKFS_MAGIC		0x534F434B
-#define TMPFS_MAGIC		0x01021994
-
 /**
  * smk_fetch - Fetch the smack label from a file.
  * @ip: a pointer to the inode
-- 
cgit v1.2.3-71-gd317


From 54fdade1c3332391948ec43530c02c4794a38172 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 22 Sep 2009 16:43:39 -0700
Subject: generic-ipi: make struct call_function_data lockless

This patch can remove spinlock from struct call_function_data, the
reasons are below:

1: add a new interface for cpumask named cpumask_test_and_clear_cpu(),
   it can atomically test and clear specific cpu, we can use it instead
   of cpumask_test_cpu() and cpumask_clear_cpu() and no need data->lock
   to protect those in generic_smp_call_function_interrupt().

2: in smp_call_function_many(), after csd_lock() return, the current's
   cfd_data is deleted from call_function list, so it not have race
   between other cpus, then cfs_data is only used in
   smp_call_function_many() that must disable preemption and not from
   a hardware interrupthandler or from a bottom half handler to call,
   only the correspond cpu can use it, so it not have race in current
   cpu, no need cfs_data->lock to protect it.

3: after 1 and 2, cfs_data->lock is only use to protect cfs_data->refs in
   generic_smp_call_function_interrupt(), so we can define cfs_data->refs
   to atomic_t, and no need cfs_data->lock any more.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
[akpm@linux-foundation.org: use atomic_dec_return()]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpumask.h | 12 ++++++++++++
 kernel/smp.c            | 29 ++++++++---------------------
 2 files changed, 20 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 796df12091b7..9b1d458aac6e 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -714,6 +714,18 @@ static inline int cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask)
 	return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask));
 }
 
+/**
+ * cpumask_test_and_clear_cpu - atomically test and clear a cpu in a cpumask
+ * @cpu: cpu number (< nr_cpu_ids)
+ * @cpumask: the cpumask pointer
+ *
+ * test_and_clear_bit wrapper for cpumasks.
+ */
+static inline int cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask)
+{
+	return test_and_clear_bit(cpumask_check(cpu), cpumask_bits(cpumask));
+}
+
 /**
  * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask
  * @dstp: the cpumask pointer
diff --git a/kernel/smp.c b/kernel/smp.c
index 8e218500ab14..fd47a256a24e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -29,8 +29,7 @@ enum {
 
 struct call_function_data {
 	struct call_single_data	csd;
-	spinlock_t		lock;
-	unsigned int		refs;
+	atomic_t		refs;
 	cpumask_var_t		cpumask;
 };
 
@@ -39,9 +38,7 @@ struct call_single_queue {
 	spinlock_t		lock;
 };
 
-static DEFINE_PER_CPU(struct call_function_data, cfd_data) = {
-	.lock			= __SPIN_LOCK_UNLOCKED(cfd_data.lock),
-};
+static DEFINE_PER_CPU(struct call_function_data, cfd_data);
 
 static int
 hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -196,25 +193,18 @@ void generic_smp_call_function_interrupt(void)
 	list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
 		int refs;
 
-		spin_lock(&data->lock);
-		if (!cpumask_test_cpu(cpu, data->cpumask)) {
-			spin_unlock(&data->lock);
+		if (!cpumask_test_and_clear_cpu(cpu, data->cpumask))
 			continue;
-		}
-		cpumask_clear_cpu(cpu, data->cpumask);
-		spin_unlock(&data->lock);
 
 		data->csd.func(data->csd.info);
 
-		spin_lock(&data->lock);
-		WARN_ON(data->refs == 0);
-		refs = --data->refs;
+		refs = atomic_dec_return(&data->refs);
+		WARN_ON(refs < 0);
 		if (!refs) {
 			spin_lock(&call_function.lock);
 			list_del_rcu(&data->csd.list);
 			spin_unlock(&call_function.lock);
 		}
-		spin_unlock(&data->lock);
 
 		if (refs)
 			continue;
@@ -419,23 +409,20 @@ void smp_call_function_many(const struct cpumask *mask,
 	data = &__get_cpu_var(cfd_data);
 	csd_lock(&data->csd);
 
-	spin_lock_irqsave(&data->lock, flags);
 	data->csd.func = func;
 	data->csd.info = info;
 	cpumask_and(data->cpumask, mask, cpu_online_mask);
 	cpumask_clear_cpu(this_cpu, data->cpumask);
-	data->refs = cpumask_weight(data->cpumask);
+	atomic_set(&data->refs, cpumask_weight(data->cpumask));
 
-	spin_lock(&call_function.lock);
+	spin_lock_irqsave(&call_function.lock, flags);
 	/*
 	 * Place entry at the _HEAD_ of the list, so that any cpu still
 	 * observing the entry in generic_smp_call_function_interrupt()
 	 * will not miss any other list entries:
 	 */
 	list_add_rcu(&data->csd.list, &call_function.queue);
-	spin_unlock(&call_function.lock);
-
-	spin_unlock_irqrestore(&data->lock, flags);
+	spin_unlock_irqrestore(&call_function.lock, flags);
 
 	/*
 	 * Make the list addition visible before sending the ipi.
-- 
cgit v1.2.3-71-gd317


From 88e9d34c727883d7d6f02cf1475b3ec98b8480c7 Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Tue, 22 Sep 2009 16:43:43 -0700
Subject: seq_file: constify seq_operations

Make all seq_operations structs const, to help mitigate against
revectoring user-triggerable function pointers.

This is derived from the grsecurity patch, although generated from scratch
because it's simpler than extracting the changes from there.

Signed-off-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mn10300/kernel/setup.c                  |  2 +-
 arch/powerpc/kernel/setup-common.c           |  2 +-
 arch/powerpc/platforms/pseries/hvCall_inst.c |  2 +-
 drivers/block/cciss.c                        |  2 +-
 drivers/char/misc.c                          |  2 +-
 drivers/char/tpm/tpm_bios.c                  |  4 ++--
 drivers/isdn/capi/kcapi_proc.c               | 10 +++++-----
 drivers/scsi/sg.c                            |  6 +++---
 fs/afs/proc.c                                |  8 ++++----
 fs/dlm/debug_fs.c                            | 12 ++++++------
 fs/jbd2/journal.c                            |  4 ++--
 fs/nfs/client.c                              |  4 ++--
 fs/nfsd/export.c                             |  2 +-
 fs/ocfs2/cluster/netdebug.c                  |  4 ++--
 fs/ocfs2/dlm/dlmdebug.c                      |  2 +-
 fs/proc/nommu.c                              |  2 +-
 include/linux/nfsd/nfsd.h                    |  2 +-
 ipc/util.c                                   |  2 +-
 kernel/cgroup.c                              |  2 +-
 kernel/kprobes.c                             |  2 +-
 kernel/lockdep_proc.c                        |  2 +-
 kernel/trace/ftrace.c                        |  4 ++--
 kernel/trace/trace.c                         |  4 ++--
 net/ipv6/ip6mr.c                             |  2 +-
 security/integrity/ima/ima_fs.c              |  4 ++--
 security/smack/smackfs.c                     |  6 +++---
 26 files changed, 49 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mn10300/kernel/setup.c b/arch/mn10300/kernel/setup.c
index 79890edfd67a..3f24c298a3af 100644
--- a/arch/mn10300/kernel/setup.c
+++ b/arch/mn10300/kernel/setup.c
@@ -285,7 +285,7 @@ static void c_stop(struct seq_file *m, void *v)
 {
 }
 
-struct seq_operations cpuinfo_op = {
+const struct seq_operations cpuinfo_op = {
 	.start	= c_start,
 	.next	= c_next,
 	.stop	= c_stop,
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 02fed27af7f6..1d5570a1e456 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -328,7 +328,7 @@ static void c_stop(struct seq_file *m, void *v)
 {
 }
 
-struct seq_operations cpuinfo_op = {
+const struct seq_operations cpuinfo_op = {
 	.start =c_start,
 	.next =	c_next,
 	.stop =	c_stop,
diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c b/arch/powerpc/platforms/pseries/hvCall_inst.c
index eae51ef9af24..3631a4f277eb 100644
--- a/arch/powerpc/platforms/pseries/hvCall_inst.c
+++ b/arch/powerpc/platforms/pseries/hvCall_inst.c
@@ -71,7 +71,7 @@ static int hc_show(struct seq_file *m, void *p)
 	return 0;
 }
 
-static struct seq_operations hcall_inst_seq_ops = {
+static const struct seq_operations hcall_inst_seq_ops = {
         .start = hc_start,
         .next  = hc_next,
         .stop  = hc_stop,
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 4f19105f755c..24c3e21ab263 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -363,7 +363,7 @@ static void cciss_seq_stop(struct seq_file *seq, void *v)
 	h->busy_configuring = 0;
 }
 
-static struct seq_operations cciss_seq_ops = {
+static const struct seq_operations cciss_seq_ops = {
 	.start = cciss_seq_start,
 	.show  = cciss_seq_show,
 	.next  = cciss_seq_next,
diff --git a/drivers/char/misc.c b/drivers/char/misc.c
index 1ee27cc23426..07fa612a58d5 100644
--- a/drivers/char/misc.c
+++ b/drivers/char/misc.c
@@ -91,7 +91,7 @@ static int misc_seq_show(struct seq_file *seq, void *v)
 }
 
 
-static struct seq_operations misc_seq_ops = {
+static const struct seq_operations misc_seq_ops = {
 	.start = misc_seq_start,
 	.next  = misc_seq_next,
 	.stop  = misc_seq_stop,
diff --git a/drivers/char/tpm/tpm_bios.c b/drivers/char/tpm/tpm_bios.c
index 0c2f55a38b95..bf2170fb1cdd 100644
--- a/drivers/char/tpm/tpm_bios.c
+++ b/drivers/char/tpm/tpm_bios.c
@@ -343,14 +343,14 @@ static int tpm_ascii_bios_measurements_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static struct seq_operations tpm_ascii_b_measurments_seqops = {
+static const struct seq_operations tpm_ascii_b_measurments_seqops = {
 	.start = tpm_bios_measurements_start,
 	.next = tpm_bios_measurements_next,
 	.stop = tpm_bios_measurements_stop,
 	.show = tpm_ascii_bios_measurements_show,
 };
 
-static struct seq_operations tpm_binary_b_measurments_seqops = {
+static const struct seq_operations tpm_binary_b_measurments_seqops = {
 	.start = tpm_bios_measurements_start,
 	.next = tpm_bios_measurements_next,
 	.stop = tpm_bios_measurements_stop,
diff --git a/drivers/isdn/capi/kcapi_proc.c b/drivers/isdn/capi/kcapi_proc.c
index 50ed778f63fc..09d4db764d22 100644
--- a/drivers/isdn/capi/kcapi_proc.c
+++ b/drivers/isdn/capi/kcapi_proc.c
@@ -89,14 +89,14 @@ static int contrstats_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static struct seq_operations seq_controller_ops = {
+static const struct seq_operations seq_controller_ops = {
 	.start	= controller_start,
 	.next	= controller_next,
 	.stop	= controller_stop,
 	.show	= controller_show,
 };
 
-static struct seq_operations seq_contrstats_ops = {
+static const struct seq_operations seq_contrstats_ops = {
 	.start	= controller_start,
 	.next	= controller_next,
 	.stop	= controller_stop,
@@ -194,14 +194,14 @@ applstats_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static struct seq_operations seq_applications_ops = {
+static const struct seq_operations seq_applications_ops = {
 	.start	= applications_start,
 	.next	= applications_next,
 	.stop	= applications_stop,
 	.show	= applications_show,
 };
 
-static struct seq_operations seq_applstats_ops = {
+static const struct seq_operations seq_applstats_ops = {
 	.start	= applications_start,
 	.next	= applications_next,
 	.stop	= applications_stop,
@@ -264,7 +264,7 @@ static int capi_driver_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static struct seq_operations seq_capi_driver_ops = {
+static const struct seq_operations seq_capi_driver_ops = {
 	.start	= capi_driver_start,
 	.next	= capi_driver_next,
 	.stop	= capi_driver_stop,
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 4968c4ced385..848b59466850 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -2233,7 +2233,7 @@ static struct file_operations dev_fops = {
 	.open = sg_proc_open_dev,
 	.release = seq_release,
 };
-static struct seq_operations dev_seq_ops = {
+static const struct seq_operations dev_seq_ops = {
 	.start = dev_seq_start,
 	.next  = dev_seq_next,
 	.stop  = dev_seq_stop,
@@ -2246,7 +2246,7 @@ static struct file_operations devstrs_fops = {
 	.open = sg_proc_open_devstrs,
 	.release = seq_release,
 };
-static struct seq_operations devstrs_seq_ops = {
+static const struct seq_operations devstrs_seq_ops = {
 	.start = dev_seq_start,
 	.next  = dev_seq_next,
 	.stop  = dev_seq_stop,
@@ -2259,7 +2259,7 @@ static struct file_operations debug_fops = {
 	.open = sg_proc_open_debug,
 	.release = seq_release,
 };
-static struct seq_operations debug_seq_ops = {
+static const struct seq_operations debug_seq_ops = {
 	.start = dev_seq_start,
 	.next  = dev_seq_next,
 	.stop  = dev_seq_stop,
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 8630615e57fe..852739d262a9 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -28,7 +28,7 @@ static int afs_proc_cells_show(struct seq_file *m, void *v);
 static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
 				    size_t size, loff_t *_pos);
 
-static struct seq_operations afs_proc_cells_ops = {
+static const struct seq_operations afs_proc_cells_ops = {
 	.start	= afs_proc_cells_start,
 	.next	= afs_proc_cells_next,
 	.stop	= afs_proc_cells_stop,
@@ -70,7 +70,7 @@ static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
 static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v);
 static int afs_proc_cell_volumes_show(struct seq_file *m, void *v);
 
-static struct seq_operations afs_proc_cell_volumes_ops = {
+static const struct seq_operations afs_proc_cell_volumes_ops = {
 	.start	= afs_proc_cell_volumes_start,
 	.next	= afs_proc_cell_volumes_next,
 	.stop	= afs_proc_cell_volumes_stop,
@@ -95,7 +95,7 @@ static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
 static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v);
 static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v);
 
-static struct seq_operations afs_proc_cell_vlservers_ops = {
+static const struct seq_operations afs_proc_cell_vlservers_ops = {
 	.start	= afs_proc_cell_vlservers_start,
 	.next	= afs_proc_cell_vlservers_next,
 	.stop	= afs_proc_cell_vlservers_stop,
@@ -119,7 +119,7 @@ static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
 static void afs_proc_cell_servers_stop(struct seq_file *p, void *v);
 static int afs_proc_cell_servers_show(struct seq_file *m, void *v);
 
-static struct seq_operations afs_proc_cell_servers_ops = {
+static const struct seq_operations afs_proc_cell_servers_ops = {
 	.start	= afs_proc_cell_servers_start,
 	.next	= afs_proc_cell_servers_next,
 	.stop	= afs_proc_cell_servers_stop,
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 1d1d27442235..1c8bb8c3a82e 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -386,9 +386,9 @@ static int table_seq_show(struct seq_file *seq, void *iter_ptr)
 	return rv;
 }
 
-static struct seq_operations format1_seq_ops;
-static struct seq_operations format2_seq_ops;
-static struct seq_operations format3_seq_ops;
+static const struct seq_operations format1_seq_ops;
+static const struct seq_operations format2_seq_ops;
+static const struct seq_operations format3_seq_ops;
 
 static void *table_seq_start(struct seq_file *seq, loff_t *pos)
 {
@@ -534,21 +534,21 @@ static void table_seq_stop(struct seq_file *seq, void *iter_ptr)
 	}
 }
 
-static struct seq_operations format1_seq_ops = {
+static const struct seq_operations format1_seq_ops = {
 	.start = table_seq_start,
 	.next  = table_seq_next,
 	.stop  = table_seq_stop,
 	.show  = table_seq_show,
 };
 
-static struct seq_operations format2_seq_ops = {
+static const struct seq_operations format2_seq_ops = {
 	.start = table_seq_start,
 	.next  = table_seq_next,
 	.stop  = table_seq_stop,
 	.show  = table_seq_show,
 };
 
-static struct seq_operations format3_seq_ops = {
+static const struct seq_operations format3_seq_ops = {
 	.start = table_seq_start,
 	.next  = table_seq_next,
 	.stop  = table_seq_stop,
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index a8a358bc0f21..53b86e16e5fe 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -768,7 +768,7 @@ static void jbd2_seq_history_stop(struct seq_file *seq, void *v)
 {
 }
 
-static struct seq_operations jbd2_seq_history_ops = {
+static const struct seq_operations jbd2_seq_history_ops = {
 	.start  = jbd2_seq_history_start,
 	.next   = jbd2_seq_history_next,
 	.stop   = jbd2_seq_history_stop,
@@ -872,7 +872,7 @@ static void jbd2_seq_info_stop(struct seq_file *seq, void *v)
 {
 }
 
-static struct seq_operations jbd2_seq_info_ops = {
+static const struct seq_operations jbd2_seq_info_ops = {
 	.start  = jbd2_seq_info_start,
 	.next   = jbd2_seq_info_next,
 	.stop   = jbd2_seq_info_stop,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index a7ce15d3c248..152025358dad 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1531,7 +1531,7 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos);
 static void nfs_server_list_stop(struct seq_file *p, void *v);
 static int nfs_server_list_show(struct seq_file *m, void *v);
 
-static struct seq_operations nfs_server_list_ops = {
+static const struct seq_operations nfs_server_list_ops = {
 	.start	= nfs_server_list_start,
 	.next	= nfs_server_list_next,
 	.stop	= nfs_server_list_stop,
@@ -1552,7 +1552,7 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos);
 static void nfs_volume_list_stop(struct seq_file *p, void *v);
 static int nfs_volume_list_show(struct seq_file *m, void *v);
 
-static struct seq_operations nfs_volume_list_ops = {
+static const struct seq_operations nfs_volume_list_ops = {
 	.start	= nfs_volume_list_start,
 	.next	= nfs_volume_list_next,
 	.stop	= nfs_volume_list_stop,
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 984a5ebcc1d6..c1c9e035d4a4 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1517,7 +1517,7 @@ static int e_show(struct seq_file *m, void *p)
 	return svc_export_show(m, &svc_export_cache, cp);
 }
 
-struct seq_operations nfs_exports_op = {
+const struct seq_operations nfs_exports_op = {
 	.start	= e_start,
 	.next	= e_next,
 	.stop	= e_stop,
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index f8424874fa07..cfb2be708abe 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -163,7 +163,7 @@ static void nst_seq_stop(struct seq_file *seq, void *v)
 {
 }
 
-static struct seq_operations nst_seq_ops = {
+static const struct seq_operations nst_seq_ops = {
 	.start = nst_seq_start,
 	.next = nst_seq_next,
 	.stop = nst_seq_stop,
@@ -344,7 +344,7 @@ static void sc_seq_stop(struct seq_file *seq, void *v)
 {
 }
 
-static struct seq_operations sc_seq_ops = {
+static const struct seq_operations sc_seq_ops = {
 	.start = sc_seq_start,
 	.next = sc_seq_next,
 	.stop = sc_seq_stop,
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index df52f706f669..c5c88124096d 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -683,7 +683,7 @@ static int lockres_seq_show(struct seq_file *s, void *v)
 	return 0;
 }
 
-static struct seq_operations debug_lockres_ops = {
+static const struct seq_operations debug_lockres_ops = {
 	.start =	lockres_seq_start,
 	.stop =		lockres_seq_stop,
 	.next =		lockres_seq_next,
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 7e14d1a04001..9fe7d7ebe115 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -109,7 +109,7 @@ static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos)
 	return rb_next((struct rb_node *) v);
 }
 
-static struct seq_operations proc_nommu_region_list_seqop = {
+static const struct seq_operations proc_nommu_region_list_seqop = {
 	.start	= nommu_region_list_start,
 	.next	= nommu_region_list_next,
 	.stop	= nommu_region_list_stop,
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 03bbe9039104..510ffdd5020e 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -60,7 +60,7 @@ extern spinlock_t		nfsd_drc_lock;
 extern unsigned int		nfsd_drc_max_mem;
 extern unsigned int		nfsd_drc_mem_used;
 
-extern struct seq_operations nfs_exports_op;
+extern const struct seq_operations nfs_exports_op;
 
 /*
  * Function prototypes.
diff --git a/ipc/util.c b/ipc/util.c
index b8e4ba92f6d1..79ce84e890f7 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -942,7 +942,7 @@ static int sysvipc_proc_show(struct seq_file *s, void *it)
 	return iface->show(s, it);
 }
 
-static struct seq_operations sysvipc_proc_seqops = {
+static const struct seq_operations sysvipc_proc_seqops = {
 	.start = sysvipc_proc_start,
 	.stop  = sysvipc_proc_stop,
 	.next  = sysvipc_proc_next,
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 213b7f92fcdd..cd83d9933b6b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2314,7 +2314,7 @@ static int cgroup_tasks_show(struct seq_file *s, void *v)
 	return seq_printf(s, "%d\n", *(int *)v);
 }
 
-static struct seq_operations cgroup_tasks_seq_operations = {
+static const struct seq_operations cgroup_tasks_seq_operations = {
 	.start = cgroup_tasks_start,
 	.stop = cgroup_tasks_stop,
 	.next = cgroup_tasks_next,
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ef177d653b2c..cfadc1291d0b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1321,7 +1321,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
 	return 0;
 }
 
-static struct seq_operations kprobes_seq_ops = {
+static const struct seq_operations kprobes_seq_ops = {
 	.start = kprobe_seq_start,
 	.next  = kprobe_seq_next,
 	.stop  = kprobe_seq_stop,
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d4b3dbc79fdb..d4aba4f3584c 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -594,7 +594,7 @@ static int ls_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static struct seq_operations lockstat_ops = {
+static const struct seq_operations lockstat_ops = {
 	.start	= ls_start,
 	.next	= ls_next,
 	.stop	= ls_stop,
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c71e91bf7372..23df7771c937 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1520,7 +1520,7 @@ static int t_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static struct seq_operations show_ftrace_seq_ops = {
+static const struct seq_operations show_ftrace_seq_ops = {
 	.start = t_start,
 	.next = t_next,
 	.stop = t_stop,
@@ -2459,7 +2459,7 @@ static int g_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static struct seq_operations ftrace_graph_seq_ops = {
+static const struct seq_operations ftrace_graph_seq_ops = {
 	.start = g_start,
 	.next = g_next,
 	.stop = g_stop,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a35925d222ba..6c0f6a8a22eb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1949,7 +1949,7 @@ static int s_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static struct seq_operations tracer_seq_ops = {
+static const struct seq_operations tracer_seq_ops = {
 	.start		= s_start,
 	.next		= s_next,
 	.stop		= s_stop,
@@ -2163,7 +2163,7 @@ static int t_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static struct seq_operations show_traces_seq_ops = {
+static const struct seq_operations show_traces_seq_ops = {
 	.start		= t_start,
 	.next		= t_next,
 	.stop		= t_stop,
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 3907510c2ce3..090675e269ee 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -324,7 +324,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static struct seq_operations ipmr_mfc_seq_ops = {
+static const struct seq_operations ipmr_mfc_seq_ops = {
 	.start = ipmr_mfc_seq_start,
 	.next  = ipmr_mfc_seq_next,
 	.stop  = ipmr_mfc_seq_stop,
diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c
index 6bfc7eaebfda..8e9777b76405 100644
--- a/security/integrity/ima/ima_fs.c
+++ b/security/integrity/ima/ima_fs.c
@@ -146,7 +146,7 @@ static int ima_measurements_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static struct seq_operations ima_measurments_seqops = {
+static const struct seq_operations ima_measurments_seqops = {
 	.start = ima_measurements_start,
 	.next = ima_measurements_next,
 	.stop = ima_measurements_stop,
@@ -221,7 +221,7 @@ static int ima_ascii_measurements_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static struct seq_operations ima_ascii_measurements_seqops = {
+static const struct seq_operations ima_ascii_measurements_seqops = {
 	.start = ima_measurements_start,
 	.next = ima_measurements_next,
 	.stop = ima_measurements_stop,
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
index f83a80980726..aeead7585093 100644
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -187,7 +187,7 @@ static void load_seq_stop(struct seq_file *s, void *v)
 	/* No-op */
 }
 
-static struct seq_operations load_seq_ops = {
+static const struct seq_operations load_seq_ops = {
 	.start = load_seq_start,
 	.next  = load_seq_next,
 	.show  = load_seq_show,
@@ -503,7 +503,7 @@ static void cipso_seq_stop(struct seq_file *s, void *v)
 	/* No-op */
 }
 
-static struct seq_operations cipso_seq_ops = {
+static const struct seq_operations cipso_seq_ops = {
 	.start = cipso_seq_start,
 	.stop  = cipso_seq_stop,
 	.next  = cipso_seq_next,
@@ -697,7 +697,7 @@ static void netlbladdr_seq_stop(struct seq_file *s, void *v)
 	/* No-op */
 }
 
-static struct seq_operations netlbladdr_seq_ops = {
+static const struct seq_operations netlbladdr_seq_ops = {
 	.start = netlbladdr_seq_start,
 	.stop  = netlbladdr_seq_stop,
 	.next  = netlbladdr_seq_next,
-- 
cgit v1.2.3-71-gd317


From 02b51df1b07b4e9ca823c89284e704cadb323cd1 Mon Sep 17 00:00:00 2001
From: Scott James Remnant <scott@ubuntu.com>
Date: Tue, 22 Sep 2009 16:43:44 -0700
Subject: proc connector: add event for process becoming session leader

The act of a process becoming a session leader is a useful signal to a
supervising init daemon such as Upstart.

While a daemon will normally do this as part of the process of becoming a
daemon, it is rare for its children to do so.  When the children do, it is
nearly always a sign that the child should be considered detached from the
parent and not supervised along with it.

The poster-child example is OpenSSH; the per-login children call setsid()
so that they may control the pty connected to them.  If the primary daemon
dies or is restarted, we do not want to consider the per-login children
and want to respawn the primary daemon without killing the children.

This patch adds a new PROC_SID_EVENT and associated structure to the
proc_event event_data union, it arranges for this to be emitted when the
special PIDTYPE_SID pid is set.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Scott James Remnant <scott@ubuntu.com>
Acked-by: Matt Helsley <matthltc@us.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Acked-by: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/connector/cn_proc.c | 25 +++++++++++++++++++++++++
 include/linux/cn_proc.h     | 10 ++++++++++
 kernel/exit.c               |  4 +++-
 3 files changed, 38 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
index 85e5dc0431fe..abf4a2529f80 100644
--- a/drivers/connector/cn_proc.c
+++ b/drivers/connector/cn_proc.c
@@ -139,6 +139,31 @@ void proc_id_connector(struct task_struct *task, int which_id)
 	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
 }
 
+void proc_sid_connector(struct task_struct *task)
+{
+	struct cn_msg *msg;
+	struct proc_event *ev;
+	struct timespec ts;
+	__u8 buffer[CN_PROC_MSG_SIZE];
+
+	if (atomic_read(&proc_event_num_listeners) < 1)
+		return;
+
+	msg = (struct cn_msg *)buffer;
+	ev = (struct proc_event *)msg->data;
+	get_seq(&msg->seq, &ev->cpu);
+	ktime_get_ts(&ts); /* get high res monotonic timestamp */
+	put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns);
+	ev->what = PROC_EVENT_SID;
+	ev->event_data.sid.process_pid = task->pid;
+	ev->event_data.sid.process_tgid = task->tgid;
+
+	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
+	msg->ack = 0; /* not used */
+	msg->len = sizeof(*ev);
+	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
+}
+
 void proc_exit_connector(struct task_struct *task)
 {
 	struct cn_msg *msg;
diff --git a/include/linux/cn_proc.h b/include/linux/cn_proc.h
index b8125b2eb665..47dac5ea8d3a 100644
--- a/include/linux/cn_proc.h
+++ b/include/linux/cn_proc.h
@@ -52,6 +52,7 @@ struct proc_event {
 		PROC_EVENT_EXEC = 0x00000002,
 		PROC_EVENT_UID  = 0x00000004,
 		PROC_EVENT_GID  = 0x00000040,
+		PROC_EVENT_SID  = 0x00000080,
 		/* "next" should be 0x00000400 */
 		/* "last" is the last process event: exit */
 		PROC_EVENT_EXIT = 0x80000000
@@ -89,6 +90,11 @@ struct proc_event {
 			} e;
 		} id;
 
+		struct sid_proc_event {
+			__kernel_pid_t process_pid;
+			__kernel_pid_t process_tgid;
+		} sid;
+
 		struct exit_proc_event {
 			__kernel_pid_t process_pid;
 			__kernel_pid_t process_tgid;
@@ -102,6 +108,7 @@ struct proc_event {
 void proc_fork_connector(struct task_struct *task);
 void proc_exec_connector(struct task_struct *task);
 void proc_id_connector(struct task_struct *task, int which_id);
+void proc_sid_connector(struct task_struct *task);
 void proc_exit_connector(struct task_struct *task);
 #else
 static inline void proc_fork_connector(struct task_struct *task)
@@ -114,6 +121,9 @@ static inline void proc_id_connector(struct task_struct *task,
 				     int which_id)
 {}
 
+static inline void proc_sid_connector(struct task_struct *task)
+{}
+
 static inline void proc_exit_connector(struct task_struct *task)
 {}
 #endif	/* CONFIG_PROC_EVENTS */
diff --git a/kernel/exit.c b/kernel/exit.c
index e47ee8a06135..61bb1761c7b8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -359,8 +359,10 @@ void __set_special_pids(struct pid *pid)
 {
 	struct task_struct *curr = current->group_leader;
 
-	if (task_session(curr) != pid)
+	if (task_session(curr) != pid) {
 		change_pid(curr, PIDTYPE_SID, pid);
+		proc_sid_connector(curr);
+	}
 
 	if (task_pgrp(curr) != pid)
 		change_pid(curr, PIDTYPE_PGID, pid);
-- 
cgit v1.2.3-71-gd317


From 70867453092297be9afb2249e712a1f960ec0a09 Mon Sep 17 00:00:00 2001
From: Roland Dreier <rdreier@cisco.com>
Date: Tue, 22 Sep 2009 16:43:46 -0700
Subject: printk_once(): use bool for boolean flag

Using the type bool (instead of int) for the __print_once flag in the
printk_once() macro matches the intent of the code better, and allows the
compiler to generate smaller code; eg a typical callsite with gcc 4.3.3 on
i386:

add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-6 (-6)
function                                     old     new   delta
static.__print_once                            4       1      -3
get_cpu_vendor                               146     143      -3

Saving 6 bytes of object size per callsite by slightly improving the
readability of the source seems like a win to me.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 0a2a19087863..55723afa097b 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -252,10 +252,10 @@ extern int printk_delay_msec;
  * Print a one-time message (analogous to WARN_ONCE() et al):
  */
 #define printk_once(x...) ({			\
-	static int __print_once = 1;		\
+	static bool __print_once = true;	\
 						\
 	if (__print_once) {			\
-		__print_once = 0;		\
+		__print_once = false;		\
 		printk(x);			\
 	}					\
 })
-- 
cgit v1.2.3-71-gd317


From 8c87df457cb58fe75b9b893007917cf8095660a0 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Tue, 22 Sep 2009 16:43:52 -0700
Subject: BUILD_BUG_ON(): fix it and a couple of bogus uses of it

gcc permitting variable length arrays makes the current construct used for
BUILD_BUG_ON() useless, as that doesn't produce any diagnostic if the
controlling expression isn't really constant.  Instead, this patch makes
it so that a bit field gets used here.  Consequently, those uses where the
condition isn't really constant now also need fixing.

Note that in the gfp.h, kmemcheck.h, and virtio_config.h cases
MAYBE_BUILD_BUG_ON() really just serves documentation purposes - even if
the expression is compile time constant (__builtin_constant_p() yields
true), the array is still deemed of variable length by gcc, and hence the
whole expression doesn't have the intended effect.

[akpm@linux-foundation.org: make arch/sparc/include/asm/vio.h compile]
[akpm@linux-foundation.org: more nonsensical assertions in tpm.c..]
Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Rajiv Andrade <srajiv@linux.vnet.ibm.com>
Cc: Mimi Zohar <zohar@us.ibm.com>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sparc/include/asm/vio.h  | 2 +-
 drivers/char/tpm/tpm.c        | 4 ++--
 drivers/net/niu.c             | 2 +-
 include/linux/gfp.h           | 2 +-
 include/linux/kernel.h        | 8 ++++++--
 include/linux/kmemcheck.h     | 2 +-
 include/linux/virtio_config.h | 3 +--
 7 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sparc/include/asm/vio.h b/arch/sparc/include/asm/vio.h
index d4de32f0f8af..6cdbf7e7351d 100644
--- a/arch/sparc/include/asm/vio.h
+++ b/arch/sparc/include/asm/vio.h
@@ -258,7 +258,7 @@ static inline void *vio_dring_entry(struct vio_dring_state *dr,
 static inline u32 vio_dring_avail(struct vio_dring_state *dr,
 				  unsigned int ring_size)
 {
-	BUILD_BUG_ON(!is_power_of_2(ring_size));
+	MAYBE_BUILD_BUG_ON(!is_power_of_2(ring_size));
 
 	return (dr->pending -
 		((dr->prod - dr->cons) & (ring_size - 1)));
diff --git a/drivers/char/tpm/tpm.c b/drivers/char/tpm/tpm.c
index b0603b2e5684..32b957efa420 100644
--- a/drivers/char/tpm/tpm.c
+++ b/drivers/char/tpm/tpm.c
@@ -696,7 +696,7 @@ int __tpm_pcr_read(struct tpm_chip *chip, int pcr_idx, u8 *res_buf)
 
 	cmd.header.in = pcrread_header;
 	cmd.params.pcrread_in.pcr_idx = cpu_to_be32(pcr_idx);
-	BUILD_BUG_ON(cmd.header.in.length > READ_PCR_RESULT_SIZE);
+	BUG_ON(cmd.header.in.length > READ_PCR_RESULT_SIZE);
 	rc = transmit_cmd(chip, &cmd, cmd.header.in.length,
 			  "attempting to read a pcr value");
 
@@ -760,7 +760,7 @@ int tpm_pcr_extend(u32 chip_num, int pcr_idx, const u8 *hash)
 		return -ENODEV;
 
 	cmd.header.in = pcrextend_header;
-	BUILD_BUG_ON(be32_to_cpu(cmd.header.in.length) > EXTEND_PCR_SIZE);
+	BUG_ON(be32_to_cpu(cmd.header.in.length) > EXTEND_PCR_SIZE);
 	cmd.params.pcrextend_in.pcr_idx = cpu_to_be32(pcr_idx);
 	memcpy(cmd.params.pcrextend_in.hash, hash, TPM_DIGEST_SIZE);
 	rc = transmit_cmd(chip, &cmd, cmd.header.in.length,
diff --git a/drivers/net/niu.c b/drivers/net/niu.c
index 76cc2614f480..f9364d0678f2 100644
--- a/drivers/net/niu.c
+++ b/drivers/net/niu.c
@@ -5615,7 +5615,7 @@ static void niu_init_tx_mac(struct niu *np)
 	/* The XMAC_MIN register only accepts values for TX min which
 	 * have the low 3 bits cleared.
 	 */
-	BUILD_BUG_ON(min & 0x7);
+	BUG_ON(min & 0x7);
 
 	if (np->flags & NIU_FLAGS_XMAC)
 		niu_init_tx_xmac(np, min, max);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f53e9b868c26..557bdad320b6 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -220,7 +220,7 @@ static inline enum zone_type gfp_zone(gfp_t flags)
 					 ((1 << ZONES_SHIFT) - 1);
 
 	if (__builtin_constant_p(bit))
-		BUILD_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
+		MAYBE_BUILD_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
 	else {
 #ifdef CONFIG_DEBUG_VM
 		BUG_ON((GFP_ZONE_BAD >> bit) & 1);
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 55723afa097b..63dcaece1ac5 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -678,13 +678,17 @@ struct sysinfo {
 };
 
 /* Force a compilation error if condition is true */
-#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
+#define BUILD_BUG_ON(condition) ((void)BUILD_BUG_ON_ZERO(condition))
+
+/* Force a compilation error if condition is constant and true */
+#define MAYBE_BUILD_BUG_ON(cond) ((void)sizeof(char[1 - 2 * !!(cond)]))
 
 /* Force a compilation error if condition is true, but also produce a
    result (of value 0 and type size_t), so the expression can be used
    e.g. in a structure initializer (or where-ever else comma expressions
    aren't permitted). */
-#define BUILD_BUG_ON_ZERO(e) (sizeof(char[1 - 2 * !!(e)]) - 1)
+#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
+#define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:-!!(e); }))
 
 /* Trap pasters of __FUNCTION__ at compile-time */
 #define __FUNCTION__ (__func__)
diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h
index 136cdcdf92ef..e880d4cf9e22 100644
--- a/include/linux/kmemcheck.h
+++ b/include/linux/kmemcheck.h
@@ -152,7 +152,7 @@ static inline bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
 									\
 		_n = (long) &((ptr)->name##_end)			\
 			- (long) &((ptr)->name##_begin);		\
-		BUILD_BUG_ON(_n < 0);					\
+		MAYBE_BUILD_BUG_ON(_n < 0);				\
 									\
 		kmemcheck_mark_initialized(&((ptr)->name##_begin), _n);	\
 	} while (0)
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index e547e3c8ee9a..0093dd7c1d6f 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -109,8 +109,7 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
 				      unsigned int fbit)
 {
 	/* Did you forget to fix assumptions on max features? */
-	if (__builtin_constant_p(fbit))
-		BUILD_BUG_ON(fbit >= 32);
+	MAYBE_BUILD_BUG_ON(fbit >= 32);
 
 	if (fbit < VIRTIO_TRANSPORT_F_START)
 		virtio_check_driver_offered_feature(vdev, fbit);
-- 
cgit v1.2.3-71-gd317


From 562787a5c32ccdf182de27793a83a9f2ee86cd77 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Tue, 22 Sep 2009 16:43:57 -0700
Subject: anonfd: split interface into file creation and install

Split the anonfd interface into a bare file pointer creation one, and a
file pointer creation plus install one.

There are cases, like the usage of eventfds inside other kernel
interfaces, where the file pointer created by anonfd needs to be used
inside the initialization of other structures.

As it is right now, as soon as anon_inode_getfd() returns, the kenrle can
race with userspace closing the newly installed file descriptor.

This patch, while keeping the old anon_inode_getfd(), introduces a new
anon_inode_getfile() (whose services are reused in anon_inode_getfd())
that allows to split the file creation phase and the fd install one.

Once all the kernel structures are initialized, the code can call the
proper fd_install().

Gregory manifested the need for something like this inside KVM.

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: James Morris <jmorris@namei.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Gregory Haskins <ghaskins@novell.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Roland Dreier <rolandd@cisco.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/anon_inodes.c            | 68 +++++++++++++++++++++++++++++++++------------
 fs/eventfd.c                | 67 +++++++++++++++++++++++++++++++++++---------
 include/linux/anon_inodes.h |  3 ++
 include/linux/eventfd.h     |  6 ++++
 4 files changed, 114 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 47d4a01c5393..d11c51fc2a3f 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -77,28 +77,24 @@ static const struct address_space_operations anon_aops = {
  *
  * Creates a new file by hooking it on a single inode. This is useful for files
  * that do not need to have a full-fledged inode in order to operate correctly.
- * All the files created with anon_inode_getfd() will share a single inode,
+ * All the files created with anon_inode_getfile() will share a single inode,
  * hence saving memory and avoiding code duplication for the file/inode/dentry
- * setup.  Returns new descriptor or -error.
+ * setup.  Returns the newly created file* or an error pointer.
  */
-int anon_inode_getfd(const char *name, const struct file_operations *fops,
-		     void *priv, int flags)
+struct file *anon_inode_getfile(const char *name,
+				const struct file_operations *fops,
+				void *priv, int flags)
 {
 	struct qstr this;
 	struct dentry *dentry;
 	struct file *file;
-	int error, fd;
+	int error;
 
 	if (IS_ERR(anon_inode_inode))
-		return -ENODEV;
+		return ERR_PTR(-ENODEV);
 
 	if (fops->owner && !try_module_get(fops->owner))
-		return -ENOENT;
-
-	error = get_unused_fd_flags(flags);
-	if (error < 0)
-		goto err_module;
-	fd = error;
+		return ERR_PTR(-ENOENT);
 
 	/*
 	 * Link the inode to a directory entry by creating a unique name
@@ -110,7 +106,7 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
 	this.hash = 0;
 	dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this);
 	if (!dentry)
-		goto err_put_unused_fd;
+		goto err_module;
 
 	/*
 	 * We know the anon_inode inode count is always greater than zero,
@@ -136,16 +132,54 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
 	file->f_version = 0;
 	file->private_data = priv;
 
+	return file;
+
+err_dput:
+	dput(dentry);
+err_module:
+	module_put(fops->owner);
+	return ERR_PTR(error);
+}
+EXPORT_SYMBOL_GPL(anon_inode_getfile);
+
+/**
+ * anon_inode_getfd - creates a new file instance by hooking it up to an
+ *                    anonymous inode, and a dentry that describe the "class"
+ *                    of the file
+ *
+ * @name:    [in]    name of the "class" of the new file
+ * @fops:    [in]    file operations for the new file
+ * @priv:    [in]    private data for the new file (will be file's private_data)
+ * @flags:   [in]    flags
+ *
+ * Creates a new file by hooking it on a single inode. This is useful for files
+ * that do not need to have a full-fledged inode in order to operate correctly.
+ * All the files created with anon_inode_getfd() will share a single inode,
+ * hence saving memory and avoiding code duplication for the file/inode/dentry
+ * setup.  Returns new descriptor or an error code.
+ */
+int anon_inode_getfd(const char *name, const struct file_operations *fops,
+		     void *priv, int flags)
+{
+	int error, fd;
+	struct file *file;
+
+	error = get_unused_fd_flags(flags);
+	if (error < 0)
+		return error;
+	fd = error;
+
+	file = anon_inode_getfile(name, fops, priv, flags);
+	if (IS_ERR(file)) {
+		error = PTR_ERR(file);
+		goto err_put_unused_fd;
+	}
 	fd_install(fd, file);
 
 	return fd;
 
-err_dput:
-	dput(dentry);
 err_put_unused_fd:
 	put_unused_fd(fd);
-err_module:
-	module_put(fops->owner);
 	return error;
 }
 EXPORT_SYMBOL_GPL(anon_inode_getfd);
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 31d12de83a2a..8b47e4200e65 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -68,11 +68,16 @@ int eventfd_signal(struct eventfd_ctx *ctx, int n)
 }
 EXPORT_SYMBOL_GPL(eventfd_signal);
 
+static void eventfd_free_ctx(struct eventfd_ctx *ctx)
+{
+	kfree(ctx);
+}
+
 static void eventfd_free(struct kref *kref)
 {
 	struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
 
-	kfree(ctx);
+	eventfd_free_ctx(ctx);
 }
 
 /**
@@ -298,9 +303,23 @@ struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
 }
 EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
 
-SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
+/**
+ * eventfd_file_create - Creates an eventfd file pointer.
+ * @count: Initial eventfd counter value.
+ * @flags: Flags for the eventfd file.
+ *
+ * This function creates an eventfd file pointer, w/out installing it into
+ * the fd table. This is useful when the eventfd file is used during the
+ * initialization of data structures that require extra setup after the eventfd
+ * creation. So the eventfd creation is split into the file pointer creation
+ * phase, and the file descriptor installation phase.
+ * In this way races with userspace closing the newly installed file descriptor
+ * can be avoided.
+ * Returns an eventfd file pointer, or a proper error pointer.
+ */
+struct file *eventfd_file_create(unsigned int count, int flags)
 {
-	int fd;
+	struct file *file;
 	struct eventfd_ctx *ctx;
 
 	/* Check the EFD_* constants for consistency.  */
@@ -308,26 +327,48 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 	BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
 
 	if (flags & ~EFD_FLAGS_SET)
-		return -EINVAL;
+		return ERR_PTR(-EINVAL);
 
 	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	kref_init(&ctx->kref);
 	init_waitqueue_head(&ctx->wqh);
 	ctx->count = count;
 	ctx->flags = flags;
 
-	/*
-	 * When we call this, the initialization must be complete, since
-	 * anon_inode_getfd() will install the fd.
-	 */
-	fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
-			      flags & EFD_SHARED_FCNTL_FLAGS);
-	if (fd < 0)
-		kfree(ctx);
+	file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx,
+				  flags & EFD_SHARED_FCNTL_FLAGS);
+	if (IS_ERR(file))
+		eventfd_free_ctx(ctx);
+
+	return file;
+}
+
+SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
+{
+	int fd, error;
+	struct file *file;
+
+	error = get_unused_fd_flags(flags & EFD_SHARED_FCNTL_FLAGS);
+	if (error < 0)
+		return error;
+	fd = error;
+
+	file = eventfd_file_create(count, flags);
+	if (IS_ERR(file)) {
+		error = PTR_ERR(file);
+		goto err_put_unused_fd;
+	}
+	fd_install(fd, file);
+
 	return fd;
+
+err_put_unused_fd:
+	put_unused_fd(fd);
+
+	return error;
 }
 
 SYSCALL_DEFINE1(eventfd, unsigned int, count)
diff --git a/include/linux/anon_inodes.h b/include/linux/anon_inodes.h
index e0a0cdc2da43..69a21e0ebd33 100644
--- a/include/linux/anon_inodes.h
+++ b/include/linux/anon_inodes.h
@@ -8,6 +8,9 @@
 #ifndef _LINUX_ANON_INODES_H
 #define _LINUX_ANON_INODES_H
 
+struct file *anon_inode_getfile(const char *name,
+				const struct file_operations *fops,
+				void *priv, int flags);
 int anon_inode_getfd(const char *name, const struct file_operations *fops,
 		     void *priv, int flags);
 
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index 3b85ba6479f4..94dd10366a78 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -27,6 +27,7 @@
 
 #ifdef CONFIG_EVENTFD
 
+struct file *eventfd_file_create(unsigned int count, int flags);
 struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx);
 void eventfd_ctx_put(struct eventfd_ctx *ctx);
 struct file *eventfd_fget(int fd);
@@ -40,6 +41,11 @@ int eventfd_signal(struct eventfd_ctx *ctx, int n);
  * Ugly ugly ugly error layer to support modules that uses eventfd but
  * pretend to work in !CONFIG_EVENTFD configurations. Namely, AIO.
  */
+static inline struct file *eventfd_file_create(unsigned int count, int flags)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
 static inline struct eventfd_ctx *eventfd_ctx_fdget(int fd)
 {
 	return ERR_PTR(-ENOSYS);
-- 
cgit v1.2.3-71-gd317


From a49c59c042c63b432307c1bbf7dac5a104c786e6 Mon Sep 17 00:00:00 2001
From: Rolf Eike Beer <eike-kernel@sf-tec.de>
Date: Tue, 22 Sep 2009 16:44:03 -0700
Subject: Make sure the value in abs() does not get truncated if it is greater
 than 2^32

abs() will truncate the input if is it outside the 2^32 range.  Fix that
by assuming `long' input.

This might generate worse code in the common case.

Signed-off-by: Rolf Eike Beer <eike-kernel@sf-tec.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 63dcaece1ac5..d3cd23f30039 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -146,7 +146,7 @@ extern int _cond_resched(void);
 #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)
 
 #define abs(x) ({				\
-		int __x = (x);			\
+		long __x = (x);			\
 		(__x < 0) ? -__x : __x;		\
 	})
 
-- 
cgit v1.2.3-71-gd317


From 1f10206cf8e945220f7220a809d8bfc15c21f9a5 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Tue, 22 Sep 2009 16:44:10 -0700
Subject: getrusage: fill ru_maxrss value

Make ->ru_maxrss value in struct rusage filled accordingly to rss hiwater
mark.  This struct is filled as a parameter to getrusage syscall.
->ru_maxrss value is set to KBs which is the way it is done in BSD
systems.  /usr/bin/time (gnu time) application converts ->ru_maxrss to KBs
which seems to be incorrect behavior.  Maintainer of this util was
notified by me with the patch which corrects it and cc'ed.

To make this happen we extend struct signal_struct by two fields.  The
first one is ->maxrss which we use to store rss hiwater of the task.  The
second one is ->cmaxrss which we use to store highest rss hiwater of all
task childs.  These values are used in k_getrusage() to actually fill
->ru_maxrss.  k_getrusage() uses current rss hiwater value directly if mm
struct exists.

Note:
exec() clear mm->hiwater_rss, but doesn't clear sig->maxrss.
it is intetionally behavior. *BSD getrusage have exec() inheriting.

test programs
========================================================

getrusage.c
===========
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
 #include <sys/time.h>
 #include <sys/resource.h>
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>
 #include <signal.h>
 #include <sys/mman.h>

 #include "common.h"

 #define err(str) perror(str), exit(1)

int main(int argc, char** argv)
{
	int status;

	printf("allocate 100MB\n");
	consume(100);

	printf("testcase1: fork inherit? \n");
	printf("  expect: initial.self ~= child.self\n");
	show_rusage("initial");
	if (__fork()) {
		wait(&status);
	} else {
		show_rusage("fork child");
		_exit(0);
	}
	printf("\n");

	printf("testcase2: fork inherit? (cont.) \n");
	printf("  expect: initial.children ~= 100MB, but child.children = 0\n");
	show_rusage("initial");
	if (__fork()) {
		wait(&status);
	} else {
		show_rusage("child");
		_exit(0);
	}
	printf("\n");

	printf("testcase3: fork + malloc \n");
	printf("  expect: child.self ~= initial.self + 50MB\n");
	show_rusage("initial");
	if (__fork()) {
		wait(&status);
	} else {
		printf("allocate +50MB\n");
		consume(50);
		show_rusage("fork child");
		_exit(0);
	}
	printf("\n");

	printf("testcase4: grandchild maxrss\n");
	printf("  expect: post_wait.children ~= 300MB\n");
	show_rusage("initial");
	if (__fork()) {
		wait(&status);
		show_rusage("post_wait");
	} else {
		system("./child -n 0 -g 300");
		_exit(0);
	}
	printf("\n");

	printf("testcase5: zombie\n");
	printf("  expect: pre_wait ~= initial, IOW the zombie process is not accounted.\n");
	printf("          post_wait ~= 400MB, IOW wait() collect child's max_rss. \n");
	show_rusage("initial");
	if (__fork()) {
		sleep(1); /* children become zombie */
		show_rusage("pre_wait");
		wait(&status);
		show_rusage("post_wait");
	} else {
		system("./child -n 400");
		_exit(0);
	}
	printf("\n");

	printf("testcase6: SIG_IGN\n");
	printf("  expect: initial ~= after_zombie (child's 500MB alloc should be ignored).\n");
	show_rusage("initial");
	signal(SIGCHLD, SIG_IGN);
	if (__fork()) {
		sleep(1); /* children become zombie */
		show_rusage("after_zombie");
	} else {
		system("./child -n 500");
		_exit(0);
	}
	printf("\n");
	signal(SIGCHLD, SIG_DFL);

	printf("testcase7: exec (without fork) \n");
	printf("  expect: initial ~= exec \n");
	show_rusage("initial");
	execl("./child", "child", "-v", NULL);

	return 0;
}

child.c
=======
 #include <sys/types.h>
 #include <unistd.h>
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
 #include <sys/time.h>
 #include <sys/resource.h>

 #include "common.h"

int main(int argc, char** argv)
{
	int status;
	int c;
	long consume_size = 0;
	long grandchild_consume_size = 0;
	int show = 0;

	while ((c = getopt(argc, argv, "n:g:v")) != -1) {
		switch (c) {
		case 'n':
			consume_size = atol(optarg);
			break;
		case 'v':
			show = 1;
			break;
		case 'g':

			grandchild_consume_size = atol(optarg);
			break;
		default:
			break;
		}
	}

	if (show)
		show_rusage("exec");

	if (consume_size) {
		printf("child alloc %ldMB\n", consume_size);
		consume(consume_size);
	}

	if (grandchild_consume_size) {
		if (fork()) {
			wait(&status);
		} else {
			printf("grandchild alloc %ldMB\n", grandchild_consume_size);
			consume(grandchild_consume_size);

			exit(0);
		}
	}

	return 0;
}

common.c
========
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
 #include <sys/time.h>
 #include <sys/resource.h>
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>
 #include <signal.h>
 #include <sys/mman.h>

 #include "common.h"
 #define err(str) perror(str), exit(1)

void show_rusage(char *prefix)
{
    	int err, err2;
    	struct rusage rusage_self;
    	struct rusage rusage_children;

    	printf("%s: ", prefix);
    	err = getrusage(RUSAGE_SELF, &rusage_self);
    	if (!err)
    		printf("self %ld ", rusage_self.ru_maxrss);
    	err2 = getrusage(RUSAGE_CHILDREN, &rusage_children);
    	if (!err2)
    		printf("children %ld ", rusage_children.ru_maxrss);

    	printf("\n");
}

/* Some buggy OS need this worthless CPU waste. */
void make_pagefault(void)
{
	void *addr;
	int size = getpagesize();
	int i;

	for (i=0; i<1000; i++) {
		addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
		if (addr == MAP_FAILED)
			err("make_pagefault");
		memset(addr, 0, size);
		munmap(addr, size);
	}
}

void consume(int mega)
{
    	size_t sz = mega * 1024 * 1024;
    	void *ptr;

    	ptr = malloc(sz);
    	memset(ptr, 0, sz);
	make_pagefault();
}

pid_t __fork(void)
{
	pid_t pid;

	pid = fork();
	make_pagefault();

	return pid;
}

common.h
========
void show_rusage(char *prefix);
void make_pagefault(void);
void consume(int mega);
pid_t __fork(void);

FreeBSD result (expected result)
========================================================
allocate 100MB
testcase1: fork inherit?
  expect: initial.self ~= child.self
initial: self 103492 children 0
fork child: self 103540 children 0

testcase2: fork inherit? (cont.)
  expect: initial.children ~= 100MB, but child.children = 0
initial: self 103540 children 103540
child: self 103564 children 0

testcase3: fork + malloc
  expect: child.self ~= initial.self + 50MB
initial: self 103564 children 103564
allocate +50MB
fork child: self 154860 children 0

testcase4: grandchild maxrss
  expect: post_wait.children ~= 300MB
initial: self 103564 children 154860
grandchild alloc 300MB
post_wait: self 103564 children 308720

testcase5: zombie
  expect: pre_wait ~= initial, IOW the zombie process is not accounted.
          post_wait ~= 400MB, IOW wait() collect child's max_rss.
initial: self 103564 children 308720
child alloc 400MB
pre_wait: self 103564 children 308720
post_wait: self 103564 children 411312

testcase6: SIG_IGN
  expect: initial ~= after_zombie (child's 500MB alloc should be ignored).
initial: self 103564 children 411312
child alloc 500MB
after_zombie: self 103624 children 411312

testcase7: exec (without fork)
  expect: initial ~= exec
initial: self 103624 children 411312
exec: self 103624 children 411312

Linux result (actual test result)
========================================================
allocate 100MB
testcase1: fork inherit?
  expect: initial.self ~= child.self
initial: self 102848 children 0
fork child: self 102572 children 0

testcase2: fork inherit? (cont.)
  expect: initial.children ~= 100MB, but child.children = 0
initial: self 102876 children 102644
child: self 102572 children 0

testcase3: fork + malloc
  expect: child.self ~= initial.self + 50MB
initial: self 102876 children 102644
allocate +50MB
fork child: self 153804 children 0

testcase4: grandchild maxrss
  expect: post_wait.children ~= 300MB
initial: self 102876 children 153864
grandchild alloc 300MB
post_wait: self 102876 children 307536

testcase5: zombie
  expect: pre_wait ~= initial, IOW the zombie process is not accounted.
          post_wait ~= 400MB, IOW wait() collect child's max_rss.
initial: self 102876 children 307536
child alloc 400MB
pre_wait: self 102876 children 307536
post_wait: self 102876 children 410076

testcase6: SIG_IGN
  expect: initial ~= after_zombie (child's 500MB alloc should be ignored).
initial: self 102876 children 410076
child alloc 500MB
after_zombie: self 102880 children 410076

testcase7: exec (without fork)
  expect: initial ~= exec
initial: self 102880 children 410076
exec: self 102880 children 410076

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c             |  3 +++
 include/linux/sched.h | 10 ++++++++++
 kernel/exit.c         |  6 ++++++
 kernel/fork.c         |  1 +
 kernel/sys.c          | 14 ++++++++++++++
 5 files changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 434dba778ccc..69bb9d899791 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -845,6 +845,9 @@ static int de_thread(struct task_struct *tsk)
 	sig->notify_count = 0;
 
 no_thread_group:
+	if (current->mm)
+		setmax_mm_hiwater_rss(&sig->maxrss, current->mm);
+
 	exit_itimers(sig);
 	flush_itimer_signals();
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 97b10da0a3ea..6448bbc6406b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -426,6 +426,15 @@ static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
 	return max(mm->hiwater_rss, get_mm_rss(mm));
 }
 
+static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
+					 struct mm_struct *mm)
+{
+	unsigned long hiwater_rss = get_mm_hiwater_rss(mm);
+
+	if (*maxrss < hiwater_rss)
+		*maxrss = hiwater_rss;
+}
+
 static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm)
 {
 	return max(mm->hiwater_vm, mm->total_vm);
@@ -612,6 +621,7 @@ struct signal_struct {
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 	unsigned long inblock, oublock, cinblock, coublock;
+	unsigned long maxrss, cmaxrss;
 	struct task_io_accounting ioac;
 
 	/*
diff --git a/kernel/exit.c b/kernel/exit.c
index 61bb1761c7b8..60d6fdcc9265 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -947,6 +947,8 @@ NORET_TYPE void do_exit(long code)
 	if (group_dead) {
 		hrtimer_cancel(&tsk->signal->real_timer);
 		exit_itimers(tsk->signal);
+		if (tsk->mm)
+			setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
 	}
 	acct_collect(code, group_dead);
 	if (group_dead)
@@ -1210,6 +1212,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 	if (likely(!traced) && likely(!task_detached(p))) {
 		struct signal_struct *psig;
 		struct signal_struct *sig;
+		unsigned long maxrss;
 
 		/*
 		 * The resource counters for the group leader are in its
@@ -1258,6 +1261,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		psig->coublock +=
 			task_io_get_oublock(p) +
 			sig->oublock + sig->coublock;
+		maxrss = max(sig->maxrss, sig->cmaxrss);
+		if (psig->cmaxrss < maxrss)
+			psig->cmaxrss = maxrss;
 		task_io_accounting_add(&psig->ioac, &p->ioac);
 		task_io_accounting_add(&psig->ioac, &sig->ioac);
 		spin_unlock_irq(&p->real_parent->sighand->siglock);
diff --git a/kernel/fork.c b/kernel/fork.c
index 1020977b57ca..7cf45812ce84 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -866,6 +866,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
+	sig->maxrss = sig->cmaxrss = 0;
 	task_io_accounting_init(&sig->ioac);
 	sig->sum_sched_runtime = 0;
 	taskstats_tgid_init(sig);
diff --git a/kernel/sys.c b/kernel/sys.c
index ea5c3bcac881..ebcb15611728 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1338,6 +1338,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 	unsigned long flags;
 	cputime_t utime, stime;
 	struct task_cputime cputime;
+	unsigned long maxrss = 0;
 
 	memset((char *) r, 0, sizeof *r);
 	utime = stime = cputime_zero;
@@ -1346,6 +1347,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 		utime = task_utime(current);
 		stime = task_stime(current);
 		accumulate_thread_rusage(p, r);
+		maxrss = p->signal->maxrss;
 		goto out;
 	}
 
@@ -1363,6 +1365,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 			r->ru_majflt = p->signal->cmaj_flt;
 			r->ru_inblock = p->signal->cinblock;
 			r->ru_oublock = p->signal->coublock;
+			maxrss = p->signal->cmaxrss;
 
 			if (who == RUSAGE_CHILDREN)
 				break;
@@ -1377,6 +1380,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 			r->ru_majflt += p->signal->maj_flt;
 			r->ru_inblock += p->signal->inblock;
 			r->ru_oublock += p->signal->oublock;
+			if (maxrss < p->signal->maxrss)
+				maxrss = p->signal->maxrss;
 			t = p;
 			do {
 				accumulate_thread_rusage(t, r);
@@ -1392,6 +1397,15 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 out:
 	cputime_to_timeval(utime, &r->ru_utime);
 	cputime_to_timeval(stime, &r->ru_stime);
+
+	if (who != RUSAGE_CHILDREN) {
+		struct mm_struct *mm = get_task_mm(p);
+		if (mm) {
+			setmax_mm_hiwater_rss(&maxrss, mm);
+			mmput(mm);
+		}
+	}
+	r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
 }
 
 int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
-- 
cgit v1.2.3-71-gd317


From 8ea926b22e2d13238e4d65d8f61c48fe424e6f4f Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@nokia.com>
Date: Tue, 22 Sep 2009 16:44:29 -0700
Subject: mmc: add 'enable' and 'disable' methods to mmc host

MMC hosts that support power saving can use the 'enable' and 'disable'
methods to exit and enter power saving states.  An explanation of their
use is provided in the comments added to include/linux/mmc/host.h.

Signed-off-by: Adrian Hunter <adrian.hunter@nokia.com>
Acked-by: Matt Fleming <matt@console-pimps.org>
Cc: Ian Molton <ian@mnementh.co.uk>
Cc: "Roberto A. Foglietta" <roberto.foglietta@gmail.com>
Cc: Jarkko Lavinen <jarkko.lavinen@nokia.com>
Cc: Denis Karpov <ext-denis.2.karpov@nokia.com>
Cc: Pierre Ossman <pierre@ossman.eu>
Cc: Philip Langdale <philipl@overt.org>
Cc: "Madhusudhan" <madhu.cr@ti.com>
Cc: <linux-mmc@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mmc/core/core.c  | 177 +++++++++++++++++++++++++++++++++++++++++++++--
 drivers/mmc/core/host.c  |   1 +
 drivers/mmc/core/host.h  |   2 +
 include/linux/mmc/host.h |  47 +++++++++++++
 4 files changed, 221 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index e22d2b5576ec..fb24a096dba8 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -343,6 +343,101 @@ unsigned int mmc_align_data_size(struct mmc_card *card, unsigned int sz)
 }
 EXPORT_SYMBOL(mmc_align_data_size);
 
+/**
+ *	mmc_host_enable - enable a host.
+ *	@host: mmc host to enable
+ *
+ *	Hosts that support power saving can use the 'enable' and 'disable'
+ *	methods to exit and enter power saving states. For more information
+ *	see comments for struct mmc_host_ops.
+ */
+int mmc_host_enable(struct mmc_host *host)
+{
+	if (!(host->caps & MMC_CAP_DISABLE))
+		return 0;
+
+	if (host->en_dis_recurs)
+		return 0;
+
+	if (host->nesting_cnt++)
+		return 0;
+
+	cancel_delayed_work_sync(&host->disable);
+
+	if (host->enabled)
+		return 0;
+
+	if (host->ops->enable) {
+		int err;
+
+		host->en_dis_recurs = 1;
+		err = host->ops->enable(host);
+		host->en_dis_recurs = 0;
+
+		if (err) {
+			pr_debug("%s: enable error %d\n",
+				 mmc_hostname(host), err);
+			return err;
+		}
+	}
+	host->enabled = 1;
+	return 0;
+}
+EXPORT_SYMBOL(mmc_host_enable);
+
+static int mmc_host_do_disable(struct mmc_host *host, int lazy)
+{
+	if (host->ops->disable) {
+		int err;
+
+		host->en_dis_recurs = 1;
+		err = host->ops->disable(host, lazy);
+		host->en_dis_recurs = 0;
+
+		if (err < 0) {
+			pr_debug("%s: disable error %d\n",
+				 mmc_hostname(host), err);
+			return err;
+		}
+		if (err > 0) {
+			unsigned long delay = msecs_to_jiffies(err);
+
+			mmc_schedule_delayed_work(&host->disable, delay);
+		}
+	}
+	host->enabled = 0;
+	return 0;
+}
+
+/**
+ *	mmc_host_disable - disable a host.
+ *	@host: mmc host to disable
+ *
+ *	Hosts that support power saving can use the 'enable' and 'disable'
+ *	methods to exit and enter power saving states. For more information
+ *	see comments for struct mmc_host_ops.
+ */
+int mmc_host_disable(struct mmc_host *host)
+{
+	int err;
+
+	if (!(host->caps & MMC_CAP_DISABLE))
+		return 0;
+
+	if (host->en_dis_recurs)
+		return 0;
+
+	if (--host->nesting_cnt)
+		return 0;
+
+	if (!host->enabled)
+		return 0;
+
+	err = mmc_host_do_disable(host, 0);
+	return err;
+}
+EXPORT_SYMBOL(mmc_host_disable);
+
 /**
  *	__mmc_claim_host - exclusively claim a host
  *	@host: mmc host to claim
@@ -379,11 +474,81 @@ int __mmc_claim_host(struct mmc_host *host, atomic_t *abort)
 		wake_up(&host->wq);
 	spin_unlock_irqrestore(&host->lock, flags);
 	remove_wait_queue(&host->wq, &wait);
+	if (!stop)
+		mmc_host_enable(host);
 	return stop;
 }
 
 EXPORT_SYMBOL(__mmc_claim_host);
 
+static int mmc_try_claim_host(struct mmc_host *host)
+{
+	int claimed_host = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&host->lock, flags);
+	if (!host->claimed) {
+		host->claimed = 1;
+		claimed_host = 1;
+	}
+	spin_unlock_irqrestore(&host->lock, flags);
+	return claimed_host;
+}
+
+static void mmc_do_release_host(struct mmc_host *host)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&host->lock, flags);
+	host->claimed = 0;
+	spin_unlock_irqrestore(&host->lock, flags);
+
+	wake_up(&host->wq);
+}
+
+void mmc_host_deeper_disable(struct work_struct *work)
+{
+	struct mmc_host *host =
+		container_of(work, struct mmc_host, disable.work);
+
+	/* If the host is claimed then we do not want to disable it anymore */
+	if (!mmc_try_claim_host(host))
+		return;
+	mmc_host_do_disable(host, 1);
+	mmc_do_release_host(host);
+}
+
+/**
+ *	mmc_host_lazy_disable - lazily disable a host.
+ *	@host: mmc host to disable
+ *
+ *	Hosts that support power saving can use the 'enable' and 'disable'
+ *	methods to exit and enter power saving states. For more information
+ *	see comments for struct mmc_host_ops.
+ */
+int mmc_host_lazy_disable(struct mmc_host *host)
+{
+	if (!(host->caps & MMC_CAP_DISABLE))
+		return 0;
+
+	if (host->en_dis_recurs)
+		return 0;
+
+	if (--host->nesting_cnt)
+		return 0;
+
+	if (!host->enabled)
+		return 0;
+
+	if (host->disable_delay) {
+		mmc_schedule_delayed_work(&host->disable,
+				msecs_to_jiffies(host->disable_delay));
+		return 0;
+	} else
+		return mmc_host_do_disable(host, 1);
+}
+EXPORT_SYMBOL(mmc_host_lazy_disable);
+
 /**
  *	mmc_release_host - release a host
  *	@host: mmc host to release
@@ -393,15 +558,11 @@ EXPORT_SYMBOL(__mmc_claim_host);
  */
 void mmc_release_host(struct mmc_host *host)
 {
-	unsigned long flags;
-
 	WARN_ON(!host->claimed);
 
-	spin_lock_irqsave(&host->lock, flags);
-	host->claimed = 0;
-	spin_unlock_irqrestore(&host->lock, flags);
+	mmc_host_lazy_disable(host);
 
-	wake_up(&host->wq);
+	mmc_do_release_host(host);
 }
 
 EXPORT_SYMBOL(mmc_release_host);
@@ -953,6 +1114,8 @@ void mmc_stop_host(struct mmc_host *host)
 	spin_unlock_irqrestore(&host->lock, flags);
 #endif
 
+	if (host->caps & MMC_CAP_DISABLE)
+		cancel_delayed_work(&host->disable);
 	cancel_delayed_work(&host->detect);
 	mmc_flush_scheduled_work();
 
@@ -981,6 +1144,8 @@ void mmc_stop_host(struct mmc_host *host)
  */
 int mmc_suspend_host(struct mmc_host *host, pm_message_t state)
 {
+	if (host->caps & MMC_CAP_DISABLE)
+		cancel_delayed_work(&host->disable);
 	cancel_delayed_work(&host->detect);
 	mmc_flush_scheduled_work();
 
diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index 5e945e64ead7..a268d12f1af0 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -83,6 +83,7 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev)
 	spin_lock_init(&host->lock);
 	init_waitqueue_head(&host->wq);
 	INIT_DELAYED_WORK(&host->detect, mmc_rescan);
+	INIT_DELAYED_WORK_DEFERRABLE(&host->disable, mmc_host_deeper_disable);
 
 	/*
 	 * By default, hosts do not support SGIO or large requests.
diff --git a/drivers/mmc/core/host.h b/drivers/mmc/core/host.h
index c2dc3d2d9f9a..8c87e1109a34 100644
--- a/drivers/mmc/core/host.h
+++ b/drivers/mmc/core/host.h
@@ -14,5 +14,7 @@
 int mmc_register_host_class(void);
 void mmc_unregister_host_class(void);
 
+void mmc_host_deeper_disable(struct work_struct *work);
+
 #endif
 
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 3e7615e9087e..338a9b3d51e4 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -51,6 +51,35 @@ struct mmc_ios {
 };
 
 struct mmc_host_ops {
+	/*
+	 * Hosts that support power saving can use the 'enable' and 'disable'
+	 * methods to exit and enter power saving states. 'enable' is called
+	 * when the host is claimed and 'disable' is called (or scheduled with
+	 * a delay) when the host is released. The 'disable' is scheduled if
+	 * the disable delay set by 'mmc_set_disable_delay()' is non-zero,
+	 * otherwise 'disable' is called immediately. 'disable' may be
+	 * scheduled repeatedly, to permit ever greater power saving at the
+	 * expense of ever greater latency to re-enable. Rescheduling is
+	 * determined by the return value of the 'disable' method. A positive
+	 * value gives the delay in milliseconds.
+	 *
+	 * In the case where a host function (like set_ios) may be called
+	 * with or without the host claimed, enabling and disabling can be
+	 * done directly and will nest correctly. Call 'mmc_host_enable()' and
+	 * 'mmc_host_lazy_disable()' for this purpose, but note that these
+	 * functions must be paired.
+	 *
+	 * Alternatively, 'mmc_host_enable()' may be paired with
+	 * 'mmc_host_disable()' which calls 'disable' immediately.  In this
+	 * case the 'disable' method will be called with 'lazy' set to 0.
+	 * This is mainly useful for error paths.
+	 *
+	 * Because lazy disable may be called from a work queue, the 'disable'
+	 * method must claim the host when 'lazy' != 0, which will work
+	 * correctly because recursion is detected and handled.
+	 */
+	int (*enable)(struct mmc_host *host);
+	int (*disable)(struct mmc_host *host, int lazy);
 	void	(*request)(struct mmc_host *host, struct mmc_request *req);
 	/*
 	 * Avoid calling these three functions too often or in a "fast path",
@@ -118,6 +147,7 @@ struct mmc_host {
 #define MMC_CAP_SPI		(1 << 4)	/* Talks only SPI protocols */
 #define MMC_CAP_NEEDS_POLL	(1 << 5)	/* Needs polling for card-detection */
 #define MMC_CAP_8_BIT_DATA	(1 << 6)	/* Can the host do 8 bit transfers */
+#define MMC_CAP_DISABLE		(1 << 7)	/* Can the host be disabled */
 
 	/* host specific block data */
 	unsigned int		max_seg_size;	/* see blk_queue_max_segment_size */
@@ -142,6 +172,13 @@ struct mmc_host {
 	unsigned int		removed:1;	/* host is being removed */
 #endif
 
+	/* Only used with MMC_CAP_DISABLE */
+	int			enabled;	/* host is enabled */
+	int			nesting_cnt;	/* "enable" nesting count */
+	int			en_dis_recurs;	/* detect recursion */
+	unsigned int		disable_delay;	/* disable delay in msecs */
+	struct delayed_work	disable;	/* disabling work */
+
 	struct mmc_card		*card;		/* device attached to this host */
 
 	wait_queue_head_t	wq;
@@ -197,5 +234,15 @@ struct regulator;
 int mmc_regulator_get_ocrmask(struct regulator *supply);
 int mmc_regulator_set_ocr(struct regulator *supply, unsigned short vdd_bit);
 
+int mmc_host_enable(struct mmc_host *host);
+int mmc_host_disable(struct mmc_host *host);
+int mmc_host_lazy_disable(struct mmc_host *host);
+
+static inline void mmc_set_disable_delay(struct mmc_host *host,
+					 unsigned int disable_delay)
+{
+	host->disable_delay = disable_delay;
+}
+
 #endif
 
-- 
cgit v1.2.3-71-gd317


From 319a3f1429c91147058ac26c5f5bac8ec1730bc6 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@nokia.com>
Date: Tue, 22 Sep 2009 16:44:30 -0700
Subject: mmc: allow host claim / release nesting

This change allows the MMC host to be claimed in situations where the host
may or may not have already been claimed.  Also 'mmc_try_claim_host()' is
now exported.

Signed-off-by: Adrian Hunter <adrian.hunter@nokia.com>
Acked-by: Matt Fleming <matt@console-pimps.org>
Cc: Ian Molton <ian@mnementh.co.uk>
Cc: "Roberto A. Foglietta" <roberto.foglietta@gmail.com>
Cc: Jarkko Lavinen <jarkko.lavinen@nokia.com>
Cc: Denis Karpov <ext-denis.2.karpov@nokia.com>
Cc: Pierre Ossman <pierre@ossman.eu>
Cc: Philip Langdale <philipl@overt.org>
Cc: "Madhusudhan" <madhu.cr@ti.com>
Cc: <linux-mmc@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mmc/core/core.c  | 34 +++++++++++++++++++++++++---------
 include/linux/mmc/core.h |  1 +
 include/linux/mmc/host.h |  2 ++
 3 files changed, 28 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index fb24a096dba8..02f2b1871a38 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -461,16 +461,18 @@ int __mmc_claim_host(struct mmc_host *host, atomic_t *abort)
 	while (1) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		stop = abort ? atomic_read(abort) : 0;
-		if (stop || !host->claimed)
+		if (stop || !host->claimed || host->claimer == current)
 			break;
 		spin_unlock_irqrestore(&host->lock, flags);
 		schedule();
 		spin_lock_irqsave(&host->lock, flags);
 	}
 	set_current_state(TASK_RUNNING);
-	if (!stop)
+	if (!stop) {
 		host->claimed = 1;
-	else
+		host->claimer = current;
+		host->claim_cnt += 1;
+	} else
 		wake_up(&host->wq);
 	spin_unlock_irqrestore(&host->lock, flags);
 	remove_wait_queue(&host->wq, &wait);
@@ -481,29 +483,43 @@ int __mmc_claim_host(struct mmc_host *host, atomic_t *abort)
 
 EXPORT_SYMBOL(__mmc_claim_host);
 
-static int mmc_try_claim_host(struct mmc_host *host)
+/**
+ *	mmc_try_claim_host - try exclusively to claim a host
+ *	@host: mmc host to claim
+ *
+ *	Returns %1 if the host is claimed, %0 otherwise.
+ */
+int mmc_try_claim_host(struct mmc_host *host)
 {
 	int claimed_host = 0;
 	unsigned long flags;
 
 	spin_lock_irqsave(&host->lock, flags);
-	if (!host->claimed) {
+	if (!host->claimed || host->claimer == current) {
 		host->claimed = 1;
+		host->claimer = current;
+		host->claim_cnt += 1;
 		claimed_host = 1;
 	}
 	spin_unlock_irqrestore(&host->lock, flags);
 	return claimed_host;
 }
+EXPORT_SYMBOL(mmc_try_claim_host);
 
 static void mmc_do_release_host(struct mmc_host *host)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&host->lock, flags);
-	host->claimed = 0;
-	spin_unlock_irqrestore(&host->lock, flags);
-
-	wake_up(&host->wq);
+	if (--host->claim_cnt) {
+		/* Release for nested claim */
+		spin_unlock_irqrestore(&host->lock, flags);
+	} else {
+		host->claimed = 0;
+		host->claimer = NULL;
+		spin_unlock_irqrestore(&host->lock, flags);
+		wake_up(&host->wq);
+	}
 }
 
 void mmc_host_deeper_disable(struct work_struct *work)
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index 7ac8b500d55c..e4898e9eeb59 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -139,6 +139,7 @@ extern unsigned int mmc_align_data_size(struct mmc_card *, unsigned int);
 
 extern int __mmc_claim_host(struct mmc_host *host, atomic_t *abort);
 extern void mmc_release_host(struct mmc_host *host);
+extern int mmc_try_claim_host(struct mmc_host *host);
 
 /**
  *	mmc_claim_host - exclusively claim a host
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 338a9b3d51e4..631a2fea5264 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -182,6 +182,8 @@ struct mmc_host {
 	struct mmc_card		*card;		/* device attached to this host */
 
 	wait_queue_head_t	wq;
+	struct task_struct	*claimer;	/* task that has host claimed */
+	int			claim_cnt;	/* "claim" nesting count */
 
 	struct delayed_work	detect;
 
-- 
cgit v1.2.3-71-gd317


From 9feae246963c648b212abad0f0eb8938de5f5fe5 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@nokia.com>
Date: Tue, 22 Sep 2009 16:44:32 -0700
Subject: mmc: add MMC_CAP_NONREMOVABLE host capability

eMMC's are not removable, so unsafe resume is OK always.

To permit this a new host capability MMC_CAP_NONREMOVABLE has been added
and suspend / resume updated accordingly.

Signed-off-by: Adrian Hunter <adrian.hunter@nokia.com>
Acked-by: Matt Fleming <matt@console-pimps.org>
Cc: Ian Molton <ian@mnementh.co.uk>
Cc: "Roberto A. Foglietta" <roberto.foglietta@gmail.com>
Cc: Jarkko Lavinen <jarkko.lavinen@nokia.com>
Cc: Denis Karpov <ext-denis.2.karpov@nokia.com>
Cc: Pierre Ossman <pierre@ossman.eu>
Cc: Philip Langdale <philipl@overt.org>
Cc: "Madhusudhan" <madhu.cr@ti.com>
Cc: <linux-mmc@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mmc/core/mmc.c   | 41 ++++++++++++++++++++++++++++++++++-------
 drivers/mmc/core/sd.c    | 41 ++++++++++++++++++++++++++++++++++-------
 include/linux/mmc/host.h |  1 +
 3 files changed, 69 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 2fb9d5f271ea..995db1853a81 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -507,8 +507,6 @@ static void mmc_detect(struct mmc_host *host)
 	}
 }
 
-#ifdef CONFIG_MMC_UNSAFE_RESUME
-
 /*
  * Suspend callback from host.
  */
@@ -551,20 +549,49 @@ static void mmc_resume(struct mmc_host *host)
 
 }
 
-#else
+#ifdef CONFIG_MMC_UNSAFE_RESUME
 
-#define mmc_suspend NULL
-#define mmc_resume NULL
+static const struct mmc_bus_ops mmc_ops = {
+	.remove = mmc_remove,
+	.detect = mmc_detect,
+	.suspend = mmc_suspend,
+	.resume = mmc_resume,
+};
 
-#endif
+static void mmc_attach_bus_ops(struct mmc_host *host)
+{
+	mmc_attach_bus(host, &mmc_ops);
+}
+
+#else
 
 static const struct mmc_bus_ops mmc_ops = {
+	.remove = mmc_remove,
+	.detect = mmc_detect,
+	.suspend = NULL,
+	.resume = NULL,
+};
+
+static const struct mmc_bus_ops mmc_ops_unsafe = {
 	.remove = mmc_remove,
 	.detect = mmc_detect,
 	.suspend = mmc_suspend,
 	.resume = mmc_resume,
 };
 
+static void mmc_attach_bus_ops(struct mmc_host *host)
+{
+	const struct mmc_bus_ops *bus_ops;
+
+	if (host->caps & MMC_CAP_NONREMOVABLE)
+		bus_ops = &mmc_ops_unsafe;
+	else
+		bus_ops = &mmc_ops;
+	mmc_attach_bus(host, bus_ops);
+}
+
+#endif
+
 /*
  * Starting point for MMC card init.
  */
@@ -575,7 +602,7 @@ int mmc_attach_mmc(struct mmc_host *host, u32 ocr)
 	BUG_ON(!host);
 	WARN_ON(!host->claimed);
 
-	mmc_attach_bus(host, &mmc_ops);
+	mmc_attach_bus_ops(host);
 
 	/*
 	 * We need to get OCR a different way for SPI.
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index 7ad646fe077e..92fa9dceca79 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -561,8 +561,6 @@ static void mmc_sd_detect(struct mmc_host *host)
 	}
 }
 
-#ifdef CONFIG_MMC_UNSAFE_RESUME
-
 /*
  * Suspend callback from host.
  */
@@ -605,20 +603,49 @@ static void mmc_sd_resume(struct mmc_host *host)
 
 }
 
-#else
+#ifdef CONFIG_MMC_UNSAFE_RESUME
 
-#define mmc_sd_suspend NULL
-#define mmc_sd_resume NULL
+static const struct mmc_bus_ops mmc_sd_ops = {
+	.remove = mmc_sd_remove,
+	.detect = mmc_sd_detect,
+	.suspend = mmc_sd_suspend,
+	.resume = mmc_sd_resume,
+};
 
-#endif
+static void mmc_sd_attach_bus_ops(struct mmc_host *host)
+{
+	mmc_attach_bus(host, &mmc_sd_ops);
+}
+
+#else
 
 static const struct mmc_bus_ops mmc_sd_ops = {
+	.remove = mmc_sd_remove,
+	.detect = mmc_sd_detect,
+	.suspend = NULL,
+	.resume = NULL,
+};
+
+static const struct mmc_bus_ops mmc_sd_ops_unsafe = {
 	.remove = mmc_sd_remove,
 	.detect = mmc_sd_detect,
 	.suspend = mmc_sd_suspend,
 	.resume = mmc_sd_resume,
 };
 
+static void mmc_sd_attach_bus_ops(struct mmc_host *host)
+{
+	const struct mmc_bus_ops *bus_ops;
+
+	if (host->caps & MMC_CAP_NONREMOVABLE)
+		bus_ops = &mmc_sd_ops_unsafe;
+	else
+		bus_ops = &mmc_sd_ops;
+	mmc_attach_bus(host, bus_ops);
+}
+
+#endif
+
 /*
  * Starting point for SD card init.
  */
@@ -629,7 +656,7 @@ int mmc_attach_sd(struct mmc_host *host, u32 ocr)
 	BUG_ON(!host);
 	WARN_ON(!host->claimed);
 
-	mmc_attach_bus(host, &mmc_sd_ops);
+	mmc_sd_attach_bus_ops(host);
 
 	/*
 	 * We need to get OCR a different way for SPI.
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 631a2fea5264..bb867d2c26bd 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -148,6 +148,7 @@ struct mmc_host {
 #define MMC_CAP_NEEDS_POLL	(1 << 5)	/* Needs polling for card-detection */
 #define MMC_CAP_8_BIT_DATA	(1 << 6)	/* Can the host do 8 bit transfers */
 #define MMC_CAP_DISABLE		(1 << 7)	/* Can the host be disabled */
+#define MMC_CAP_NONREMOVABLE	(1 << 8)	/* Nonremovable e.g. eMMC */
 
 	/* host specific block data */
 	unsigned int		max_seg_size;	/* see blk_queue_max_segment_size */
-- 
cgit v1.2.3-71-gd317


From eae1aeeed852aae37621b82a9e7f6c05096a18fd Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@nokia.com>
Date: Tue, 22 Sep 2009 16:44:33 -0700
Subject: mmc: add ability to save power by powering off cards

Power can be saved by powering off cards that are not in use.  This is
similar to suspend / resume except it is under the control of the driver,
and does not require any power management support.  It can only be used
when the driver can monitor whether the card is removed, otherwise it is
unsafe.  This is possible because, unlike suspend, the driver still
receives card detect and / or cover switch interrupts.

Signed-off-by: Adrian Hunter <adrian.hunter@nokia.com>
Acked-by: Matt Fleming <matt@console-pimps.org>
Cc: Ian Molton <ian@mnementh.co.uk>
Cc: "Roberto A. Foglietta" <roberto.foglietta@gmail.com>
Cc: Jarkko Lavinen <jarkko.lavinen@nokia.com>
Cc: Denis Karpov <ext-denis.2.karpov@nokia.com>
Cc: Pierre Ossman <pierre@ossman.eu>
Cc: Philip Langdale <philipl@overt.org>
Cc: "Madhusudhan" <madhu.cr@ti.com>
Cc: <linux-mmc@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mmc/core/core.c  | 34 ++++++++++++++++++++++++++++++++++
 drivers/mmc/core/core.h  |  2 ++
 drivers/mmc/core/mmc.c   | 11 +++++++++++
 drivers/mmc/core/sd.c    | 11 +++++++++++
 include/linux/mmc/host.h |  3 +++
 5 files changed, 61 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 02f2b1871a38..be1fc013fbe9 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -1151,6 +1151,40 @@ void mmc_stop_host(struct mmc_host *host)
 	mmc_power_off(host);
 }
 
+void mmc_power_save_host(struct mmc_host *host)
+{
+	mmc_bus_get(host);
+
+	if (!host->bus_ops || host->bus_dead || !host->bus_ops->power_restore) {
+		mmc_bus_put(host);
+		return;
+	}
+
+	if (host->bus_ops->power_save)
+		host->bus_ops->power_save(host);
+
+	mmc_bus_put(host);
+
+	mmc_power_off(host);
+}
+EXPORT_SYMBOL(mmc_power_save_host);
+
+void mmc_power_restore_host(struct mmc_host *host)
+{
+	mmc_bus_get(host);
+
+	if (!host->bus_ops || host->bus_dead || !host->bus_ops->power_restore) {
+		mmc_bus_put(host);
+		return;
+	}
+
+	mmc_power_up(host);
+	host->bus_ops->power_restore(host);
+
+	mmc_bus_put(host);
+}
+EXPORT_SYMBOL(mmc_power_restore_host);
+
 #ifdef CONFIG_PM
 
 /**
diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h
index c819effa1032..f7eb4c4ca014 100644
--- a/drivers/mmc/core/core.h
+++ b/drivers/mmc/core/core.h
@@ -20,6 +20,8 @@ struct mmc_bus_ops {
 	void (*detect)(struct mmc_host *);
 	void (*suspend)(struct mmc_host *);
 	void (*resume)(struct mmc_host *);
+	void (*power_save)(struct mmc_host *);
+	void (*power_restore)(struct mmc_host *);
 };
 
 void mmc_attach_bus(struct mmc_host *host, const struct mmc_bus_ops *ops);
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 995db1853a81..27e842df6a6f 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -549,6 +549,14 @@ static void mmc_resume(struct mmc_host *host)
 
 }
 
+static void mmc_power_restore(struct mmc_host *host)
+{
+	host->card->state &= ~MMC_STATE_HIGHSPEED;
+	mmc_claim_host(host);
+	mmc_init_card(host, host->ocr, host->card);
+	mmc_release_host(host);
+}
+
 #ifdef CONFIG_MMC_UNSAFE_RESUME
 
 static const struct mmc_bus_ops mmc_ops = {
@@ -556,6 +564,7 @@ static const struct mmc_bus_ops mmc_ops = {
 	.detect = mmc_detect,
 	.suspend = mmc_suspend,
 	.resume = mmc_resume,
+	.power_restore = mmc_power_restore,
 };
 
 static void mmc_attach_bus_ops(struct mmc_host *host)
@@ -570,6 +579,7 @@ static const struct mmc_bus_ops mmc_ops = {
 	.detect = mmc_detect,
 	.suspend = NULL,
 	.resume = NULL,
+	.power_restore = mmc_power_restore,
 };
 
 static const struct mmc_bus_ops mmc_ops_unsafe = {
@@ -577,6 +587,7 @@ static const struct mmc_bus_ops mmc_ops_unsafe = {
 	.detect = mmc_detect,
 	.suspend = mmc_suspend,
 	.resume = mmc_resume,
+	.power_restore = mmc_power_restore,
 };
 
 static void mmc_attach_bus_ops(struct mmc_host *host)
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index 92fa9dceca79..222a60928cdb 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -603,6 +603,14 @@ static void mmc_sd_resume(struct mmc_host *host)
 
 }
 
+static void mmc_sd_power_restore(struct mmc_host *host)
+{
+	host->card->state &= ~MMC_STATE_HIGHSPEED;
+	mmc_claim_host(host);
+	mmc_sd_init_card(host, host->ocr, host->card);
+	mmc_release_host(host);
+}
+
 #ifdef CONFIG_MMC_UNSAFE_RESUME
 
 static const struct mmc_bus_ops mmc_sd_ops = {
@@ -610,6 +618,7 @@ static const struct mmc_bus_ops mmc_sd_ops = {
 	.detect = mmc_sd_detect,
 	.suspend = mmc_sd_suspend,
 	.resume = mmc_sd_resume,
+	.power_restore = mmc_sd_power_restore,
 };
 
 static void mmc_sd_attach_bus_ops(struct mmc_host *host)
@@ -624,6 +633,7 @@ static const struct mmc_bus_ops mmc_sd_ops = {
 	.detect = mmc_sd_detect,
 	.suspend = NULL,
 	.resume = NULL,
+	.power_restore = mmc_sd_power_restore,
 };
 
 static const struct mmc_bus_ops mmc_sd_ops_unsafe = {
@@ -631,6 +641,7 @@ static const struct mmc_bus_ops mmc_sd_ops_unsafe = {
 	.detect = mmc_sd_detect,
 	.suspend = mmc_sd_suspend,
 	.resume = mmc_sd_resume,
+	.power_restore = mmc_sd_power_restore,
 };
 
 static void mmc_sd_attach_bus_ops(struct mmc_host *host)
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index bb867d2c26bd..c1cbe598d470 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -223,6 +223,9 @@ static inline void *mmc_priv(struct mmc_host *host)
 extern int mmc_suspend_host(struct mmc_host *, pm_message_t);
 extern int mmc_resume_host(struct mmc_host *);
 
+extern void mmc_power_save_host(struct mmc_host *host);
+extern void mmc_power_restore_host(struct mmc_host *host);
+
 extern void mmc_detect_change(struct mmc_host *, unsigned long delay);
 extern void mmc_request_done(struct mmc_host *, struct mmc_request *);
 
-- 
cgit v1.2.3-71-gd317


From b1ebe38456f7fe61a88af2844361e763ac6ea5ae Mon Sep 17 00:00:00 2001
From: Jarkko Lavinen <jarkko.lavinen@nokia.com>
Date: Tue, 22 Sep 2009 16:44:34 -0700
Subject: mmc: add mmc card sleep and awake support

Add support for the new MMC command SLEEP_AWAKE.

Signed-off-by: Jarkko Lavinen <jarkko.lavinen@nokia.com>
Signed-off-by: Adrian Hunter <adrian.hunter@nokia.com>
Acked-by: Matt Fleming <matt@console-pimps.org>
Cc: Ian Molton <ian@mnementh.co.uk>
Cc: "Roberto A. Foglietta" <roberto.foglietta@gmail.com>
Cc: Jarkko Lavinen <jarkko.lavinen@nokia.com>
Cc: Denis Karpov <ext-denis.2.karpov@nokia.com>
Cc: Pierre Ossman <pierre@ossman.eu>
Cc: Philip Langdale <philipl@overt.org>
Cc: "Madhusudhan" <madhu.cr@ti.com>
Cc: <linux-mmc@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mmc/core/core.c    | 40 ++++++++++++++++++++++++++++++++++
 drivers/mmc/core/core.h    |  2 ++
 drivers/mmc/core/mmc.c     | 54 +++++++++++++++++++++++++++++++++++++++++-----
 drivers/mmc/core/mmc_ops.c | 36 +++++++++++++++++++++++++++++++
 drivers/mmc/core/mmc_ops.h |  1 +
 include/linux/mmc/card.h   |  2 ++
 include/linux/mmc/host.h   |  5 +++++
 include/linux/mmc/mmc.h    |  2 ++
 8 files changed, 137 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index be1fc013fbe9..828e60ea528c 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -1185,6 +1185,46 @@ void mmc_power_restore_host(struct mmc_host *host)
 }
 EXPORT_SYMBOL(mmc_power_restore_host);
 
+int mmc_card_awake(struct mmc_host *host)
+{
+	int err = -ENOSYS;
+
+	mmc_bus_get(host);
+
+	if (host->bus_ops && !host->bus_dead && host->bus_ops->awake)
+		err = host->bus_ops->awake(host);
+
+	mmc_bus_put(host);
+
+	return err;
+}
+EXPORT_SYMBOL(mmc_card_awake);
+
+int mmc_card_sleep(struct mmc_host *host)
+{
+	int err = -ENOSYS;
+
+	mmc_bus_get(host);
+
+	if (host->bus_ops && !host->bus_dead && host->bus_ops->awake)
+		err = host->bus_ops->sleep(host);
+
+	mmc_bus_put(host);
+
+	return err;
+}
+EXPORT_SYMBOL(mmc_card_sleep);
+
+int mmc_card_can_sleep(struct mmc_host *host)
+{
+	struct mmc_card *card = host->card;
+
+	if (card && mmc_card_mmc(card) && card->ext_csd.rev >= 3)
+		return 1;
+	return 0;
+}
+EXPORT_SYMBOL(mmc_card_can_sleep);
+
 #ifdef CONFIG_PM
 
 /**
diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h
index f7eb4c4ca014..c386348f5f73 100644
--- a/drivers/mmc/core/core.h
+++ b/drivers/mmc/core/core.h
@@ -16,6 +16,8 @@
 #define MMC_CMD_RETRIES        3
 
 struct mmc_bus_ops {
+	int (*awake)(struct mmc_host *);
+	int (*sleep)(struct mmc_host *);
 	void (*remove)(struct mmc_host *);
 	void (*detect)(struct mmc_host *);
 	void (*suspend)(struct mmc_host *);
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 27e842df6a6f..e0bfa9515c8a 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -160,7 +160,6 @@ static int mmc_read_ext_csd(struct mmc_card *card)
 {
 	int err;
 	u8 *ext_csd;
-	unsigned int ext_csd_struct;
 
 	BUG_ON(!card);
 
@@ -207,16 +206,16 @@ static int mmc_read_ext_csd(struct mmc_card *card)
 		goto out;
 	}
 
-	ext_csd_struct = ext_csd[EXT_CSD_REV];
-	if (ext_csd_struct > 3) {
+	card->ext_csd.rev = ext_csd[EXT_CSD_REV];
+	if (card->ext_csd.rev > 3) {
 		printk(KERN_ERR "%s: unrecognised EXT_CSD structure "
 			"version %d\n", mmc_hostname(card->host),
-			ext_csd_struct);
+			card->ext_csd.rev);
 		err = -EINVAL;
 		goto out;
 	}
 
-	if (ext_csd_struct >= 2) {
+	if (card->ext_csd.rev >= 2) {
 		card->ext_csd.sectors =
 			ext_csd[EXT_CSD_SEC_CNT + 0] << 0 |
 			ext_csd[EXT_CSD_SEC_CNT + 1] << 8 |
@@ -241,6 +240,15 @@ static int mmc_read_ext_csd(struct mmc_card *card)
 		goto out;
 	}
 
+	if (card->ext_csd.rev >= 3) {
+		u8 sa_shift = ext_csd[EXT_CSD_S_A_TIMEOUT];
+
+		/* Sleep / awake timeout in 100ns units */
+		if (sa_shift > 0 && sa_shift <= 0x17)
+			card->ext_csd.sa_timeout =
+					1 << ext_csd[EXT_CSD_S_A_TIMEOUT];
+	}
+
 out:
 	kfree(ext_csd);
 
@@ -557,9 +565,41 @@ static void mmc_power_restore(struct mmc_host *host)
 	mmc_release_host(host);
 }
 
+static int mmc_sleep(struct mmc_host *host)
+{
+	struct mmc_card *card = host->card;
+	int err = -ENOSYS;
+
+	if (card && card->ext_csd.rev >= 3) {
+		err = mmc_card_sleepawake(host, 1);
+		if (err < 0)
+			pr_debug("%s: Error %d while putting card into sleep",
+				 mmc_hostname(host), err);
+	}
+
+	return err;
+}
+
+static int mmc_awake(struct mmc_host *host)
+{
+	struct mmc_card *card = host->card;
+	int err = -ENOSYS;
+
+	if (card && card->ext_csd.rev >= 3) {
+		err = mmc_card_sleepawake(host, 0);
+		if (err < 0)
+			pr_debug("%s: Error %d while awaking sleeping card",
+				 mmc_hostname(host), err);
+	}
+
+	return err;
+}
+
 #ifdef CONFIG_MMC_UNSAFE_RESUME
 
 static const struct mmc_bus_ops mmc_ops = {
+	.awake = mmc_awake,
+	.sleep = mmc_sleep,
 	.remove = mmc_remove,
 	.detect = mmc_detect,
 	.suspend = mmc_suspend,
@@ -575,6 +615,8 @@ static void mmc_attach_bus_ops(struct mmc_host *host)
 #else
 
 static const struct mmc_bus_ops mmc_ops = {
+	.awake = mmc_awake,
+	.sleep = mmc_sleep,
 	.remove = mmc_remove,
 	.detect = mmc_detect,
 	.suspend = NULL,
@@ -583,6 +625,8 @@ static const struct mmc_bus_ops mmc_ops = {
 };
 
 static const struct mmc_bus_ops mmc_ops_unsafe = {
+	.awake = mmc_awake,
+	.sleep = mmc_sleep,
 	.remove = mmc_remove,
 	.detect = mmc_detect,
 	.suspend = mmc_suspend,
diff --git a/drivers/mmc/core/mmc_ops.c b/drivers/mmc/core/mmc_ops.c
index 34ce2703d29a..355c6042cf65 100644
--- a/drivers/mmc/core/mmc_ops.c
+++ b/drivers/mmc/core/mmc_ops.c
@@ -57,6 +57,42 @@ int mmc_deselect_cards(struct mmc_host *host)
 	return _mmc_select_card(host, NULL);
 }
 
+int mmc_card_sleepawake(struct mmc_host *host, int sleep)
+{
+	struct mmc_command cmd;
+	struct mmc_card *card = host->card;
+	int err;
+
+	if (sleep)
+		mmc_deselect_cards(host);
+
+	memset(&cmd, 0, sizeof(struct mmc_command));
+
+	cmd.opcode = MMC_SLEEP_AWAKE;
+	cmd.arg = card->rca << 16;
+	if (sleep)
+		cmd.arg |= 1 << 15;
+
+	cmd.flags = MMC_RSP_R1B | MMC_CMD_AC;
+	err = mmc_wait_for_cmd(host, &cmd, 0);
+	if (err)
+		return err;
+
+	/*
+	 * If the host does not wait while the card signals busy, then we will
+	 * will have to wait the sleep/awake timeout.  Note, we cannot use the
+	 * SEND_STATUS command to poll the status because that command (and most
+	 * others) is invalid while the card sleeps.
+	 */
+	if (!(host->caps & MMC_CAP_WAIT_WHILE_BUSY))
+		mmc_delay(DIV_ROUND_UP(card->ext_csd.sa_timeout, 10000));
+
+	if (!sleep)
+		err = mmc_select_card(card);
+
+	return err;
+}
+
 int mmc_go_idle(struct mmc_host *host)
 {
 	int err;
diff --git a/drivers/mmc/core/mmc_ops.h b/drivers/mmc/core/mmc_ops.h
index 17854bf7cf0d..653eb8e84178 100644
--- a/drivers/mmc/core/mmc_ops.h
+++ b/drivers/mmc/core/mmc_ops.h
@@ -25,6 +25,7 @@ int mmc_send_status(struct mmc_card *card, u32 *status);
 int mmc_send_cid(struct mmc_host *host, u32 *cid);
 int mmc_spi_read_ocr(struct mmc_host *host, int highcap, u32 *ocrp);
 int mmc_spi_set_crc(struct mmc_host *host, int use_crc);
+int mmc_card_sleepawake(struct mmc_host *host, int sleep);
 
 #endif
 
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 403aa505f27e..58f59174c64b 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -40,6 +40,8 @@ struct mmc_csd {
 };
 
 struct mmc_ext_csd {
+	u8			rev;
+	unsigned int		sa_timeout;		/* Units: 100ns */
 	unsigned int		hs_max_dtr;
 	unsigned int		sectors;
 };
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index c1cbe598d470..81bb42358595 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -149,6 +149,7 @@ struct mmc_host {
 #define MMC_CAP_8_BIT_DATA	(1 << 6)	/* Can the host do 8 bit transfers */
 #define MMC_CAP_DISABLE		(1 << 7)	/* Can the host be disabled */
 #define MMC_CAP_NONREMOVABLE	(1 << 8)	/* Nonremovable e.g. eMMC */
+#define MMC_CAP_WAIT_WHILE_BUSY	(1 << 9)	/* Waits while card is busy */
 
 	/* host specific block data */
 	unsigned int		max_seg_size;	/* see blk_queue_max_segment_size */
@@ -240,6 +241,10 @@ struct regulator;
 int mmc_regulator_get_ocrmask(struct regulator *supply);
 int mmc_regulator_set_ocr(struct regulator *supply, unsigned short vdd_bit);
 
+int mmc_card_awake(struct mmc_host *host);
+int mmc_card_sleep(struct mmc_host *host);
+int mmc_card_can_sleep(struct mmc_host *host);
+
 int mmc_host_enable(struct mmc_host *host);
 int mmc_host_disable(struct mmc_host *host);
 int mmc_host_lazy_disable(struct mmc_host *host);
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index 14b81f3e5232..b2b40951c16c 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -31,6 +31,7 @@
 #define MMC_ALL_SEND_CID          2   /* bcr                     R2  */
 #define MMC_SET_RELATIVE_ADDR     3   /* ac   [31:16] RCA        R1  */
 #define MMC_SET_DSR               4   /* bc   [31:16] RCA            */
+#define MMC_SLEEP_AWAKE		  5   /* ac   [31:16] RCA 15:flg R1b */
 #define MMC_SWITCH                6   /* ac   [31:0] See below   R1b */
 #define MMC_SELECT_CARD           7   /* ac   [31:16] RCA        R1  */
 #define MMC_SEND_EXT_CSD          8   /* adtc                    R1  */
@@ -254,6 +255,7 @@ struct _mmc_csd {
 #define EXT_CSD_CARD_TYPE	196	/* RO */
 #define EXT_CSD_REV		192	/* RO */
 #define EXT_CSD_SEC_CNT		212	/* RO, 4 bytes */
+#define EXT_CSD_S_A_TIMEOUT	217
 
 /*
  * EXT_CSD field definitions
-- 
cgit v1.2.3-71-gd317


From ef0b27d4ccacac32afc3d1c0e8a95e4091dfbc8c Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@nokia.com>
Date: Tue, 22 Sep 2009 16:44:37 -0700
Subject: mmc: check status after MMC SWITCH command

According to the standard, the SWITCH command should be followed by a
SEND_STATUS command to check for errors.

Signed-off-by: Adrian Hunter <adrian.hunter@nokia.com>
Acked-by: Matt Fleming <matt@console-pimps.org>
Cc: Ian Molton <ian@mnementh.co.uk>
Cc: "Roberto A. Foglietta" <roberto.foglietta@gmail.com>
Cc: Jarkko Lavinen <jarkko.lavinen@nokia.com>
Cc: Denis Karpov <ext-denis.2.karpov@nokia.com>
Cc: Pierre Ossman <pierre@ossman.eu>
Cc: Philip Langdale <philipl@overt.org>
Cc: "Madhusudhan" <madhu.cr@ti.com>
Cc: <linux-mmc@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mmc/core/mmc.c     | 24 ++++++++++++++++++------
 drivers/mmc/core/mmc_ops.c | 23 +++++++++++++++++++++++
 include/linux/mmc/mmc.h    |  1 +
 3 files changed, 42 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index e0bfa9515c8a..a6b5fe9d3969 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -416,12 +416,17 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 		(host->caps & MMC_CAP_MMC_HIGHSPEED)) {
 		err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
 			EXT_CSD_HS_TIMING, 1);
-		if (err)
+		if (err && err != -EBADMSG)
 			goto free_card;
 
-		mmc_card_set_highspeed(card);
-
-		mmc_set_timing(card->host, MMC_TIMING_MMC_HS);
+		if (err) {
+			printk(KERN_WARNING "%s: switch to highspeed failed\n",
+			       mmc_hostname(card->host));
+			err = 0;
+		} else {
+			mmc_card_set_highspeed(card);
+			mmc_set_timing(card->host, MMC_TIMING_MMC_HS);
+		}
 	}
 
 	/*
@@ -456,10 +461,17 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 		err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
 				 EXT_CSD_BUS_WIDTH, ext_csd_bit);
 
-		if (err)
+		if (err && err != -EBADMSG)
 			goto free_card;
 
-		mmc_set_bus_width(card->host, bus_width);
+		if (err) {
+			printk(KERN_WARNING "%s: switch to bus width %d "
+			       "failed\n", mmc_hostname(card->host),
+			       1 << bus_width);
+			err = 0;
+		} else {
+			mmc_set_bus_width(card->host, bus_width);
+		}
 	}
 
 	if (!oldcard)
diff --git a/drivers/mmc/core/mmc_ops.c b/drivers/mmc/core/mmc_ops.c
index 355c6042cf65..d2cb5c634392 100644
--- a/drivers/mmc/core/mmc_ops.c
+++ b/drivers/mmc/core/mmc_ops.c
@@ -390,6 +390,7 @@ int mmc_switch(struct mmc_card *card, u8 set, u8 index, u8 value)
 {
 	int err;
 	struct mmc_command cmd;
+	u32 status;
 
 	BUG_ON(!card);
 	BUG_ON(!card->host);
@@ -407,6 +408,28 @@ int mmc_switch(struct mmc_card *card, u8 set, u8 index, u8 value)
 	if (err)
 		return err;
 
+	/* Must check status to be sure of no errors */
+	do {
+		err = mmc_send_status(card, &status);
+		if (err)
+			return err;
+		if (card->host->caps & MMC_CAP_WAIT_WHILE_BUSY)
+			break;
+		if (mmc_host_is_spi(card->host))
+			break;
+	} while (R1_CURRENT_STATE(status) == 7);
+
+	if (mmc_host_is_spi(card->host)) {
+		if (status & R1_SPI_ILLEGAL_COMMAND)
+			return -EBADMSG;
+	} else {
+		if (status & 0xFDFFA000)
+			printk(KERN_WARNING "%s: unexpected status %#x after "
+			       "switch", mmc_hostname(card->host), status);
+		if (status & R1_SWITCH_ERROR)
+			return -EBADMSG;
+	}
+
 	return 0;
 }
 
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index b2b40951c16c..c02c8db73701 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -128,6 +128,7 @@
 #define R1_STATUS(x)            (x & 0xFFFFE000)
 #define R1_CURRENT_STATE(x)	((x & 0x00001E00) >> 9)	/* sx, b (4 bits) */
 #define R1_READY_FOR_DATA	(1 << 8)	/* sx, a */
+#define R1_SWITCH_ERROR		(1 << 7)	/* sx, c */
 #define R1_APP_CMD		(1 << 5)	/* sr, c */
 
 /*
-- 
cgit v1.2.3-71-gd317


From 006ebd5de13854d6250eecc76866bbfad1ff7daf Mon Sep 17 00:00:00 2001
From: Ohad Ben-Cohen <ohad@wizery.com>
Date: Tue, 22 Sep 2009 16:45:07 -0700
Subject: sdio: add CD disable support

Add support to disconnect the pull-up resistor on CD/DAT[3] (pin 1)
of the card. This may be desired on certain setups of boards,
controllers and embedded sdio devices which do not need the card's
pull-up. As a result, card detection is disabled and power is saved.

[akpm@linux-foundation.org: simplify sdio_disable_cd() a bit]
Signed-off-by: Ohad Ben-Cohen <ohad@wizery.com>
Acked-by: Matt Fleming <matt@console-pimps.org>
Cc: Ian Molton <ian@mnementh.co.uk>
Cc: "Roberto A. Foglietta" <roberto.foglietta@gmail.com>
Cc: Philip Langdale <philipl@overt.org>
Cc: Pierre Ossman <pierre@ossman.eu>
Cc: David Vrabel <david.vrabel@csr.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mmc/core/sdio.c  | 30 ++++++++++++++++++++++++++++++
 include/linux/mmc/card.h |  3 ++-
 2 files changed, 32 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c
index 6f221dc029ad..2d24a123f0b0 100644
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -164,6 +164,29 @@ static int sdio_enable_wide(struct mmc_card *card)
 	return 0;
 }
 
+/*
+ * If desired, disconnect the pull-up resistor on CD/DAT[3] (pin 1)
+ * of the card. This may be required on certain setups of boards,
+ * controllers and embedded sdio device which do not need the card's
+ * pull-up. As a result, card detection is disabled and power is saved.
+ */
+static int sdio_disable_cd(struct mmc_card *card)
+{
+	int ret;
+	u8 ctrl;
+
+	if (!card->cccr.disable_cd)
+		return 0;
+
+	ret = mmc_io_rw_direct(card, 0, 0, SDIO_CCCR_IF, 0, &ctrl);
+	if (ret)
+		return ret;
+
+	ctrl |= SDIO_BUS_CD_DISABLE;
+
+	return mmc_io_rw_direct(card, 1, 0, SDIO_CCCR_IF, ctrl, NULL);
+}
+
 /*
  * Test if the card supports high-speed mode and, if so, switch to it.
  */
@@ -384,6 +407,13 @@ int mmc_attach_sdio(struct mmc_host *host, u32 ocr)
 	if (err)
 		goto remove;
 
+	/*
+	 * If needed, disconnect card detection pull-up resistor.
+	 */
+	err = sdio_disable_cd(card);
+	if (err)
+		goto remove;
+
 	/*
 	 * Initialize (but don't add) all present functions.
 	 */
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 58f59174c64b..00db39cceadc 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -64,7 +64,8 @@ struct sdio_cccr {
 				low_speed:1,
 				wide_bus:1,
 				high_power:1,
-				high_speed:1;
+				high_speed:1,
+				disable_cd:1;
 };
 
 struct sdio_cis {
-- 
cgit v1.2.3-71-gd317


From 7c979ec7135d96bbff34790bf4b85a8508ede7fc Mon Sep 17 00:00:00 2001
From: Ohad Ben-Cohen <ohad@wizery.com>
Date: Tue, 22 Sep 2009 16:45:18 -0700
Subject: sdio: add MMC_QUIRK_LENIENT_FN0

Normally writes to SDIO function 0 outside the vendor specific CCCR
registers are prohibited.

To support embedded devices that require writes to SDIO function 0 outside
this range (e.g.  TI WL127x embedded sdio wifi device),
MMC_QUIRK_LENIENT_FN0 is introduced.

A card quirks field is added to `struct mmc_card' to support non-standard
devices (e.g.  embedded sdio devices).

[akpm@linux-foundation.org: code in C, not cpp!]
Signed-off-by: Ohad Ben-Cohen <ohad@wizery.com>
Cc: <linux-mmc@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mmc/core/sdio_io.c | 2 +-
 include/linux/mmc/card.h   | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/sdio_io.c b/drivers/mmc/core/sdio_io.c
index f61fc2d4cd0a..f9aa8a7deffa 100644
--- a/drivers/mmc/core/sdio_io.c
+++ b/drivers/mmc/core/sdio_io.c
@@ -624,7 +624,7 @@ void sdio_f0_writeb(struct sdio_func *func, unsigned char b, unsigned int addr,
 
 	BUG_ON(!func);
 
-	if (addr < 0xF0 || addr > 0xFF) {
+	if ((addr < 0xF0 || addr > 0xFF) && (!mmc_card_lenient_fn0(func->card))) {
 		if (err_ret)
 			*err_ret = -EINVAL;
 		return;
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 00db39cceadc..2ee22e8af110 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -97,6 +97,8 @@ struct mmc_card {
 #define MMC_STATE_READONLY	(1<<1)		/* card is read-only */
 #define MMC_STATE_HIGHSPEED	(1<<2)		/* card is in high speed mode */
 #define MMC_STATE_BLOCKADDR	(1<<3)		/* card uses block-addressing */
+	unsigned int		quirks; 	/* card quirks */
+#define MMC_QUIRK_LENIENT_FN0	(1<<0)		/* allow SDIO FN0 writes outside of the VS CCCR range */
 
 	u32			raw_cid[4];	/* raw card CID */
 	u32			raw_csd[4];	/* raw card CSD */
@@ -132,6 +134,11 @@ struct mmc_card {
 #define mmc_card_set_highspeed(c) ((c)->state |= MMC_STATE_HIGHSPEED)
 #define mmc_card_set_blockaddr(c) ((c)->state |= MMC_STATE_BLOCKADDR)
 
+static inline int mmc_card_lenient_fn0(const struct mmc_card *c)
+{
+	return c->quirks & MMC_QUIRK_LENIENT_FN0;
+}
+
 #define mmc_card_name(c)	((c)->cid.prod_name)
 #define mmc_card_id(c)		(dev_name(&(c)->dev))
 
-- 
cgit v1.2.3-71-gd317


From 996ad5686c5f868e67557cc1bfcb2cfdde1a18b4 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@fluxnic.net>
Date: Tue, 22 Sep 2009 16:45:30 -0700
Subject: mmc: make SDIO device/driver struct accessors public

Especially with the PM framework, those are quite handy to have in driver
code too.

Signed-off-by: Nicolas Pitre <nico@marvell.com>
Cc: <linux-mmc@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mmc/core/sdio_bus.c   | 3 ---
 include/linux/mmc/sdio_func.h | 3 +++
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c
index 46284b527397..d37464e296a5 100644
--- a/drivers/mmc/core/sdio_bus.c
+++ b/drivers/mmc/core/sdio_bus.c
@@ -20,9 +20,6 @@
 #include "sdio_cis.h"
 #include "sdio_bus.h"
 
-#define dev_to_sdio_func(d)	container_of(d, struct sdio_func, dev)
-#define to_sdio_driver(d)      container_of(d, struct sdio_driver, drv)
-
 /* show configuration fields */
 #define sdio_config_attr(field, format_string)				\
 static ssize_t								\
diff --git a/include/linux/mmc/sdio_func.h b/include/linux/mmc/sdio_func.h
index 451bdfc85830..ac3ab683fec6 100644
--- a/include/linux/mmc/sdio_func.h
+++ b/include/linux/mmc/sdio_func.h
@@ -67,6 +67,7 @@ struct sdio_func {
 
 #define sdio_get_drvdata(f)	dev_get_drvdata(&(f)->dev)
 #define sdio_set_drvdata(f,d)	dev_set_drvdata(&(f)->dev, d)
+#define dev_to_sdio_func(d)	container_of(d, struct sdio_func, dev)
 
 /*
  * SDIO function device driver
@@ -81,6 +82,8 @@ struct sdio_driver {
 	struct device_driver drv;
 };
 
+#define to_sdio_driver(d)	container_of(d, struct sdio_driver, drv)
+
 /**
  * SDIO_DEVICE - macro used to describe a specific SDIO device
  * @vend: the 16 bit manufacturer code
-- 
cgit v1.2.3-71-gd317


From d899bf7b55f503ba7d3d07ed27c3a37e270fa7db Mon Sep 17 00:00:00 2001
From: Stefani Seibold <stefani@seibold.net>
Date: Tue, 22 Sep 2009 16:45:40 -0700
Subject: procfs: provide stack information for threads

A patch to give a better overview of the userland application stack usage,
especially for embedded linux.

Currently you are only able to dump the main process/thread stack usage
which is showed in /proc/pid/status by the "VmStk" Value.  But you get no
information about the consumed stack memory of the the threads.

There is an enhancement in the /proc/<pid>/{task/*,}/*maps and which marks
the vm mapping where the thread stack pointer reside with "[thread stack
xxxxxxxx]".  xxxxxxxx is the maximum size of stack.  This is a value
information, because libpthread doesn't set the start of the stack to the
top of the mapped area, depending of the pthread usage.

A sample output of /proc/<pid>/task/<tid>/maps looks like:

08048000-08049000 r-xp 00000000 03:00 8312       /opt/z
08049000-0804a000 rw-p 00001000 03:00 8312       /opt/z
0804a000-0806b000 rw-p 00000000 00:00 0          [heap]
a7d12000-a7d13000 ---p 00000000 00:00 0
a7d13000-a7f13000 rw-p 00000000 00:00 0          [thread stack: 001ff4b4]
a7f13000-a7f14000 ---p 00000000 00:00 0
a7f14000-a7f36000 rw-p 00000000 00:00 0
a7f36000-a8069000 r-xp 00000000 03:00 4222       /lib/libc.so.6
a8069000-a806b000 r--p 00133000 03:00 4222       /lib/libc.so.6
a806b000-a806c000 rw-p 00135000 03:00 4222       /lib/libc.so.6
a806c000-a806f000 rw-p 00000000 00:00 0
a806f000-a8083000 r-xp 00000000 03:00 14462      /lib/libpthread.so.0
a8083000-a8084000 r--p 00013000 03:00 14462      /lib/libpthread.so.0
a8084000-a8085000 rw-p 00014000 03:00 14462      /lib/libpthread.so.0
a8085000-a8088000 rw-p 00000000 00:00 0
a8088000-a80a4000 r-xp 00000000 03:00 8317       /lib/ld-linux.so.2
a80a4000-a80a5000 r--p 0001b000 03:00 8317       /lib/ld-linux.so.2
a80a5000-a80a6000 rw-p 0001c000 03:00 8317       /lib/ld-linux.so.2
afaf5000-afb0a000 rw-p 00000000 00:00 0          [stack]
ffffe000-fffff000 r-xp 00000000 00:00 0          [vdso]

Also there is a new entry "stack usage" in /proc/<pid>/{task/*,}/status
which will you give the current stack usage in kb.

A sample output of /proc/self/status looks like:

Name:	cat
State:	R (running)
Tgid:	507
Pid:	507
.
.
.
CapBnd:	fffffffffffffeff
voluntary_ctxt_switches:	0
nonvoluntary_ctxt_switches:	0
Stack usage:	12 kB

I also fixed stack base address in /proc/<pid>/{task/*,}/stat to the base
address of the associated thread stack and not the one of the main
process.  This makes more sense.

[akpm@linux-foundation.org: fs/proc/array.c now needs walk_page_range()]
Signed-off-by: Stefani Seibold <stefani@seibold.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt |  5 ++-
 fs/exec.c                          |  2 +
 fs/proc/array.c                    | 85 +++++++++++++++++++++++++++++++++++++-
 fs/proc/task_mmu.c                 | 19 +++++++++
 include/linux/sched.h              |  1 +
 kernel/fork.c                      |  2 +
 mm/Makefile                        |  4 +-
 7 files changed, 114 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 75988ba26a51..b5aee7838a00 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -176,6 +176,7 @@ read the file /proc/PID/status:
   CapBnd: ffffffffffffffff
   voluntary_ctxt_switches:        0
   nonvoluntary_ctxt_switches:     1
+  Stack usage:    12 kB
 
 This shows you nearly the same information you would get if you viewed it with
 the ps  command.  In  fact,  ps  uses  the  proc  file  system  to  obtain its
@@ -229,6 +230,7 @@ Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
  Mems_allowed_list           Same as previous, but in "list format"
  voluntary_ctxt_switches     number of voluntary context switches
  nonvoluntary_ctxt_switches  number of non voluntary context switches
+ Stack usage:                stack usage high water mark (round up to page size)
 ..............................................................................
 
 Table 1-3: Contents of the statm files (as of 2.6.8-rc3)
@@ -307,7 +309,7 @@ address           perms offset  dev   inode      pathname
 08049000-0804a000 rw-p 00001000 03:00 8312       /opt/test
 0804a000-0806b000 rw-p 00000000 00:00 0          [heap]
 a7cb1000-a7cb2000 ---p 00000000 00:00 0
-a7cb2000-a7eb2000 rw-p 00000000 00:00 0
+a7cb2000-a7eb2000 rw-p 00000000 00:00 0          [threadstack:001ff4b4]
 a7eb2000-a7eb3000 ---p 00000000 00:00 0
 a7eb3000-a7ed5000 rw-p 00000000 00:00 0
 a7ed5000-a8008000 r-xp 00000000 03:00 4222       /lib/libc.so.6
@@ -343,6 +345,7 @@ is not associated with a file:
  [stack]                  = the stack of the main process
  [vdso]                   = the "virtual dynamic shared object",
                             the kernel system call handler
+ [threadstack:xxxxxxxx]   = the stack of the thread, xxxxxxxx is the stack size
 
  or if empty, the mapping is anonymous.
 
diff --git a/fs/exec.c b/fs/exec.c
index 69bb9d899791..5c833c18d0d4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1357,6 +1357,8 @@ int do_execve(char * filename,
 	if (retval < 0)
 		goto out;
 
+	current->stack_start = current->mm->start_stack;
+
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 725a650bbbb8..0c6bc602e6c4 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -82,6 +82,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/ptrace.h>
 #include <linux/tracehook.h>
+#include <linux/swapops.h>
 
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -321,6 +322,87 @@ static inline void task_context_switch_counts(struct seq_file *m,
 			p->nivcsw);
 }
 
+struct stack_stats {
+	struct vm_area_struct *vma;
+	unsigned long	startpage;
+	unsigned long	usage;
+};
+
+static int stack_usage_pte_range(pmd_t *pmd, unsigned long addr,
+				unsigned long end, struct mm_walk *walk)
+{
+	struct stack_stats *ss = walk->private;
+	struct vm_area_struct *vma = ss->vma;
+	pte_t *pte, ptent;
+	spinlock_t *ptl;
+	int ret = 0;
+
+	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	for (; addr != end; pte++, addr += PAGE_SIZE) {
+		ptent = *pte;
+
+#ifdef CONFIG_STACK_GROWSUP
+		if (pte_present(ptent) || is_swap_pte(ptent))
+			ss->usage = addr - ss->startpage + PAGE_SIZE;
+#else
+		if (pte_present(ptent) || is_swap_pte(ptent)) {
+			ss->usage = ss->startpage - addr + PAGE_SIZE;
+			pte++;
+			ret = 1;
+			break;
+		}
+#endif
+	}
+	pte_unmap_unlock(pte - 1, ptl);
+	cond_resched();
+	return ret;
+}
+
+static inline unsigned long get_stack_usage_in_bytes(struct vm_area_struct *vma,
+				struct task_struct *task)
+{
+	struct stack_stats ss;
+	struct mm_walk stack_walk = {
+		.pmd_entry = stack_usage_pte_range,
+		.mm = vma->vm_mm,
+		.private = &ss,
+	};
+
+	if (!vma->vm_mm || is_vm_hugetlb_page(vma))
+		return 0;
+
+	ss.vma = vma;
+	ss.startpage = task->stack_start & PAGE_MASK;
+	ss.usage = 0;
+
+#ifdef CONFIG_STACK_GROWSUP
+	walk_page_range(KSTK_ESP(task) & PAGE_MASK, vma->vm_end,
+		&stack_walk);
+#else
+	walk_page_range(vma->vm_start, (KSTK_ESP(task) & PAGE_MASK) + PAGE_SIZE,
+		&stack_walk);
+#endif
+	return ss.usage;
+}
+
+static inline void task_show_stack_usage(struct seq_file *m,
+						struct task_struct *task)
+{
+	struct vm_area_struct	*vma;
+	struct mm_struct	*mm = get_task_mm(task);
+
+	if (mm) {
+		down_read(&mm->mmap_sem);
+		vma = find_vma(mm, task->stack_start);
+		if (vma)
+			seq_printf(m, "Stack usage:\t%lu kB\n",
+				get_stack_usage_in_bytes(vma, task) >> 10);
+
+		up_read(&mm->mmap_sem);
+		mmput(mm);
+	}
+}
+
 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 			struct pid *pid, struct task_struct *task)
 {
@@ -340,6 +422,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 	task_show_regs(m, task);
 #endif
 	task_context_switch_counts(m, task);
+	task_show_stack_usage(m, task);
 	return 0;
 }
 
@@ -481,7 +564,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		rsslim,
 		mm ? mm->start_code : 0,
 		mm ? mm->end_code : 0,
-		(permitted && mm) ? mm->start_stack : 0,
+		(permitted) ? task->stack_start : 0,
 		esp,
 		eip,
 		/* The signal information here is obsolete.
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 366b1017a4f1..2a1bef9203c6 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -243,6 +243,25 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 				} else if (vma->vm_start <= mm->start_stack &&
 					   vma->vm_end >= mm->start_stack) {
 					name = "[stack]";
+				} else {
+					unsigned long stack_start;
+					struct proc_maps_private *pmp;
+
+					pmp = m->private;
+					stack_start = pmp->task->stack_start;
+
+					if (vma->vm_start <= stack_start &&
+					    vma->vm_end >= stack_start) {
+						pad_len_spaces(m, len);
+						seq_printf(m,
+						 "[threadstack:%08lx]",
+#ifdef CONFIG_STACK_GROWSUP
+						 vma->vm_end - stack_start
+#else
+						 stack_start - vma->vm_start
+#endif
+						);
+					}
 				}
 			} else {
 				name = "[vdso]";
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6448bbc6406b..3cbc6c0be666 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1529,6 +1529,7 @@ struct task_struct {
 	/* bitmask of trace recursion */
 	unsigned long trace_recursion;
 #endif /* CONFIG_TRACING */
+	unsigned long stack_start;
 };
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
diff --git a/kernel/fork.c b/kernel/fork.c
index 7cf45812ce84..8f45b0ebdda7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1095,6 +1095,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	p->bts = NULL;
 
+	p->stack_start = stack_start;
+
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p, clone_flags);
 
diff --git a/mm/Makefile b/mm/Makefile
index 728a9fde49d1..88193d73cd1a 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -11,10 +11,10 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   maccess.o page_alloc.o page-writeback.o \
 			   readahead.o swap.o truncate.o vmscan.o shmem.o \
 			   prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
-			   page_isolation.o mm_init.o mmu_context.o $(mmu-y)
+			   page_isolation.o mm_init.o mmu_context.o \
+			   pagewalk.o $(mmu-y)
 obj-y += init-mm.o
 
-obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
 obj-$(CONFIG_BOUNCE)	+= bounce.o
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
 obj-$(CONFIG_HAS_DMA)	+= dmapool.o
-- 
cgit v1.2.3-71-gd317


From 2ef43ec772551e975a6ea7cf22b59c84955aadf9 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 22 Sep 2009 16:45:41 -0700
Subject: kcore: use usual list for kclist

This patchset is for /proc/kcore.  With this,

 - many per-arch hooks are removed.

 - /proc/kcore will know really valid physical memory area.

 - /proc/kcore will be aware of memory hotplug.

 - /proc/kcore will be architecture independent i.e.
   if an arch supports CONFIG_MMU, it can use /proc/kcore.
   (if the arch uses usual memory layout.)

This patch:

/proc/kcore uses its own list handling codes. It's better to use
generic list codes.

No changes in logic. just clean up.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/kcore.c         | 12 ++++++------
 include/linux/proc_fs.h |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 0cf8a24cf6c3..f9327e51ce99 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -20,6 +20,7 @@
 #include <linux/init.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
+#include <linux/list.h>
 
 #define CORE_STR "CORE"
 
@@ -57,7 +58,7 @@ struct memelfnote
 	void *data;
 };
 
-static struct kcore_list *kclist;
+static LIST_HEAD(kclist_head);
 static DEFINE_RWLOCK(kclist_lock);
 
 void
@@ -67,8 +68,7 @@ kclist_add(struct kcore_list *new, void *addr, size_t size)
 	new->size = size;
 
 	write_lock(&kclist_lock);
-	new->next = kclist;
-	kclist = new;
+	list_add_tail(&new->list, &kclist_head);
 	write_unlock(&kclist_lock);
 }
 
@@ -80,7 +80,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
 	*nphdr = 1; /* PT_NOTE */
 	size = 0;
 
-	for (m=kclist; m; m=m->next) {
+	list_for_each_entry(m, &kclist_head, list) {
 		try = kc_vaddr_to_offset((size_t)m->addr + m->size);
 		if (try > size)
 			size = try;
@@ -192,7 +192,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
 	nhdr->p_align	= 0;
 
 	/* setup ELF PT_LOAD program header for every area */
-	for (m=kclist; m; m=m->next) {
+	list_for_each_entry(m, &kclist_head, list) {
 		phdr = (struct elf_phdr *) bufp;
 		bufp += sizeof(struct elf_phdr);
 		offset += sizeof(struct elf_phdr);
@@ -317,7 +317,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 		struct kcore_list *m;
 
 		read_lock(&kclist_lock);
-		for (m=kclist; m; m=m->next) {
+		list_for_each_entry(m, &kclist_head, list) {
 			if (start >= m->addr && start < (m->addr+m->size))
 				break;
 		}
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index e6e77d31c418..0aff2a62eba9 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -79,7 +79,7 @@ struct proc_dir_entry {
 };
 
 struct kcore_list {
-	struct kcore_list *next;
+	struct list_head list;
 	unsigned long addr;
 	size_t size;
 };
-- 
cgit v1.2.3-71-gd317


From c30bb2a25fcfde6157e6154a32c14686fb0bedbe Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 22 Sep 2009 16:45:43 -0700
Subject: kcore: add kclist types

Presently, kclist_add() only eats start address and size as its arguments.
Considering to make kclist dynamically reconfigulable, it's necessary to
know which kclists are for System RAM and which are not.

This patch add kclist types as
  KCORE_RAM
  KCORE_VMALLOC
  KCORE_TEXT
  KCORE_OTHER

This "type" is used in a patch following this for detecting KCORE_RAM.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/init.c       |  7 ++++---
 arch/mips/mm/init.c       |  7 ++++---
 arch/powerpc/mm/init_32.c |  4 ++--
 arch/powerpc/mm/init_64.c |  5 +++--
 arch/sh/mm/init.c         |  4 ++--
 arch/x86/mm/init_32.c     |  4 ++--
 arch/x86/mm/init_64.c     | 11 ++++++-----
 fs/proc/kcore.c           |  3 ++-
 include/linux/proc_fs.h   | 13 +++++++++++--
 9 files changed, 36 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 1d286244a562..f6a3c21a2826 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -639,9 +639,10 @@ mem_init (void)
 
 	high_memory = __va(max_low_pfn * PAGE_SIZE);
 
-	kclist_add(&kcore_mem, __va(0), max_low_pfn * PAGE_SIZE);
-	kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
-	kclist_add(&kcore_kernel, _stext, _end - _stext);
+	kclist_add(&kcore_mem, __va(0), max_low_pfn * PAGE_SIZE, KCORE_RAM);
+	kclist_add(&kcore_vmem, (void *)VMALLOC_START,
+			VMALLOC_END-VMALLOC_START, KCORE_VMALLOC);
+	kclist_add(&kcore_kernel, _stext, _end - _stext, KCORE_TEXT);
 
 	for_each_online_pgdat(pgdat)
 		if (pgdat->bdata->node_bootmem_map)
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 1f4ee4797a6e..f8661985bff6 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -409,11 +409,12 @@ void __init mem_init(void)
 	if ((unsigned long) &_text > (unsigned long) CKSEG0)
 		/* The -4 is a hack so that user tools don't have to handle
 		   the overflow.  */
-		kclist_add(&kcore_kseg0, (void *) CKSEG0, 0x80000000 - 4);
+		kclist_add(&kcore_kseg0, (void *) CKSEG0,
+				0x80000000 - 4, KCORE_TEXT);
 #endif
-	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
+	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT, KCORE_RAM);
 	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
-		   VMALLOC_END-VMALLOC_START);
+		   VMALLOC_END-VMALLOC_START, KCORE_VMALLOC);
 
 	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
 	       "%ldk reserved, %ldk data, %ldk init, %ldk highmem)\n",
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 3ef5084b90ca..e91add90ec54 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -268,11 +268,11 @@ static int __init setup_kcore(void)
 						size);
 		}
 
-		kclist_add(kcore_mem, __va(base), size);
+		kclist_add(kcore_mem, __va(base), size, KCORE_RAM);
 	}
 
 	kclist_add(&kcore_vmem, (void *)VMALLOC_START,
-		VMALLOC_END-VMALLOC_START);
+		VMALLOC_END-VMALLOC_START, KCORE_VMALLOC);
 
 	return 0;
 }
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 31582329cd67..9ee563101b56 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -128,10 +128,11 @@ static int __init setup_kcore(void)
 		if (!kcore_mem)
 			panic("%s: kmalloc failed\n", __func__);
 
-		kclist_add(kcore_mem, __va(base), size);
+		kclist_add(kcore_mem, __va(base), size, KCORE_RAM);
 	}
 
-	kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
+	kclist_add(&kcore_vmem, (void *)VMALLOC_START,
+		VMALLOC_END-VMALLOC_START, KCORE_VMALLOC);
 
 	return 0;
 }
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index fabb7c6f48d2..ef56c9f9d7ba 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -226,9 +226,9 @@ void __init mem_init(void)
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
 
-	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
+	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT, KCORE_RAM);
 	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
-		   VMALLOC_END - VMALLOC_START);
+		   VMALLOC_END - VMALLOC_START, KCORE_VMALLOC);
 
 	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
 	       "%dk data, %dk init)\n",
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index b49b4f67453d..2cbc40112932 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -886,9 +886,9 @@ void __init mem_init(void)
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
 
-	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
+	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT, KCORE_RAM);
 	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
-		   VMALLOC_END-VMALLOC_START);
+		   VMALLOC_END-VMALLOC_START, KCORE_VMALLOC);
 
 	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
 			"%dk reserved, %dk data, %dk init, %ldk highmem)\n",
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 810bd31e7f5f..c05810b614fe 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -677,13 +677,14 @@ void __init mem_init(void)
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
 
 	/* Register memory areas for /proc/kcore */
-	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
+	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT, KCORE_RAM);
 	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
-		   VMALLOC_END-VMALLOC_START);
-	kclist_add(&kcore_kernel, &_stext, _end - _stext);
-	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
+		   VMALLOC_END-VMALLOC_START, KCORE_VMALLOC);
+	kclist_add(&kcore_kernel, &_stext, _end - _stext, KCORE_TEXT);
+	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN,
+			KCORE_OTHER);
 	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
-				 VSYSCALL_END - VSYSCALL_START);
+			 VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
 
 	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
 			 "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index f9327e51ce99..659c1635db81 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -62,10 +62,11 @@ static LIST_HEAD(kclist_head);
 static DEFINE_RWLOCK(kclist_lock);
 
 void
-kclist_add(struct kcore_list *new, void *addr, size_t size)
+kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
 {
 	new->addr = (unsigned long)addr;
 	new->size = size;
+	new->type = type;
 
 	write_lock(&kclist_lock);
 	list_add_tail(&new->list, &kclist_head);
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 0aff2a62eba9..bd7b840765a0 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -78,10 +78,18 @@ struct proc_dir_entry {
 	struct list_head pde_openers;	/* who did ->open, but not ->release */
 };
 
+enum kcore_type {
+	KCORE_TEXT,
+	KCORE_VMALLOC,
+	KCORE_RAM,
+	KCORE_OTHER,
+};
+
 struct kcore_list {
 	struct list_head list;
 	unsigned long addr;
 	size_t size;
+	int type;
 };
 
 struct vmcore {
@@ -233,11 +241,12 @@ static inline void dup_mm_exe_file(struct mm_struct *oldmm,
 #endif /* CONFIG_PROC_FS */
 
 #if !defined(CONFIG_PROC_KCORE)
-static inline void kclist_add(struct kcore_list *new, void *addr, size_t size)
+static inline void
+kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
 {
 }
 #else
-extern void kclist_add(struct kcore_list *, void *, size_t);
+extern void kclist_add(struct kcore_list *, void *, size_t, int type);
 #endif
 
 union proc_op {
-- 
cgit v1.2.3-71-gd317


From 908eedc6168bd92e89f90d89fa389065a36358fa Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 22 Sep 2009 16:45:46 -0700
Subject: walk system ram range
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Originally, walk_memory_resource() was introduced to traverse all memory
of "System RAM" for detecting memory hotplug/unplug range.  For doing so,
flags of IORESOUCE_MEM|IORESOURCE_BUSY was used and this was enough for
memory hotplug.

But for using other purpose, /proc/kcore, this may includes some firmware
area marked as IORESOURCE_BUSY | IORESOUCE_MEM.  This patch makes the
check strict to find out busy "System RAM".

Note: PPC64 keeps their own walk_memory_resouce(), which walk through
ppc64's lmb informaton.  Because old kclist_add() is called per lmb, this
patch makes no difference in behavior, finally.

And this patch removes CONFIG_MEMORY_HOTPLUG check from this function.
Because pfn_valid() just show "there is memmap or not* and cannot be used
for "there is physical memory or not", this function is useful in generic
to scan physical memory range.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Américo Wang <xiyou.wangcong@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Roland Dreier <rolandd@cisco.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/mm/mem.c                  |  6 +++---
 drivers/infiniband/hw/ehca/ehca_mrmw.c |  2 +-
 drivers/net/ehea/ehea_qmr.c            |  2 +-
 include/linux/ioport.h                 |  4 ++++
 include/linux/memory_hotplug.h         |  8 --------
 kernel/resource.c                      | 23 ++++++++++++++++-------
 mm/memory_hotplug.c                    |  6 +++---
 7 files changed, 28 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 0e5c59b995ef..59736317bf0e 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -143,8 +143,8 @@ int arch_add_memory(int nid, u64 start, u64 size)
  * memory regions, find holes and callback for contiguous regions.
  */
 int
-walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg,
-			int (*func)(unsigned long, unsigned long, void *))
+walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
+		void *arg, int (*func)(unsigned long, unsigned long, void *))
 {
 	struct lmb_property res;
 	unsigned long pfn, len;
@@ -166,7 +166,7 @@ walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg,
 	}
 	return ret;
 }
-EXPORT_SYMBOL_GPL(walk_memory_resource);
+EXPORT_SYMBOL_GPL(walk_system_ram_range);
 
 /*
  * Initialize the bootmem system and give it all the memory we
diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c
index 7663a2a9f130..7550a534005c 100644
--- a/drivers/infiniband/hw/ehca/ehca_mrmw.c
+++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c
@@ -2463,7 +2463,7 @@ int ehca_create_busmap(void)
 	int ret;
 
 	ehca_mr_len = 0;
-	ret = walk_memory_resource(0, 1ULL << MAX_PHYSMEM_BITS, NULL,
+	ret = walk_system_ram_range(0, 1ULL << MAX_PHYSMEM_BITS, NULL,
 				   ehca_create_busmap_callback);
 	return ret;
 }
diff --git a/drivers/net/ehea/ehea_qmr.c b/drivers/net/ehea/ehea_qmr.c
index 3747457f5e69..bc7c5b7abb88 100644
--- a/drivers/net/ehea/ehea_qmr.c
+++ b/drivers/net/ehea/ehea_qmr.c
@@ -751,7 +751,7 @@ int ehea_create_busmap(void)
 
 	mutex_lock(&ehea_busmap_mutex);
 	ehea_mr_len = 0;
-	ret = walk_memory_resource(0, 1ULL << MAX_PHYSMEM_BITS, NULL,
+	ret = walk_system_ram_range(0, 1ULL << MAX_PHYSMEM_BITS, NULL,
 				   ehea_create_busmap_callback);
 	mutex_unlock(&ehea_busmap_mutex);
 	return ret;
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 786e7b8cece9..83aa81297ea3 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -184,5 +184,9 @@ extern void __devm_release_region(struct device *dev, struct resource *parent,
 extern int iomem_map_sanity_check(resource_size_t addr, unsigned long size);
 extern int iomem_is_exclusive(u64 addr);
 
+extern int
+walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
+		void *arg, int (*func)(unsigned long, unsigned long, void *));
+
 #endif /* __ASSEMBLY__ */
 #endif	/* _LINUX_IOPORT_H */
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index d95f72e79b82..fed969281a41 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -191,14 +191,6 @@ static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
 
 #endif /* ! CONFIG_MEMORY_HOTPLUG */
 
-/*
- * Walk through all memory which is registered as resource.
- * arg is (start_pfn, nr_pages, private_arg_pointer)
- */
-extern int walk_memory_resource(unsigned long start_pfn,
-			unsigned long nr_pages, void *arg,
-			int (*func)(unsigned long, unsigned long, void *));
-
 #ifdef CONFIG_MEMORY_HOTREMOVE
 
 extern int is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
diff --git a/kernel/resource.c b/kernel/resource.c
index 78b087221c15..fb11a58b9594 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -223,13 +223,13 @@ int release_resource(struct resource *old)
 
 EXPORT_SYMBOL(release_resource);
 
-#if defined(CONFIG_MEMORY_HOTPLUG) && !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
+#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
 /*
  * Finds the lowest memory reosurce exists within [res->start.res->end)
- * the caller must specify res->start, res->end, res->flags.
+ * the caller must specify res->start, res->end, res->flags and "name".
  * If found, returns 0, res is overwritten, if not found, returns -1.
  */
-static int find_next_system_ram(struct resource *res)
+static int find_next_system_ram(struct resource *res, char *name)
 {
 	resource_size_t start, end;
 	struct resource *p;
@@ -245,6 +245,8 @@ static int find_next_system_ram(struct resource *res)
 		/* system ram is just marked as IORESOURCE_MEM */
 		if (p->flags != res->flags)
 			continue;
+		if (name && strcmp(p->name, name))
+			continue;
 		if (p->start > end) {
 			p = NULL;
 			break;
@@ -262,19 +264,26 @@ static int find_next_system_ram(struct resource *res)
 		res->end = p->end;
 	return 0;
 }
-int
-walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg,
-			int (*func)(unsigned long, unsigned long, void *))
+
+/*
+ * This function calls callback against all memory range of "System RAM"
+ * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
+ * Now, this function is only for "System RAM".
+ */
+int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
+		void *arg, int (*func)(unsigned long, unsigned long, void *))
 {
 	struct resource res;
 	unsigned long pfn, len;
 	u64 orig_end;
 	int ret = -1;
+
 	res.start = (u64) start_pfn << PAGE_SHIFT;
 	res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
 	res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 	orig_end = res.end;
-	while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) {
+	while ((res.start < res.end) &&
+		(find_next_system_ram(&res, "System RAM") >= 0)) {
 		pfn = (unsigned long)(res.start >> PAGE_SHIFT);
 		len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT);
 		ret = (*func)(pfn, len, arg);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index efe3e0ec2e61..821dee596377 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -413,7 +413,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	if (!populated_zone(zone))
 		need_zonelists_rebuild = 1;
 
-	ret = walk_memory_resource(pfn, nr_pages, &onlined_pages,
+	ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
 		online_pages_range);
 	if (ret) {
 		printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
@@ -705,7 +705,7 @@ offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
 static void
 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 {
-	walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL,
+	walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
 				offline_isolated_pages_cb);
 }
 
@@ -731,7 +731,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
 	long offlined = 0;
 	int ret;
 
-	ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined,
+	ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
 			check_pages_isolated_cb);
 	if (ret < 0)
 		offlined = (long)ret;
-- 
cgit v1.2.3-71-gd317


From 26562c59fa9111ae3ea7b78045889662aac9e5ac Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 22 Sep 2009 16:45:49 -0700
Subject: kcore: register vmemmap range

Benjamin Herrenschmidt <benh@kernel.crashing.org> pointed out that vmemmap
range is not included in KCORE_RAM, KCORE_VMALLOC ....

This adds KCORE_VMEMMAP if SPARSEMEM_VMEMMAP is used.  By this, vmemmap
can be readable via /proc/kcore

Because it's not vmalloc area, vread/vwrite cannot be used.  But the range
is static against the memory layout, this patch handles vmemmap area by
the same scheme with physical memory.

This patch assumes SPARSEMEM_VMEMMAP range is not in VMALLOC range.  It's
correct now.

[akpm@linux-foundation.org: fix typo]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Jiri Slaby <jirislaby@gmail.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/kcore.c         | 52 +++++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/proc_fs.h |  1 +
 2 files changed, 51 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 802de33d6341..78970e6f715c 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -103,7 +103,7 @@ static void free_kclist_ents(struct list_head *head)
 	}
 }
 /*
- * Replace all KCORE_RAM information with passed list.
+ * Replace all KCORE_RAM/KCORE_VMEMMAP information with passed list.
  */
 static void __kcore_update_ram(struct list_head *list)
 {
@@ -113,7 +113,8 @@ static void __kcore_update_ram(struct list_head *list)
 	write_lock(&kclist_lock);
 	if (kcore_need_update) {
 		list_for_each_entry_safe(pos, tmp, &kclist_head, list) {
-			if (pos->type == KCORE_RAM)
+			if (pos->type == KCORE_RAM
+				|| pos->type == KCORE_VMEMMAP)
 				list_move(&pos->list, &garbage);
 		}
 		list_splice_tail(list, &kclist_head);
@@ -151,6 +152,47 @@ static int kcore_update_ram(void)
 
 #else /* !CONFIG_HIGHMEM */
 
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+/* calculate vmemmap's address from given system ram pfn and register it */
+int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
+{
+	unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT;
+	unsigned long nr_pages = ent->size >> PAGE_SHIFT;
+	unsigned long start, end;
+	struct kcore_list *vmm, *tmp;
+
+
+	start = ((unsigned long)pfn_to_page(pfn)) & PAGE_MASK;
+	end = ((unsigned long)pfn_to_page(pfn + nr_pages)) - 1;
+	end = ALIGN(end, PAGE_SIZE);
+	/* overlap check (because we have to align page */
+	list_for_each_entry(tmp, head, list) {
+		if (tmp->type != KCORE_VMEMMAP)
+			continue;
+		if (start < tmp->addr + tmp->size)
+			if (end > tmp->addr)
+				end = tmp->addr;
+	}
+	if (start < end) {
+		vmm = kmalloc(sizeof(*vmm), GFP_KERNEL);
+		if (!vmm)
+			return 0;
+		vmm->addr = start;
+		vmm->size = end - start;
+		vmm->type = KCORE_VMEMMAP;
+		list_add_tail(&vmm->list, head);
+	}
+	return 1;
+
+}
+#else
+int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
+{
+	return 1;
+}
+
+#endif
+
 static int
 kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg)
 {
@@ -181,6 +223,12 @@ kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg)
 
 	ent->type = KCORE_RAM;
 	list_add_tail(&ent->list, head);
+
+	if (!get_sparsemem_vmemmap_info(ent, head)) {
+		list_del(&ent->list);
+		goto free_out;
+	}
+
 	return 0;
 free_out:
 	kfree(ent);
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index bd7b840765a0..379eaed72d4b 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -82,6 +82,7 @@ enum kcore_type {
 	KCORE_TEXT,
 	KCORE_VMALLOC,
 	KCORE_RAM,
+	KCORE_VMEMMAP,
 	KCORE_OTHER,
 };
 
-- 
cgit v1.2.3-71-gd317


From 81ac3ad9061dd9cd490ee92f0c5316a14d77ce18 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 22 Sep 2009 16:45:49 -0700
Subject: kcore: register module area in generic way

Some archs define MODULED_VADDR/MODULES_END which is not in VMALLOC area.
This is handled only in x86-64.  This patch make it more generic.  And we
can use vread/vwrite to access the area.  Fix it.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Jiri Slaby <jirislaby@gmail.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/init_64.c |  4 +---
 fs/proc/kcore.c       | 19 ++++++++++++++++++-
 include/linux/mm.h    |  8 ++++++++
 mm/vmalloc.c          |  2 +-
 4 files changed, 28 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d5d23cc24076..5a4398a6006b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -647,7 +647,7 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
-static struct kcore_list kcore_modules, kcore_vsyscall;
+static struct kcore_list kcore_vsyscall;
 
 void __init mem_init(void)
 {
@@ -676,8 +676,6 @@ void __init mem_init(void)
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
 
 	/* Register memory areas for /proc/kcore */
-	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN,
-			KCORE_OTHER);
 	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
 			 VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
 
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 78970e6f715c..c6a5ec731972 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -490,7 +490,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 		if (m == NULL) {
 			if (clear_user(buffer, tsz))
 				return -EFAULT;
-		} else if (is_vmalloc_addr((void *)start)) {
+		} else if (is_vmalloc_or_module_addr((void *)start)) {
 			char * elf_buf;
 
 			elf_buf = kzalloc(tsz, GFP_KERNEL);
@@ -586,6 +586,22 @@ static void __init proc_kcore_text_init(void)
 }
 #endif
 
+#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
+/*
+ * MODULES_VADDR has no intersection with VMALLOC_ADDR.
+ */
+struct kcore_list kcore_modules;
+static void __init add_modules_range(void)
+{
+	kclist_add(&kcore_modules, (void *)MODULES_VADDR,
+			MODULES_END - MODULES_VADDR, KCORE_VMALLOC);
+}
+#else
+static void __init add_modules_range(void)
+{
+}
+#endif
+
 static int __init proc_kcore_init(void)
 {
 	proc_root_kcore = proc_create("kcore", S_IRUSR, NULL,
@@ -595,6 +611,7 @@ static int __init proc_kcore_init(void)
 	/* Store vmalloc area */
 	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
 		VMALLOC_END - VMALLOC_START, KCORE_VMALLOC);
+	add_modules_range();
 	/* Store direct-map area from physical memory map */
 	kcore_update_ram();
 	hotplug_memory_notifier(kcore_callback, 0);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5946e2ff9fe8..b6eae5e3144b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -285,6 +285,14 @@ static inline int is_vmalloc_addr(const void *x)
 	return 0;
 #endif
 }
+#ifdef CONFIG_MMU
+extern int is_vmalloc_or_module_addr(const void *x);
+#else
+static int is_vmalloc_or_module_addr(const void *x)
+{
+	return 0;
+}
+#endif
 
 static inline struct page *compound_head(struct page *page)
 {
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5535da1d6961..69511e663234 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -184,7 +184,7 @@ static int vmap_page_range(unsigned long start, unsigned long end,
 	return ret;
 }
 
-static inline int is_vmalloc_or_module_addr(const void *x)
+int is_vmalloc_or_module_addr(const void *x)
 {
 	/*
 	 * ARM, x86-64 and sparc64 put modules in a special place,
-- 
cgit v1.2.3-71-gd317


From a7e3108cca54c105f496919040f00df56767ec00 Mon Sep 17 00:00:00 2001
From: maximilian attems <max@stro.at>
Date: Tue, 22 Sep 2009 16:45:53 -0700
Subject: ramfs: move RAMFS_MAGIC to include/linux/magic.h

initramfs userspace likes to use this magic number.

Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: maximilian attems <max@stro.at>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ramfs/inode.c      | 4 +---
 include/linux/magic.h | 1 +
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a7f0110fca4c..a6090aa1a7c1 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -34,12 +34,10 @@
 #include <linux/ramfs.h>
 #include <linux/sched.h>
 #include <linux/parser.h>
+#include <linux/magic.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
-/* some random number */
-#define RAMFS_MAGIC	0x858458f6
-
 #define RAMFS_DEFAULT_MODE	0755
 
 static const struct super_operations ramfs_ops;
diff --git a/include/linux/magic.h b/include/linux/magic.h
index bce37786a0a5..76285e01b39e 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -12,6 +12,7 @@
 #define SYSFS_MAGIC		0x62656572
 #define SECURITYFS_MAGIC	0x73636673
 #define SELINUX_MAGIC		0xf97cff8c
+#define RAMFS_MAGIC		0x858458f6	/* some random number */
 #define TMPFS_MAGIC		0x01021994
 #define HUGETLBFS_MAGIC 	0x958458f6	/* some random number */
 #define SQUASHFS_MAGIC		0x73717368
-- 
cgit v1.2.3-71-gd317


From b73b255956119111dc18fa063d1e3a0bb3f06328 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 22 Sep 2009 16:46:00 -0700
Subject: spi.h: add missing kernel-doc for struct spi_master

Add missing kernel-doc notation in spi.h for struct spi_master:

Warning(include/linux/spi/spi.h:289): No description found for parameter 'mode_bits'
Warning(include/linux/spi/spi.h:289): No description found for parameter 'flags'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/spi/spi.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index c47c4b4da97e..eb25cedb995b 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -207,6 +207,8 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv)
  *	each slave has a chipselect signal, but it's common that not
  *	every chipselect is connected to a slave.
  * @dma_alignment: SPI controller constraint on DMA buffers alignment.
+ * @mode_bits: flags understood by this controller driver
+ * @flags: other constraints relevant to this driver
  * @setup: updates the device mode and clocking records used by a
  *	device's SPI controller; protocol code may call this.  This
  *	must fail if an unrecognized or unsupported mode is requested.
-- 
cgit v1.2.3-71-gd317


From 75368bf6c2876d8f33abfe77aa3864869a3893eb Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Tue, 22 Sep 2009 16:46:04 -0700
Subject: spi: add support for device table matching

With this patch spi drivers can use standard spi_driver.id_table and
MODULE_DEVICE_TABLE() mechanisms to bind against the devices.  Just like
we do with I2C drivers.

This is useful when a single driver supports several variants of devices
but it is not possible to detect them in run-time (like non-JEDEC chips
probing in drivers/mtd/devices/m25p80.c), and when platform_data usage is
overkill.

This patch also makes life a lot easier on OpenFirmware platforms, since
with OF we extensively use proper device IDs in modaliases.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Cc: David Brownell <dbrownell@users.sourceforge.net>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Grant Likely <grant.likely@secretlab.ca>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: Ben Dooks <ben-linux@fluff.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/spi/spi.c               | 23 +++++++++++++++++++++++
 include/linux/mod_devicetable.h | 10 ++++++++++
 include/linux/spi/spi.h         | 10 ++++++++--
 scripts/mod/file2alias.c        | 13 +++++++++++++
 4 files changed, 54 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 70845ccd85c3..8518a6eb63f3 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -59,9 +59,32 @@ static struct device_attribute spi_dev_attrs[] = {
  * and the sysfs version makes coldplug work too.
  */
 
+static const struct spi_device_id *spi_match_id(const struct spi_device_id *id,
+						const struct spi_device *sdev)
+{
+	while (id->name[0]) {
+		if (!strcmp(sdev->modalias, id->name))
+			return id;
+		id++;
+	}
+	return NULL;
+}
+
+const struct spi_device_id *spi_get_device_id(const struct spi_device *sdev)
+{
+	const struct spi_driver *sdrv = to_spi_driver(sdev->dev.driver);
+
+	return spi_match_id(sdrv->id_table, sdev);
+}
+EXPORT_SYMBOL_GPL(spi_get_device_id);
+
 static int spi_match_device(struct device *dev, struct device_driver *drv)
 {
 	const struct spi_device	*spi = to_spi_device(dev);
+	const struct spi_driver	*sdrv = to_spi_driver(drv);
+
+	if (sdrv->id_table)
+		return !!spi_match_id(sdrv->id_table, spi);
 
 	return strcmp(spi->modalias, drv->name) == 0;
 }
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 1bf5900ffe43..b34f1ef2f1fe 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -399,6 +399,16 @@ struct i2c_device_id {
 			__attribute__((aligned(sizeof(kernel_ulong_t))));
 };
 
+/* spi */
+
+#define SPI_NAME_SIZE	32
+
+struct spi_device_id {
+	char name[SPI_NAME_SIZE];
+	kernel_ulong_t driver_data	/* Data private to the driver */
+			__attribute__((aligned(sizeof(kernel_ulong_t))));
+};
+
 /* dmi */
 enum dmi_field {
 	DMI_NONE,
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index eb25cedb995b..e2051f39f6a8 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -20,6 +20,7 @@
 #define __LINUX_SPI_H
 
 #include <linux/device.h>
+#include <linux/mod_devicetable.h>
 
 /*
  * INTERFACES between SPI master-side drivers and SPI infrastructure.
@@ -86,7 +87,7 @@ struct spi_device {
 	int			irq;
 	void			*controller_state;
 	void			*controller_data;
-	char			modalias[32];
+	char			modalias[SPI_NAME_SIZE];
 
 	/*
 	 * likely need more hooks for more protocol options affecting how
@@ -145,6 +146,7 @@ struct spi_message;
 
 /**
  * struct spi_driver - Host side "protocol" driver
+ * @id_table: List of SPI devices supported by this driver
  * @probe: Binds this driver to the spi device.  Drivers can verify
  *	that the device is actually present, and may need to configure
  *	characteristics (such as bits_per_word) which weren't needed for
@@ -170,6 +172,7 @@ struct spi_message;
  * MMC, RTC, filesystem character device nodes, and hardware monitoring.
  */
 struct spi_driver {
+	const struct spi_device_id *id_table;
 	int			(*probe)(struct spi_device *spi);
 	int			(*remove)(struct spi_device *spi);
 	void			(*shutdown)(struct spi_device *spi);
@@ -734,7 +737,7 @@ struct spi_board_info {
 	 * controller_data goes to spi_device.controller_data,
 	 * irq is copied too
 	 */
-	char		modalias[32];
+	char		modalias[SPI_NAME_SIZE];
 	const void	*platform_data;
 	void		*controller_data;
 	int		irq;
@@ -802,4 +805,7 @@ spi_unregister_device(struct spi_device *spi)
 		device_unregister(&spi->dev);
 }
 
+extern const struct spi_device_id *
+spi_get_device_id(const struct spi_device *sdev);
+
 #endif /* __LINUX_SPI_H */
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 40e0045876ee..9d446e34519c 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -657,6 +657,15 @@ static int do_i2c_entry(const char *filename, struct i2c_device_id *id,
 	return 1;
 }
 
+/* Looks like: S */
+static int do_spi_entry(const char *filename, struct spi_device_id *id,
+			char *alias)
+{
+	sprintf(alias, "%s", id->name);
+
+	return 1;
+}
+
 static const struct dmifield {
 	const char *prefix;
 	int field;
@@ -853,6 +862,10 @@ void handle_moddevtable(struct module *mod, struct elf_info *info,
 		do_table(symval, sym->st_size,
 			 sizeof(struct i2c_device_id), "i2c",
 			 do_i2c_entry, mod);
+	else if (sym_is(symname, "__mod_spi_device_table"))
+		do_table(symval, sym->st_size,
+			 sizeof(struct spi_device_id), "spi",
+			 do_spi_entry, mod);
 	else if (sym_is(symname, "__mod_dmi_device_table"))
 		do_table(symval, sym->st_size,
 			 sizeof(struct dmi_system_id), "dmi",
-- 
cgit v1.2.3-71-gd317


From e0626e3844e8f430fc1a4417f523a00797df7ca6 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Tue, 22 Sep 2009 16:46:08 -0700
Subject: spi: prefix modalias with "spi:"

This makes it consistent with other buses (platform, i2c, vio, ...).  I'm
not sure why we use the prefixes, but there must be a reason.

This was easy enough to do it, and I did it.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Cc: David Brownell <dbrownell@users.sourceforge.net>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Grant Likely <grant.likely@secretlab.ca>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: Ben Dooks <ben-linux@fluff.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dmitry Torokhov <dtor@mail.ru>
Cc: Samuel Ortiz <sameo@openedhand.com>
Cc: "John W. Linville" <linville@tuxdriver.com>
Acked-by: Mike Frysinger <vapier.adi@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpio/max7301.c                    | 1 +
 drivers/gpio/mcp23s08.c                   | 1 +
 drivers/hwmon/lis3lv02d_spi.c             | 2 +-
 drivers/hwmon/max1111.c                   | 1 +
 drivers/input/touchscreen/ad7877.c        | 1 +
 drivers/input/touchscreen/ad7879.c        | 1 +
 drivers/input/touchscreen/ads7846.c       | 1 +
 drivers/leds/leds-dac124s085.c            | 1 +
 drivers/mfd/ezx-pcap.c                    | 1 +
 drivers/misc/eeprom/at25.c                | 2 +-
 drivers/mmc/host/mmc_spi.c                | 1 +
 drivers/mtd/devices/mtd_dataflash.c       | 1 +
 drivers/net/enc28j60.c                    | 1 +
 drivers/net/ks8851.c                      | 1 +
 drivers/net/wireless/libertas/if_spi.c    | 1 +
 drivers/net/wireless/p54/p54spi.c         | 1 +
 drivers/net/wireless/wl12xx/wl1251_main.c | 1 +
 drivers/rtc/rtc-ds1305.c                  | 1 +
 drivers/rtc/rtc-ds1390.c                  | 1 +
 drivers/rtc/rtc-ds3234.c                  | 1 +
 drivers/rtc/rtc-m41t94.c                  | 1 +
 drivers/rtc/rtc-max6902.c                 | 1 +
 drivers/rtc/rtc-r9701.c                   | 1 +
 drivers/rtc/rtc-rs5c348.c                 | 1 +
 drivers/serial/max3100.c                  | 1 +
 drivers/spi/spi.c                         | 3 ++-
 drivers/spi/spidev.c                      | 1 +
 drivers/spi/tle62x0.c                     | 1 +
 drivers/staging/stlc45xx/stlc45xx.c       | 1 +
 drivers/video/backlight/corgi_lcd.c       | 1 +
 drivers/video/backlight/ltv350qv.c        | 1 +
 drivers/video/backlight/tdo24m.c          | 1 +
 drivers/video/backlight/tosa_lcd.c        | 2 +-
 drivers/video/backlight/vgg2432a4.c       | 3 +--
 include/linux/mod_devicetable.h           | 1 +
 scripts/mod/file2alias.c                  | 4 ++--
 36 files changed, 38 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/max7301.c b/drivers/gpio/max7301.c
index 7b82eaae2621..480956f1ca50 100644
--- a/drivers/gpio/max7301.c
+++ b/drivers/gpio/max7301.c
@@ -339,3 +339,4 @@ module_exit(max7301_exit);
 MODULE_AUTHOR("Juergen Beisert");
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("MAX7301 SPI based GPIO-Expander");
+MODULE_ALIAS("spi:" DRIVER_NAME);
diff --git a/drivers/gpio/mcp23s08.c b/drivers/gpio/mcp23s08.c
index f6fae0e50e65..c6c7aa15f5da 100644
--- a/drivers/gpio/mcp23s08.c
+++ b/drivers/gpio/mcp23s08.c
@@ -433,3 +433,4 @@ static void __exit mcp23s08_exit(void)
 module_exit(mcp23s08_exit);
 
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("spi:mcp23s08");
diff --git a/drivers/hwmon/lis3lv02d_spi.c b/drivers/hwmon/lis3lv02d_spi.c
index 82ebca5a699c..ecd739534f6a 100644
--- a/drivers/hwmon/lis3lv02d_spi.c
+++ b/drivers/hwmon/lis3lv02d_spi.c
@@ -139,4 +139,4 @@ module_exit(lis302dl_exit);
 MODULE_AUTHOR("Daniel Mack <daniel@caiaq.de>");
 MODULE_DESCRIPTION("lis3lv02d SPI glue layer");
 MODULE_LICENSE("GPL");
-
+MODULE_ALIAS("spi:" DRV_NAME);
diff --git a/drivers/hwmon/max1111.c b/drivers/hwmon/max1111.c
index bfaa665ccf32..9ac497271adf 100644
--- a/drivers/hwmon/max1111.c
+++ b/drivers/hwmon/max1111.c
@@ -242,3 +242,4 @@ module_exit(max1111_exit);
 MODULE_AUTHOR("Eric Miao <eric.miao@marvell.com>");
 MODULE_DESCRIPTION("MAX1111 ADC Driver");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("spi:max1111");
diff --git a/drivers/input/touchscreen/ad7877.c b/drivers/input/touchscreen/ad7877.c
index ecaeb7e8e75e..eb83939c705e 100644
--- a/drivers/input/touchscreen/ad7877.c
+++ b/drivers/input/touchscreen/ad7877.c
@@ -842,3 +842,4 @@ module_exit(ad7877_exit);
 MODULE_AUTHOR("Michael Hennerich <hennerich@blackfin.uclinux.org>");
 MODULE_DESCRIPTION("AD7877 touchscreen Driver");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("spi:ad7877");
diff --git a/drivers/input/touchscreen/ad7879.c b/drivers/input/touchscreen/ad7879.c
index 5d8a70398807..19b4db7e974d 100644
--- a/drivers/input/touchscreen/ad7879.c
+++ b/drivers/input/touchscreen/ad7879.c
@@ -779,3 +779,4 @@ module_exit(ad7879_exit);
 MODULE_AUTHOR("Michael Hennerich <hennerich@blackfin.uclinux.org>");
 MODULE_DESCRIPTION("AD7879(-1) touchscreen Driver");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("spi:ad7879");
diff --git a/drivers/input/touchscreen/ads7846.c b/drivers/input/touchscreen/ads7846.c
index ba9d38c3f412..09c810999b92 100644
--- a/drivers/input/touchscreen/ads7846.c
+++ b/drivers/input/touchscreen/ads7846.c
@@ -1256,3 +1256,4 @@ module_exit(ads7846_exit);
 
 MODULE_DESCRIPTION("ADS7846 TouchScreen Driver");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("spi:ads7846");
diff --git a/drivers/leds/leds-dac124s085.c b/drivers/leds/leds-dac124s085.c
index 098d9aae7259..2913d76ad3d2 100644
--- a/drivers/leds/leds-dac124s085.c
+++ b/drivers/leds/leds-dac124s085.c
@@ -148,3 +148,4 @@ module_exit(dac124s085_leds_exit);
 MODULE_AUTHOR("Guennadi Liakhovetski <lg@denx.de>");
 MODULE_DESCRIPTION("DAC124S085 LED driver");
 MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("spi:dac124s085");
diff --git a/drivers/mfd/ezx-pcap.c b/drivers/mfd/ezx-pcap.c
index 016be4938e4c..876288917976 100644
--- a/drivers/mfd/ezx-pcap.c
+++ b/drivers/mfd/ezx-pcap.c
@@ -548,3 +548,4 @@ module_exit(ezx_pcap_exit);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Daniel Ribeiro / Harald Welte");
 MODULE_DESCRIPTION("Motorola PCAP2 ASIC Driver");
+MODULE_ALIAS("spi:ezx-pcap");
diff --git a/drivers/misc/eeprom/at25.c b/drivers/misc/eeprom/at25.c
index 2e535a0ccd5e..d902d81dde39 100644
--- a/drivers/misc/eeprom/at25.c
+++ b/drivers/misc/eeprom/at25.c
@@ -417,4 +417,4 @@ module_exit(at25_exit);
 MODULE_DESCRIPTION("Driver for most SPI EEPROMs");
 MODULE_AUTHOR("David Brownell");
 MODULE_LICENSE("GPL");
-
+MODULE_ALIAS("spi:at25");
diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c
index a461017ce5ce..d55fe4fb7935 100644
--- a/drivers/mmc/host/mmc_spi.c
+++ b/drivers/mmc/host/mmc_spi.c
@@ -1562,3 +1562,4 @@ MODULE_AUTHOR("Mike Lavender, David Brownell, "
 		"Hans-Peter Nilsson, Jan Nikitenko");
 MODULE_DESCRIPTION("SPI SD/MMC host driver");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("spi:mmc_spi");
diff --git a/drivers/mtd/devices/mtd_dataflash.c b/drivers/mtd/devices/mtd_dataflash.c
index 43976aa4dbb1..211c27acd01e 100644
--- a/drivers/mtd/devices/mtd_dataflash.c
+++ b/drivers/mtd/devices/mtd_dataflash.c
@@ -966,3 +966,4 @@ module_exit(dataflash_exit);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Andrew Victor, David Brownell");
 MODULE_DESCRIPTION("MTD DataFlash driver");
+MODULE_ALIAS("spi:mtd_dataflash");
diff --git a/drivers/net/enc28j60.c b/drivers/net/enc28j60.c
index 117fc6c12e34..66813c91a720 100644
--- a/drivers/net/enc28j60.c
+++ b/drivers/net/enc28j60.c
@@ -1666,3 +1666,4 @@ MODULE_AUTHOR("Claudio Lanconelli <lanconelli.claudio@eptar.com>");
 MODULE_LICENSE("GPL");
 module_param_named(debug, debug.msg_enable, int, 0);
 MODULE_PARM_DESC(debug, "Debug verbosity level (0=none, ..., ffff=all)");
+MODULE_ALIAS("spi:" DRV_NAME);
diff --git a/drivers/net/ks8851.c b/drivers/net/ks8851.c
index 547ac7c7479c..237835864357 100644
--- a/drivers/net/ks8851.c
+++ b/drivers/net/ks8851.c
@@ -1321,3 +1321,4 @@ MODULE_LICENSE("GPL");
 
 module_param_named(message, msg_enable, int, 0);
 MODULE_PARM_DESC(message, "Message verbosity level (0=none, 31=all)");
+MODULE_ALIAS("spi:ks8851");
diff --git a/drivers/net/wireless/libertas/if_spi.c b/drivers/net/wireless/libertas/if_spi.c
index 446e327180f8..cb8be8d7abc1 100644
--- a/drivers/net/wireless/libertas/if_spi.c
+++ b/drivers/net/wireless/libertas/if_spi.c
@@ -1222,3 +1222,4 @@ MODULE_DESCRIPTION("Libertas SPI WLAN Driver");
 MODULE_AUTHOR("Andrey Yurovsky <andrey@cozybit.com>, "
 	      "Colin McCabe <colin@cozybit.com>");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("spi:libertas_spi");
diff --git a/drivers/net/wireless/p54/p54spi.c b/drivers/net/wireless/p54/p54spi.c
index 05458d9249ce..afd26bf06649 100644
--- a/drivers/net/wireless/p54/p54spi.c
+++ b/drivers/net/wireless/p54/p54spi.c
@@ -731,3 +731,4 @@ module_exit(p54spi_exit);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Christian Lamparter <chunkeey@web.de>");
+MODULE_ALIAS("spi:cx3110x");
diff --git a/drivers/net/wireless/wl12xx/wl1251_main.c b/drivers/net/wireless/wl12xx/wl1251_main.c
index 5809ef5b18f8..1103256ad989 100644
--- a/drivers/net/wireless/wl12xx/wl1251_main.c
+++ b/drivers/net/wireless/wl12xx/wl1251_main.c
@@ -1426,3 +1426,4 @@ EXPORT_SYMBOL_GPL(wl1251_free_hw);
 MODULE_DESCRIPTION("TI wl1251 Wireles LAN Driver Core");
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Kalle Valo <kalle.valo@nokia.com>");
+MODULE_ALIAS("spi:wl12xx");
diff --git a/drivers/rtc/rtc-ds1305.c b/drivers/rtc/rtc-ds1305.c
index 8f410e59d9f5..2736b11a1b1e 100644
--- a/drivers/rtc/rtc-ds1305.c
+++ b/drivers/rtc/rtc-ds1305.c
@@ -841,3 +841,4 @@ module_exit(ds1305_exit);
 
 MODULE_DESCRIPTION("RTC driver for DS1305 and DS1306 chips");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("spi:rtc-ds1305");
diff --git a/drivers/rtc/rtc-ds1390.c b/drivers/rtc/rtc-ds1390.c
index e01b955db077..cdb705057091 100644
--- a/drivers/rtc/rtc-ds1390.c
+++ b/drivers/rtc/rtc-ds1390.c
@@ -189,3 +189,4 @@ module_exit(ds1390_exit);
 MODULE_DESCRIPTION("Dallas/Maxim DS1390/93/94 SPI RTC driver");
 MODULE_AUTHOR("Mark Jackson <mpfj@mimc.co.uk>");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("spi:rtc-ds1390");
diff --git a/drivers/rtc/rtc-ds3234.c b/drivers/rtc/rtc-ds3234.c
index c51589ede5b7..a774ca35b5f7 100644
--- a/drivers/rtc/rtc-ds3234.c
+++ b/drivers/rtc/rtc-ds3234.c
@@ -188,3 +188,4 @@ module_exit(ds3234_exit);
 MODULE_DESCRIPTION("DS3234 SPI RTC driver");
 MODULE_AUTHOR("Dennis Aberilla <denzzzhome@yahoo.com>");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("spi:ds3234");
diff --git a/drivers/rtc/rtc-m41t94.c b/drivers/rtc/rtc-m41t94.c
index c3a18c58daf6..c8c97a4169d4 100644
--- a/drivers/rtc/rtc-m41t94.c
+++ b/drivers/rtc/rtc-m41t94.c
@@ -171,3 +171,4 @@ module_exit(m41t94_exit);
 MODULE_AUTHOR("Kim B. Heino <Kim.Heino@bluegiga.com>");
 MODULE_DESCRIPTION("Driver for ST M41T94 SPI RTC");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("spi:rtc-m41t94");
diff --git a/drivers/rtc/rtc-max6902.c b/drivers/rtc/rtc-max6902.c
index 36a8ea9ed8ba..657403ebd54a 100644
--- a/drivers/rtc/rtc-max6902.c
+++ b/drivers/rtc/rtc-max6902.c
@@ -175,3 +175,4 @@ module_exit(max6902_exit);
 MODULE_DESCRIPTION ("max6902 spi RTC driver");
 MODULE_AUTHOR ("Raphael Assenat");
 MODULE_LICENSE ("GPL");
+MODULE_ALIAS("spi:rtc-max6902");
diff --git a/drivers/rtc/rtc-r9701.c b/drivers/rtc/rtc-r9701.c
index 42028f233bef..9beba49c3c5b 100644
--- a/drivers/rtc/rtc-r9701.c
+++ b/drivers/rtc/rtc-r9701.c
@@ -174,3 +174,4 @@ module_exit(r9701_exit);
 MODULE_DESCRIPTION("r9701 spi RTC driver");
 MODULE_AUTHOR("Magnus Damm <damm@opensource.se>");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("spi:rtc-r9701");
diff --git a/drivers/rtc/rtc-rs5c348.c b/drivers/rtc/rtc-rs5c348.c
index dd1e2bc7a472..2099037cb3ea 100644
--- a/drivers/rtc/rtc-rs5c348.c
+++ b/drivers/rtc/rtc-rs5c348.c
@@ -251,3 +251,4 @@ MODULE_AUTHOR("Atsushi Nemoto <anemo@mba.ocn.ne.jp>");
 MODULE_DESCRIPTION("Ricoh RS5C348 RTC driver");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_VERSION);
+MODULE_ALIAS("spi:rtc-rs5c348");
diff --git a/drivers/serial/max3100.c b/drivers/serial/max3100.c
index 75ab00631c41..3c30c56aa2e1 100644
--- a/drivers/serial/max3100.c
+++ b/drivers/serial/max3100.c
@@ -925,3 +925,4 @@ module_exit(max3100_exit);
 MODULE_DESCRIPTION("MAX3100 driver");
 MODULE_AUTHOR("Christian Pellegrin <chripell@evolware.org>");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("spi:max3100");
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 8518a6eb63f3..49e84860c8da 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -23,6 +23,7 @@
 #include <linux/init.h>
 #include <linux/cache.h>
 #include <linux/mutex.h>
+#include <linux/mod_devicetable.h>
 #include <linux/spi/spi.h>
 
 
@@ -93,7 +94,7 @@ static int spi_uevent(struct device *dev, struct kobj_uevent_env *env)
 {
 	const struct spi_device		*spi = to_spi_device(dev);
 
-	add_uevent_var(env, "MODALIAS=%s", spi->modalias);
+	add_uevent_var(env, "MODALIAS=%s%s", SPI_MODULE_PREFIX, spi->modalias);
 	return 0;
 }
 
diff --git a/drivers/spi/spidev.c b/drivers/spi/spidev.c
index 606e7a40a8da..f921bd1109e1 100644
--- a/drivers/spi/spidev.c
+++ b/drivers/spi/spidev.c
@@ -688,3 +688,4 @@ module_exit(spidev_exit);
 MODULE_AUTHOR("Andrea Paterniani, <a.paterniani@swapp-eng.it>");
 MODULE_DESCRIPTION("User mode SPI device interface");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("spi:spidev");
diff --git a/drivers/spi/tle62x0.c b/drivers/spi/tle62x0.c
index 455991fbe28f..bf9540f5fb98 100644
--- a/drivers/spi/tle62x0.c
+++ b/drivers/spi/tle62x0.c
@@ -329,3 +329,4 @@ module_exit(tle62x0_exit);
 MODULE_AUTHOR("Ben Dooks <ben@simtec.co.uk>");
 MODULE_DESCRIPTION("TLE62x0 SPI driver");
 MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("spi:tle62x0");
diff --git a/drivers/staging/stlc45xx/stlc45xx.c b/drivers/staging/stlc45xx/stlc45xx.c
index 12d414deaad6..be99eb33d817 100644
--- a/drivers/staging/stlc45xx/stlc45xx.c
+++ b/drivers/staging/stlc45xx/stlc45xx.c
@@ -2591,3 +2591,4 @@ module_exit(stlc45xx_exit);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Kalle Valo <kalle.valo@nokia.com>");
+MODULE_ALIAS("spi:cx3110x");
diff --git a/drivers/video/backlight/corgi_lcd.c b/drivers/video/backlight/corgi_lcd.c
index f8a4bb20f41a..2211a852af9c 100644
--- a/drivers/video/backlight/corgi_lcd.c
+++ b/drivers/video/backlight/corgi_lcd.c
@@ -639,3 +639,4 @@ module_exit(corgi_lcd_exit);
 MODULE_DESCRIPTION("LCD and backlight driver for SHARP C7x0/Cxx00");
 MODULE_AUTHOR("Eric Miao <eric.miao@marvell.com>");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("spi:corgi-lcd");
diff --git a/drivers/video/backlight/ltv350qv.c b/drivers/video/backlight/ltv350qv.c
index 2eb206bf73e6..4631ca8fa4a4 100644
--- a/drivers/video/backlight/ltv350qv.c
+++ b/drivers/video/backlight/ltv350qv.c
@@ -328,3 +328,4 @@ module_exit(ltv350qv_exit);
 MODULE_AUTHOR("Haavard Skinnemoen <hskinnemoen@atmel.com>");
 MODULE_DESCRIPTION("Samsung LTV350QV LCD Driver");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("spi:ltv350qv");
diff --git a/drivers/video/backlight/tdo24m.c b/drivers/video/backlight/tdo24m.c
index 51422fc4f606..bbfb502add67 100644
--- a/drivers/video/backlight/tdo24m.c
+++ b/drivers/video/backlight/tdo24m.c
@@ -472,3 +472,4 @@ module_exit(tdo24m_exit);
 MODULE_AUTHOR("Eric Miao <eric.miao@marvell.com>");
 MODULE_DESCRIPTION("Driver for Toppoly TDO24M LCD Panel");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("spi:tdo24m");
diff --git a/drivers/video/backlight/tosa_lcd.c b/drivers/video/backlight/tosa_lcd.c
index b7fbc75a62fc..50ec17dfc517 100644
--- a/drivers/video/backlight/tosa_lcd.c
+++ b/drivers/video/backlight/tosa_lcd.c
@@ -300,4 +300,4 @@ module_exit(tosa_lcd_exit);
 MODULE_AUTHOR("Dmitry Baryshkov");
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("LCD/Backlight control for Sharp SL-6000 PDA");
-
+MODULE_ALIAS("spi:tosa-lcd");
diff --git a/drivers/video/backlight/vgg2432a4.c b/drivers/video/backlight/vgg2432a4.c
index 8e653b8a6f17..b49063c831e7 100644
--- a/drivers/video/backlight/vgg2432a4.c
+++ b/drivers/video/backlight/vgg2432a4.c
@@ -280,5 +280,4 @@ module_exit(vgg2432a4_exit);
 MODULE_AUTHOR("Ben Dooks <ben-linux@fluff.org>");
 MODULE_DESCRIPTION("VGG2432A4 LCD Driver");
 MODULE_LICENSE("GPL v2");
-
-
+MODULE_ALIAS("spi:VGG2432A4");
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index b34f1ef2f1fe..f58e9d836f32 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -402,6 +402,7 @@ struct i2c_device_id {
 /* spi */
 
 #define SPI_NAME_SIZE	32
+#define SPI_MODULE_PREFIX "spi:"
 
 struct spi_device_id {
 	char name[SPI_NAME_SIZE];
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 9d446e34519c..62a9025cdcc7 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -657,11 +657,11 @@ static int do_i2c_entry(const char *filename, struct i2c_device_id *id,
 	return 1;
 }
 
-/* Looks like: S */
+/* Looks like: spi:S */
 static int do_spi_entry(const char *filename, struct spi_device_id *id,
 			char *alias)
 {
-	sprintf(alias, "%s", id->name);
+	sprintf(alias, SPI_MODULE_PREFIX "%s", id->name);
 
 	return 1;
 }
-- 
cgit v1.2.3-71-gd317


From 568d0697f42771425ae9f1e9a3db769fef7e10b6 Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Tue, 22 Sep 2009 16:46:18 -0700
Subject: spi: handle TX-only/RX-only

Support two new half-duplex SPI implementation restrictions, for links
that talk to TX-only or RX-only devices.  (Existing half-duplex flavors
support both transfer directions, just not at the same time.)

Move spi_async() into the spi.c core, and stop inlining it.  Then make
that function perform error checks and reject messages that demand more
than the underlying controller can support.

Based on a patch from Marek Szyprowski which did this only for the
bitbanged GPIO driver.

Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/spi/spi.c       | 59 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/spi/spi.h | 39 +++-----------------------------
 2 files changed, 62 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 49e84860c8da..b76f2468a84a 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -663,6 +663,65 @@ int spi_setup(struct spi_device *spi)
 }
 EXPORT_SYMBOL_GPL(spi_setup);
 
+/**
+ * spi_async - asynchronous SPI transfer
+ * @spi: device with which data will be exchanged
+ * @message: describes the data transfers, including completion callback
+ * Context: any (irqs may be blocked, etc)
+ *
+ * This call may be used in_irq and other contexts which can't sleep,
+ * as well as from task contexts which can sleep.
+ *
+ * The completion callback is invoked in a context which can't sleep.
+ * Before that invocation, the value of message->status is undefined.
+ * When the callback is issued, message->status holds either zero (to
+ * indicate complete success) or a negative error code.  After that
+ * callback returns, the driver which issued the transfer request may
+ * deallocate the associated memory; it's no longer in use by any SPI
+ * core or controller driver code.
+ *
+ * Note that although all messages to a spi_device are handled in
+ * FIFO order, messages may go to different devices in other orders.
+ * Some device might be higher priority, or have various "hard" access
+ * time requirements, for example.
+ *
+ * On detection of any fault during the transfer, processing of
+ * the entire message is aborted, and the device is deselected.
+ * Until returning from the associated message completion callback,
+ * no other spi_message queued to that device will be processed.
+ * (This rule applies equally to all the synchronous transfer calls,
+ * which are wrappers around this core asynchronous primitive.)
+ */
+int spi_async(struct spi_device *spi, struct spi_message *message)
+{
+	struct spi_master *master = spi->master;
+
+	/* Half-duplex links include original MicroWire, and ones with
+	 * only one data pin like SPI_3WIRE (switches direction) or where
+	 * either MOSI or MISO is missing.  They can also be caused by
+	 * software limitations.
+	 */
+	if ((master->flags & SPI_MASTER_HALF_DUPLEX)
+			|| (spi->mode & SPI_3WIRE)) {
+		struct spi_transfer *xfer;
+		unsigned flags = master->flags;
+
+		list_for_each_entry(xfer, &message->transfers, transfer_list) {
+			if (xfer->rx_buf && xfer->tx_buf)
+				return -EINVAL;
+			if ((flags & SPI_MASTER_NO_TX) && xfer->tx_buf)
+				return -EINVAL;
+			if ((flags & SPI_MASTER_NO_RX) && xfer->rx_buf)
+				return -EINVAL;
+		}
+	}
+
+	message->spi = spi;
+	message->status = -EINPROGRESS;
+	return master->transfer(spi, message);
+}
+EXPORT_SYMBOL_GPL(spi_async);
+
 
 /*-------------------------------------------------------------------------*/
 
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index e2051f39f6a8..97b60b37f445 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -258,6 +258,8 @@ struct spi_master {
 	/* other constraints relevant to this driver */
 	u16			flags;
 #define SPI_MASTER_HALF_DUPLEX	BIT(0)		/* can't do full duplex */
+#define SPI_MASTER_NO_RX	BIT(1)		/* can't do buffer read */
+#define SPI_MASTER_NO_TX	BIT(2)		/* can't do buffer write */
 
 	/* Setup mode and clock, etc (spi driver may call many times).
 	 *
@@ -538,42 +540,7 @@ static inline void spi_message_free(struct spi_message *m)
 }
 
 extern int spi_setup(struct spi_device *spi);
-
-/**
- * spi_async - asynchronous SPI transfer
- * @spi: device with which data will be exchanged
- * @message: describes the data transfers, including completion callback
- * Context: any (irqs may be blocked, etc)
- *
- * This call may be used in_irq and other contexts which can't sleep,
- * as well as from task contexts which can sleep.
- *
- * The completion callback is invoked in a context which can't sleep.
- * Before that invocation, the value of message->status is undefined.
- * When the callback is issued, message->status holds either zero (to
- * indicate complete success) or a negative error code.  After that
- * callback returns, the driver which issued the transfer request may
- * deallocate the associated memory; it's no longer in use by any SPI
- * core or controller driver code.
- *
- * Note that although all messages to a spi_device are handled in
- * FIFO order, messages may go to different devices in other orders.
- * Some device might be higher priority, or have various "hard" access
- * time requirements, for example.
- *
- * On detection of any fault during the transfer, processing of
- * the entire message is aborted, and the device is deselected.
- * Until returning from the associated message completion callback,
- * no other spi_message queued to that device will be processed.
- * (This rule applies equally to all the synchronous transfer calls,
- * which are wrappers around this core asynchronous primitive.)
- */
-static inline int
-spi_async(struct spi_device *spi, struct spi_message *message)
-{
-	message->spi = spi;
-	return spi->master->transfer(spi, message);
-}
+extern int spi_async(struct spi_device *spi, struct spi_message *message);
 
 /*---------------------------------------------------------------------------*/
 
-- 
cgit v1.2.3-71-gd317


From a4177ee7f1a83eecb1d75e85d32664b023ef65e9 Mon Sep 17 00:00:00 2001
From: Jani Nikula <ext-jani.1.nikula@nokia.com>
Date: Tue, 22 Sep 2009 16:46:33 -0700
Subject: gpiolib: allow exported GPIO nodes to be named using sysfs links

Commit 926b663ce8215ba448960e1ff6e58b67a2c3b99b (gpiolib: allow GPIOs to
be named) already provides naming on the chip level. This patch provides
more flexibility by allowing multiple names where ever in sysfs on a per
GPIO basis.

Adapted from David Brownell's comments on a similar concept:
http://lkml.org/lkml/2009/4/20/203.

[randy.dunlap@oracle.com: fix build for CONFIG_GENERIC_GPIO=n]
Signed-off-by: Jani Nikula <ext-jani.1.nikula@nokia.com>
Acked-by: David Brownell <david-b@pacbell.net>
Cc: Daniel Silverstone <dsilvers@simtec.co.uk>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/gpio.txt     | 10 ++++++++++
 drivers/gpio/gpiolib.c     | 45 +++++++++++++++++++++++++++++++++++++++++++++
 include/asm-generic/gpio.h |  8 ++++++++
 include/linux/gpio.h       | 11 +++++++++++
 4 files changed, 74 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt
index e4b6985044a2..566edaa56a53 100644
--- a/Documentation/gpio.txt
+++ b/Documentation/gpio.txt
@@ -555,6 +555,11 @@ requested using gpio_request():
 	/* reverse gpio_export() */
 	void gpio_unexport();
 
+	/* create a sysfs link to an exported GPIO node */
+	int gpio_export_link(struct device *dev, const char *name,
+		unsigned gpio)
+
+
 After a kernel driver requests a GPIO, it may only be made available in
 the sysfs interface by gpio_export().  The driver can control whether the
 signal direction may change.  This helps drivers prevent userspace code
@@ -563,3 +568,8 @@ from accidentally clobbering important system state.
 This explicit exporting can help with debugging (by making some kinds
 of experiments easier), or can provide an always-there interface that's
 suitable for documenting as part of a board support package.
+
+After the GPIO has been exported, gpio_export_link() allows creating
+symlinks from elsewhere in sysfs to the GPIO sysfs node.  Drivers can
+use this to provide the interface under their own device in sysfs with
+a descriptive name.
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 51a8d4103be5..aef6b3d8e2cf 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -504,6 +504,51 @@ static int match_export(struct device *dev, void *data)
 	return dev_get_drvdata(dev) == data;
 }
 
+/**
+ * gpio_export_link - create a sysfs link to an exported GPIO node
+ * @dev: device under which to create symlink
+ * @name: name of the symlink
+ * @gpio: gpio to create symlink to, already exported
+ *
+ * Set up a symlink from /sys/.../dev/name to /sys/class/gpio/gpioN
+ * node. Caller is responsible for unlinking.
+ *
+ * Returns zero on success, else an error.
+ */
+int gpio_export_link(struct device *dev, const char *name, unsigned gpio)
+{
+	struct gpio_desc	*desc;
+	int			status = -EINVAL;
+
+	if (!gpio_is_valid(gpio))
+		goto done;
+
+	mutex_lock(&sysfs_lock);
+
+	desc = &gpio_desc[gpio];
+
+	if (test_bit(FLAG_EXPORT, &desc->flags)) {
+		struct device *tdev;
+
+		tdev = class_find_device(&gpio_class, NULL, desc, match_export);
+		if (tdev != NULL) {
+			status = sysfs_create_link(&dev->kobj, &tdev->kobj,
+						name);
+		} else {
+			status = -ENODEV;
+		}
+	}
+
+	mutex_unlock(&sysfs_lock);
+
+done:
+	if (status)
+		pr_debug("%s: gpio%d status %d\n", __func__, gpio, status);
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(gpio_export_link);
+
 /**
  * gpio_unexport - reverse effect of gpio_export()
  * @gpio: gpio to make unavailable
diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h
index d6c379dc64fa..9cca3785cab8 100644
--- a/include/asm-generic/gpio.h
+++ b/include/asm-generic/gpio.h
@@ -141,6 +141,8 @@ extern int __gpio_to_irq(unsigned gpio);
  * but more typically is configured entirely from userspace.
  */
 extern int gpio_export(unsigned gpio, bool direction_may_change);
+extern int gpio_export_link(struct device *dev, const char *name,
+			unsigned gpio);
 extern void gpio_unexport(unsigned gpio);
 
 #endif	/* CONFIG_GPIO_SYSFS */
@@ -185,6 +187,12 @@ static inline int gpio_export(unsigned gpio, bool direction_may_change)
 	return -ENOSYS;
 }
 
+static inline int gpio_export_link(struct device *dev, const char *name,
+				unsigned gpio)
+{
+	return -ENOSYS;
+}
+
 static inline void gpio_unexport(unsigned gpio)
 {
 }
diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index e10c49a5b96e..059bd189d35d 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -12,6 +12,8 @@
 #include <linux/types.h>
 #include <linux/errno.h>
 
+struct device;
+
 /*
  * Some platforms don't support the GPIO programming interface.
  *
@@ -89,6 +91,15 @@ static inline int gpio_export(unsigned gpio, bool direction_may_change)
 	return -EINVAL;
 }
 
+static inline int gpio_export_link(struct device *dev, const char *name,
+				unsigned gpio)
+{
+	/* GPIO can never have been exported */
+	WARN_ON(1);
+	return -EINVAL;
+}
+
+
 static inline void gpio_unexport(unsigned gpio)
 {
 	/* GPIO can never have been exported */
-- 
cgit v1.2.3-71-gd317


From 1e5db00687c1ebd93a902caf1d3694209013cb3e Mon Sep 17 00:00:00 2001
From: Richard Röjfors <richard.rojfors.ext@mocean-labs.com>
Date: Tue, 22 Sep 2009 16:46:34 -0700
Subject: gpio: add MC33880 driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A GPIO driver for the Freescale MC33880 High/Low side switch

Signed-off-by: Richard Röjfors <richard.rojfors.ext@mocean-labs.com>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpio/Kconfig        |   7 ++
 drivers/gpio/Makefile       |   1 +
 drivers/gpio/mc33880.c      | 196 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/spi/mc33880.h |  10 +++
 4 files changed, 214 insertions(+)
 create mode 100644 drivers/gpio/mc33880.c
 create mode 100644 include/linux/spi/mc33880.h

(limited to 'include/linux')

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 6b4c484a699a..223e7c92fd54 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -195,4 +195,11 @@ config GPIO_MCP23S08
 	  SPI driver for Microchip MCP23S08 I/O expander.  This provides
 	  a GPIO interface supporting inputs and outputs.
 
+config GPIO_MC33880
+	tristate "Freescale MC33880 high-side/low-side switch"
+	depends on SPI_MASTER
+	help
+	  SPI driver for Freescale MC33880 high-side/low-side switch.
+	  This provides GPIO interface supporting inputs and outputs.
+
 endif
diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile
index ea7c745f26a8..f35de16c7c49 100644
--- a/drivers/gpio/Makefile
+++ b/drivers/gpio/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_GPIOLIB)		+= gpiolib.o
 
 obj-$(CONFIG_GPIO_MAX7301)	+= max7301.o
 obj-$(CONFIG_GPIO_MAX732X)	+= max732x.o
+obj-$(CONFIG_GPIO_MC33880)	+= mc33880.o
 obj-$(CONFIG_GPIO_MCP23S08)	+= mcp23s08.o
 obj-$(CONFIG_GPIO_PCA953X)	+= pca953x.o
 obj-$(CONFIG_GPIO_PCF857X)	+= pcf857x.o
diff --git a/drivers/gpio/mc33880.c b/drivers/gpio/mc33880.c
new file mode 100644
index 000000000000..e7d01bd8fdb3
--- /dev/null
+++ b/drivers/gpio/mc33880.c
@@ -0,0 +1,196 @@
+/*
+ * mc33880.c MC33880 high-side/low-side switch GPIO driver
+ * Copyright (c) 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* Supports:
+ * Freescale MC33880 high-side/low-side switch
+ */
+
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/spi/spi.h>
+#include <linux/spi/mc33880.h>
+#include <linux/gpio.h>
+
+#define DRIVER_NAME "mc33880"
+
+/*
+ * Pin configurations, see MAX7301 datasheet page 6
+ */
+#define PIN_CONFIG_MASK 0x03
+#define PIN_CONFIG_IN_PULLUP 0x03
+#define PIN_CONFIG_IN_WO_PULLUP 0x02
+#define PIN_CONFIG_OUT 0x01
+
+#define PIN_NUMBER 8
+
+
+/*
+ * Some registers must be read back to modify.
+ * To save time we cache them here in memory
+ */
+struct mc33880 {
+	struct mutex	lock;	/* protect from simultanous accesses */
+	u8		port_config;
+	struct gpio_chip chip;
+	struct spi_device *spi;
+};
+
+static int mc33880_write_config(struct mc33880 *mc)
+{
+	return spi_write(mc->spi, &mc->port_config, sizeof(mc->port_config));
+}
+
+
+static int __mc33880_set(struct mc33880 *mc, unsigned offset, int value)
+{
+	if (value)
+		mc->port_config |= 1 << offset;
+	else
+		mc->port_config &= ~(1 << offset);
+
+	return mc33880_write_config(mc);
+}
+
+
+static void mc33880_set(struct gpio_chip *chip, unsigned offset, int value)
+{
+	struct mc33880 *mc = container_of(chip, struct mc33880, chip);
+
+	mutex_lock(&mc->lock);
+
+	__mc33880_set(mc, offset, value);
+
+	mutex_unlock(&mc->lock);
+}
+
+static int __devinit mc33880_probe(struct spi_device *spi)
+{
+	struct mc33880 *mc;
+	struct mc33880_platform_data *pdata;
+	int ret;
+
+	pdata = spi->dev.platform_data;
+	if (!pdata || !pdata->base) {
+		dev_dbg(&spi->dev, "incorrect or missing platform data\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * bits_per_word cannot be configured in platform data
+	 */
+	spi->bits_per_word = 8;
+
+	ret = spi_setup(spi);
+	if (ret < 0)
+		return ret;
+
+	mc = kzalloc(sizeof(struct mc33880), GFP_KERNEL);
+	if (!mc)
+		return -ENOMEM;
+
+	mutex_init(&mc->lock);
+
+	dev_set_drvdata(&spi->dev, mc);
+
+	mc->spi = spi;
+
+	mc->chip.label = DRIVER_NAME,
+	mc->chip.set = mc33880_set;
+	mc->chip.base = pdata->base;
+	mc->chip.ngpio = PIN_NUMBER;
+	mc->chip.can_sleep = 1;
+	mc->chip.dev = &spi->dev;
+	mc->chip.owner = THIS_MODULE;
+
+	mc->port_config = 0x00;
+	/* write twice, because during initialisation the first setting
+	 * is just for testing SPI communication, and the second is the
+	 * "real" configuration
+	 */
+	ret = mc33880_write_config(mc);
+	mc->port_config = 0x00;
+	if (!ret)
+		ret = mc33880_write_config(mc);
+
+	if (ret) {
+		printk(KERN_ERR "Failed writing to " DRIVER_NAME ": %d\n", ret);
+		goto exit_destroy;
+	}
+
+	ret = gpiochip_add(&mc->chip);
+	if (ret)
+		goto exit_destroy;
+
+	return ret;
+
+exit_destroy:
+	dev_set_drvdata(&spi->dev, NULL);
+	mutex_destroy(&mc->lock);
+	kfree(mc);
+	return ret;
+}
+
+static int mc33880_remove(struct spi_device *spi)
+{
+	struct mc33880 *mc;
+	int ret;
+
+	mc = dev_get_drvdata(&spi->dev);
+	if (mc == NULL)
+		return -ENODEV;
+
+	dev_set_drvdata(&spi->dev, NULL);
+
+	ret = gpiochip_remove(&mc->chip);
+	if (!ret) {
+		mutex_destroy(&mc->lock);
+		kfree(mc);
+	} else
+		dev_err(&spi->dev, "Failed to remove the GPIO controller: %d\n",
+			ret);
+
+	return ret;
+}
+
+static struct spi_driver mc33880_driver = {
+	.driver = {
+		.name		= DRIVER_NAME,
+		.owner		= THIS_MODULE,
+	},
+	.probe		= mc33880_probe,
+	.remove		= __devexit_p(mc33880_remove),
+};
+
+static int __init mc33880_init(void)
+{
+	return spi_register_driver(&mc33880_driver);
+}
+/* register after spi postcore initcall and before
+ * subsys initcalls that may rely on these GPIOs
+ */
+subsys_initcall(mc33880_init);
+
+static void __exit mc33880_exit(void)
+{
+	spi_unregister_driver(&mc33880_driver);
+}
+module_exit(mc33880_exit);
+
+MODULE_AUTHOR("Mocean Laboratories <info@mocean-labs.com>");
+MODULE_LICENSE("GPL v2");
+
diff --git a/include/linux/spi/mc33880.h b/include/linux/spi/mc33880.h
new file mode 100644
index 000000000000..82ffccd6fbe5
--- /dev/null
+++ b/include/linux/spi/mc33880.h
@@ -0,0 +1,10 @@
+#ifndef LINUX_SPI_MC33880_H
+#define LINUX_SPI_MC33880_H
+
+struct mc33880_platform_data {
+	/* number assigned to the first GPIO */
+	unsigned	base;
+};
+
+#endif
+
-- 
cgit v1.2.3-71-gd317


From 4cf8e53b3b55fa2f9b2a6b9c3e557b649adf7c6a Mon Sep 17 00:00:00 2001
From: Marek Vasut <marek.vasut@gmail.com>
Date: Tue, 22 Sep 2009 16:46:35 -0700
Subject: mfd/gpio: add a GPIO interface to the UCB1400 MFD chip driver via
 gpiolib

Cc: Eric Miao <eric.y.miao@gmail.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: David Brownell <david-b@pacbell.net>
Cc: Samuel Ortiz <sameo@openedhand.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpio/Kconfig        |  12 +++++
 drivers/gpio/Makefile       |   1 +
 drivers/gpio/ucb1400_gpio.c | 125 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/mfd/ucb1400_core.c  |  31 +++++++++--
 include/linux/ucb1400.h     |  19 +++++++
 5 files changed, 184 insertions(+), 4 deletions(-)
 create mode 100644 drivers/gpio/ucb1400_gpio.c

(limited to 'include/linux')

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 223e7c92fd54..ccca08e0b595 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -202,4 +202,16 @@ config GPIO_MC33880
 	  SPI driver for Freescale MC33880 high-side/low-side switch.
 	  This provides GPIO interface supporting inputs and outputs.
 
+comment "AC97 GPIO expanders:"
+
+config GPIO_UCB1400
+	bool "Philips UCB1400 GPIO"
+	depends on UCB1400_CORE
+	help
+	  This enables support for the Philips UCB1400 GPIO pins.
+	  The UCB1400 is an AC97 audio codec.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called ucb1400_gpio.
+
 endif
diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile
index f35de16c7c49..c1ac034698c5 100644
--- a/drivers/gpio/Makefile
+++ b/drivers/gpio/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_GPIO_PCA953X)	+= pca953x.o
 obj-$(CONFIG_GPIO_PCF857X)	+= pcf857x.o
 obj-$(CONFIG_GPIO_PL061)	+= pl061.o
 obj-$(CONFIG_GPIO_TWL4030)	+= twl4030-gpio.o
+obj-$(CONFIG_GPIO_UCB1400)	+= ucb1400_gpio.o
 obj-$(CONFIG_GPIO_XILINX)	+= xilinx_gpio.o
 obj-$(CONFIG_GPIO_BT8XX)	+= bt8xxgpio.o
 obj-$(CONFIG_GPIO_VR41XX)	+= vr41xx_giu.o
diff --git a/drivers/gpio/ucb1400_gpio.c b/drivers/gpio/ucb1400_gpio.c
new file mode 100644
index 000000000000..50e6bd1392ce
--- /dev/null
+++ b/drivers/gpio/ucb1400_gpio.c
@@ -0,0 +1,125 @@
+/*
+ * Philips UCB1400 GPIO driver
+ *
+ * Author: Marek Vasut <marek.vasut@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/ucb1400.h>
+
+struct ucb1400_gpio_data *ucbdata;
+
+static int ucb1400_gpio_dir_in(struct gpio_chip *gc, unsigned off)
+{
+	struct ucb1400_gpio *gpio;
+	gpio = container_of(gc, struct ucb1400_gpio, gc);
+	ucb1400_gpio_set_direction(gpio->ac97, off, 0);
+	return 0;
+}
+
+static int ucb1400_gpio_dir_out(struct gpio_chip *gc, unsigned off, int val)
+{
+	struct ucb1400_gpio *gpio;
+	gpio = container_of(gc, struct ucb1400_gpio, gc);
+	ucb1400_gpio_set_direction(gpio->ac97, off, 1);
+	ucb1400_gpio_set_value(gpio->ac97, off, val);
+	return 0;
+}
+
+static int ucb1400_gpio_get(struct gpio_chip *gc, unsigned off)
+{
+	struct ucb1400_gpio *gpio;
+	gpio = container_of(gc, struct ucb1400_gpio, gc);
+	return ucb1400_gpio_get_value(gpio->ac97, off);
+}
+
+static void ucb1400_gpio_set(struct gpio_chip *gc, unsigned off, int val)
+{
+	struct ucb1400_gpio *gpio;
+	gpio = container_of(gc, struct ucb1400_gpio, gc);
+	ucb1400_gpio_set_value(gpio->ac97, off, val);
+}
+
+static int ucb1400_gpio_probe(struct platform_device *dev)
+{
+	struct ucb1400_gpio *ucb = dev->dev.platform_data;
+	int err = 0;
+
+	if (!(ucbdata && ucbdata->gpio_offset)) {
+		err = -EINVAL;
+		goto err;
+	}
+
+	platform_set_drvdata(dev, ucb);
+
+	ucb->gc.label = "ucb1400_gpio";
+	ucb->gc.base = ucbdata->gpio_offset;
+	ucb->gc.ngpio = 10;
+	ucb->gc.owner = THIS_MODULE;
+
+	ucb->gc.direction_input = ucb1400_gpio_dir_in;
+	ucb->gc.direction_output = ucb1400_gpio_dir_out;
+	ucb->gc.get = ucb1400_gpio_get;
+	ucb->gc.set = ucb1400_gpio_set;
+	ucb->gc.can_sleep = 1;
+
+	err = gpiochip_add(&ucb->gc);
+	if (err)
+		goto err;
+
+	if (ucbdata && ucbdata->gpio_setup)
+		err = ucbdata->gpio_setup(&dev->dev, ucb->gc.ngpio);
+
+err:
+	return err;
+
+}
+
+static int ucb1400_gpio_remove(struct platform_device *dev)
+{
+	int err = 0;
+	struct ucb1400_gpio *ucb = platform_get_drvdata(dev);
+
+	if (ucbdata && ucbdata->gpio_teardown) {
+		err = ucbdata->gpio_teardown(&dev->dev, ucb->gc.ngpio);
+		if (err)
+			return err;
+	}
+
+	err = gpiochip_remove(&ucb->gc);
+	return err;
+}
+
+static struct platform_driver ucb1400_gpio_driver = {
+	.probe	= ucb1400_gpio_probe,
+	.remove	= ucb1400_gpio_remove,
+	.driver	= {
+		.name	= "ucb1400_gpio"
+	},
+};
+
+static int __init ucb1400_gpio_init(void)
+{
+	return platform_driver_register(&ucb1400_gpio_driver);
+}
+
+static void __exit ucb1400_gpio_exit(void)
+{
+	platform_driver_unregister(&ucb1400_gpio_driver);
+}
+
+void __init ucb1400_gpio_set_data(struct ucb1400_gpio_data *data)
+{
+	ucbdata = data;
+}
+
+module_init(ucb1400_gpio_init);
+module_exit(ucb1400_gpio_exit);
+
+MODULE_DESCRIPTION("Philips UCB1400 GPIO driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/mfd/ucb1400_core.c b/drivers/mfd/ucb1400_core.c
index 78c2135c5de6..2afc08006e6d 100644
--- a/drivers/mfd/ucb1400_core.c
+++ b/drivers/mfd/ucb1400_core.c
@@ -48,9 +48,11 @@ static int ucb1400_core_probe(struct device *dev)
 	int err;
 	struct ucb1400 *ucb;
 	struct ucb1400_ts ucb_ts;
+	struct ucb1400_gpio ucb_gpio;
 	struct snd_ac97 *ac97;
 
 	memset(&ucb_ts, 0, sizeof(ucb_ts));
+	memset(&ucb_gpio, 0, sizeof(ucb_gpio));
 
 	ucb = kzalloc(sizeof(struct ucb1400), GFP_KERNEL);
 	if (!ucb) {
@@ -68,25 +70,44 @@ static int ucb1400_core_probe(struct device *dev)
 		goto err0;
 	}
 
+	/* GPIO */
+	ucb_gpio.ac97 = ac97;
+	ucb->ucb1400_gpio = platform_device_alloc("ucb1400_gpio", -1);
+	if (!ucb->ucb1400_gpio) {
+		err = -ENOMEM;
+		goto err0;
+	}
+	err = platform_device_add_data(ucb->ucb1400_gpio, &ucb_gpio,
+					sizeof(ucb_gpio));
+	if (err)
+		goto err1;
+	err = platform_device_add(ucb->ucb1400_gpio);
+	if (err)
+		goto err1;
+
 	/* TOUCHSCREEN */
 	ucb_ts.ac97 = ac97;
 	ucb->ucb1400_ts = platform_device_alloc("ucb1400_ts", -1);
 	if (!ucb->ucb1400_ts) {
 		err = -ENOMEM;
-		goto err0;
+		goto err2;
 	}
 	err = platform_device_add_data(ucb->ucb1400_ts, &ucb_ts,
 					sizeof(ucb_ts));
 	if (err)
-		goto err1;
+		goto err3;
 	err = platform_device_add(ucb->ucb1400_ts);
 	if (err)
-		goto err1;
+		goto err3;
 
 	return 0;
 
-err1:
+err3:
 	platform_device_put(ucb->ucb1400_ts);
+err2:
+	platform_device_unregister(ucb->ucb1400_gpio);
+err1:
+	platform_device_put(ucb->ucb1400_gpio);
 err0:
 	kfree(ucb);
 err:
@@ -98,6 +119,8 @@ static int ucb1400_core_remove(struct device *dev)
 	struct ucb1400 *ucb = dev_get_drvdata(dev);
 
 	platform_device_unregister(ucb->ucb1400_ts);
+	platform_device_unregister(ucb->ucb1400_gpio);
+
 	kfree(ucb);
 	return 0;
 }
diff --git a/include/linux/ucb1400.h b/include/linux/ucb1400.h
index ae779bb8cc0f..adb44066680c 100644
--- a/include/linux/ucb1400.h
+++ b/include/linux/ucb1400.h
@@ -26,6 +26,7 @@
 #include <sound/ac97_codec.h>
 #include <linux/mutex.h>
 #include <linux/platform_device.h>
+#include <linux/gpio.h>
 
 /*
  * UCB1400 AC-link registers
@@ -82,6 +83,17 @@
 #define UCB_ID			0x7e
 #define UCB_ID_1400             0x4304
 
+struct ucb1400_gpio_data {
+	int gpio_offset;
+	int (*gpio_setup)(struct device *dev, int ngpio);
+	int (*gpio_teardown)(struct device *dev, int ngpio);
+};
+
+struct ucb1400_gpio {
+	struct gpio_chip	gc;
+	struct snd_ac97		*ac97;
+};
+
 struct ucb1400_ts {
 	struct input_dev	*ts_idev;
 	struct task_struct	*ts_task;
@@ -95,6 +107,7 @@ struct ucb1400_ts {
 
 struct ucb1400 {
 	struct platform_device	*ucb1400_ts;
+	struct platform_device	*ucb1400_gpio;
 };
 
 static inline u16 ucb1400_reg_read(struct snd_ac97 *ac97, u16 reg)
@@ -147,4 +160,10 @@ static inline void ucb1400_adc_disable(struct snd_ac97 *ac97)
 unsigned int ucb1400_adc_read(struct snd_ac97 *ac97, u16 adc_channel,
 			      int adcsync);
 
+#ifdef CONFIG_GPIO_UCB1400
+void __init ucb1400_gpio_set_data(struct ucb1400_gpio_data *data);
+#else
+static inline void ucb1400_gpio_set_data(struct ucb1400_gpio_data *data) {}
+#endif
+
 #endif
-- 
cgit v1.2.3-71-gd317


From 4fefce9abaeef0d6ec45e06a882db23a65135272 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Tue, 22 Sep 2009 16:48:02 -0700
Subject: jbd.h: bitfields should be unsigned

bitfields should be unsigned.

This fixes sparse noise:
  error: dubious one-bit signed bitfield

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Cc: Jan Kara <jack@ucw.cz>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/jbd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index a1187a0c99b4..331530cd3cc6 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -556,7 +556,7 @@ struct transaction_s
 	 * This transaction is being forced and some process is
 	 * waiting for it to finish.
 	 */
-	int t_synchronous_commit:1;
+	unsigned int t_synchronous_commit:1;
 };
 
 /**
-- 
cgit v1.2.3-71-gd317


From 4fd8da8d62416d0dae05603ab5990a498d9aeb12 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 23 Sep 2009 17:49:55 +0200
Subject: fs: change sys_truncate length parameter type

For this system call user space passes a signed long length parameter,
while the kernel side takes an unsigned long parameter and converts it
later to signed long again.

This has led to bugs in compat wrappers see e.g.  dd90bbd5 "powerpc: Add
compat_sys_truncate".  The s390 compat wrapper for this functions is
broken as well since it also performs zero extension instead of sign
extension for the length parameter.

In addition if hpa comes up with an automated way of generating
compat wrappers it would generate a wrong one here.

So change the length parameter from unsigned long to long.

Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/open.c                | 5 ++---
 include/linux/syscalls.h | 3 +--
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/open.c b/fs/open.c
index 31191bf513e4..4f01e06227c6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -290,10 +290,9 @@ out:
 	return error;
 }
 
-SYSCALL_DEFINE2(truncate, const char __user *, path, unsigned long, length)
+SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
 {
-	/* on 32-bit boxen it will cut the range 2^31--2^32-1 off */
-	return do_sys_truncate(path, (long)length);
+	return do_sys_truncate(path, length);
 }
 
 static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 8d8285a10db9..a990ace1a838 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -460,8 +460,7 @@ asmlinkage long sys_mount(char __user *dev_name, char __user *dir_name,
 				void __user *data);
 asmlinkage long sys_umount(char __user *name, int flags);
 asmlinkage long sys_oldumount(char __user *name);
-asmlinkage long sys_truncate(const char __user *path,
-				unsigned long length);
+asmlinkage long sys_truncate(const char __user *path, long length);
 asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
 asmlinkage long sys_stat(char __user *filename,
 			struct __old_kernel_stat __user *statbuf);
-- 
cgit v1.2.3-71-gd317


From 74908a0009eb36054190ab80deb9671014efed96 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 17 Sep 2009 17:47:12 -0700
Subject: include/linux/cred.h: fix build

mips allmodconfig:

include/linux/cred.h: In function `creds_are_invalid':
include/linux/cred.h:187: error: `PAGE_SIZE' undeclared (first use in this function)
include/linux/cred.h:187: error: (Each undeclared identifier is reported only once
include/linux/cred.h:187: error: for each function it appears in.)

Fixes

commit b6dff3ec5e116e3af6f537d4caedcad6b9e5082a
Author:     David Howells <dhowells@redhat.com>
AuthorDate: Fri Nov 14 10:39:16 2008 +1100
Commit:     James Morris <jmorris@namei.org>
CommitDate: Fri Nov 14 10:39:16 2008 +1100

    CRED: Separate task security context from task_struct

I think.

It's way too large to be inlined anyway.

Dunno if this needs an EXPORT_SYMBOL() yet.

Cc: David Howells <dhowells@redhat.com>
Cc: James Morris <jmorris@namei.org>
Cc: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/cred.h | 18 +-----------------
 kernel/cred.c        | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index fb371601a3b4..4e3387a89cb9 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -176,23 +176,7 @@ extern void __invalid_creds(const struct cred *, const char *, unsigned);
 extern void __validate_process_creds(struct task_struct *,
 				     const char *, unsigned);
 
-static inline bool creds_are_invalid(const struct cred *cred)
-{
-	if (cred->magic != CRED_MAGIC)
-		return true;
-	if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
-		return true;
-#ifdef CONFIG_SECURITY_SELINUX
-	if (selinux_is_enabled()) {
-		if ((unsigned long) cred->security < PAGE_SIZE)
-			return true;
-		if ((*(u32 *)cred->security & 0xffffff00) ==
-		    (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))
-			return true;
-	}
-#endif
-	return false;
-}
+extern bool creds_are_invalid(const struct cred *cred);
 
 static inline void __validate_creds(const struct cred *cred,
 				    const char *file, unsigned line)
diff --git a/kernel/cred.c b/kernel/cred.c
index d7f7a01082eb..70bda79fae24 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -782,6 +782,24 @@ EXPORT_SYMBOL(set_create_files_as);
 
 #ifdef CONFIG_DEBUG_CREDENTIALS
 
+bool creds_are_invalid(const struct cred *cred)
+{
+	if (cred->magic != CRED_MAGIC)
+		return true;
+	if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
+		return true;
+#ifdef CONFIG_SECURITY_SELINUX
+	if (selinux_is_enabled()) {
+		if ((unsigned long) cred->security < PAGE_SIZE)
+			return true;
+		if ((*(u32 *)cred->security & 0xffffff00) ==
+		    (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))
+			return true;
+	}
+#endif
+	return false;
+}
+
 /*
  * dump invalid credentials
  */
-- 
cgit v1.2.3-71-gd317


From 1b9894f342a39601bb0420b7b8c7e445670c1b51 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 21 Sep 2009 11:12:03 -0700
Subject: serial core: fix new kernel-doc warnings

Fix new kernel-doc warnings in serial_core.[hc] files.

  Warning(include/linux/serial_core.h:485): No description found for parameter 'uport'
  Warning(include/linux/serial_core.h:485): Excess function parameter 'port' description in 'uart_handle_dcd_change'
  Warning(include/linux/serial_core.h:511): No description found for parameter 'uport'
  Warning(include/linux/serial_core.h:511): Excess function parameter 'port' description in 'uart_handle_cts_change'
  Warning(drivers/serial/serial_core.c:2437): No description found for parameter 'uport'
  Warning(drivers/serial/serial_core.c:2437): Excess function parameter 'port' description in 'uart_add_one_port'
  Warning(drivers/serial/serial_core.c:2509): No description found for parameter 'uport'
  Warning(drivers/serial/serial_core.c:2509): Excess function parameter 'port' description in 'uart_remove_one_port'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/serial_core.c | 4 ++--
 include/linux/serial_core.h  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index 2514d00c0f6f..1689bda1d13b 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -2426,7 +2426,7 @@ struct tty_driver *uart_console_device(struct console *co, int *index)
 /**
  *	uart_add_one_port - attach a driver-defined port structure
  *	@drv: pointer to the uart low level driver structure for this port
- *	@port: uart port structure to use for this port.
+ *	@uport: uart port structure to use for this port.
  *
  *	This allows the driver to register its own uart_port structure
  *	with the core driver.  The main purpose is to allow the low
@@ -2499,7 +2499,7 @@ int uart_add_one_port(struct uart_driver *drv, struct uart_port *uport)
 /**
  *	uart_remove_one_port - detach a driver defined port structure
  *	@drv: pointer to the uart low level driver structure for this port
- *	@port: uart port structure for this port
+ *	@uport: uart port structure for this port
  *
  *	This unhooks (and hangs up) the specified port structure from the
  *	core driver.  No further calls will be made to the low-level code
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index d58e460844dd..fe661afe0713 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -477,7 +477,7 @@ static inline int uart_handle_break(struct uart_port *port)
 
 /**
  *	uart_handle_dcd_change - handle a change of carrier detect state
- *	@port: uart_port structure for the open port
+ *	@uport: uart_port structure for the open port
  *	@status: new carrier detect status, nonzero if active
  */
 static inline void
@@ -503,7 +503,7 @@ uart_handle_dcd_change(struct uart_port *uport, unsigned int status)
 
 /**
  *	uart_handle_cts_change - handle a change of clear-to-send state
- *	@port: uart_port structure for the open port
+ *	@uport: uart_port structure for the open port
  *	@status: new clear to send status, nonzero if active
  */
 static inline void
-- 
cgit v1.2.3-71-gd317


From 97363c6a4f93a20380b4a9e11f35e27fed68a517 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 23 Sep 2009 14:36:38 -0400
Subject: sunrpc: xdr_xcode_hyper helpers cannot presume 64-bit alignment

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xdr.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 7da466ba4b0d..f5cc0898bc53 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -11,6 +11,7 @@
 
 #include <linux/uio.h>
 #include <asm/byteorder.h>
+#include <asm/unaligned.h>
 #include <linux/scatterlist.h>
 
 /*
@@ -117,14 +118,14 @@ static inline __be32 *xdr_encode_array(__be32 *p, const void *s, unsigned int le
 static inline __be32 *
 xdr_encode_hyper(__be32 *p, __u64 val)
 {
-	*(__be64 *)p = cpu_to_be64(val);
+	put_unaligned_be64(val, p);
 	return p + 2;
 }
 
 static inline __be32 *
 xdr_decode_hyper(__be32 *p, __u64 *valp)
 {
-	*valp = be64_to_cpup((__be64 *)p);
+	*valp = get_unaligned_be64(p);
 	return p + 2;
 }
 
-- 
cgit v1.2.3-71-gd317


From a0219d948dd712561817b0d7c95fd2f10b698203 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 24 Sep 2009 09:34:35 -0600
Subject: cpumask: remove dangerous CPU_MASK_ALL_PTR

(Thanks to Al Viro for reminding me of this, via Ingo)

CPU_MASK_ALL is the (deprecated) "all bits set" cpumask, defined as so:

	#define CPU_MASK_ALL (cpumask_t) { { ... } }

Taking the address of such a temporary is questionable at best,
unfortunately 321a8e9d (cpumask: add CPU_MASK_ALL_PTR macro) added
CPU_MASK_ALL_PTR:

	#define CPU_MASK_ALL_PTR (&CPU_MASK_ALL)

Which formalizes this practice.  One day gcc could bite us over this
usage (though we seem to have gotten away with it so far).

Now all callers are removed, we kill it.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Mike Travis <travis@sgi.com>
---
 include/linux/cpumask.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 9b1d458aac6e..c0ab3588129d 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -324,8 +324,6 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
 	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD			\
 } }
 
-#define CPU_MASK_ALL_PTR	(&CPU_MASK_ALL)
-
 #else
 
 #define CPU_MASK_ALL							\
@@ -336,7 +334,6 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
 
 /* cpu_mask_all is in init/main.c */
 extern cpumask_t cpu_mask_all;
-#define CPU_MASK_ALL_PTR	(&cpu_mask_all)
 
 #endif
 
-- 
cgit v1.2.3-71-gd317


From 72d78d05cbaa69f2a32f5f9d65a4551ba0da571f Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 24 Sep 2009 09:34:36 -0600
Subject: cpumask: remove unused cpu_mask_all

It's only defined for NR_CPUS > BITS_PER_LONG; cpu_all_mask is always
defined (and const).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/cpumask.h | 3 ---
 init/main.c             | 5 -----
 2 files changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index c0ab3588129d..dbb8367ecf56 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -332,9 +332,6 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
 	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD			\
 } }
 
-/* cpu_mask_all is in init/main.c */
-extern cpumask_t cpu_mask_all;
-
 #endif
 
 #define CPU_MASK_NONE							\
diff --git a/init/main.c b/init/main.c
index 6107223124e4..51695cee1864 100644
--- a/init/main.c
+++ b/init/main.c
@@ -360,11 +360,6 @@ static inline void smp_prepare_cpus(unsigned int maxcpus) { }
 
 #else
 
-#if NR_CPUS > BITS_PER_LONG
-cpumask_t cpu_mask_all __read_mostly = CPU_MASK_ALL;
-EXPORT_SYMBOL(cpu_mask_all);
-#endif
-
 /* Setup number of possible processor ids */
 int nr_cpu_ids __read_mostly = NR_CPUS;
 EXPORT_SYMBOL(nr_cpu_ids);
-- 
cgit v1.2.3-71-gd317


From ef79f8e191722dbc1fc33bdfc448f572266c37e9 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 24 Sep 2009 09:34:37 -0600
Subject: cpumask: remove unused mask field from struct irqaction.

Up until 1.1.83, the primitive human tribes used struct sigaction for
interrupts.  The sa_mask field was overloaded to hold a pointer to the
name.

When someone created the new "struct irqaction" they carried across
the "mask" field as a kind of ancestor worship: the fact that it was
unused makes clear its spiritual significance.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/interrupt.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 8e9e151f811e..894ed7180bff 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -97,7 +97,6 @@ typedef irqreturn_t (*irq_handler_t)(int, void *);
 struct irqaction {
 	irq_handler_t handler;
 	unsigned long flags;
-	cpumask_t mask;
 	const char *name;
 	void *dev_id;
 	struct irqaction *next;
-- 
cgit v1.2.3-71-gd317


From 144e2ce6115c0a1ee4cb5c935360ea4e2966b0ce Mon Sep 17 00:00:00 2001
From: Nobuhiro Iwamatsu <iwamatsu.nobuhiro@renesas.com>
Date: Mon, 15 Jun 2009 12:16:54 +0900
Subject: cpumask: Remove mask field from comments

By 7be23e278f, mask field was deleted by irqaction. However, it was not
deleted from comment.

Signed-off-by: Nobuhiro Iwamatsu <iwamatsu.nobuhiro@renesas.com>
CC: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/interrupt.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 894ed7180bff..b78cf8194957 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -84,7 +84,6 @@ typedef irqreturn_t (*irq_handler_t)(int, void *);
  * struct irqaction - per interrupt action descriptor
  * @handler:	interrupt handler function
  * @flags:	flags (see IRQF_* above)
- * @mask:	no comment as it is useless and about to be removed
  * @name:	name of the device
  * @dev_id:	cookie to identify the device
  * @next:	pointer to the next irqaction for shared interrupts
-- 
cgit v1.2.3-71-gd317


From e0ad955680878998ff7dc51ce06ddad12260423a Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 24 Sep 2009 09:34:38 -0600
Subject: cpumask: don't define set_cpus_allowed() if CONFIG_CPUMASK_OFFSTACK=y

You're not supposed to pass cpumasks on the stack in that case.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/sched.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index cbf2a3b46280..848d1f20086e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1817,10 +1817,13 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
 	return 0;
 }
 #endif
+
+#ifndef CONFIG_CPUMASK_OFFSTACK
 static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 {
 	return set_cpus_allowed_ptr(p, &new_mask);
 }
+#endif
 
 /*
  * Architectures can set this to 1 if they have specified
-- 
cgit v1.2.3-71-gd317


From fe71a3c7dc8cfe0f239c04b4fc6501f4aa56aa0a Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 24 Sep 2009 09:34:40 -0600
Subject: cpumask: remove the deprecated smp_call_function_mask()

Everyone is now using smp_call_function_many().

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/smp.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 9e3d8af09207..39c64bae776d 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -73,15 +73,6 @@ int smp_call_function(void(*func)(void *info), void *info, int wait);
 void smp_call_function_many(const struct cpumask *mask,
 			    void (*func)(void *info), void *info, bool wait);
 
-/* Deprecated: Use smp_call_function_many which takes a pointer to the mask. */
-static inline int
-smp_call_function_mask(cpumask_t mask, void(*func)(void *info), void *info,
-		       int wait)
-{
-	smp_call_function_many(&mask, func, info, wait);
-	return 0;
-}
-
 void __smp_call_function_single(int cpuid, struct call_single_data *data,
 				int wait);
 
@@ -144,8 +135,6 @@ static inline int up_smp_call_function(void (*func)(void *), void *info)
 static inline void smp_send_reschedule(int cpu) { }
 #define num_booting_cpus()			1
 #define smp_prepare_boot_cpu()			do {} while (0)
-#define smp_call_function_mask(mask, func, info, wait) \
-			(up_smp_call_function(func, info))
 #define smp_call_function_many(mask, func, info, wait) \
 			(up_smp_call_function(func, info))
 static inline void init_call_single_data(void)
-- 
cgit v1.2.3-71-gd317


From 6f401420e2822c24c36e6e1c657f6e7f7f777a93 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 24 Sep 2009 09:34:40 -0600
Subject: cpumask: remove obsolete topology_core_siblings and
 topology_thread_siblings: core

There were replaced by topology_core_cpumask and topology_thread_cpumask.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/topology.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 809b26c07090..fc0bf3edeb67 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -211,12 +211,6 @@ int arch_update_cpu_topology(void);
 #ifndef topology_core_id
 #define topology_core_id(cpu)			((void)(cpu), 0)
 #endif
-#ifndef topology_thread_siblings
-#define topology_thread_siblings(cpu)		cpumask_of_cpu(cpu)
-#endif
-#ifndef topology_core_siblings
-#define topology_core_siblings(cpu)		cpumask_of_cpu(cpu)
-#endif
 #ifndef topology_thread_cpumask
 #define topology_thread_cpumask(cpu)		cpumask_of(cpu)
 #endif
-- 
cgit v1.2.3-71-gd317


From 4b805b17382c11a8b1c9bb8053ce9d1dcde0701a Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 24 Sep 2009 09:34:52 -0600
Subject: cpumask: remove unused deprecated functions, avoid accusations of
 insanity

We're not forcing removal of the old cpu_ functions, but we might as
well delete the now-unused ones.

Especially CPUMASK_ALLOC and friends.  I actually got a phone call (!)
from a hacker who thought I had introduced them as the new cpumask
API.  He seemed bewildered that I had lost all taste.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: benh@kernel.crashing.org
---
 include/linux/cpumask.h | 112 +-----------------------------------------------
 1 file changed, 1 insertion(+), 111 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index dbb8367ecf56..e162d13c65ab 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -15,10 +15,6 @@
  * see bitmap_scnprintf() and bitmap_parse_user() in lib/bitmap.c.
  * For details of cpulist_scnprintf() and cpulist_parse(), see
  * bitmap_scnlistprintf() and bitmap_parselist(), also in bitmap.c.
- * For details of cpu_remap(), see bitmap_bitremap in lib/bitmap.c
- * For details of cpus_remap(), see bitmap_remap in lib/bitmap.c.
- * For details of cpus_onto(), see bitmap_onto in lib/bitmap.c.
- * For details of cpus_fold(), see bitmap_fold in lib/bitmap.c.
  *
  * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
  * Note: The alternate operations with the suffix "_nr" are used
@@ -47,22 +43,17 @@
  * void cpus_or(dst, src1, src2)	dst = src1 | src2  [union]
  * void cpus_xor(dst, src1, src2)	dst = src1 ^ src2
  * int cpus_andnot(dst, src1, src2)	dst = src1 & ~src2
- * void cpus_complement(dst, src)	dst = ~src
  *
  * int cpus_equal(mask1, mask2)		Does mask1 == mask2?
  * int cpus_intersects(mask1, mask2)	Do mask1 and mask2 intersect?
  * int cpus_subset(mask1, mask2)	Is mask1 a subset of mask2?
  * int cpus_empty(mask)			Is mask empty (no bits sets)?
- * int cpus_full(mask)			Is mask full (all bits sets)?
  * int cpus_weight(mask)		Hamming weigh - number of set bits
- * int cpus_weight_nr(mask)		Same using nr_cpu_ids instead of NR_CPUS
  *
- * void cpus_shift_right(dst, src, n)	Shift right
  * void cpus_shift_left(dst, src, n)	Shift left
  *
  * int first_cpu(mask)			Number lowest set bit, or NR_CPUS
  * int next_cpu(cpu, mask)		Next cpu past 'cpu', or NR_CPUS
- * int next_cpu_nr(cpu, mask)		Next cpu past 'cpu', or nr_cpu_ids
  *
  * cpumask_t cpumask_of_cpu(cpu)	Return cpumask with bit 'cpu' set
  *					(can be used as an lvalue)
@@ -70,45 +61,10 @@
  * CPU_MASK_NONE			Initializer - no bits set
  * unsigned long *cpus_addr(mask)	Array of unsigned long's in mask
  *
- * CPUMASK_ALLOC kmalloc's a structure that is a composite of many cpumask_t
- * variables, and CPUMASK_PTR provides pointers to each field.
- *
- * The structure should be defined something like this:
- * struct my_cpumasks {
- *	cpumask_t mask1;
- *	cpumask_t mask2;
- * };
- *
- * Usage is then:
- *	CPUMASK_ALLOC(my_cpumasks);
- *	CPUMASK_PTR(mask1, my_cpumasks);
- *	CPUMASK_PTR(mask2, my_cpumasks);
- *
- *	--- DO NOT reference cpumask_t pointers until this check ---
- *	if (my_cpumasks == NULL)
- *		"kmalloc failed"...
- *
- * References are now pointers to the cpumask_t variables (*mask1, ...)
- *
- *if NR_CPUS > BITS_PER_LONG
- *   CPUMASK_ALLOC(m)			Declares and allocates struct m *m =
- *						kmalloc(sizeof(*m), GFP_KERNEL)
- *   CPUMASK_FREE(m)			Macro for kfree(m)
- *else
- *   CPUMASK_ALLOC(m)			Declares struct m _m, *m = &_m
- *   CPUMASK_FREE(m)			Nop
- *endif
- *   CPUMASK_PTR(v, m)			Declares cpumask_t *v = &(m->v)
- * ------------------------------------------------------------------------
- *
  * int cpumask_scnprintf(buf, len, mask) Format cpumask for printing
  * int cpumask_parse_user(ubuf, ulen, mask)	Parse ascii string as cpumask
  * int cpulist_scnprintf(buf, len, mask) Format cpumask as list for printing
  * int cpulist_parse(buf, map)		Parse ascii string as cpulist
- * int cpu_remap(oldbit, old, new)	newbit = map(old, new)(oldbit)
- * void cpus_remap(dst, src, old, new)	*dst = map(old, new)(src)
- * void cpus_onto(dst, orig, relmap)	*dst = orig relative to relmap
- * void cpus_fold(dst, orig, sz)	dst bits = orig bits mod sz
  *
  * for_each_cpu_mask(cpu, mask)		for-loop cpu over mask using NR_CPUS
  * for_each_cpu_mask_nr(cpu, mask)	for-loop cpu over mask using nr_cpu_ids
@@ -142,7 +98,6 @@
 #include <linux/bitmap.h>
 
 typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
-extern cpumask_t _unused_cpumask_arg_;
 
 #ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
 #define cpu_set(cpu, dst) __cpu_set((cpu), &(dst))
@@ -207,13 +162,6 @@ static inline int __cpus_andnot(cpumask_t *dstp, const cpumask_t *src1p,
 	return bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
 }
 
-#define cpus_complement(dst, src) __cpus_complement(&(dst), &(src), NR_CPUS)
-static inline void __cpus_complement(cpumask_t *dstp,
-					const cpumask_t *srcp, int nbits)
-{
-	bitmap_complement(dstp->bits, srcp->bits, nbits);
-}
-
 #define cpus_equal(src1, src2) __cpus_equal(&(src1), &(src2), NR_CPUS)
 static inline int __cpus_equal(const cpumask_t *src1p,
 					const cpumask_t *src2p, int nbits)
@@ -241,26 +189,12 @@ static inline int __cpus_empty(const cpumask_t *srcp, int nbits)
 	return bitmap_empty(srcp->bits, nbits);
 }
 
-#define cpus_full(cpumask) __cpus_full(&(cpumask), NR_CPUS)
-static inline int __cpus_full(const cpumask_t *srcp, int nbits)
-{
-	return bitmap_full(srcp->bits, nbits);
-}
-
 #define cpus_weight(cpumask) __cpus_weight(&(cpumask), NR_CPUS)
 static inline int __cpus_weight(const cpumask_t *srcp, int nbits)
 {
 	return bitmap_weight(srcp->bits, nbits);
 }
 
-#define cpus_shift_right(dst, src, n) \
-			__cpus_shift_right(&(dst), &(src), (n), NR_CPUS)
-static inline void __cpus_shift_right(cpumask_t *dstp,
-					const cpumask_t *srcp, int n, int nbits)
-{
-	bitmap_shift_right(dstp->bits, srcp->bits, n, nbits);
-}
-
 #define cpus_shift_left(dst, src, n) \
 			__cpus_shift_left(&(dst), &(src), (n), NR_CPUS)
 static inline void __cpus_shift_left(cpumask_t *dstp,
@@ -346,46 +280,6 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
 
 #define cpus_addr(src) ((src).bits)
 
-#if NR_CPUS > BITS_PER_LONG
-#define	CPUMASK_ALLOC(m)	struct m *m = kmalloc(sizeof(*m), GFP_KERNEL)
-#define	CPUMASK_FREE(m)		kfree(m)
-#else
-#define	CPUMASK_ALLOC(m)	struct m _m, *m = &_m
-#define	CPUMASK_FREE(m)
-#endif
-#define	CPUMASK_PTR(v, m) 	cpumask_t *v = &(m->v)
-
-#define cpu_remap(oldbit, old, new) \
-		__cpu_remap((oldbit), &(old), &(new), NR_CPUS)
-static inline int __cpu_remap(int oldbit,
-		const cpumask_t *oldp, const cpumask_t *newp, int nbits)
-{
-	return bitmap_bitremap(oldbit, oldp->bits, newp->bits, nbits);
-}
-
-#define cpus_remap(dst, src, old, new) \
-		__cpus_remap(&(dst), &(src), &(old), &(new), NR_CPUS)
-static inline void __cpus_remap(cpumask_t *dstp, const cpumask_t *srcp,
-		const cpumask_t *oldp, const cpumask_t *newp, int nbits)
-{
-	bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits);
-}
-
-#define cpus_onto(dst, orig, relmap) \
-		__cpus_onto(&(dst), &(orig), &(relmap), NR_CPUS)
-static inline void __cpus_onto(cpumask_t *dstp, const cpumask_t *origp,
-		const cpumask_t *relmapp, int nbits)
-{
-	bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits);
-}
-
-#define cpus_fold(dst, orig, sz) \
-		__cpus_fold(&(dst), &(orig), sz, NR_CPUS)
-static inline void __cpus_fold(cpumask_t *dstp, const cpumask_t *origp,
-		int sz, int nbits)
-{
-	bitmap_fold(dstp->bits, origp->bits, sz, nbits);
-}
 #endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */
 
 #if NR_CPUS == 1
@@ -419,18 +313,14 @@ int __any_online_cpu(const cpumask_t *mask);
 #ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
 #if NR_CPUS <= 64
 
-#define next_cpu_nr(n, src)		next_cpu(n, src)
-#define cpus_weight_nr(cpumask)		cpus_weight(cpumask)
 #define for_each_cpu_mask_nr(cpu, mask)	for_each_cpu_mask(cpu, mask)
 
 #else /* NR_CPUS > 64 */
 
 int __next_cpu_nr(int n, const cpumask_t *srcp);
-#define next_cpu_nr(n, src)	__next_cpu_nr((n), &(src))
-#define cpus_weight_nr(cpumask)	__cpus_weight(&(cpumask), nr_cpu_ids)
 #define for_each_cpu_mask_nr(cpu, mask)			\
 	for ((cpu) = -1;				\
-		(cpu) = next_cpu_nr((cpu), (mask)),	\
+		(cpu) = __next_cpu_nr((cpu), &(mask)),	\
 		(cpu) < nr_cpu_ids; )
 
 #endif /* NR_CPUS > 64 */
-- 
cgit v1.2.3-71-gd317


From 6ba2ef7baac23a5d9bb85e28b882d16b439a2293 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 24 Sep 2009 09:34:53 -0600
Subject: cpumask: Move deprecated functions to end of header.

The new ones have pretty kerneldoc.  Move the old ones to the end to
avoid confusing people.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: benh@kernel.crashing.org
---
 include/linux/cpumask.h | 593 ++++++++++++++++++++----------------------------
 1 file changed, 252 insertions(+), 341 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index e162d13c65ab..789cf5f920ce 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -3,328 +3,37 @@
 
 /*
  * Cpumasks provide a bitmap suitable for representing the
- * set of CPU's in a system, one bit position per CPU number.
- *
- * The new cpumask_ ops take a "struct cpumask *"; the old ones
- * use cpumask_t.
- *
- * See detailed comments in the file linux/bitmap.h describing the
- * data type on which these cpumasks are based.
- *
- * For details of cpumask_scnprintf() and cpumask_parse_user(),
- * see bitmap_scnprintf() and bitmap_parse_user() in lib/bitmap.c.
- * For details of cpulist_scnprintf() and cpulist_parse(), see
- * bitmap_scnlistprintf() and bitmap_parselist(), also in bitmap.c.
- *
- * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
- * Note: The alternate operations with the suffix "_nr" are used
- *       to limit the range of the loop to nr_cpu_ids instead of
- *       NR_CPUS when NR_CPUS > 64 for performance reasons.
- *       If NR_CPUS is <= 64 then most assembler bitmask
- *       operators execute faster with a constant range, so
- *       the operator will continue to use NR_CPUS.
- *
- *       Another consideration is that nr_cpu_ids is initialized
- *       to NR_CPUS and isn't lowered until the possible cpus are
- *       discovered (including any disabled cpus).  So early uses
- *       will span the entire range of NR_CPUS.
- * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
- *
- * The obsolescent cpumask operations are:
- *
- * void cpu_set(cpu, mask)		turn on bit 'cpu' in mask
- * void cpu_clear(cpu, mask)		turn off bit 'cpu' in mask
- * void cpus_setall(mask)		set all bits
- * void cpus_clear(mask)		clear all bits
- * int cpu_isset(cpu, mask)		true iff bit 'cpu' set in mask
- * int cpu_test_and_set(cpu, mask)	test and set bit 'cpu' in mask
- *
- * int cpus_and(dst, src1, src2)	dst = src1 & src2  [intersection]
- * void cpus_or(dst, src1, src2)	dst = src1 | src2  [union]
- * void cpus_xor(dst, src1, src2)	dst = src1 ^ src2
- * int cpus_andnot(dst, src1, src2)	dst = src1 & ~src2
- *
- * int cpus_equal(mask1, mask2)		Does mask1 == mask2?
- * int cpus_intersects(mask1, mask2)	Do mask1 and mask2 intersect?
- * int cpus_subset(mask1, mask2)	Is mask1 a subset of mask2?
- * int cpus_empty(mask)			Is mask empty (no bits sets)?
- * int cpus_weight(mask)		Hamming weigh - number of set bits
- *
- * void cpus_shift_left(dst, src, n)	Shift left
- *
- * int first_cpu(mask)			Number lowest set bit, or NR_CPUS
- * int next_cpu(cpu, mask)		Next cpu past 'cpu', or NR_CPUS
- *
- * cpumask_t cpumask_of_cpu(cpu)	Return cpumask with bit 'cpu' set
- *					(can be used as an lvalue)
- * CPU_MASK_ALL				Initializer - all bits set
- * CPU_MASK_NONE			Initializer - no bits set
- * unsigned long *cpus_addr(mask)	Array of unsigned long's in mask
- *
- * int cpumask_scnprintf(buf, len, mask) Format cpumask for printing
- * int cpumask_parse_user(ubuf, ulen, mask)	Parse ascii string as cpumask
- * int cpulist_scnprintf(buf, len, mask) Format cpumask as list for printing
- * int cpulist_parse(buf, map)		Parse ascii string as cpulist
- *
- * for_each_cpu_mask(cpu, mask)		for-loop cpu over mask using NR_CPUS
- * for_each_cpu_mask_nr(cpu, mask)	for-loop cpu over mask using nr_cpu_ids
- *
- * int num_online_cpus()		Number of online CPUs
- * int num_possible_cpus()		Number of all possible CPUs
- * int num_present_cpus()		Number of present CPUs
- *
- * int cpu_online(cpu)			Is some cpu online?
- * int cpu_possible(cpu)		Is some cpu possible?
- * int cpu_present(cpu)			Is some cpu present (can schedule)?
- *
- * int any_online_cpu(mask)		First online cpu in mask
- *
- * for_each_possible_cpu(cpu)		for-loop cpu over cpu_possible_map
- * for_each_online_cpu(cpu)		for-loop cpu over cpu_online_map
- * for_each_present_cpu(cpu)		for-loop cpu over cpu_present_map
- *
- * Subtlety:
- * 1) The 'type-checked' form of cpu_isset() causes gcc (3.3.2, anyway)
- *    to generate slightly worse code.  Note for example the additional
- *    40 lines of assembly code compiling the "for each possible cpu"
- *    loops buried in the disk_stat_read() macros calls when compiling
- *    drivers/block/genhd.c (arch i386, CONFIG_SMP=y).  So use a simple
- *    one-line #define for cpu_isset(), instead of wrapping an inline
- *    inside a macro, the way we do the other calls.
+ * set of CPU's in a system, one bit position per CPU number.  In general,
+ * only nr_cpu_ids (<= NR_CPUS) bits are valid.
  */
-
 #include <linux/kernel.h>
 #include <linux/threads.h>
 #include <linux/bitmap.h>
 
 typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
 
-#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
-#define cpu_set(cpu, dst) __cpu_set((cpu), &(dst))
-static inline void __cpu_set(int cpu, volatile cpumask_t *dstp)
-{
-	set_bit(cpu, dstp->bits);
-}
-
-#define cpu_clear(cpu, dst) __cpu_clear((cpu), &(dst))
-static inline void __cpu_clear(int cpu, volatile cpumask_t *dstp)
-{
-	clear_bit(cpu, dstp->bits);
-}
-
-#define cpus_setall(dst) __cpus_setall(&(dst), NR_CPUS)
-static inline void __cpus_setall(cpumask_t *dstp, int nbits)
-{
-	bitmap_fill(dstp->bits, nbits);
-}
-
-#define cpus_clear(dst) __cpus_clear(&(dst), NR_CPUS)
-static inline void __cpus_clear(cpumask_t *dstp, int nbits)
-{
-	bitmap_zero(dstp->bits, nbits);
-}
-
-/* No static inline type checking - see Subtlety (1) above. */
-#define cpu_isset(cpu, cpumask) test_bit((cpu), (cpumask).bits)
-
-#define cpu_test_and_set(cpu, cpumask) __cpu_test_and_set((cpu), &(cpumask))
-static inline int __cpu_test_and_set(int cpu, cpumask_t *addr)
-{
-	return test_and_set_bit(cpu, addr->bits);
-}
-
-#define cpus_and(dst, src1, src2) __cpus_and(&(dst), &(src1), &(src2), NR_CPUS)
-static inline int __cpus_and(cpumask_t *dstp, const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
-{
-	return bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
-}
-
-#define cpus_or(dst, src1, src2) __cpus_or(&(dst), &(src1), &(src2), NR_CPUS)
-static inline void __cpus_or(cpumask_t *dstp, const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
-{
-	bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
-}
-
-#define cpus_xor(dst, src1, src2) __cpus_xor(&(dst), &(src1), &(src2), NR_CPUS)
-static inline void __cpus_xor(cpumask_t *dstp, const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
-{
-	bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
-}
-
-#define cpus_andnot(dst, src1, src2) \
-				__cpus_andnot(&(dst), &(src1), &(src2), NR_CPUS)
-static inline int __cpus_andnot(cpumask_t *dstp, const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
-{
-	return bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
-}
-
-#define cpus_equal(src1, src2) __cpus_equal(&(src1), &(src2), NR_CPUS)
-static inline int __cpus_equal(const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
-{
-	return bitmap_equal(src1p->bits, src2p->bits, nbits);
-}
-
-#define cpus_intersects(src1, src2) __cpus_intersects(&(src1), &(src2), NR_CPUS)
-static inline int __cpus_intersects(const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
-{
-	return bitmap_intersects(src1p->bits, src2p->bits, nbits);
-}
-
-#define cpus_subset(src1, src2) __cpus_subset(&(src1), &(src2), NR_CPUS)
-static inline int __cpus_subset(const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
-{
-	return bitmap_subset(src1p->bits, src2p->bits, nbits);
-}
-
-#define cpus_empty(src) __cpus_empty(&(src), NR_CPUS)
-static inline int __cpus_empty(const cpumask_t *srcp, int nbits)
-{
-	return bitmap_empty(srcp->bits, nbits);
-}
-
-#define cpus_weight(cpumask) __cpus_weight(&(cpumask), NR_CPUS)
-static inline int __cpus_weight(const cpumask_t *srcp, int nbits)
-{
-	return bitmap_weight(srcp->bits, nbits);
-}
-
-#define cpus_shift_left(dst, src, n) \
-			__cpus_shift_left(&(dst), &(src), (n), NR_CPUS)
-static inline void __cpus_shift_left(cpumask_t *dstp,
-					const cpumask_t *srcp, int n, int nbits)
-{
-	bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
-}
-#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */
-
 /**
- * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
- * @bitmap: the bitmap
+ * cpumask_bits - get the bits in a cpumask
+ * @maskp: the struct cpumask *
  *
- * There are a few places where cpumask_var_t isn't appropriate and
- * static cpumasks must be used (eg. very early boot), yet we don't
- * expose the definition of 'struct cpumask'.
- *
- * This does the conversion, and can be used as a constant initializer.
+ * You should only assume nr_cpu_ids bits of this mask are valid.  This is
+ * a macro so it's const-correct.
  */
-#define to_cpumask(bitmap)						\
-	((struct cpumask *)(1 ? (bitmap)				\
-			    : (void *)sizeof(__check_is_bitmap(bitmap))))
-
-static inline int __check_is_bitmap(const unsigned long *bitmap)
-{
-	return 1;
-}
-
-/*
- * Special-case data structure for "single bit set only" constant CPU masks.
- *
- * We pre-generate all the 64 (or 32) possible bit positions, with enough
- * padding to the left and the right, and return the constant pointer
- * appropriately offset.
- */
-extern const unsigned long
-	cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)];
-
-static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
-{
-	const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG];
-	p -= cpu / BITS_PER_LONG;
-	return to_cpumask(p);
-}
-
-#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
-/*
- * In cases where we take the address of the cpumask immediately,
- * gcc optimizes it out (it's a constant) and there's no huge stack
- * variable created:
- */
-#define cpumask_of_cpu(cpu) (*get_cpu_mask(cpu))
-
-
-#define CPU_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(NR_CPUS)
-
-#if NR_CPUS <= BITS_PER_LONG
-
-#define CPU_MASK_ALL							\
-(cpumask_t) { {								\
-	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD			\
-} }
-
-#else
-
-#define CPU_MASK_ALL							\
-(cpumask_t) { {								\
-	[0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,			\
-	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD			\
-} }
-
-#endif
-
-#define CPU_MASK_NONE							\
-(cpumask_t) { {								\
-	[0 ... BITS_TO_LONGS(NR_CPUS)-1] =  0UL				\
-} }
-
-#define CPU_MASK_CPU0							\
-(cpumask_t) { {								\
-	[0] =  1UL							\
-} }
-
-#define cpus_addr(src) ((src).bits)
-
-#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */
+#define cpumask_bits(maskp) ((maskp)->bits)
 
 #if NR_CPUS == 1
-
 #define nr_cpu_ids		1
-#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
-#define first_cpu(src)		({ (void)(src); 0; })
-#define next_cpu(n, src)	({ (void)(src); 1; })
-#define any_online_cpu(mask)	0
-#define for_each_cpu_mask(cpu, mask)	\
-	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
-#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */
-#else /* NR_CPUS > 1 */
-
+#else
 extern int nr_cpu_ids;
-#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
-int __first_cpu(const cpumask_t *srcp);
-int __next_cpu(int n, const cpumask_t *srcp);
-int __any_online_cpu(const cpumask_t *mask);
-
-#define first_cpu(src)		__first_cpu(&(src))
-#define next_cpu(n, src)	__next_cpu((n), &(src))
-#define any_online_cpu(mask) __any_online_cpu(&(mask))
-#define for_each_cpu_mask(cpu, mask)			\
-	for ((cpu) = -1;				\
-		(cpu) = next_cpu((cpu), (mask)),	\
-		(cpu) < NR_CPUS; )
-#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */
 #endif
 
-#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
-#if NR_CPUS <= 64
-
-#define for_each_cpu_mask_nr(cpu, mask)	for_each_cpu_mask(cpu, mask)
-
-#else /* NR_CPUS > 64 */
-
-int __next_cpu_nr(int n, const cpumask_t *srcp);
-#define for_each_cpu_mask_nr(cpu, mask)			\
-	for ((cpu) = -1;				\
-		(cpu) = __next_cpu_nr((cpu), &(mask)),	\
-		(cpu) < nr_cpu_ids; )
-
-#endif /* NR_CPUS > 64 */
-#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */
+#ifdef CONFIG_CPUMASK_OFFSTACK
+/* Assuming NR_CPUS is huge, a runtime limit is more efficient.  Also,
+ * not all bits may be allocated. */
+#define nr_cpumask_bits	nr_cpu_ids
+#else
+#define nr_cpumask_bits	NR_CPUS
+#endif
 
 /*
  * The following particular system cpumasks and operations manage
@@ -371,12 +80,6 @@ extern const struct cpumask *const cpu_online_mask;
 extern const struct cpumask *const cpu_present_mask;
 extern const struct cpumask *const cpu_active_mask;
 
-/* These strip const, as traditionally they weren't const. */
-#define cpu_possible_map	(*(cpumask_t *)cpu_possible_mask)
-#define cpu_online_map		(*(cpumask_t *)cpu_online_mask)
-#define cpu_present_map		(*(cpumask_t *)cpu_present_mask)
-#define cpu_active_map		(*(cpumask_t *)cpu_active_mask)
-
 #if NR_CPUS > 1
 #define num_online_cpus()	cpumask_weight(cpu_online_mask)
 #define num_possible_cpus()	cpumask_weight(cpu_possible_mask)
@@ -395,35 +98,6 @@ extern const struct cpumask *const cpu_active_mask;
 #define cpu_active(cpu)		((cpu) == 0)
 #endif
 
-#define cpu_is_offline(cpu)	unlikely(!cpu_online(cpu))
-
-/* These are the new versions of the cpumask operators: passed by pointer.
- * The older versions will be implemented in terms of these, then deleted. */
-#define cpumask_bits(maskp) ((maskp)->bits)
-
-#if NR_CPUS <= BITS_PER_LONG
-#define CPU_BITS_ALL						\
-{								\
-	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD	\
-}
-
-#else /* NR_CPUS > BITS_PER_LONG */
-
-#define CPU_BITS_ALL						\
-{								\
-	[0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,		\
-	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD		\
-}
-#endif /* NR_CPUS > BITS_PER_LONG */
-
-#ifdef CONFIG_CPUMASK_OFFSTACK
-/* Assuming NR_CPUS is huge, a runtime limit is more efficient.  Also,
- * not all bits may be allocated. */
-#define nr_cpumask_bits	nr_cpu_ids
-#else
-#define nr_cpumask_bits	NR_CPUS
-#endif
-
 /* verify cpu argument to cpumask_* operators */
 static inline unsigned int cpumask_check(unsigned int cpu)
 {
@@ -984,4 +658,241 @@ void set_cpu_active(unsigned int cpu, bool active);
 void init_cpu_present(const struct cpumask *src);
 void init_cpu_possible(const struct cpumask *src);
 void init_cpu_online(const struct cpumask *src);
+
+/**
+ * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
+ * @bitmap: the bitmap
+ *
+ * There are a few places where cpumask_var_t isn't appropriate and
+ * static cpumasks must be used (eg. very early boot), yet we don't
+ * expose the definition of 'struct cpumask'.
+ *
+ * This does the conversion, and can be used as a constant initializer.
+ */
+#define to_cpumask(bitmap)						\
+	((struct cpumask *)(1 ? (bitmap)				\
+			    : (void *)sizeof(__check_is_bitmap(bitmap))))
+
+static inline int __check_is_bitmap(const unsigned long *bitmap)
+{
+	return 1;
+}
+
+/*
+ * Special-case data structure for "single bit set only" constant CPU masks.
+ *
+ * We pre-generate all the 64 (or 32) possible bit positions, with enough
+ * padding to the left and the right, and return the constant pointer
+ * appropriately offset.
+ */
+extern const unsigned long
+	cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)];
+
+static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
+{
+	const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG];
+	p -= cpu / BITS_PER_LONG;
+	return to_cpumask(p);
+}
+
+#define cpu_is_offline(cpu)	unlikely(!cpu_online(cpu))
+
+#if NR_CPUS <= BITS_PER_LONG
+#define CPU_BITS_ALL						\
+{								\
+	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD	\
+}
+
+#else /* NR_CPUS > BITS_PER_LONG */
+
+#define CPU_BITS_ALL						\
+{								\
+	[0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,		\
+	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD		\
+}
+#endif /* NR_CPUS > BITS_PER_LONG */
+
+/*
+ *
+ * From here down, all obsolete.  Use cpumask_ variants!
+ *
+ */
+#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
+/* These strip const, as traditionally they weren't const. */
+#define cpu_possible_map	(*(cpumask_t *)cpu_possible_mask)
+#define cpu_online_map		(*(cpumask_t *)cpu_online_mask)
+#define cpu_present_map		(*(cpumask_t *)cpu_present_mask)
+#define cpu_active_map		(*(cpumask_t *)cpu_active_mask)
+
+#define cpumask_of_cpu(cpu) (*get_cpu_mask(cpu))
+
+#define CPU_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(NR_CPUS)
+
+#if NR_CPUS <= BITS_PER_LONG
+
+#define CPU_MASK_ALL							\
+(cpumask_t) { {								\
+	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD			\
+} }
+
+#else
+
+#define CPU_MASK_ALL							\
+(cpumask_t) { {								\
+	[0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,			\
+	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD			\
+} }
+
+#endif
+
+#define CPU_MASK_NONE							\
+(cpumask_t) { {								\
+	[0 ... BITS_TO_LONGS(NR_CPUS)-1] =  0UL				\
+} }
+
+#define CPU_MASK_CPU0							\
+(cpumask_t) { {								\
+	[0] =  1UL							\
+} }
+
+#if NR_CPUS == 1
+#define first_cpu(src)		({ (void)(src); 0; })
+#define next_cpu(n, src)	({ (void)(src); 1; })
+#define any_online_cpu(mask)	0
+#define for_each_cpu_mask(cpu, mask)	\
+	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
+#else /* NR_CPUS > 1 */
+int __first_cpu(const cpumask_t *srcp);
+int __next_cpu(int n, const cpumask_t *srcp);
+int __any_online_cpu(const cpumask_t *mask);
+
+#define first_cpu(src)		__first_cpu(&(src))
+#define next_cpu(n, src)	__next_cpu((n), &(src))
+#define any_online_cpu(mask) __any_online_cpu(&(mask))
+#define for_each_cpu_mask(cpu, mask)			\
+	for ((cpu) = -1;				\
+		(cpu) = next_cpu((cpu), (mask)),	\
+		(cpu) < NR_CPUS; )
+#endif /* SMP */
+
+#if NR_CPUS <= 64
+
+#define for_each_cpu_mask_nr(cpu, mask)	for_each_cpu_mask(cpu, mask)
+
+#else /* NR_CPUS > 64 */
+
+int __next_cpu_nr(int n, const cpumask_t *srcp);
+#define for_each_cpu_mask_nr(cpu, mask)			\
+	for ((cpu) = -1;				\
+		(cpu) = __next_cpu_nr((cpu), &(mask)),	\
+		(cpu) < nr_cpu_ids; )
+
+#endif /* NR_CPUS > 64 */
+
+#define cpus_addr(src) ((src).bits)
+
+#define cpu_set(cpu, dst) __cpu_set((cpu), &(dst))
+static inline void __cpu_set(int cpu, volatile cpumask_t *dstp)
+{
+	set_bit(cpu, dstp->bits);
+}
+
+#define cpu_clear(cpu, dst) __cpu_clear((cpu), &(dst))
+static inline void __cpu_clear(int cpu, volatile cpumask_t *dstp)
+{
+	clear_bit(cpu, dstp->bits);
+}
+
+#define cpus_setall(dst) __cpus_setall(&(dst), NR_CPUS)
+static inline void __cpus_setall(cpumask_t *dstp, int nbits)
+{
+	bitmap_fill(dstp->bits, nbits);
+}
+
+#define cpus_clear(dst) __cpus_clear(&(dst), NR_CPUS)
+static inline void __cpus_clear(cpumask_t *dstp, int nbits)
+{
+	bitmap_zero(dstp->bits, nbits);
+}
+
+/* No static inline type checking - see Subtlety (1) above. */
+#define cpu_isset(cpu, cpumask) test_bit((cpu), (cpumask).bits)
+
+#define cpu_test_and_set(cpu, cpumask) __cpu_test_and_set((cpu), &(cpumask))
+static inline int __cpu_test_and_set(int cpu, cpumask_t *addr)
+{
+	return test_and_set_bit(cpu, addr->bits);
+}
+
+#define cpus_and(dst, src1, src2) __cpus_and(&(dst), &(src1), &(src2), NR_CPUS)
+static inline int __cpus_and(cpumask_t *dstp, const cpumask_t *src1p,
+					const cpumask_t *src2p, int nbits)
+{
+	return bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_or(dst, src1, src2) __cpus_or(&(dst), &(src1), &(src2), NR_CPUS)
+static inline void __cpus_or(cpumask_t *dstp, const cpumask_t *src1p,
+					const cpumask_t *src2p, int nbits)
+{
+	bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_xor(dst, src1, src2) __cpus_xor(&(dst), &(src1), &(src2), NR_CPUS)
+static inline void __cpus_xor(cpumask_t *dstp, const cpumask_t *src1p,
+					const cpumask_t *src2p, int nbits)
+{
+	bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_andnot(dst, src1, src2) \
+				__cpus_andnot(&(dst), &(src1), &(src2), NR_CPUS)
+static inline int __cpus_andnot(cpumask_t *dstp, const cpumask_t *src1p,
+					const cpumask_t *src2p, int nbits)
+{
+	return bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_equal(src1, src2) __cpus_equal(&(src1), &(src2), NR_CPUS)
+static inline int __cpus_equal(const cpumask_t *src1p,
+					const cpumask_t *src2p, int nbits)
+{
+	return bitmap_equal(src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_intersects(src1, src2) __cpus_intersects(&(src1), &(src2), NR_CPUS)
+static inline int __cpus_intersects(const cpumask_t *src1p,
+					const cpumask_t *src2p, int nbits)
+{
+	return bitmap_intersects(src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_subset(src1, src2) __cpus_subset(&(src1), &(src2), NR_CPUS)
+static inline int __cpus_subset(const cpumask_t *src1p,
+					const cpumask_t *src2p, int nbits)
+{
+	return bitmap_subset(src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_empty(src) __cpus_empty(&(src), NR_CPUS)
+static inline int __cpus_empty(const cpumask_t *srcp, int nbits)
+{
+	return bitmap_empty(srcp->bits, nbits);
+}
+
+#define cpus_weight(cpumask) __cpus_weight(&(cpumask), NR_CPUS)
+static inline int __cpus_weight(const cpumask_t *srcp, int nbits)
+{
+	return bitmap_weight(srcp->bits, nbits);
+}
+
+#define cpus_shift_left(dst, src, n) \
+			__cpus_shift_left(&(dst), &(src), (n), NR_CPUS)
+static inline void __cpus_shift_left(cpumask_t *dstp,
+					const cpumask_t *srcp, int n, int nbits)
+{
+	bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
+}
+#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */
+
 #endif /* __LINUX_CPUMASK_H */
-- 
cgit v1.2.3-71-gd317


From 2bcd57ab61e7cabed626226a3771617981c11ce1 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 24 Sep 2009 04:22:25 +0400
Subject: headers: utsname.h redux

* remove asm/atomic.h inclusion from linux/utsname.h --
   not needed after kref conversion
 * remove linux/utsname.h inclusion from files which do not need it

NOTE: it looks like fs/binfmt_elf.c do not need utsname.h, however
due to some personality stuff it _is_ needed -- cowardly leave ELF-related
headers and files alone.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/process.c                     | 1 -
 arch/arm/kernel/sys_arm.c                       | 1 -
 arch/frv/kernel/sys_frv.c                       | 1 -
 arch/h8300/kernel/sys_h8300.c                   | 1 -
 arch/m68k/kernel/sys_m68k.c                     | 1 -
 arch/m68knommu/kernel/sys_m68k.c                | 1 -
 arch/microblaze/kernel/sys_microblaze.c         | 1 -
 arch/mn10300/kernel/sys_mn10300.c               | 1 -
 arch/parisc/kernel/sys_parisc32.c               | 1 -
 arch/powerpc/kernel/setup-common.c              | 1 -
 arch/powerpc/kernel/sys_ppc32.c                 | 1 -
 arch/s390/kernel/compat_linux.c                 | 1 -
 arch/s390/kernel/process.c                      | 1 -
 arch/sh/kernel/sys_sh32.c                       | 1 -
 arch/sh/kernel/sys_sh64.c                       | 1 -
 arch/sparc/kernel/sys_sparc32.c                 | 1 -
 arch/sparc/kernel/systbls.h                     | 3 ++-
 arch/x86/kernel/dumpstack_32.c                  | 1 -
 arch/x86/kernel/dumpstack_64.c                  | 1 -
 arch/x86/kernel/traps.c                         | 1 -
 drivers/media/video/usbvision/usbvision-core.c  | 1 -
 drivers/media/video/usbvision/usbvision-i2c.c   | 1 -
 drivers/media/video/usbvision/usbvision-video.c | 1 -
 drivers/s390/char/zcore.c                       | 1 -
 drivers/usb/gadget/f_loopback.c                 | 1 -
 drivers/usb/gadget/f_obex.c                     | 1 -
 drivers/usb/gadget/f_sourcesink.c               | 1 -
 drivers/usb/gadget/u_audio.c                    | 1 -
 drivers/usb/gadget/u_ether.c                    | 1 -
 fs/gfs2/ops_inode.c                             | 1 -
 fs/lockd/xdr.c                                  | 1 -
 fs/lockd/xdr4.c                                 | 1 -
 fs/nfs/nfs2xdr.c                                | 1 -
 fs/nfs/nfs3proc.c                               | 1 -
 fs/nfs/nfs3xdr.c                                | 1 -
 fs/nfs/nfs4proc.c                               | 1 -
 fs/nfs/nfs4xdr.c                                | 1 -
 fs/nfs/proc.c                                   | 1 -
 fs/nfsd/nfs4idmap.c                             | 1 -
 fs/ocfs2/dlm/dlmast.c                           | 1 -
 fs/ocfs2/dlm/dlmconvert.c                       | 1 -
 fs/ocfs2/dlm/dlmdebug.c                         | 1 -
 fs/ocfs2/dlm/dlmdomain.c                        | 1 -
 fs/ocfs2/dlm/dlmlock.c                          | 1 -
 fs/ocfs2/dlm/dlmmaster.c                        | 1 -
 fs/ocfs2/dlm/dlmrecovery.c                      | 1 -
 fs/ocfs2/dlm/dlmthread.c                        | 1 -
 fs/ocfs2/dlm/dlmunlock.c                        | 1 -
 fs/ocfs2/super.c                                | 1 -
 fs/ocfs2/symlink.c                              | 1 -
 include/linux/utsname.h                         | 1 -
 init/main.c                                     | 1 -
 kernel/power/swap.c                             | 1 -
 kernel/sysctl.c                                 | 1 -
 kernel/uid16.c                                  | 1 -
 net/sunrpc/auth_null.c                          | 1 -
 56 files changed, 2 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 3a2fb7a02db4..289039bb6bb2 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -19,7 +19,6 @@
 #include <linux/ptrace.h>
 #include <linux/slab.h>
 #include <linux/user.h>
-#include <linux/utsname.h>
 #include <linux/time.h>
 #include <linux/major.h>
 #include <linux/stat.h>
diff --git a/arch/arm/kernel/sys_arm.c b/arch/arm/kernel/sys_arm.c
index b3ec641b5cf8..78ecaac65206 100644
--- a/arch/arm/kernel/sys_arm.c
+++ b/arch/arm/kernel/sys_arm.c
@@ -25,7 +25,6 @@
 #include <linux/mman.h>
 #include <linux/fs.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/ipc.h>
 #include <linux/uaccess.h>
 
diff --git a/arch/frv/kernel/sys_frv.c b/arch/frv/kernel/sys_frv.c
index baadc97f8627..2b6b5289cdcc 100644
--- a/arch/frv/kernel/sys_frv.c
+++ b/arch/frv/kernel/sys_frv.c
@@ -21,7 +21,6 @@
 #include <linux/stat.h>
 #include <linux/mman.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/syscalls.h>
 #include <linux/ipc.h>
 
diff --git a/arch/h8300/kernel/sys_h8300.c b/arch/h8300/kernel/sys_h8300.c
index 2745656dcc52..8cb5d73a0e35 100644
--- a/arch/h8300/kernel/sys_h8300.c
+++ b/arch/h8300/kernel/sys_h8300.c
@@ -17,7 +17,6 @@
 #include <linux/syscalls.h>
 #include <linux/mman.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/fs.h>
 #include <linux/ipc.h>
 
diff --git a/arch/m68k/kernel/sys_m68k.c b/arch/m68k/kernel/sys_m68k.c
index 7f54efaf60bb..7deb402bfc75 100644
--- a/arch/m68k/kernel/sys_m68k.c
+++ b/arch/m68k/kernel/sys_m68k.c
@@ -20,7 +20,6 @@
 #include <linux/syscalls.h>
 #include <linux/mman.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/ipc.h>
 
 #include <asm/setup.h>
diff --git a/arch/m68knommu/kernel/sys_m68k.c b/arch/m68knommu/kernel/sys_m68k.c
index 700281638629..efdd090778a3 100644
--- a/arch/m68knommu/kernel/sys_m68k.c
+++ b/arch/m68knommu/kernel/sys_m68k.c
@@ -17,7 +17,6 @@
 #include <linux/syscalls.h>
 #include <linux/mman.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/ipc.h>
 #include <linux/fs.h>
 
diff --git a/arch/microblaze/kernel/sys_microblaze.c b/arch/microblaze/kernel/sys_microblaze.c
index b96f1682bb24..07cabed4b947 100644
--- a/arch/microblaze/kernel/sys_microblaze.c
+++ b/arch/microblaze/kernel/sys_microblaze.c
@@ -23,7 +23,6 @@
 #include <linux/mman.h>
 #include <linux/sys.h>
 #include <linux/ipc.h>
-#include <linux/utsname.h>
 #include <linux/file.h>
 #include <linux/module.h>
 #include <linux/err.h>
diff --git a/arch/mn10300/kernel/sys_mn10300.c b/arch/mn10300/kernel/sys_mn10300.c
index 3e52a1054327..8ca5af00334c 100644
--- a/arch/mn10300/kernel/sys_mn10300.c
+++ b/arch/mn10300/kernel/sys_mn10300.c
@@ -19,7 +19,6 @@
 #include <linux/stat.h>
 #include <linux/mman.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/tty.h>
 
 #include <asm/uaccess.h>
diff --git a/arch/parisc/kernel/sys_parisc32.c b/arch/parisc/kernel/sys_parisc32.c
index 92a0acaa0d12..561388b17c91 100644
--- a/arch/parisc/kernel/sys_parisc32.c
+++ b/arch/parisc/kernel/sys_parisc32.c
@@ -18,7 +18,6 @@
 #include <linux/signal.h>
 #include <linux/resource.h>
 #include <linux/times.h>
-#include <linux/utsname.h>
 #include <linux/time.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 1d5570a1e456..74cd1a7d0d4b 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -24,7 +24,6 @@
 #include <linux/seq_file.h>
 #include <linux/ioport.h>
 #include <linux/console.h>
-#include <linux/utsname.h>
 #include <linux/screen_info.h>
 #include <linux/root_dev.h>
 #include <linux/notifier.h>
diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c
index 1cc5e9e5da96..b97c2d67f4ac 100644
--- a/arch/powerpc/kernel/sys_ppc32.c
+++ b/arch/powerpc/kernel/sys_ppc32.c
@@ -22,7 +22,6 @@
 #include <linux/signal.h>
 #include <linux/resource.h>
 #include <linux/times.h>
-#include <linux/utsname.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/sem.h>
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 5519cb745106..0debcec23a39 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -24,7 +24,6 @@
 #include <linux/signal.h>
 #include <linux/resource.h>
 #include <linux/times.h>
-#include <linux/utsname.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/sem.h>
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 59fe6ecc6ed3..5417eb57271a 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -27,7 +27,6 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/notifier.h>
-#include <linux/utsname.h>
 #include <linux/tick.h>
 #include <linux/elfcore.h>
 #include <linux/kernel_stat.h>
diff --git a/arch/sh/kernel/sys_sh32.c b/arch/sh/kernel/sys_sh32.c
index 63ba12836eae..eb68bfdd86e6 100644
--- a/arch/sh/kernel/sys_sh32.c
+++ b/arch/sh/kernel/sys_sh32.c
@@ -9,7 +9,6 @@
 #include <linux/syscalls.h>
 #include <linux/mman.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/ipc.h>
diff --git a/arch/sh/kernel/sys_sh64.c b/arch/sh/kernel/sys_sh64.c
index 91fb8445a5a0..287235768bc5 100644
--- a/arch/sh/kernel/sys_sh64.c
+++ b/arch/sh/kernel/sys_sh64.c
@@ -23,7 +23,6 @@
 #include <linux/stat.h>
 #include <linux/mman.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/syscalls.h>
 #include <linux/ipc.h>
 #include <asm/uaccess.h>
diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c
index f5000a460c05..04e28b2671c8 100644
--- a/arch/sparc/kernel/sys_sparc32.c
+++ b/arch/sparc/kernel/sys_sparc32.c
@@ -16,7 +16,6 @@
 #include <linux/signal.h>
 #include <linux/resource.h>
 #include <linux/times.h>
-#include <linux/utsname.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/sem.h>
diff --git a/arch/sparc/kernel/systbls.h b/arch/sparc/kernel/systbls.h
index 15c2d752b2bc..a63c5d2d9849 100644
--- a/arch/sparc/kernel/systbls.h
+++ b/arch/sparc/kernel/systbls.h
@@ -3,10 +3,11 @@
 
 #include <linux/kernel.h>
 #include <linux/types.h>
-#include <linux/utsname.h>
 #include <asm/utrap.h>
 #include <asm/signal.h>
 
+struct new_utsname;
+
 extern asmlinkage unsigned long sys_getpagesize(void);
 extern asmlinkage unsigned long sparc_brk(unsigned long brk);
 extern asmlinkage long sparc_pipe(struct pt_regs *regs);
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index bca5fba91c9e..f7dd2a7c3bf4 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -5,7 +5,6 @@
 #include <linux/kallsyms.h>
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
-#include <linux/utsname.h>
 #include <linux/hardirq.h>
 #include <linux/kdebug.h>
 #include <linux/module.h>
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 54b0a3276766..a071e6be177e 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -5,7 +5,6 @@
 #include <linux/kallsyms.h>
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
-#include <linux/utsname.h>
 #include <linux/hardirq.h>
 #include <linux/kdebug.h>
 #include <linux/module.h>
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9346e102338d..a665c71352b8 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -14,7 +14,6 @@
 #include <linux/spinlock.h>
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
-#include <linux/utsname.h>
 #include <linux/kdebug.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
diff --git a/drivers/media/video/usbvision/usbvision-core.c b/drivers/media/video/usbvision/usbvision-core.c
index 6ba16abeebdd..e0f91e4ab653 100644
--- a/drivers/media/video/usbvision/usbvision-core.c
+++ b/drivers/media/video/usbvision/usbvision-core.c
@@ -28,7 +28,6 @@
 #include <linux/timer.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
-#include <linux/utsname.h>
 #include <linux/highmem.h>
 #include <linux/vmalloc.h>
 #include <linux/module.h>
diff --git a/drivers/media/video/usbvision/usbvision-i2c.c b/drivers/media/video/usbvision/usbvision-i2c.c
index f97fd06d5948..c19f51dba2ee 100644
--- a/drivers/media/video/usbvision/usbvision-i2c.c
+++ b/drivers/media/video/usbvision/usbvision-i2c.c
@@ -28,7 +28,6 @@
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <asm/uaccess.h>
 #include <linux/ioport.h>
diff --git a/drivers/media/video/usbvision/usbvision-video.c b/drivers/media/video/usbvision/usbvision-video.c
index 90d9b5c0e9a7..a2a50d608a3f 100644
--- a/drivers/media/video/usbvision/usbvision-video.c
+++ b/drivers/media/video/usbvision/usbvision-video.c
@@ -52,7 +52,6 @@
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/mm.h>
-#include <linux/utsname.h>
 #include <linux/highmem.h>
 #include <linux/vmalloc.h>
 #include <linux/module.h>
diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c
index c431198bdbc4..82daa3c1dc9c 100644
--- a/drivers/s390/char/zcore.c
+++ b/drivers/s390/char/zcore.c
@@ -14,7 +14,6 @@
 
 #include <linux/init.h>
 #include <linux/miscdevice.h>
-#include <linux/utsname.h>
 #include <linux/debugfs.h>
 #include <asm/ipl.h>
 #include <asm/sclp.h>
diff --git a/drivers/usb/gadget/f_loopback.c b/drivers/usb/gadget/f_loopback.c
index eb6ddfc20857..6cb29d3df575 100644
--- a/drivers/usb/gadget/f_loopback.c
+++ b/drivers/usb/gadget/f_loopback.c
@@ -22,7 +22,6 @@
 /* #define VERBOSE_DEBUG */
 
 #include <linux/kernel.h>
-#include <linux/utsname.h>
 #include <linux/device.h>
 
 #include "g_zero.h"
diff --git a/drivers/usb/gadget/f_obex.c b/drivers/usb/gadget/f_obex.c
index 46d6266f30ec..b4a3ba654ea5 100644
--- a/drivers/usb/gadget/f_obex.c
+++ b/drivers/usb/gadget/f_obex.c
@@ -24,7 +24,6 @@
 /* #define VERBOSE_DEBUG */
 
 #include <linux/kernel.h>
-#include <linux/utsname.h>
 #include <linux/device.h>
 
 #include "u_serial.h"
diff --git a/drivers/usb/gadget/f_sourcesink.c b/drivers/usb/gadget/f_sourcesink.c
index bffe91d525f9..09cba273d2db 100644
--- a/drivers/usb/gadget/f_sourcesink.c
+++ b/drivers/usb/gadget/f_sourcesink.c
@@ -22,7 +22,6 @@
 /* #define VERBOSE_DEBUG */
 
 #include <linux/kernel.h>
-#include <linux/utsname.h>
 #include <linux/device.h>
 
 #include "g_zero.h"
diff --git a/drivers/usb/gadget/u_audio.c b/drivers/usb/gadget/u_audio.c
index b5200d551458..8252595d619d 100644
--- a/drivers/usb/gadget/u_audio.c
+++ b/drivers/usb/gadget/u_audio.c
@@ -10,7 +10,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/utsname.h>
 #include <linux/device.h>
 #include <linux/delay.h>
 #include <linux/ctype.h>
diff --git a/drivers/usb/gadget/u_ether.c b/drivers/usb/gadget/u_ether.c
index f8751ff863cd..2fc02bd95848 100644
--- a/drivers/usb/gadget/u_ether.c
+++ b/drivers/usb/gadget/u_ether.c
@@ -23,7 +23,6 @@
 /* #define VERBOSE_DEBUG */
 
 #include <linux/kernel.h>
-#include <linux/utsname.h>
 #include <linux/device.h>
 #include <linux/ctype.h>
 #include <linux/etherdevice.h>
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index c3ac18054057..247436c10deb 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -12,7 +12,6 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/namei.h>
-#include <linux/utsname.h>
 #include <linux/mm.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 0336f2beacde..b583ab0a4cbb 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -8,7 +8,6 @@
 
 #include <linux/types.h>
 #include <linux/sched.h>
-#include <linux/utsname.h>
 #include <linux/nfs.h>
 
 #include <linux/sunrpc/xdr.h>
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index e1d528653192..ad9dbbc9145d 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -9,7 +9,6 @@
 
 #include <linux/types.h>
 #include <linux/sched.h>
-#include <linux/utsname.h>
 #include <linux/nfs.h>
 
 #include <linux/sunrpc/xdr.h>
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index c862c9340f9a..5e078b222b4e 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -13,7 +13,6 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/utsname.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index ee6a13f05443..3f8881d1a050 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -7,7 +7,6 @@
  */
 
 #include <linux/mm.h>
-#include <linux/utsname.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/sunrpc/clnt.h>
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 35869a4921f1..5fe5492fbd29 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -10,7 +10,6 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/utsname.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index be6544aef41f..ed7c269e2514 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -36,7 +36,6 @@
  */
 
 #include <linux/mm.h>
-#include <linux/utsname.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/string.h>
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index cfc30d362f94..83ad47cbdd8a 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -39,7 +39,6 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/utsname.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 7be72d90d49d..ef583854d8d0 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -32,7 +32,6 @@
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/mm.h>
-#include <linux/utsname.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index cdfa86fa1471..ba2c199592fd 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -38,7 +38,6 @@
 #include <linux/init.h>
 
 #include <linux/mm.h>
-#include <linux/utsname.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/sunrpc/clnt.h>
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 81eff8e58322..01cf8cc3d286 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -30,7 +30,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
 #include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 75997b4deaf3..ca96bce50e18 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -30,7 +30,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
 #include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index c5c88124096d..ca46002ec10e 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -27,7 +27,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/sysctl.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 4d9e6b288dd8..0334000676d3 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -28,7 +28,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/delay.h>
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 83a9f2972ac8..437698e9465f 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -30,7 +30,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
 #include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index f8b653fcd4dd..83bcaf266b35 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -30,7 +30,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
 #include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 43e6e3280569..d9fa3d22e17c 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -30,7 +30,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
 #include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 98569e86c613..52ec020ea78b 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -30,7 +30,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
 #include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 756f5b0998e0..00f53b2aea76 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -30,7 +30,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
 #include <linux/random.h>
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 24feb449a1dc..4cc3c890a2cd 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -28,7 +28,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/random.h>
 #include <linux/statfs.h>
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 579dd1b1110f..e3421030a69f 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -38,7 +38,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
-#include <linux/utsname.h>
 #include <linux/namei.h>
 
 #define MLOG_MASK_PREFIX ML_NAMEI
diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index 3656b300de3a..69f39974c041 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -36,7 +36,6 @@ struct new_utsname {
 #include <linux/kref.h>
 #include <linux/nsproxy.h>
 #include <linux/err.h>
-#include <asm/atomic.h>
 
 struct uts_namespace {
 	struct kref kref;
diff --git a/init/main.c b/init/main.c
index 6107223124e4..76961db298f0 100644
--- a/init/main.c
+++ b/init/main.c
@@ -18,7 +18,6 @@
 #include <linux/string.h>
 #include <linux/ctype.h>
 #include <linux/delay.h>
-#include <linux/utsname.h>
 #include <linux/ioport.h>
 #include <linux/init.h>
 #include <linux/smp_lock.h>
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8ba052c86d48..b101cdc4df3f 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -13,7 +13,6 @@
 
 #include <linux/module.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/delay.h>
 #include <linux/bitops.h>
 #include <linux/genhd.h>
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0dfaa47d7cb6..7f4f57bea4ce 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -26,7 +26,6 @@
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/ctype.h>
-#include <linux/utsname.h>
 #include <linux/kmemcheck.h>
 #include <linux/smp_lock.h>
 #include <linux/fs.h>
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 0314501688b9..419209893d87 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -4,7 +4,6 @@
  */
 
 #include <linux/mm.h>
-#include <linux/utsname.h>
 #include <linux/mman.h>
 #include <linux/notifier.h>
 #include <linux/reboot.h>
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index c70dd7f5258e..1db618f56ecb 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -8,7 +8,6 @@
 
 #include <linux/types.h>
 #include <linux/module.h>
-#include <linux/utsname.h>
 #include <linux/sunrpc/clnt.h>
 
 #ifdef RPC_DEBUG
-- 
cgit v1.2.3-71-gd317


From 22fe404218156328a27e66349b1175cd0baa4990 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 18 Sep 2009 13:05:44 -0700
Subject: vfs: split generic_forget_inode() so that hugetlbfs does not have to
 copy it

Hugetlbfs needs to do special things instead of truncate_inode_pages().
 Currently, it copied generic_forget_inode() except for
truncate_inode_pages() call which is asking for trouble (the code there
isn't trivial).  So create a separate function generic_detach_inode()
which does all the list magic done in generic_forget_inode() and call
it from hugetlbfs_forget_inode().

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hugetlbfs/inode.c | 33 ++++-----------------------------
 fs/inode.c           | 21 +++++++++++++++++++--
 include/linux/fs.h   |  1 +
 3 files changed, 24 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index eba6d552d9c9..478a169a262d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -380,36 +380,11 @@ static void hugetlbfs_delete_inode(struct inode *inode)
 
 static void hugetlbfs_forget_inode(struct inode *inode) __releases(inode_lock)
 {
-	struct super_block *sb = inode->i_sb;
-
-	if (!hlist_unhashed(&inode->i_hash)) {
-		if (!(inode->i_state & (I_DIRTY|I_SYNC)))
-			list_move(&inode->i_list, &inode_unused);
-		inodes_stat.nr_unused++;
-		if (!sb || (sb->s_flags & MS_ACTIVE)) {
-			spin_unlock(&inode_lock);
-			return;
-		}
-		inode->i_state |= I_WILL_FREE;
-		spin_unlock(&inode_lock);
-		/*
-		 * write_inode_now is a noop as we set BDI_CAP_NO_WRITEBACK
-		 * in our backing_dev_info.
-		 */
-		write_inode_now(inode, 1);
-		spin_lock(&inode_lock);
-		inode->i_state &= ~I_WILL_FREE;
-		inodes_stat.nr_unused--;
-		hlist_del_init(&inode->i_hash);
+	if (generic_detach_inode(inode)) {
+		truncate_hugepages(inode, 0);
+		clear_inode(inode);
+		destroy_inode(inode);
 	}
-	list_del_init(&inode->i_list);
-	list_del_init(&inode->i_sb_list);
-	inode->i_state |= I_FREEING;
-	inodes_stat.nr_inodes--;
-	spin_unlock(&inode_lock);
-	truncate_hugepages(inode, 0);
-	clear_inode(inode);
-	destroy_inode(inode);
 }
 
 static void hugetlbfs_drop_inode(struct inode *inode)
diff --git a/fs/inode.c b/fs/inode.c
index 07d775ea6161..fa506d539653 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1241,7 +1241,16 @@ void generic_delete_inode(struct inode *inode)
 }
 EXPORT_SYMBOL(generic_delete_inode);
 
-static void generic_forget_inode(struct inode *inode)
+/**
+ *	generic_detach_inode - remove inode from inode lists
+ *	@inode: inode to remove
+ *
+ *	Remove inode from inode lists, write it if it's dirty. This is just an
+ *	internal VFS helper exported for hugetlbfs. Do not use!
+ *
+ *	Returns 1 if inode should be completely destroyed.
+ */
+int generic_detach_inode(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 
@@ -1251,7 +1260,7 @@ static void generic_forget_inode(struct inode *inode)
 		inodes_stat.nr_unused++;
 		if (sb->s_flags & MS_ACTIVE) {
 			spin_unlock(&inode_lock);
-			return;
+			return 0;
 		}
 		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state |= I_WILL_FREE;
@@ -1269,6 +1278,14 @@ static void generic_forget_inode(struct inode *inode)
 	inode->i_state |= I_FREEING;
 	inodes_stat.nr_inodes--;
 	spin_unlock(&inode_lock);
+	return 1;
+}
+EXPORT_SYMBOL_GPL(generic_detach_inode);
+
+static void generic_forget_inode(struct inode *inode)
+{
+	if (!generic_detach_inode(inode))
+		return;
 	if (inode->i_data.nrpages)
 		truncate_inode_pages(&inode->i_data, 0);
 	clear_inode(inode);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 51803528b095..955e34615cb7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2156,6 +2156,7 @@ extern ino_t iunique(struct super_block *, ino_t);
 extern int inode_needs_sync(struct inode *inode);
 extern void generic_delete_inode(struct inode *inode);
 extern void generic_drop_inode(struct inode *inode);
+extern int generic_detach_inode(struct inode *inode);
 
 extern struct inode *ilookup5_nowait(struct super_block *sb,
 		unsigned long hashval, int (*test)(struct inode *, void *),
-- 
cgit v1.2.3-71-gd317


From 42cb56ae2ab67390da34906b27bedc3f2ff1393b Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 18 Sep 2009 13:05:53 -0700
Subject: vfs: change sb->s_maxbytes to a loff_t

sb->s_maxbytes is supposed to indicate the maximum size of a file that can
exist on the filesystem.  It's declared as an unsigned long long.

Even if a filesystem has no inherent limit that prevents it from using
every bit in that unsigned long long, it's still problematic to set it to
anything larger than MAX_LFS_FILESIZE.  There are places in the kernel
that cast s_maxbytes to a signed value.  If it's set too large then this
cast makes it a negative number and generally breaks the comparison.

Change s_maxbytes to be loff_t instead.  That should help eliminate the
temptation to set it too large by making it a signed value.

Also, add a warning for couple of releases to help catch filesystems that
set s_maxbytes too large.  Eventually we can either convert this to a
BUG() or just remove it and in the hope that no one will get it wrong now
that it's a signed value.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Robert Love <rlove@google.com>
Cc: Mandeep Singh Baines <msb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c         | 10 ++++++++++
 include/linux/fs.h |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index 0e7207b9815c..4906e2d8f400 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -892,6 +892,16 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
  	if (error)
  		goto out_sb;
 
+	/*
+	 * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
+	 * but s_maxbytes was an unsigned long long for many releases. Throw
+	 * this warning for a little while to try and catch filesystems that
+	 * violate this rule. This warning should be either removed or
+	 * converted to a BUG() in 2.6.34.
+	 */
+	WARN((mnt->mnt_sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
+		"negative value (%lld)\n", type->name, mnt->mnt_sb->s_maxbytes);
+
 	mnt->mnt_mountpoint = mnt->mnt_root;
 	mnt->mnt_parent = mnt;
 	up_write(&mnt->mnt_sb->s_umount);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 955e34615cb7..cbb7724c11d3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1315,7 +1315,7 @@ struct super_block {
 	unsigned long		s_blocksize;
 	unsigned char		s_blocksize_bits;
 	unsigned char		s_dirt;
-	unsigned long long	s_maxbytes;	/* Max file size */
+	loff_t			s_maxbytes;	/* Max file size */
 	struct file_system_type	*s_type;
 	const struct super_operations	*s_op;
 	const struct dquot_operations	*dq_op;
-- 
cgit v1.2.3-71-gd317


From f84398068d9c2babe41500504ef247ae07081857 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 21 Sep 2009 14:48:36 +0200
Subject: vfs: seq_file: add helpers for data filling

Add two helpers that allow access to the seq_file's own buffer, but
hide the internal details of seq_files.

This allows easier implementation of special purpose filling
functions.  It also cleans up some existing functions which duplicated
the seq_file logic.

Make these inline functions in seq_file.h, as suggested by Al.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Acked-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/seq_file.c            | 75 +++++++++++++++++++++++++-----------------------
 include/linux/seq_file.h | 38 ++++++++++++++++++++++++
 2 files changed, 77 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index 66efd0aa8fb3..eae7d9dbf3ff 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -429,20 +429,21 @@ EXPORT_SYMBOL(mangle_path);
  */
 int seq_path(struct seq_file *m, struct path *path, char *esc)
 {
-	if (m->count < m->size) {
-		char *s = m->buf + m->count;
-		char *p = d_path(path, s, m->size - m->count);
+	char *buf;
+	size_t size = seq_get_buf(m, &buf);
+	int res = -1;
+
+	if (size) {
+		char *p = d_path(path, buf, size);
 		if (!IS_ERR(p)) {
-			s = mangle_path(s, p, esc);
-			if (s) {
-				p = m->buf + m->count;
-				m->count = s - m->buf;
-				return s - p;
-			}
+			char *end = mangle_path(buf, p, esc);
+			if (end)
+				res = end - buf;
 		}
 	}
-	m->count = m->size;
-	return -1;
+	seq_commit(m, res);
+
+	return res;
 }
 EXPORT_SYMBOL(seq_path);
 
@@ -454,27 +455,28 @@ EXPORT_SYMBOL(seq_path);
 int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
 		  char *esc)
 {
-	int err = -ENAMETOOLONG;
-	if (m->count < m->size) {
-		char *s = m->buf + m->count;
+	char *buf;
+	size_t size = seq_get_buf(m, &buf);
+	int res = -ENAMETOOLONG;
+
+	if (size) {
 		char *p;
 
 		spin_lock(&dcache_lock);
-		p = __d_path(path, root, s, m->size - m->count);
+		p = __d_path(path, root, buf, size);
 		spin_unlock(&dcache_lock);
-		err = PTR_ERR(p);
+		res = PTR_ERR(p);
 		if (!IS_ERR(p)) {
-			s = mangle_path(s, p, esc);
-			if (s) {
-				p = m->buf + m->count;
-				m->count = s - m->buf;
-				return 0;
-			}
-			err = -ENAMETOOLONG;
+			char *end = mangle_path(buf, p, esc);
+			if (end)
+				res = end - buf;
+			else
+				res = -ENAMETOOLONG;
 		}
 	}
-	m->count = m->size;
-	return err;
+	seq_commit(m, res);
+
+	return res < 0 ? res : 0;
 }
 
 /*
@@ -482,20 +484,21 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
  */
 int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
 {
-	if (m->count < m->size) {
-		char *s = m->buf + m->count;
-		char *p = dentry_path(dentry, s, m->size - m->count);
+	char *buf;
+	size_t size = seq_get_buf(m, &buf);
+	int res = -1;
+
+	if (size) {
+		char *p = dentry_path(dentry, buf, size);
 		if (!IS_ERR(p)) {
-			s = mangle_path(s, p, esc);
-			if (s) {
-				p = m->buf + m->count;
-				m->count = s - m->buf;
-				return s - p;
-			}
+			char *end = mangle_path(buf, p, esc);
+			if (end)
+				res = end - buf;
 		}
 	}
-	m->count = m->size;
-	return -1;
+	seq_commit(m, res);
+
+	return res;
 }
 
 int seq_bitmap(struct seq_file *m, const unsigned long *bits,
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 0c6a86b79596..8366d8f12e53 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -35,6 +35,44 @@ struct seq_operations {
 
 #define SEQ_SKIP 1
 
+/**
+ * seq_get_buf - get buffer to write arbitrary data to
+ * @m: the seq_file handle
+ * @bufp: the beginning of the buffer is stored here
+ *
+ * Return the number of bytes available in the buffer, or zero if
+ * there's no space.
+ */
+static inline size_t seq_get_buf(struct seq_file *m, char **bufp)
+{
+	BUG_ON(m->count > m->size);
+	if (m->count < m->size)
+		*bufp = m->buf + m->count;
+	else
+		*bufp = NULL;
+
+	return m->size - m->count;
+}
+
+/**
+ * seq_commit - commit data to the buffer
+ * @m: the seq_file handle
+ * @num: the number of bytes to commit
+ *
+ * Commit @num bytes of data written to a buffer previously acquired
+ * by seq_buf_get.  To signal an error condition, or that the data
+ * didn't fit in the available space, pass a negative @num value.
+ */
+static inline void seq_commit(struct seq_file *m, int num)
+{
+	if (num < 0) {
+		m->count = m->size;
+	} else {
+		BUG_ON(m->count + num > m->size);
+		m->count += num;
+	}
+}
+
 char *mangle_path(char *s, char *p, char *esc);
 int seq_open(struct file *, const struct seq_operations *);
 ssize_t seq_read(struct file *, char __user *, size_t, loff_t *);
-- 
cgit v1.2.3-71-gd317


From 4fadd7bb20a1e7c774ed88dc703d8fbcd00ff917 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 3 Aug 2009 23:28:06 +0200
Subject: freeze_bdev: kill bd_mount_sem

Now that we have the freeze count there is not much reason for bd_mount_sem
anymore.  The actual freeze/thaw operations are serialized using the
bd_fsfreeze_mutex, and the only other place we take bd_mount_sem is
get_sb_bdev which tries to prevent mounting a filesystem while the block
device is frozen.  Instead of add a check for bd_fsfreeze_count and
return -EBUSY if a filesystem is frozen.  While that is a change in user
visible behaviour a failing mount is much better for this case rather
than having the mount process stuck uninterruptible for a long time.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c     | 8 +-------
 fs/super.c         | 9 +++++++--
 include/linux/fs.h | 1 -
 3 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 5d1ed50bd46c..22506eb4a58e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -216,8 +216,6 @@ EXPORT_SYMBOL(fsync_bdev);
  * freeze_bdev  --  lock a filesystem and force it into a consistent state
  * @bdev:	blockdevice to lock
  *
- * This takes the block device bd_mount_sem to make sure no new mounts
- * happen on bdev until thaw_bdev() is called.
  * If a superblock is found on this device, we take the s_umount semaphore
  * on it to make sure nobody unmounts until the snapshot creation is done.
  * The reference counter (bd_fsfreeze_count) guarantees that only the last
@@ -240,7 +238,6 @@ struct super_block *freeze_bdev(struct block_device *bdev)
 	}
 	bdev->bd_fsfreeze_count++;
 
-	down(&bdev->bd_mount_sem);
 	sb = get_super(bdev);
 	if (sb && !(sb->s_flags & MS_RDONLY)) {
 		sb->s_frozen = SB_FREEZE_WRITE;
@@ -260,7 +257,6 @@ struct super_block *freeze_bdev(struct block_device *bdev)
 					"VFS:Filesystem freeze failed\n");
 				sb->s_frozen = SB_UNFROZEN;
 				drop_super(sb);
-				up(&bdev->bd_mount_sem);
 				bdev->bd_fsfreeze_count--;
 				mutex_unlock(&bdev->bd_fsfreeze_mutex);
 				return ERR_PTR(error);
@@ -271,7 +267,7 @@ struct super_block *freeze_bdev(struct block_device *bdev)
 	sync_blockdev(bdev);
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 
-	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
+	return sb;	/* thaw_bdev releases s->s_umount */
 }
 EXPORT_SYMBOL(freeze_bdev);
 
@@ -321,7 +317,6 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb)
 		drop_super(sb);
 	}
 
-	up(&bdev->bd_mount_sem);
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 	return 0;
 }
@@ -430,7 +425,6 @@ static void init_once(void *foo)
 
 	memset(bdev, 0, sizeof(*bdev));
 	mutex_init(&bdev->bd_mutex);
-	sema_init(&bdev->bd_mount_sem, 1);
 	INIT_LIST_HEAD(&bdev->bd_inodes);
 	INIT_LIST_HEAD(&bdev->bd_list);
 #ifdef CONFIG_SYSFS
diff --git a/fs/super.c b/fs/super.c
index 4906e2d8f400..1cb26a3e3df0 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -743,9 +743,14 @@ int get_sb_bdev(struct file_system_type *fs_type,
 	 * will protect the lockfs code from trying to start a snapshot
 	 * while we are mounting
 	 */
-	down(&bdev->bd_mount_sem);
+	mutex_lock(&bdev->bd_fsfreeze_mutex);
+	if (bdev->bd_fsfreeze_count > 0) {
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		error = -EBUSY;
+		goto error_bdev;
+	}
 	s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
-	up(&bdev->bd_mount_sem);
+	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 	if (IS_ERR(s))
 		goto error_s;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cbb7724c11d3..72dfbd423974 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -640,7 +640,6 @@ struct block_device {
 	struct super_block *	bd_super;
 	int			bd_openers;
 	struct mutex		bd_mutex;	/* open/close mutex */
-	struct semaphore	bd_mount_sem;
 	struct list_head	bd_inodes;
 	void *			bd_holder;
 	int			bd_holders;
-- 
cgit v1.2.3-71-gd317


From 4504230a71566785a05d3e6b53fa1ee071b864eb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 3 Aug 2009 23:28:35 +0200
Subject: freeze_bdev: grab active reference to frozen superblocks

Currently we held s_umount while a filesystem is frozen, despite that we
might return to userspace and unlock it from a different process.  Instead
grab an active reference to keep the file system busy and add an explicit
check for frozen filesystems in remount and reject the remount instead
of blocking on s_umount.

Add a new get_active_super helper to super.c for use by freeze_bdev that
grabs an active reference to a superblock from a given block device.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c     | 132 +++++++++++++++++++++++++++++------------------------
 fs/super.c         |  48 ++++++++++++++++++-
 include/linux/fs.h |   1 +
 3 files changed, 120 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 22506eb4a58e..9cf4b926f8e4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -230,43 +230,54 @@ struct super_block *freeze_bdev(struct block_device *bdev)
 	int error = 0;
 
 	mutex_lock(&bdev->bd_fsfreeze_mutex);
-	if (bdev->bd_fsfreeze_count > 0) {
-		bdev->bd_fsfreeze_count++;
+	if (++bdev->bd_fsfreeze_count > 1) {
+		/*
+		 * We don't even need to grab a reference - the first call
+		 * to freeze_bdev grab an active reference and only the last
+		 * thaw_bdev drops it.
+		 */
 		sb = get_super(bdev);
+		drop_super(sb);
 		mutex_unlock(&bdev->bd_fsfreeze_mutex);
 		return sb;
 	}
-	bdev->bd_fsfreeze_count++;
-
-	sb = get_super(bdev);
-	if (sb && !(sb->s_flags & MS_RDONLY)) {
-		sb->s_frozen = SB_FREEZE_WRITE;
-		smp_wmb();
-
-		sync_filesystem(sb);
-
-		sb->s_frozen = SB_FREEZE_TRANS;
-		smp_wmb();
-
-		sync_blockdev(sb->s_bdev);
-
-		if (sb->s_op->freeze_fs) {
-			error = sb->s_op->freeze_fs(sb);
-			if (error) {
-				printk(KERN_ERR
-					"VFS:Filesystem freeze failed\n");
-				sb->s_frozen = SB_UNFROZEN;
-				drop_super(sb);
-				bdev->bd_fsfreeze_count--;
-				mutex_unlock(&bdev->bd_fsfreeze_mutex);
-				return ERR_PTR(error);
-			}
+
+	sb = get_active_super(bdev);
+	if (!sb)
+		goto out;
+	if (sb->s_flags & MS_RDONLY) {
+		deactivate_locked_super(sb);
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return sb;
+	}
+
+	sb->s_frozen = SB_FREEZE_WRITE;
+	smp_wmb();
+
+	sync_filesystem(sb);
+
+	sb->s_frozen = SB_FREEZE_TRANS;
+	smp_wmb();
+
+	sync_blockdev(sb->s_bdev);
+
+	if (sb->s_op->freeze_fs) {
+		error = sb->s_op->freeze_fs(sb);
+		if (error) {
+			printk(KERN_ERR
+				"VFS:Filesystem freeze failed\n");
+			sb->s_frozen = SB_UNFROZEN;
+			deactivate_locked_super(sb);
+			bdev->bd_fsfreeze_count--;
+			mutex_unlock(&bdev->bd_fsfreeze_mutex);
+			return ERR_PTR(error);
 		}
 	}
+	up_write(&sb->s_umount);
 
+ out:
 	sync_blockdev(bdev);
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
-
 	return sb;	/* thaw_bdev releases s->s_umount */
 }
 EXPORT_SYMBOL(freeze_bdev);
@@ -280,43 +291,44 @@ EXPORT_SYMBOL(freeze_bdev);
  */
 int thaw_bdev(struct block_device *bdev, struct super_block *sb)
 {
-	int error = 0;
+	int error = -EINVAL;
 
 	mutex_lock(&bdev->bd_fsfreeze_mutex);
-	if (!bdev->bd_fsfreeze_count) {
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		return -EINVAL;
-	}
-
-	bdev->bd_fsfreeze_count--;
-	if (bdev->bd_fsfreeze_count > 0) {
-		if (sb)
-			drop_super(sb);
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		return 0;
-	}
-
-	if (sb) {
-		BUG_ON(sb->s_bdev != bdev);
-		if (!(sb->s_flags & MS_RDONLY)) {
-			if (sb->s_op->unfreeze_fs) {
-				error = sb->s_op->unfreeze_fs(sb);
-				if (error) {
-					printk(KERN_ERR
-						"VFS:Filesystem thaw failed\n");
-					sb->s_frozen = SB_FREEZE_TRANS;
-					bdev->bd_fsfreeze_count++;
-					mutex_unlock(&bdev->bd_fsfreeze_mutex);
-					return error;
-				}
-			}
-			sb->s_frozen = SB_UNFROZEN;
-			smp_wmb();
-			wake_up(&sb->s_wait_unfrozen);
+	if (!bdev->bd_fsfreeze_count)
+		goto out_unlock;
+
+	error = 0;
+	if (--bdev->bd_fsfreeze_count > 0)
+		goto out_unlock;
+
+	if (!sb)
+		goto out_unlock;
+
+	BUG_ON(sb->s_bdev != bdev);
+	down_write(&sb->s_umount);
+	if (sb->s_flags & MS_RDONLY)
+		goto out_deactivate;
+
+	if (sb->s_op->unfreeze_fs) {
+		error = sb->s_op->unfreeze_fs(sb);
+		if (error) {
+			printk(KERN_ERR
+				"VFS:Filesystem thaw failed\n");
+			sb->s_frozen = SB_FREEZE_TRANS;
+			bdev->bd_fsfreeze_count++;
+			mutex_unlock(&bdev->bd_fsfreeze_mutex);
+			return error;
 		}
-		drop_super(sb);
 	}
 
+	sb->s_frozen = SB_UNFROZEN;
+	smp_wmb();
+	wake_up(&sb->s_wait_unfrozen);
+
+out_deactivate:
+	if (sb)
+		deactivate_locked_super(sb);
+out_unlock:
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 	return 0;
 }
diff --git a/fs/super.c b/fs/super.c
index 1cb26a3e3df0..19eb70b374bc 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -465,6 +465,48 @@ rescan:
 }
 
 EXPORT_SYMBOL(get_super);
+
+/**
+ * get_active_super - get an active reference to the superblock of a device
+ * @bdev: device to get the superblock for
+ *
+ * Scans the superblock list and finds the superblock of the file system
+ * mounted on the device given.  Returns the superblock with an active
+ * reference and s_umount held exclusively or %NULL if none was found.
+ */
+struct super_block *get_active_super(struct block_device *bdev)
+{
+	struct super_block *sb;
+
+	if (!bdev)
+		return NULL;
+
+	spin_lock(&sb_lock);
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (sb->s_bdev != bdev)
+			continue;
+
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+		down_write(&sb->s_umount);
+		if (sb->s_root) {
+			spin_lock(&sb_lock);
+			if (sb->s_count > S_BIAS) {
+				atomic_inc(&sb->s_active);
+				sb->s_count--;
+				spin_unlock(&sb_lock);
+				return sb;
+			}
+			spin_unlock(&sb_lock);
+		}
+		up_write(&sb->s_umount);
+		put_super(sb);
+		yield();
+		spin_lock(&sb_lock);
+	}
+	spin_unlock(&sb_lock);
+	return NULL;
+}
  
 struct super_block * user_get_super(dev_t dev)
 {
@@ -527,11 +569,15 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 {
 	int retval;
 	int remount_rw;
-	
+
+	if (sb->s_frozen != SB_UNFROZEN)
+		return -EBUSY;
+
 #ifdef CONFIG_BLOCK
 	if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev))
 		return -EACCES;
 #endif
+
 	if (flags & MS_RDONLY)
 		acct_auto_close(sb);
 	shrink_dcache_sb(sb);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 72dfbd423974..502d96ef345d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2334,6 +2334,7 @@ extern void get_filesystem(struct file_system_type *fs);
 extern void put_filesystem(struct file_system_type *fs);
 extern struct file_system_type *get_fs_type(const char *name);
 extern struct super_block *get_super(struct block_device *);
+extern struct super_block *get_active_super(struct block_device *bdev);
 extern struct super_block *user_get_super(dev_t);
 extern void drop_super(struct super_block *sb);
 
-- 
cgit v1.2.3-71-gd317


From 25d9e2d15286281ec834b829a4aaf8969011f1cd Mon Sep 17 00:00:00 2001
From: "npiggin@suse.de" <npiggin@suse.de>
Date: Fri, 21 Aug 2009 02:35:05 +1000
Subject: truncate: new helpers

Introduce new truncate helpers truncate_pagecache and inode_newsize_ok.
vmtruncate is also consolidated from mm/memory.c and mm/nommu.c and
into mm/truncate.c.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/vm/locking |  2 +-
 fs/attr.c                | 46 ++++++++++++++++++++++++++++++++--
 include/linux/fs.h       |  3 ++-
 include/linux/mm.h       |  5 ++--
 mm/filemap.c             |  2 +-
 mm/memory.c              | 62 +++-------------------------------------------
 mm/mremap.c              |  4 +--
 mm/nommu.c               | 40 ------------------------------
 mm/truncate.c            | 64 ++++++++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 120 insertions(+), 108 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/vm/locking b/Documentation/vm/locking
index f366fa956179..25fadb448760 100644
--- a/Documentation/vm/locking
+++ b/Documentation/vm/locking
@@ -80,7 +80,7 @@ Note: PTL can also be used to guarantee that no new clones using the
 mm start up ... this is a loose form of stability on mm_users. For
 example, it is used in copy_mm to protect against a racing tlb_gather_mmu
 single address space optimization, so that the zap_page_range (from
-vmtruncate) does not lose sending ipi's to cloned threads that might 
+truncate) does not lose sending ipi's to cloned threads that might
 be spawned underneath it and go to user mode to drag in pte's into tlbs.
 
 swap_lock
diff --git a/fs/attr.c b/fs/attr.c
index 9fe1b1bd30a8..96d394bdaddf 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -18,7 +18,7 @@
 /* Taken over from the old code... */
 
 /* POSIX UID/GID verification for setting inode attributes. */
-int inode_change_ok(struct inode *inode, struct iattr *attr)
+int inode_change_ok(const struct inode *inode, struct iattr *attr)
 {
 	int retval = -EPERM;
 	unsigned int ia_valid = attr->ia_valid;
@@ -60,9 +60,51 @@ fine:
 error:
 	return retval;
 }
-
 EXPORT_SYMBOL(inode_change_ok);
 
+/**
+ * inode_newsize_ok - may this inode be truncated to a given size
+ * @inode:	the inode to be truncated
+ * @offset:	the new size to assign to the inode
+ * @Returns:	0 on success, -ve errno on failure
+ *
+ * inode_newsize_ok will check filesystem limits and ulimits to check that the
+ * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
+ * when necessary. Caller must not proceed with inode size change if failure is
+ * returned. @inode must be a file (not directory), with appropriate
+ * permissions to allow truncate (inode_newsize_ok does NOT check these
+ * conditions).
+ *
+ * inode_newsize_ok must be called with i_mutex held.
+ */
+int inode_newsize_ok(const struct inode *inode, loff_t offset)
+{
+	if (inode->i_size < offset) {
+		unsigned long limit;
+
+		limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+		if (limit != RLIM_INFINITY && offset > limit)
+			goto out_sig;
+		if (offset > inode->i_sb->s_maxbytes)
+			goto out_big;
+	} else {
+		/*
+		 * truncation of in-use swapfiles is disallowed - it would
+		 * cause subsequent swapout to scribble on the now-freed
+		 * blocks.
+		 */
+		if (IS_SWAPFILE(inode))
+			return -ETXTBSY;
+	}
+
+	return 0;
+out_sig:
+	send_sig(SIGXFSZ, current, 0);
+out_big:
+	return -EFBIG;
+}
+EXPORT_SYMBOL(inode_newsize_ok);
+
 int inode_setattr(struct inode * inode, struct iattr * attr)
 {
 	unsigned int ia_valid = attr->ia_valid;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 502d96ef345d..2b08b5ce09b6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2382,7 +2382,8 @@ extern int buffer_migrate_page(struct address_space *,
 #define buffer_migrate_page NULL
 #endif
 
-extern int inode_change_ok(struct inode *, struct iattr *);
+extern int inode_change_ok(const struct inode *, struct iattr *);
+extern int inode_newsize_ok(const struct inode *, loff_t offset);
 extern int __must_check inode_setattr(struct inode *, struct iattr *);
 
 extern void file_update_time(struct file *file);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b6eae5e3144b..8347e938fb2f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -791,8 +791,9 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
 	unmap_mapping_range(mapping, holebegin, holelen, 0);
 }
 
-extern int vmtruncate(struct inode * inode, loff_t offset);
-extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
+extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new);
+extern int vmtruncate(struct inode *inode, loff_t offset);
+extern int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end);
 
 #ifdef CONFIG_MMU
 extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
diff --git a/mm/filemap.c b/mm/filemap.c
index bcc7372aebbc..33349adb227a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -58,7 +58,7 @@
 /*
  * Lock ordering:
  *
- *  ->i_mmap_lock		(vmtruncate)
+ *  ->i_mmap_lock		(truncate_pagecache)
  *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
  *      ->swap_lock		(exclusive_swap_page, others)
  *        ->mapping->tree_lock
diff --git a/mm/memory.c b/mm/memory.c
index b1443ac07c00..ebcd3decac89 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -297,7 +297,8 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		unsigned long addr = vma->vm_start;
 
 		/*
-		 * Hide vma from rmap and vmtruncate before freeing pgtables
+		 * Hide vma from rmap and truncate_pagecache before freeing
+		 * pgtables
 		 */
 		anon_vma_unlink(vma);
 		unlink_file_vma(vma);
@@ -2407,7 +2408,7 @@ restart:
  * @mapping: the address space containing mmaps to be unmapped.
  * @holebegin: byte in first page to unmap, relative to the start of
  * the underlying file.  This will be rounded down to a PAGE_SIZE
- * boundary.  Note that this is different from vmtruncate(), which
+ * boundary.  Note that this is different from truncate_pagecache(), which
  * must keep the partial page.  In contrast, we must get rid of
  * partial pages.
  * @holelen: size of prospective hole in bytes.  This will be rounded
@@ -2458,63 +2459,6 @@ void unmap_mapping_range(struct address_space *mapping,
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
-/**
- * vmtruncate - unmap mappings "freed" by truncate() syscall
- * @inode: inode of the file used
- * @offset: file offset to start truncating
- *
- * NOTE! We have to be ready to update the memory sharing
- * between the file and the memory map for a potential last
- * incomplete page.  Ugly, but necessary.
- */
-int vmtruncate(struct inode * inode, loff_t offset)
-{
-	if (inode->i_size < offset) {
-		unsigned long limit;
-
-		limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-		if (limit != RLIM_INFINITY && offset > limit)
-			goto out_sig;
-		if (offset > inode->i_sb->s_maxbytes)
-			goto out_big;
-		i_size_write(inode, offset);
-	} else {
-		struct address_space *mapping = inode->i_mapping;
-
-		/*
-		 * truncation of in-use swapfiles is disallowed - it would
-		 * cause subsequent swapout to scribble on the now-freed
-		 * blocks.
-		 */
-		if (IS_SWAPFILE(inode))
-			return -ETXTBSY;
-		i_size_write(inode, offset);
-
-		/*
-		 * unmap_mapping_range is called twice, first simply for
-		 * efficiency so that truncate_inode_pages does fewer
-		 * single-page unmaps.  However after this first call, and
-		 * before truncate_inode_pages finishes, it is possible for
-		 * private pages to be COWed, which remain after
-		 * truncate_inode_pages finishes, hence the second
-		 * unmap_mapping_range call must be made for correctness.
-		 */
-		unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
-		truncate_inode_pages(mapping, offset);
-		unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
-	}
-
-	if (inode->i_op->truncate)
-		inode->i_op->truncate(inode);
-	return 0;
-
-out_sig:
-	send_sig(SIGXFSZ, current, 0);
-out_big:
-	return -EFBIG;
-}
-EXPORT_SYMBOL(vmtruncate);
-
 int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
 {
 	struct address_space *mapping = inode->i_mapping;
diff --git a/mm/mremap.c b/mm/mremap.c
index 20a07dba6be0..97bff2547719 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -86,8 +86,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 	if (vma->vm_file) {
 		/*
 		 * Subtle point from Rajesh Venkatasubramanian: before
-		 * moving file-based ptes, we must lock vmtruncate out,
-		 * since it might clean the dst vma before the src vma,
+		 * moving file-based ptes, we must lock truncate_pagecache
+		 * out, since it might clean the dst vma before the src vma,
 		 * and we propagate stale pages into the dst afterward.
 		 */
 		mapping = vma->vm_file->f_mapping;
diff --git a/mm/nommu.c b/mm/nommu.c
index 8d484241d034..56a446f05971 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -82,46 +82,6 @@ DECLARE_RWSEM(nommu_region_sem);
 struct vm_operations_struct generic_file_vm_ops = {
 };
 
-/*
- * Handle all mappings that got truncated by a "truncate()"
- * system call.
- *
- * NOTE! We have to be ready to update the memory sharing
- * between the file and the memory map for a potential last
- * incomplete page.  Ugly, but necessary.
- */
-int vmtruncate(struct inode *inode, loff_t offset)
-{
-	struct address_space *mapping = inode->i_mapping;
-	unsigned long limit;
-
-	if (inode->i_size < offset)
-		goto do_expand;
-	i_size_write(inode, offset);
-
-	truncate_inode_pages(mapping, offset);
-	goto out_truncate;
-
-do_expand:
-	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-	if (limit != RLIM_INFINITY && offset > limit)
-		goto out_sig;
-	if (offset > inode->i_sb->s_maxbytes)
-		goto out;
-	i_size_write(inode, offset);
-
-out_truncate:
-	if (inode->i_op->truncate)
-		inode->i_op->truncate(inode);
-	return 0;
-out_sig:
-	send_sig(SIGXFSZ, current, 0);
-out:
-	return -EFBIG;
-}
-
-EXPORT_SYMBOL(vmtruncate);
-
 /*
  * Return the total memory allocated for this pointer, not
  * just what the caller asked for.
diff --git a/mm/truncate.c b/mm/truncate.c
index ccc3ecf7cb98..5900afca0fa9 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -465,3 +465,67 @@ int invalidate_inode_pages2(struct address_space *mapping)
 	return invalidate_inode_pages2_range(mapping, 0, -1);
 }
 EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
+
+/**
+ * truncate_pagecache - unmap and remove pagecache that has been truncated
+ * @inode: inode
+ * @old: old file offset
+ * @new: new file offset
+ *
+ * inode's new i_size must already be written before truncate_pagecache
+ * is called.
+ *
+ * This function should typically be called before the filesystem
+ * releases resources associated with the freed range (eg. deallocates
+ * blocks). This way, pagecache will always stay logically coherent
+ * with on-disk format, and the filesystem would not have to deal with
+ * situations such as writepage being called for a page that has already
+ * had its underlying blocks deallocated.
+ */
+void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
+{
+	if (new < old) {
+		struct address_space *mapping = inode->i_mapping;
+
+		/*
+		 * unmap_mapping_range is called twice, first simply for
+		 * efficiency so that truncate_inode_pages does fewer
+		 * single-page unmaps.  However after this first call, and
+		 * before truncate_inode_pages finishes, it is possible for
+		 * private pages to be COWed, which remain after
+		 * truncate_inode_pages finishes, hence the second
+		 * unmap_mapping_range call must be made for correctness.
+		 */
+		unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
+		truncate_inode_pages(mapping, new);
+		unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
+	}
+}
+EXPORT_SYMBOL(truncate_pagecache);
+
+/**
+ * vmtruncate - unmap mappings "freed" by truncate() syscall
+ * @inode: inode of the file used
+ * @offset: file offset to start truncating
+ *
+ * NOTE! We have to be ready to update the memory sharing
+ * between the file and the memory map for a potential last
+ * incomplete page.  Ugly, but necessary.
+ */
+int vmtruncate(struct inode *inode, loff_t offset)
+{
+	loff_t oldsize;
+	int error;
+
+	error = inode_newsize_ok(inode, offset);
+	if (error)
+		return error;
+	oldsize = inode->i_size;
+	i_size_write(inode, offset);
+	truncate_pagecache(inode, oldsize, offset);
+	if (inode->i_op->truncate)
+		inode->i_op->truncate(inode);
+
+	return error;
+}
+EXPORT_SYMBOL(vmtruncate);
-- 
cgit v1.2.3-71-gd317


From 57f1f0874f426a9bdfc5cd3f886113dd5cd17834 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Wed, 23 Sep 2009 15:56:10 -0700
Subject: time: add function to convert between calendar time and broken-down
 time for universal use

There are many similar code in kernel for one object: convert time between
calendar time and broken-down time.

Here is some source I found:
  fs/ncpfs/dir.c
  fs/smbfs/proc.c
  fs/fat/misc.c
  fs/udf/udftime.c
  fs/cifs/netmisc.c
  net/netfilter/xt_time.c
  drivers/scsi/ips.c
  drivers/input/misc/hp_sdc_rtc.c
  drivers/rtc/rtc-lib.c
  arch/ia64/hp/sim/boot/fw-emu.c
  arch/m68k/mac/misc.c
  arch/powerpc/kernel/time.c
  arch/parisc/include/asm/rtc.h
  ...

We can make a common function for this type of conversion, At least we
can get following benefit:

1: Make kernel simple and unify
2: Easy to fix bug in converting code
3: Reduce clone of code in future
   For example, I'm trying to make ftrace display walltime,
   this patch will make me easy.

This code is based on code from glibc-2.6

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/time.h   |  28 +++++++++++
 kernel/time/Makefile   |   2 +-
 kernel/time/timeconv.c | 127 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 156 insertions(+), 1 deletion(-)
 create mode 100644 kernel/time/timeconv.c

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index 56787c093345..fe04e5ef6a59 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -155,6 +155,34 @@ extern void timekeeping_leap_insert(int leapsecond);
 struct tms;
 extern void do_sys_times(struct tms *);
 
+/*
+ * Similar to the struct tm in userspace <time.h>, but it needs to be here so
+ * that the kernel source is self contained.
+ */
+struct tm {
+	/*
+	 * the number of seconds after the minute, normally in the range
+	 * 0 to 59, but can be up to 60 to allow for leap seconds
+	 */
+	int tm_sec;
+	/* the number of minutes after the hour, in the range 0 to 59*/
+	int tm_min;
+	/* the number of hours past midnight, in the range 0 to 23 */
+	int tm_hour;
+	/* the day of the month, in the range 1 to 31 */
+	int tm_mday;
+	/* the number of months since January, in the range 0 to 11 */
+	int tm_mon;
+	/* the number of years since 1900 */
+	long tm_year;
+	/* the number of days since Sunday, in the range 0 to 6 */
+	int tm_wday;
+	/* the number of days since January 1, in the range 0 to 365 */
+	int tm_yday;
+};
+
+void time_to_tm(time_t totalsecs, int offset, struct tm *result);
+
 /**
  * timespec_to_ns - Convert timespec to nanoseconds
  * @ts:		pointer to the timespec variable to be converted
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 0b0a6366c9d4..ee266620b06c 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
-obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
+obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o
 
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)		+= clockevents.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS)		+= tick-common.o
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c
new file mode 100644
index 000000000000..86628e755f38
--- /dev/null
+++ b/kernel/time/timeconv.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc.
+ * This file is part of the GNU C Library.
+ * Contributed by Paul Eggert (eggert@twinsun.com).
+ *
+ * The GNU C Library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * The GNU C Library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with the GNU C Library; see the file COPYING.LIB.  If not,
+ * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/*
+ * Converts the calendar time to broken-down time representation
+ * Based on code from glibc-2.6
+ *
+ * 2009-7-14:
+ *   Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com>
+ */
+
+#include <linux/time.h>
+#include <linux/module.h>
+
+/*
+ * Nonzero if YEAR is a leap year (every 4 years,
+ * except every 100th isn't, and every 400th is).
+ */
+static int __isleap(long year)
+{
+	return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0);
+}
+
+/* do a mathdiv for long type */
+static long math_div(long a, long b)
+{
+	return a / b - (a % b < 0);
+}
+
+/* How many leap years between y1 and y2, y1 must less or equal to y2 */
+static long leaps_between(long y1, long y2)
+{
+	long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100)
+		+ math_div(y1 - 1, 400);
+	long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100)
+		+ math_div(y2 - 1, 400);
+	return leaps2 - leaps1;
+}
+
+/* How many days come before each month (0-12). */
+static const unsigned short __mon_yday[2][13] = {
+	/* Normal years. */
+	{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365},
+	/* Leap years. */
+	{0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366}
+};
+
+#define SECS_PER_HOUR	(60 * 60)
+#define SECS_PER_DAY	(SECS_PER_HOUR * 24)
+
+/**
+ * time_to_tm - converts the calendar time to local broken-down time
+ *
+ * @totalsecs	the number of seconds elapsed since 00:00:00 on January 1, 1970,
+ *		Coordinated Universal Time (UTC).
+ * @offset	offset seconds adding to totalsecs.
+ * @result	pointer to struct tm variable to receive broken-down time
+ */
+void time_to_tm(time_t totalsecs, int offset, struct tm *result)
+{
+	long days, rem, y;
+	const unsigned short *ip;
+
+	days = totalsecs / SECS_PER_DAY;
+	rem = totalsecs % SECS_PER_DAY;
+	rem += offset;
+	while (rem < 0) {
+		rem += SECS_PER_DAY;
+		--days;
+	}
+	while (rem >= SECS_PER_DAY) {
+		rem -= SECS_PER_DAY;
+		++days;
+	}
+
+	result->tm_hour = rem / SECS_PER_HOUR;
+	rem %= SECS_PER_HOUR;
+	result->tm_min = rem / 60;
+	result->tm_sec = rem % 60;
+
+	/* January 1, 1970 was a Thursday. */
+	result->tm_wday = (4 + days) % 7;
+	if (result->tm_wday < 0)
+		result->tm_wday += 7;
+
+	y = 1970;
+
+	while (days < 0 || days >= (__isleap(y) ? 366 : 365)) {
+		/* Guess a corrected year, assuming 365 days per year. */
+		long yg = y + math_div(days, 365);
+
+		/* Adjust DAYS and Y to match the guessed year. */
+		days -= (yg - y) * 365 + leaps_between(y, yg);
+		y = yg;
+	}
+
+	result->tm_year = y - 1900;
+
+	result->tm_yday = days;
+
+	ip = __mon_yday[__isleap(y)];
+	for (y = 11; days < ip[y]; y--)
+		continue;
+	days -= ip[y];
+
+	result->tm_mon = y;
+	result->tm_mday = days + 1;
+}
+EXPORT_SYMBOL(time_to_tm);
-- 
cgit v1.2.3-71-gd317


From 55dff4954ebdeba2be59e19398a607d799c5fa9f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 23 Sep 2009 15:56:17 -0700
Subject: docs: fix various Documentation/ paths in header files

Fix various Documentation/ paths in include/linux/.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Reviewed-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/configfs.h   | 4 ++--
 include/linux/debugfs.h    | 2 +-
 include/linux/relay.h      | 2 +-
 include/linux/tracepoint.h | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index 7f627775c947..ddb7a97c78c2 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -27,8 +27,8 @@
  *
  * configfs Copyright (C) 2005 Oracle.  All rights reserved.
  *
- * Please read Documentation/filesystems/configfs.txt before using the
- * configfs interface, ESPECIALLY the parts about reference counts and
+ * Please read Documentation/filesystems/configfs/configfs.txt before using
+ * the configfs interface, ESPECIALLY the parts about reference counts and
  * item destructors.
  */
 
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index eb5c2ba2f81a..fc1b930f246c 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -9,7 +9,7 @@
  *	2 as published by the Free Software Foundation.
  *
  *  debugfs is for people to use instead of /proc or /sys.
- *  See Documentation/DocBook/kernel-api for more details.
+ *  See Documentation/DocBook/filesystems for more details.
  */
 
 #ifndef _DEBUGFS_H_
diff --git a/include/linux/relay.h b/include/linux/relay.h
index 953fc055e875..14a86bc7102b 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -140,7 +140,7 @@ struct rchan_callbacks
 	 * cause relay_open() to create a single global buffer rather
 	 * than the default set of per-cpu buffers.
 	 *
-	 * See Documentation/filesystems/relayfs.txt for more info.
+	 * See Documentation/filesystems/relay.txt for more info.
 	 */
 	struct dentry *(*create_buf_file)(const char *filename,
 					  struct dentry *parent,
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 63a3f7a80580..660a9de96f81 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -4,7 +4,7 @@
 /*
  * Kernel Tracepoint API.
  *
- * See Documentation/tracepoint.txt.
+ * See Documentation/trace/tracepoints.txt.
  *
  * (C) Copyright 2008 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
  *
-- 
cgit v1.2.3-71-gd317


From 8f3ff20862cfcb85500a2bb55ee64622bd59fd0c Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Wed, 23 Sep 2009 15:56:25 -0700
Subject: cgroups: revert "cgroups: fix pid namespace bug"

The following series adds a "cgroup.procs" file to each cgroup that
reports unique tgids rather than pids, and allows all threads in a
threadgroup to be atomically moved to a new cgroup.

The subsystem "attach" interface is modified to support attaching whole
threadgroups at a time, which could introduce potential problems if any
subsystem were to need to access the old cgroup of every thread being
moved.  The attach interface may need to be revised if this becomes the
case.

Also added is functionality for read/write locking all CLONE_THREAD
fork()ing within a threadgroup, by means of an rwsem that lives in the
sighand_struct, for per-threadgroup-ness and also for sharing a cacheline
with the sighand's atomic count.  This scheme should introduce no extra
overhead in the fork path when there's no contention.

The final patch reveals potential for a race when forking before a
subsystem's attach function is called - one potential solution in case any
subsystem has this problem is to hang on to the group's fork mutex through
the attach() calls, though no subsystem yet demonstrates need for an
extended critical section.

This patch:

Revert

commit 096b7fe012d66ed55e98bc8022405ede0cc80e96
Author:     Li Zefan <lizf@cn.fujitsu.com>
AuthorDate: Wed Jul 29 15:04:04 2009 -0700
Commit:     Linus Torvalds <torvalds@linux-foundation.org>
CommitDate: Wed Jul 29 19:10:35 2009 -0700

    cgroups: fix pid namespace bug

This is in preparation for some clashing cgroups changes that subsume the
original commit's functionaliy.

The original commit fixed a pid namespace bug which Ben Blum fixed
independently (in the same way, but with different code) as part of a
series of patches.  I played around with trying to reconcile Ben's patch
series with Li's patch, but concluded that it was simpler to just revert
Li's, given that Ben's patch series contained essentially the same fix.

Signed-off-by: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h | 11 +++---
 kernel/cgroup.c        | 95 +++++++++++++-------------------------------------
 2 files changed, 31 insertions(+), 75 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 90bba9e62286..c833d6f23672 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -179,11 +179,14 @@ struct cgroup {
 	 */
 	struct list_head release_list;
 
-	/* pids_mutex protects pids_list and cached pid arrays. */
+	/* pids_mutex protects the fields below */
 	struct rw_semaphore pids_mutex;
-
-	/* Linked list of struct cgroup_pids */
-	struct list_head pids_list;
+	/* Array of process ids in the cgroup */
+	pid_t *tasks_pids;
+	/* How many files are using the current tasks_pids array */
+	int pids_use_count;
+	/* Length of the current tasks_pids array */
+	int pids_length;
 
 	/* For RCU-protected deletion */
 	struct rcu_head rcu_head;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 14efffed72c8..22db0a7cf1fa 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1121,7 +1121,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	INIT_LIST_HEAD(&cgrp->children);
 	INIT_LIST_HEAD(&cgrp->css_sets);
 	INIT_LIST_HEAD(&cgrp->release_list);
-	INIT_LIST_HEAD(&cgrp->pids_list);
 	init_rwsem(&cgrp->pids_mutex);
 }
 
@@ -2431,30 +2430,12 @@ err:
 	return ret;
 }
 
-/*
- * Cache pids for all threads in the same pid namespace that are
- * opening the same "tasks" file.
- */
-struct cgroup_pids {
-	/* The node in cgrp->pids_list */
-	struct list_head list;
-	/* The cgroup those pids belong to */
-	struct cgroup *cgrp;
-	/* The namepsace those pids belong to */
-	struct pid_namespace *ns;
-	/* Array of process ids in the cgroup */
-	pid_t *tasks_pids;
-	/* How many files are using the this tasks_pids array */
-	int use_count;
-	/* Length of the current tasks_pids array */
-	int length;
-};
-
 static int cmppid(const void *a, const void *b)
 {
 	return *(pid_t *)a - *(pid_t *)b;
 }
 
+
 /*
  * seq_file methods for the "tasks" file. The seq_file position is the
  * next pid to display; the seq_file iterator is a pointer to the pid
@@ -2469,47 +2450,45 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
 	 * after a seek to the start). Use a binary-search to find the
 	 * next pid to display, if any
 	 */
-	struct cgroup_pids *cp = s->private;
-	struct cgroup *cgrp = cp->cgrp;
+	struct cgroup *cgrp = s->private;
 	int index = 0, pid = *pos;
 	int *iter;
 
 	down_read(&cgrp->pids_mutex);
 	if (pid) {
-		int end = cp->length;
+		int end = cgrp->pids_length;
 
 		while (index < end) {
 			int mid = (index + end) / 2;
-			if (cp->tasks_pids[mid] == pid) {
+			if (cgrp->tasks_pids[mid] == pid) {
 				index = mid;
 				break;
-			} else if (cp->tasks_pids[mid] <= pid)
+			} else if (cgrp->tasks_pids[mid] <= pid)
 				index = mid + 1;
 			else
 				end = mid;
 		}
 	}
 	/* If we're off the end of the array, we're done */
-	if (index >= cp->length)
+	if (index >= cgrp->pids_length)
 		return NULL;
 	/* Update the abstract position to be the actual pid that we found */
-	iter = cp->tasks_pids + index;
+	iter = cgrp->tasks_pids + index;
 	*pos = *iter;
 	return iter;
 }
 
 static void cgroup_tasks_stop(struct seq_file *s, void *v)
 {
-	struct cgroup_pids *cp = s->private;
-	struct cgroup *cgrp = cp->cgrp;
+	struct cgroup *cgrp = s->private;
 	up_read(&cgrp->pids_mutex);
 }
 
 static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
 {
-	struct cgroup_pids *cp = s->private;
+	struct cgroup *cgrp = s->private;
 	int *p = v;
-	int *end = cp->tasks_pids + cp->length;
+	int *end = cgrp->tasks_pids + cgrp->pids_length;
 
 	/*
 	 * Advance to the next pid in the array. If this goes off the
@@ -2536,33 +2515,26 @@ static const struct seq_operations cgroup_tasks_seq_operations = {
 	.show = cgroup_tasks_show,
 };
 
-static void release_cgroup_pid_array(struct cgroup_pids *cp)
+static void release_cgroup_pid_array(struct cgroup *cgrp)
 {
-	struct cgroup *cgrp = cp->cgrp;
-
 	down_write(&cgrp->pids_mutex);
-	BUG_ON(!cp->use_count);
-	if (!--cp->use_count) {
-		list_del(&cp->list);
-		put_pid_ns(cp->ns);
-		kfree(cp->tasks_pids);
-		kfree(cp);
+	BUG_ON(!cgrp->pids_use_count);
+	if (!--cgrp->pids_use_count) {
+		kfree(cgrp->tasks_pids);
+		cgrp->tasks_pids = NULL;
+		cgrp->pids_length = 0;
 	}
 	up_write(&cgrp->pids_mutex);
 }
 
 static int cgroup_tasks_release(struct inode *inode, struct file *file)
 {
-	struct seq_file *seq;
-	struct cgroup_pids *cp;
+	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
 
 	if (!(file->f_mode & FMODE_READ))
 		return 0;
 
-	seq = file->private_data;
-	cp = seq->private;
-
-	release_cgroup_pid_array(cp);
+	release_cgroup_pid_array(cgrp);
 	return seq_release(inode, file);
 }
 
@@ -2581,8 +2553,6 @@ static struct file_operations cgroup_tasks_operations = {
 static int cgroup_tasks_open(struct inode *unused, struct file *file)
 {
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-	struct pid_namespace *ns = current->nsproxy->pid_ns;
-	struct cgroup_pids *cp;
 	pid_t *pidarray;
 	int npids;
 	int retval;
@@ -2609,37 +2579,20 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
 	 * array if necessary
 	 */
 	down_write(&cgrp->pids_mutex);
-
-	list_for_each_entry(cp, &cgrp->pids_list, list) {
-		if (ns == cp->ns)
-			goto found;
-	}
-
-	cp = kzalloc(sizeof(*cp), GFP_KERNEL);
-	if (!cp) {
-		up_write(&cgrp->pids_mutex);
-		kfree(pidarray);
-		return -ENOMEM;
-	}
-	cp->cgrp = cgrp;
-	cp->ns = ns;
-	get_pid_ns(ns);
-	list_add(&cp->list, &cgrp->pids_list);
-found:
-	kfree(cp->tasks_pids);
-	cp->tasks_pids = pidarray;
-	cp->length = npids;
-	cp->use_count++;
+	kfree(cgrp->tasks_pids);
+	cgrp->tasks_pids = pidarray;
+	cgrp->pids_length = npids;
+	cgrp->pids_use_count++;
 	up_write(&cgrp->pids_mutex);
 
 	file->f_op = &cgroup_tasks_operations;
 
 	retval = seq_open(file, &cgroup_tasks_seq_operations);
 	if (retval) {
-		release_cgroup_pid_array(cp);
+		release_cgroup_pid_array(cgrp);
 		return retval;
 	}
-	((struct seq_file *)file->private_data)->private = cp;
+	((struct seq_file *)file->private_data)->private = cgrp;
 	return 0;
 }
 
-- 
cgit v1.2.3-71-gd317


From 102a775e3647628727ae83a9a6abf0564c3ca7cb Mon Sep 17 00:00:00 2001
From: Ben Blum <bblum@google.com>
Date: Wed, 23 Sep 2009 15:56:26 -0700
Subject: cgroups: add a read-only "procs" file similar to "tasks" that shows
 only unique tgids

struct cgroup used to have a bunch of fields for keeping track of the
pidlist for the tasks file.  Those are now separated into a new struct
cgroup_pidlist, of which two are had, one for procs and one for tasks.
The way the seq_file operations are set up is changed so that just the
pidlist struct gets passed around as the private data.

Interface example: Suppose a multithreaded process has pid 1000 and other
threads with ids 1001, 1002, 1003:
$ cat tasks
1000
1001
1002
1003
$ cat cgroup.procs
1000
$

Signed-off-by: Ben Blum <bblum@google.com>
Signed-off-by: Paul Menage <menage@google.com>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h |  22 ++--
 kernel/cgroup.c        | 278 ++++++++++++++++++++++++++++++-------------------
 2 files changed, 186 insertions(+), 114 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c833d6f23672..2357733a0a80 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -141,6 +141,17 @@ enum {
 	CGRP_WAIT_ON_RMDIR,
 };
 
+struct cgroup_pidlist {
+	/* protects the other fields */
+	struct rw_semaphore mutex;
+	/* array of xids */
+	pid_t *list;
+	/* how many elements the above list has */
+	int length;
+	/* how many files are using the current array */
+	int use_count;
+};
+
 struct cgroup {
 	unsigned long flags;		/* "unsigned long" so bitops work */
 
@@ -179,14 +190,9 @@ struct cgroup {
 	 */
 	struct list_head release_list;
 
-	/* pids_mutex protects the fields below */
-	struct rw_semaphore pids_mutex;
-	/* Array of process ids in the cgroup */
-	pid_t *tasks_pids;
-	/* How many files are using the current tasks_pids array */
-	int pids_use_count;
-	/* Length of the current tasks_pids array */
-	int pids_length;
+	/* we will have two separate pidlists, one for pids (the tasks file)
+	 * and one for tgids (the procs file). */
+	struct cgroup_pidlist tasks, procs;
 
 	/* For RCU-protected deletion */
 	struct rcu_head rcu_head;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 22db0a7cf1fa..a9433f50e53d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1121,7 +1121,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	INIT_LIST_HEAD(&cgrp->children);
 	INIT_LIST_HEAD(&cgrp->css_sets);
 	INIT_LIST_HEAD(&cgrp->release_list);
-	init_rwsem(&cgrp->pids_mutex);
+	init_rwsem(&(cgrp->tasks.mutex));
+	init_rwsem(&(cgrp->procs.mutex));
 }
 
 static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1637,15 +1638,6 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
 	return ret;
 }
 
-/* The various types of files and directories in a cgroup file system */
-enum cgroup_filetype {
-	FILE_ROOT,
-	FILE_DIR,
-	FILE_TASKLIST,
-	FILE_NOTIFY_ON_RELEASE,
-	FILE_RELEASE_AGENT,
-};
-
 /**
  * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
  * @cgrp: the cgroup to be checked for liveness
@@ -2343,7 +2335,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
 }
 
 /*
- * Stuff for reading the 'tasks' file.
+ * Stuff for reading the 'tasks'/'procs' files.
  *
  * Reading this file can return large amounts of data if a cgroup has
  * *lots* of attached tasks. So it may need several calls to read(),
@@ -2353,27 +2345,106 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
  */
 
 /*
- * Load into 'pidarray' up to 'npids' of the tasks using cgroup
- * 'cgrp'.  Return actual number of pids loaded.  No need to
- * task_lock(p) when reading out p->cgroup, since we're in an RCU
- * read section, so the css_set can't go away, and is
- * immutable after creation.
+ * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
+ * If the new stripped list is sufficiently smaller and there's enough memory
+ * to allocate a new buffer, will let go of the unneeded memory. Returns the
+ * number of unique elements.
  */
-static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
+/* is the size difference enough that we should re-allocate the array? */
+#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
+static int pidlist_uniq(pid_t **p, int length)
 {
-	int n = 0, pid;
+	int src, dest = 1;
+	pid_t *list = *p;
+	pid_t *newlist;
+
+	/*
+	 * we presume the 0th element is unique, so i starts at 1. trivial
+	 * edge cases first; no work needs to be done for either
+	 */
+	if (length == 0 || length == 1)
+		return length;
+	/* src and dest walk down the list; dest counts unique elements */
+	for (src = 1; src < length; src++) {
+		/* find next unique element */
+		while (list[src] == list[src-1]) {
+			src++;
+			if (src == length)
+				goto after;
+		}
+		/* dest always points to where the next unique element goes */
+		list[dest] = list[src];
+		dest++;
+	}
+after:
+	/*
+	 * if the length difference is large enough, we want to allocate a
+	 * smaller buffer to save memory. if this fails due to out of memory,
+	 * we'll just stay with what we've got.
+	 */
+	if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
+		newlist = krealloc(list, dest * sizeof(pid_t), GFP_KERNEL);
+		if (newlist)
+			*p = newlist;
+	}
+	return dest;
+}
+
+static int cmppid(const void *a, const void *b)
+{
+	return *(pid_t *)a - *(pid_t *)b;
+}
+
+/*
+ * Load a cgroup's pidarray with either procs' tgids or tasks' pids
+ */
+static int pidlist_array_load(struct cgroup *cgrp, bool procs)
+{
+	pid_t *array;
+	int length;
+	int pid, n = 0; /* used for populating the array */
 	struct cgroup_iter it;
 	struct task_struct *tsk;
+	struct cgroup_pidlist *l;
+
+	/*
+	 * If cgroup gets more users after we read count, we won't have
+	 * enough space - tough.  This race is indistinguishable to the
+	 * caller from the case that the additional cgroup users didn't
+	 * show up until sometime later on.
+	 */
+	length = cgroup_task_count(cgrp);
+	array = kmalloc(length * sizeof(pid_t), GFP_KERNEL);
+	if (!array)
+		return -ENOMEM;
+	/* now, populate the array */
 	cgroup_iter_start(cgrp, &it);
 	while ((tsk = cgroup_iter_next(cgrp, &it))) {
-		if (unlikely(n == npids))
+		if (unlikely(n == length))
 			break;
-		pid = task_pid_vnr(tsk);
-		if (pid > 0)
-			pidarray[n++] = pid;
+		/* get tgid or pid for procs or tasks file respectively */
+		pid = (procs ? task_tgid_vnr(tsk) : task_pid_vnr(tsk));
+		if (pid > 0) /* make sure to only use valid results */
+			array[n++] = pid;
 	}
 	cgroup_iter_end(cgrp, &it);
-	return n;
+	length = n;
+	/* now sort & (if procs) strip out duplicates */
+	sort(array, length, sizeof(pid_t), cmppid, NULL);
+	if (procs) {
+		length = pidlist_uniq(&array, length);
+		l = &(cgrp->procs);
+	} else {
+		l = &(cgrp->tasks);
+	}
+	/* store array in cgroup, freeing old if necessary */
+	down_write(&l->mutex);
+	kfree(l->list);
+	l->list = array;
+	l->length = length;
+	l->use_count++;
+	up_write(&l->mutex);
+	return 0;
 }
 
 /**
@@ -2430,19 +2501,14 @@ err:
 	return ret;
 }
 
-static int cmppid(const void *a, const void *b)
-{
-	return *(pid_t *)a - *(pid_t *)b;
-}
-
 
 /*
- * seq_file methods for the "tasks" file. The seq_file position is the
+ * seq_file methods for the tasks/procs files. The seq_file position is the
  * next pid to display; the seq_file iterator is a pointer to the pid
- * in the cgroup->tasks_pids array.
+ * in the cgroup->l->list array.
  */
 
-static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
+static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 {
 	/*
 	 * Initially we receive a position value that corresponds to
@@ -2450,46 +2516,45 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
 	 * after a seek to the start). Use a binary-search to find the
 	 * next pid to display, if any
 	 */
-	struct cgroup *cgrp = s->private;
+	struct cgroup_pidlist *l = s->private;
 	int index = 0, pid = *pos;
 	int *iter;
 
-	down_read(&cgrp->pids_mutex);
+	down_read(&l->mutex);
 	if (pid) {
-		int end = cgrp->pids_length;
+		int end = l->length;
 
 		while (index < end) {
 			int mid = (index + end) / 2;
-			if (cgrp->tasks_pids[mid] == pid) {
+			if (l->list[mid] == pid) {
 				index = mid;
 				break;
-			} else if (cgrp->tasks_pids[mid] <= pid)
+			} else if (l->list[mid] <= pid)
 				index = mid + 1;
 			else
 				end = mid;
 		}
 	}
 	/* If we're off the end of the array, we're done */
-	if (index >= cgrp->pids_length)
+	if (index >= l->length)
 		return NULL;
 	/* Update the abstract position to be the actual pid that we found */
-	iter = cgrp->tasks_pids + index;
+	iter = l->list + index;
 	*pos = *iter;
 	return iter;
 }
 
-static void cgroup_tasks_stop(struct seq_file *s, void *v)
+static void cgroup_pidlist_stop(struct seq_file *s, void *v)
 {
-	struct cgroup *cgrp = s->private;
-	up_read(&cgrp->pids_mutex);
+	struct cgroup_pidlist *l = s->private;
+	up_read(&l->mutex);
 }
 
-static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
+static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
 {
-	struct cgroup *cgrp = s->private;
-	int *p = v;
-	int *end = cgrp->tasks_pids + cgrp->pids_length;
-
+	struct cgroup_pidlist *l = s->private;
+	pid_t *p = v;
+	pid_t *end = l->list + l->length;
 	/*
 	 * Advance to the next pid in the array. If this goes off the
 	 * end, we're done
@@ -2503,98 +2568,94 @@ static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
 	}
 }
 
-static int cgroup_tasks_show(struct seq_file *s, void *v)
+static int cgroup_pidlist_show(struct seq_file *s, void *v)
 {
 	return seq_printf(s, "%d\n", *(int *)v);
 }
 
-static const struct seq_operations cgroup_tasks_seq_operations = {
-	.start = cgroup_tasks_start,
-	.stop = cgroup_tasks_stop,
-	.next = cgroup_tasks_next,
-	.show = cgroup_tasks_show,
+/*
+ * seq_operations functions for iterating on pidlists through seq_file -
+ * independent of whether it's tasks or procs
+ */
+static const struct seq_operations cgroup_pidlist_seq_operations = {
+	.start = cgroup_pidlist_start,
+	.stop = cgroup_pidlist_stop,
+	.next = cgroup_pidlist_next,
+	.show = cgroup_pidlist_show,
 };
 
-static void release_cgroup_pid_array(struct cgroup *cgrp)
+static void cgroup_release_pid_array(struct cgroup_pidlist *l)
 {
-	down_write(&cgrp->pids_mutex);
-	BUG_ON(!cgrp->pids_use_count);
-	if (!--cgrp->pids_use_count) {
-		kfree(cgrp->tasks_pids);
-		cgrp->tasks_pids = NULL;
-		cgrp->pids_length = 0;
+	down_write(&l->mutex);
+	BUG_ON(!l->use_count);
+	if (!--l->use_count) {
+		kfree(l->list);
+		l->list = NULL;
+		l->length = 0;
 	}
-	up_write(&cgrp->pids_mutex);
+	up_write(&l->mutex);
 }
 
-static int cgroup_tasks_release(struct inode *inode, struct file *file)
+static int cgroup_pidlist_release(struct inode *inode, struct file *file)
 {
-	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-
+	struct cgroup_pidlist *l;
 	if (!(file->f_mode & FMODE_READ))
 		return 0;
-
-	release_cgroup_pid_array(cgrp);
+	/*
+	 * the seq_file will only be initialized if the file was opened for
+	 * reading; hence we check if it's not null only in that case.
+	 */
+	l = ((struct seq_file *)file->private_data)->private;
+	cgroup_release_pid_array(l);
 	return seq_release(inode, file);
 }
 
-static struct file_operations cgroup_tasks_operations = {
+static const struct file_operations cgroup_pidlist_operations = {
 	.read = seq_read,
 	.llseek = seq_lseek,
 	.write = cgroup_file_write,
-	.release = cgroup_tasks_release,
+	.release = cgroup_pidlist_release,
 };
 
 /*
- * Handle an open on 'tasks' file.  Prepare an array containing the
- * process id's of tasks currently attached to the cgroup being opened.
+ * The following functions handle opens on a file that displays a pidlist
+ * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
+ * in the cgroup.
  */
-
-static int cgroup_tasks_open(struct inode *unused, struct file *file)
+/* helper function for the two below it */
+static int cgroup_pidlist_open(struct file *file, bool procs)
 {
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-	pid_t *pidarray;
-	int npids;
+	struct cgroup_pidlist *l = (procs ? &cgrp->procs : &cgrp->tasks);
 	int retval;
 
 	/* Nothing to do for write-only files */
 	if (!(file->f_mode & FMODE_READ))
 		return 0;
 
-	/*
-	 * If cgroup gets more users after we read count, we won't have
-	 * enough space - tough.  This race is indistinguishable to the
-	 * caller from the case that the additional cgroup users didn't
-	 * show up until sometime later on.
-	 */
-	npids = cgroup_task_count(cgrp);
-	pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
-	if (!pidarray)
-		return -ENOMEM;
-	npids = pid_array_load(pidarray, npids, cgrp);
-	sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
-
-	/*
-	 * Store the array in the cgroup, freeing the old
-	 * array if necessary
-	 */
-	down_write(&cgrp->pids_mutex);
-	kfree(cgrp->tasks_pids);
-	cgrp->tasks_pids = pidarray;
-	cgrp->pids_length = npids;
-	cgrp->pids_use_count++;
-	up_write(&cgrp->pids_mutex);
-
-	file->f_op = &cgroup_tasks_operations;
+	/* have the array populated */
+	retval = pidlist_array_load(cgrp, procs);
+	if (retval)
+		return retval;
+	/* configure file information */
+	file->f_op = &cgroup_pidlist_operations;
 
-	retval = seq_open(file, &cgroup_tasks_seq_operations);
+	retval = seq_open(file, &cgroup_pidlist_seq_operations);
 	if (retval) {
-		release_cgroup_pid_array(cgrp);
+		cgroup_release_pid_array(l);
 		return retval;
 	}
-	((struct seq_file *)file->private_data)->private = cgrp;
+	((struct seq_file *)file->private_data)->private = l;
 	return 0;
 }
+static int cgroup_tasks_open(struct inode *unused, struct file *file)
+{
+	return cgroup_pidlist_open(file, false);
+}
+static int cgroup_procs_open(struct inode *unused, struct file *file)
+{
+	return cgroup_pidlist_open(file, true);
+}
 
 static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
 					    struct cftype *cft)
@@ -2617,21 +2678,27 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
 /*
  * for the common functions, 'private' gives the type of file
  */
+/* for hysterical raisins, we can't put this on the older files */
+#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
 static struct cftype files[] = {
 	{
 		.name = "tasks",
 		.open = cgroup_tasks_open,
 		.write_u64 = cgroup_tasks_write,
-		.release = cgroup_tasks_release,
-		.private = FILE_TASKLIST,
+		.release = cgroup_pidlist_release,
 		.mode = S_IRUGO | S_IWUSR,
 	},
-
+	{
+		.name = CGROUP_FILE_GENERIC_PREFIX "procs",
+		.open = cgroup_procs_open,
+		/* .write_u64 = cgroup_procs_write, TODO */
+		.release = cgroup_pidlist_release,
+		.mode = S_IRUGO,
+	},
 	{
 		.name = "notify_on_release",
 		.read_u64 = cgroup_read_notify_on_release,
 		.write_u64 = cgroup_write_notify_on_release,
-		.private = FILE_NOTIFY_ON_RELEASE,
 	},
 };
 
@@ -2640,7 +2707,6 @@ static struct cftype cft_release_agent = {
 	.read_seq_string = cgroup_release_agent_show,
 	.write_string = cgroup_release_agent_write,
 	.max_write_len = PATH_MAX,
-	.private = FILE_RELEASE_AGENT,
 };
 
 static int cgroup_populate_dir(struct cgroup *cgrp)
-- 
cgit v1.2.3-71-gd317


From 72a8cb30d10d4041c455a7054607a7d519167c87 Mon Sep 17 00:00:00 2001
From: Ben Blum <bblum@google.com>
Date: Wed, 23 Sep 2009 15:56:27 -0700
Subject: cgroups: ensure correct concurrent opening/reading of pidlists across
 pid namespaces

Previously there was the problem in which two processes from different pid
namespaces reading the tasks or procs file could result in one process
seeing results from the other's namespace.  Rather than one pidlist for
each file in a cgroup, we now keep a list of pidlists keyed by namespace
and file type (tasks versus procs) in which entries are placed on demand.
Each pidlist has its own lock, and that the pidlists themselves are passed
around in the seq_file's private pointer means we don't have to touch the
cgroup or its master list except when creating and destroying entries.

Signed-off-by: Ben Blum <bblum@google.com>
Signed-off-by: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h |  34 +++++++++++++---
 kernel/cgroup.c        | 107 +++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 119 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 2357733a0a80..88e863460726 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -141,15 +141,36 @@ enum {
 	CGRP_WAIT_ON_RMDIR,
 };
 
+/* which pidlist file are we talking about? */
+enum cgroup_filetype {
+	CGROUP_FILE_PROCS,
+	CGROUP_FILE_TASKS,
+};
+
+/*
+ * A pidlist is a list of pids that virtually represents the contents of one
+ * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
+ * a pair (one each for procs, tasks) for each pid namespace that's relevant
+ * to the cgroup.
+ */
 struct cgroup_pidlist {
-	/* protects the other fields */
-	struct rw_semaphore mutex;
+	/*
+	 * used to find which pidlist is wanted. doesn't change as long as
+	 * this particular list stays in the list.
+	 */
+	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
 	/* array of xids */
 	pid_t *list;
 	/* how many elements the above list has */
 	int length;
 	/* how many files are using the current array */
 	int use_count;
+	/* each of these stored in a list by its cgroup */
+	struct list_head links;
+	/* pointer to the cgroup we belong to, for list removal purposes */
+	struct cgroup *owner;
+	/* protects the other fields */
+	struct rw_semaphore mutex;
 };
 
 struct cgroup {
@@ -190,9 +211,12 @@ struct cgroup {
 	 */
 	struct list_head release_list;
 
-	/* we will have two separate pidlists, one for pids (the tasks file)
-	 * and one for tgids (the procs file). */
-	struct cgroup_pidlist tasks, procs;
+	/*
+	 * list of pidlists, up to two for each namespace (one for procs, one
+	 * for tasks); created on demand.
+	 */
+	struct list_head pidlists;
+	struct mutex pidlist_mutex;
 
 	/* For RCU-protected deletion */
 	struct rcu_head rcu_head;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a9433f50e53d..97194ba12014 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -776,6 +776,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 		 */
 		deactivate_super(cgrp->root->sb);
 
+		/*
+		 * if we're getting rid of the cgroup, refcount should ensure
+		 * that there are no pidlists left.
+		 */
+		BUG_ON(!list_empty(&cgrp->pidlists));
+
 		call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
 	}
 	iput(inode);
@@ -1121,8 +1127,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	INIT_LIST_HEAD(&cgrp->children);
 	INIT_LIST_HEAD(&cgrp->css_sets);
 	INIT_LIST_HEAD(&cgrp->release_list);
-	init_rwsem(&(cgrp->tasks.mutex));
-	init_rwsem(&(cgrp->procs.mutex));
+	INIT_LIST_HEAD(&cgrp->pidlists);
+	mutex_init(&cgrp->pidlist_mutex);
 }
 
 static void init_cgroup_root(struct cgroupfs_root *root)
@@ -2395,10 +2401,60 @@ static int cmppid(const void *a, const void *b)
 	return *(pid_t *)a - *(pid_t *)b;
 }
 
+/*
+ * find the appropriate pidlist for our purpose (given procs vs tasks)
+ * returns with the lock on that pidlist already held, and takes care
+ * of the use count, or returns NULL with no locks held if we're out of
+ * memory.
+ */
+static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+						  enum cgroup_filetype type)
+{
+	struct cgroup_pidlist *l;
+	/* don't need task_nsproxy() if we're looking at ourself */
+	struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns);
+	/*
+	 * We can't drop the pidlist_mutex before taking the l->mutex in case
+	 * the last ref-holder is trying to remove l from the list at the same
+	 * time. Holding the pidlist_mutex precludes somebody taking whichever
+	 * list we find out from under us - compare release_pid_array().
+	 */
+	mutex_lock(&cgrp->pidlist_mutex);
+	list_for_each_entry(l, &cgrp->pidlists, links) {
+		if (l->key.type == type && l->key.ns == ns) {
+			/* found a matching list - drop the extra refcount */
+			put_pid_ns(ns);
+			/* make sure l doesn't vanish out from under us */
+			down_write(&l->mutex);
+			mutex_unlock(&cgrp->pidlist_mutex);
+			l->use_count++;
+			return l;
+		}
+	}
+	/* entry not found; create a new one */
+	l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
+	if (!l) {
+		mutex_unlock(&cgrp->pidlist_mutex);
+		put_pid_ns(ns);
+		return l;
+	}
+	init_rwsem(&l->mutex);
+	down_write(&l->mutex);
+	l->key.type = type;
+	l->key.ns = ns;
+	l->use_count = 0; /* don't increment here */
+	l->list = NULL;
+	l->owner = cgrp;
+	list_add(&l->links, &cgrp->pidlists);
+	mutex_unlock(&cgrp->pidlist_mutex);
+	return l;
+}
+
 /*
  * Load a cgroup's pidarray with either procs' tgids or tasks' pids
  */
-static int pidlist_array_load(struct cgroup *cgrp, bool procs)
+static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
+			      struct cgroup_pidlist **lp)
 {
 	pid_t *array;
 	int length;
@@ -2423,7 +2479,10 @@ static int pidlist_array_load(struct cgroup *cgrp, bool procs)
 		if (unlikely(n == length))
 			break;
 		/* get tgid or pid for procs or tasks file respectively */
-		pid = (procs ? task_tgid_vnr(tsk) : task_pid_vnr(tsk));
+		if (type == CGROUP_FILE_PROCS)
+			pid = task_tgid_vnr(tsk);
+		else
+			pid = task_pid_vnr(tsk);
 		if (pid > 0) /* make sure to only use valid results */
 			array[n++] = pid;
 	}
@@ -2431,19 +2490,20 @@ static int pidlist_array_load(struct cgroup *cgrp, bool procs)
 	length = n;
 	/* now sort & (if procs) strip out duplicates */
 	sort(array, length, sizeof(pid_t), cmppid, NULL);
-	if (procs) {
+	if (type == CGROUP_FILE_PROCS)
 		length = pidlist_uniq(&array, length);
-		l = &(cgrp->procs);
-	} else {
-		l = &(cgrp->tasks);
+	l = cgroup_pidlist_find(cgrp, type);
+	if (!l) {
+		kfree(array);
+		return -ENOMEM;
 	}
-	/* store array in cgroup, freeing old if necessary */
-	down_write(&l->mutex);
+	/* store array, freeing old if necessary - lock already held */
 	kfree(l->list);
 	l->list = array;
 	l->length = length;
 	l->use_count++;
 	up_write(&l->mutex);
+	*lp = l;
 	return 0;
 }
 
@@ -2586,13 +2646,26 @@ static const struct seq_operations cgroup_pidlist_seq_operations = {
 
 static void cgroup_release_pid_array(struct cgroup_pidlist *l)
 {
+	/*
+	 * the case where we're the last user of this particular pidlist will
+	 * have us remove it from the cgroup's list, which entails taking the
+	 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
+	 * pidlist_mutex, we have to take pidlist_mutex first.
+	 */
+	mutex_lock(&l->owner->pidlist_mutex);
 	down_write(&l->mutex);
 	BUG_ON(!l->use_count);
 	if (!--l->use_count) {
+		/* we're the last user if refcount is 0; remove and free */
+		list_del(&l->links);
+		mutex_unlock(&l->owner->pidlist_mutex);
 		kfree(l->list);
-		l->list = NULL;
-		l->length = 0;
+		put_pid_ns(l->key.ns);
+		up_write(&l->mutex);
+		kfree(l);
+		return;
 	}
+	mutex_unlock(&l->owner->pidlist_mutex);
 	up_write(&l->mutex);
 }
 
@@ -2623,10 +2696,10 @@ static const struct file_operations cgroup_pidlist_operations = {
  * in the cgroup.
  */
 /* helper function for the two below it */
-static int cgroup_pidlist_open(struct file *file, bool procs)
+static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
 {
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-	struct cgroup_pidlist *l = (procs ? &cgrp->procs : &cgrp->tasks);
+	struct cgroup_pidlist *l;
 	int retval;
 
 	/* Nothing to do for write-only files */
@@ -2634,7 +2707,7 @@ static int cgroup_pidlist_open(struct file *file, bool procs)
 		return 0;
 
 	/* have the array populated */
-	retval = pidlist_array_load(cgrp, procs);
+	retval = pidlist_array_load(cgrp, type, &l);
 	if (retval)
 		return retval;
 	/* configure file information */
@@ -2650,11 +2723,11 @@ static int cgroup_pidlist_open(struct file *file, bool procs)
 }
 static int cgroup_tasks_open(struct inode *unused, struct file *file)
 {
-	return cgroup_pidlist_open(file, false);
+	return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
 }
 static int cgroup_procs_open(struct inode *unused, struct file *file)
 {
-	return cgroup_pidlist_open(file, true);
+	return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
 }
 
 static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
-- 
cgit v1.2.3-71-gd317


From c378369d8b4fa516ff2b1e79c3eded4e0e955ebb Mon Sep 17 00:00:00 2001
From: Ben Blum <bblum@google.com>
Date: Wed, 23 Sep 2009 15:56:29 -0700
Subject: cgroups: change css_set freeing mechanism to be under RCU

Changes css_set freeing mechanism to be under RCU

This is a prepatch for making the procs file writable. In order to free the
old css_sets for each task to be moved as they're being moved, the freeing
mechanism must be RCU-protected, or else we would have to have a call to
synchronize_rcu() for each task before freeing its old css_set.

Signed-off-by: Ben Blum <bblum@google.com>
Signed-off-by: Paul Menage <menage@google.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h | 3 +++
 kernel/cgroup.c        | 8 +++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 88e863460726..3ac78a2f4b5a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -260,6 +260,9 @@ struct css_set {
 	 * during subsystem registration (at boot time).
 	 */
 	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
+
+	/* For RCU-protected deletion */
+	struct rcu_head rcu_head;
 };
 
 /*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3e356b05b2d5..bf8dd1a9f2d1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -267,6 +267,12 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
 	return &css_set_table[index];
 }
 
+static void free_css_set_rcu(struct rcu_head *obj)
+{
+	struct css_set *cg = container_of(obj, struct css_set, rcu_head);
+	kfree(cg);
+}
+
 /* We don't maintain the lists running through each css_set to its
  * task until after the first call to cgroup_iter_start(). This
  * reduces the fork()/exit() overhead for people who have cgroups
@@ -310,7 +316,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
 	}
 
 	write_unlock(&css_set_lock);
-	kfree(cg);
+	call_rcu(&cg->rcu_head, free_css_set_rcu);
 }
 
 /*
-- 
cgit v1.2.3-71-gd317


From be367d09927023d081f9199665c8500f69f14d22 Mon Sep 17 00:00:00 2001
From: Ben Blum <bblum@google.com>
Date: Wed, 23 Sep 2009 15:56:31 -0700
Subject: cgroups: let ss->can_attach and ss->attach do whole threadgroups at a
 time

Alter the ss->can_attach and ss->attach functions to be able to deal with
a whole threadgroup at a time, for use in cgroup_attach_proc.  (This is a
pre-patch to cgroup-procs-writable.patch.)

Currently, new mode of the attach function can only tell the subsystem
about the old cgroup of the threadgroup leader.  No subsystem currently
needs that information for each thread that's being moved, but if one were
to be added (for example, one that counts tasks within a group) this bit
would need to be reworked a bit to tell the subsystem the right
information.

[hidave.darkstar@gmail.com: fix build]
Signed-off-by: Ben Blum <bblum@google.com>
Signed-off-by: Paul Menage <menage@google.com>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Reviewed-by: Matt Helsley <matthltc@us.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Dave Young <hidave.darkstar@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroups/cgroups.txt | 12 +++++--
 include/linux/cgroup.h            |  7 +++--
 kernel/cgroup.c                   |  4 +--
 kernel/cgroup_freezer.c           | 15 ++++++++-
 kernel/cpuset.c                   | 66 ++++++++++++++++++++++++++++++---------
 kernel/ns_cgroup.c                | 16 ++++++++--
 kernel/sched.c                    | 35 +++++++++++++++++++--
 mm/memcontrol.c                   |  3 +-
 security/device_cgroup.c          |  3 +-
 9 files changed, 131 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 4bccfc19196b..455d4e6d346d 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -521,7 +521,7 @@ rmdir() will fail with it. From this behavior, pre_destroy() can be
 called multiple times against a cgroup.
 
 int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-	       struct task_struct *task)
+	       struct task_struct *task, bool threadgroup)
 (cgroup_mutex held by caller)
 
 Called prior to moving a task into a cgroup; if the subsystem
@@ -529,14 +529,20 @@ returns an error, this will abort the attach operation.  If a NULL
 task is passed, then a successful result indicates that *any*
 unspecified task can be moved into the cgroup. Note that this isn't
 called on a fork. If this method returns 0 (success) then this should
-remain valid while the caller holds cgroup_mutex.
+remain valid while the caller holds cgroup_mutex. If threadgroup is
+true, then a successful result indicates that all threads in the given
+thread's threadgroup can be moved together.
 
 void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-	    struct cgroup *old_cgrp, struct task_struct *task)
+	    struct cgroup *old_cgrp, struct task_struct *task,
+	    bool threadgroup)
 (cgroup_mutex held by caller)
 
 Called after the task has been attached to the cgroup, to allow any
 post-attachment activity that requires memory allocations or blocking.
+If threadgroup is true, the subsystem should take care of all threads
+in the specified thread's threadgroup. Currently does not support any
+subsystem that might need the old_cgrp for every thread in the group.
 
 void fork(struct cgroup_subsy *ss, struct task_struct *task)
 
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 3ac78a2f4b5a..b62bb9294d0c 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -425,10 +425,11 @@ struct cgroup_subsys {
 						  struct cgroup *cgrp);
 	int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
 	void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
-	int (*can_attach)(struct cgroup_subsys *ss,
-			  struct cgroup *cgrp, struct task_struct *tsk);
+	int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
+			  struct task_struct *tsk, bool threadgroup);
 	void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
-			struct cgroup *old_cgrp, struct task_struct *tsk);
+			struct cgroup *old_cgrp, struct task_struct *tsk,
+			bool threadgroup);
 	void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
 	void (*exit)(struct cgroup_subsys *ss, struct task_struct *task);
 	int (*populate)(struct cgroup_subsys *ss,
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index bf8dd1a9f2d1..7ccba4bc5e3b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1552,7 +1552,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 
 	for_each_subsys(root, ss) {
 		if (ss->can_attach) {
-			retval = ss->can_attach(ss, cgrp, tsk);
+			retval = ss->can_attach(ss, cgrp, tsk, false);
 			if (retval)
 				return retval;
 		}
@@ -1590,7 +1590,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 
 	for_each_subsys(root, ss) {
 		if (ss->attach)
-			ss->attach(ss, cgrp, oldcgrp, tsk);
+			ss->attach(ss, cgrp, oldcgrp, tsk, false);
 	}
 	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
 	synchronize_rcu();
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index fb249e2bcada..59e9ef6aab40 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -159,7 +159,7 @@ static bool is_task_frozen_enough(struct task_struct *task)
  */
 static int freezer_can_attach(struct cgroup_subsys *ss,
 			      struct cgroup *new_cgroup,
-			      struct task_struct *task)
+			      struct task_struct *task, bool threadgroup)
 {
 	struct freezer *freezer;
 
@@ -177,6 +177,19 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
 	if (freezer->state == CGROUP_FROZEN)
 		return -EBUSY;
 
+	if (threadgroup) {
+		struct task_struct *c;
+
+		rcu_read_lock();
+		list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
+			if (is_task_frozen_enough(c)) {
+				rcu_read_unlock();
+				return -EBUSY;
+			}
+		}
+		rcu_read_unlock();
+	}
+
 	return 0;
 }
 
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7e75a41bd508..b5cb469d2545 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1324,9 +1324,10 @@ static int fmeter_getrate(struct fmeter *fmp)
 static cpumask_var_t cpus_attach;
 
 /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
-static int cpuset_can_attach(struct cgroup_subsys *ss,
-			     struct cgroup *cont, struct task_struct *tsk)
+static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
+			     struct task_struct *tsk, bool threadgroup)
 {
+	int ret;
 	struct cpuset *cs = cgroup_cs(cont);
 
 	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1343,18 +1344,51 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
 	if (tsk->flags & PF_THREAD_BOUND)
 		return -EINVAL;
 
-	return security_task_setscheduler(tsk, 0, NULL);
+	ret = security_task_setscheduler(tsk, 0, NULL);
+	if (ret)
+		return ret;
+	if (threadgroup) {
+		struct task_struct *c;
+
+		rcu_read_lock();
+		list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
+			ret = security_task_setscheduler(c, 0, NULL);
+			if (ret) {
+				rcu_read_unlock();
+				return ret;
+			}
+		}
+		rcu_read_unlock();
+	}
+	return 0;
+}
+
+static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
+			       struct cpuset *cs)
+{
+	int err;
+	/*
+	 * can_attach beforehand should guarantee that this doesn't fail.
+	 * TODO: have a better way to handle failure here
+	 */
+	err = set_cpus_allowed_ptr(tsk, cpus_attach);
+	WARN_ON_ONCE(err);
+
+	task_lock(tsk);
+	cpuset_change_task_nodemask(tsk, to);
+	task_unlock(tsk);
+	cpuset_update_task_spread_flag(cs, tsk);
+
 }
 
-static void cpuset_attach(struct cgroup_subsys *ss,
-			  struct cgroup *cont, struct cgroup *oldcont,
-			  struct task_struct *tsk)
+static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
+			  struct cgroup *oldcont, struct task_struct *tsk,
+			  bool threadgroup)
 {
 	nodemask_t from, to;
 	struct mm_struct *mm;
 	struct cpuset *cs = cgroup_cs(cont);
 	struct cpuset *oldcs = cgroup_cs(oldcont);
-	int err;
 
 	if (cs == &top_cpuset) {
 		cpumask_copy(cpus_attach, cpu_possible_mask);
@@ -1363,15 +1397,19 @@ static void cpuset_attach(struct cgroup_subsys *ss,
 		guarantee_online_cpus(cs, cpus_attach);
 		guarantee_online_mems(cs, &to);
 	}
-	err = set_cpus_allowed_ptr(tsk, cpus_attach);
-	if (err)
-		return;
 
-	task_lock(tsk);
-	cpuset_change_task_nodemask(tsk, &to);
-	task_unlock(tsk);
-	cpuset_update_task_spread_flag(cs, tsk);
+	/* do per-task migration stuff possibly for each in the threadgroup */
+	cpuset_attach_task(tsk, &to, cs);
+	if (threadgroup) {
+		struct task_struct *c;
+		rcu_read_lock();
+		list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
+			cpuset_attach_task(c, &to, cs);
+		}
+		rcu_read_unlock();
+	}
 
+	/* change mm; only needs to be done once even if threadgroup */
 	from = oldcs->mems_allowed;
 	to = cs->mems_allowed;
 	mm = get_task_mm(tsk);
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 5aa854f9e5ae..2a5dfec8efe0 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -42,8 +42,8 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
  *       (hence either you are in the same cgroup as task, or in an
  *        ancestor cgroup thereof)
  */
-static int ns_can_attach(struct cgroup_subsys *ss,
-		struct cgroup *new_cgroup, struct task_struct *task)
+static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
+			 struct task_struct *task, bool threadgroup)
 {
 	if (current != task) {
 		if (!capable(CAP_SYS_ADMIN))
@@ -56,6 +56,18 @@ static int ns_can_attach(struct cgroup_subsys *ss,
 	if (!cgroup_is_descendant(new_cgroup, task))
 		return -EPERM;
 
+	if (threadgroup) {
+		struct task_struct *c;
+		rcu_read_lock();
+		list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
+			if (!cgroup_is_descendant(new_cgroup, c)) {
+				rcu_read_unlock();
+				return -EPERM;
+			}
+		}
+		rcu_read_unlock();
+	}
+
 	return 0;
 }
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 2f76e06bea58..0d0361b9dbb3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -10377,8 +10377,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 }
 
 static int
-cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-		      struct task_struct *tsk)
+cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
 #ifdef CONFIG_RT_GROUP_SCHED
 	if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
@@ -10388,15 +10387,45 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 	if (tsk->sched_class != &fair_sched_class)
 		return -EINVAL;
 #endif
+	return 0;
+}
 
+static int
+cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+		      struct task_struct *tsk, bool threadgroup)
+{
+	int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
+	if (retval)
+		return retval;
+	if (threadgroup) {
+		struct task_struct *c;
+		rcu_read_lock();
+		list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
+			retval = cpu_cgroup_can_attach_task(cgrp, c);
+			if (retval) {
+				rcu_read_unlock();
+				return retval;
+			}
+		}
+		rcu_read_unlock();
+	}
 	return 0;
 }
 
 static void
 cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-			struct cgroup *old_cont, struct task_struct *tsk)
+		  struct cgroup *old_cont, struct task_struct *tsk,
+		  bool threadgroup)
 {
 	sched_move_task(tsk);
+	if (threadgroup) {
+		struct task_struct *c;
+		rcu_read_lock();
+		list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
+			sched_move_task(c);
+		}
+		rcu_read_unlock();
+	}
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9b10d8753784..cf2e717f5c12 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2612,7 +2612,8 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
 				struct cgroup *cont,
 				struct cgroup *old_cont,
-				struct task_struct *p)
+				struct task_struct *p,
+				bool threadgroup)
 {
 	mutex_lock(&memcg_tasklist);
 	/*
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index b8186bac8b7e..6cf8fd2b79e8 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -61,7 +61,8 @@ static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
 struct cgroup_subsys devices_subsys;
 
 static int devcgroup_can_attach(struct cgroup_subsys *ss,
-		struct cgroup *new_cgroup, struct task_struct *task)
+		struct cgroup *new_cgroup, struct task_struct *task,
+		bool threadgroup)
 {
 	if (current != task && !capable(CAP_SYS_ADMIN))
 			return -EPERM;
-- 
cgit v1.2.3-71-gd317


From 4b3bde4c983de36c59e6c1a24701f6fe816f9f55 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbir@linux.vnet.ibm.com>
Date: Wed, 23 Sep 2009 15:56:32 -0700
Subject: memcg: remove the overhead associated with the root cgroup

Change the memory cgroup to remove the overhead associated with accounting
all pages in the root cgroup.  As a side-effect, we can no longer set a
memory hard limit in the root cgroup.

A new flag to track whether the page has been accounted or not has been
added as well.  Flags are now set atomically for page_cgroup,
pcg_default_flags is now obsolete and removed.

[akpm@linux-foundation.org: fix a few documentation glitches]
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroups/memory.txt |  4 +++
 include/linux/page_cgroup.h      | 13 ++++++++++
 mm/memcontrol.c                  | 54 +++++++++++++++++++++++++++++-----------
 3 files changed, 57 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 23d1262c0775..ab0a02172cf4 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -179,6 +179,9 @@ The reclaim algorithm has not been modified for cgroups, except that
 pages that are selected for reclaiming come from the per cgroup LRU
 list.
 
+NOTE: Reclaim does not work for the root cgroup, since we cannot set any
+limits on the root cgroup.
+
 2. Locking
 
 The memory controller uses the following hierarchy
@@ -210,6 +213,7 @@ We can alter the memory limit:
 NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
 mega or gigabytes.
 NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited).
+NOTE: We cannot set limits on the root cgroup any more.
 
 # cat /cgroups/0/memory.limit_in_bytes
 4194304
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index ada779f24178..4b938d4f3ac2 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -38,6 +38,7 @@ enum {
 	PCG_LOCK,  /* page cgroup is locked */
 	PCG_CACHE, /* charged as cache */
 	PCG_USED, /* this object is in use. */
+	PCG_ACCT_LRU, /* page has been accounted for */
 };
 
 #define TESTPCGFLAG(uname, lname)			\
@@ -52,11 +53,23 @@ static inline void SetPageCgroup##uname(struct page_cgroup *pc)\
 static inline void ClearPageCgroup##uname(struct page_cgroup *pc)	\
 	{ clear_bit(PCG_##lname, &pc->flags);  }
 
+#define TESTCLEARPCGFLAG(uname, lname)			\
+static inline int TestClearPageCgroup##uname(struct page_cgroup *pc)	\
+	{ return test_and_clear_bit(PCG_##lname, &pc->flags);  }
+
 /* Cache flag is set only once (at allocation) */
 TESTPCGFLAG(Cache, CACHE)
+CLEARPCGFLAG(Cache, CACHE)
+SETPCGFLAG(Cache, CACHE)
 
 TESTPCGFLAG(Used, USED)
 CLEARPCGFLAG(Used, USED)
+SETPCGFLAG(Used, USED)
+
+SETPCGFLAG(AcctLRU, ACCT_LRU)
+CLEARPCGFLAG(AcctLRU, ACCT_LRU)
+TESTPCGFLAG(AcctLRU, ACCT_LRU)
+TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
 
 static inline int page_cgroup_nid(struct page_cgroup *pc)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index cf2e717f5c12..b0757660663f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -43,6 +43,7 @@
 
 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES	5
+struct mem_cgroup *root_mem_cgroup __read_mostly;
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
@@ -200,13 +201,8 @@ enum charge_type {
 #define PCGF_CACHE	(1UL << PCG_CACHE)
 #define PCGF_USED	(1UL << PCG_USED)
 #define PCGF_LOCK	(1UL << PCG_LOCK)
-static const unsigned long
-pcg_default_flags[NR_CHARGE_TYPE] = {
-	PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
-	PCGF_USED | PCGF_LOCK, /* Anon */
-	PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
-	0, /* FORCE */
-};
+/* Not used, but added here for completeness */
+#define PCGF_ACCT	(1UL << PCG_ACCT)
 
 /* for encoding cft->private value on file */
 #define _MEM			(0)
@@ -354,6 +350,11 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
 	return ret;
 }
 
+static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
+{
+	return (mem == root_mem_cgroup);
+}
+
 /*
  * Following LRU functions are allowed to be used without PCG_LOCK.
  * Operations are called by routine of global LRU independently from memcg.
@@ -371,22 +372,24 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
 void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
 {
 	struct page_cgroup *pc;
-	struct mem_cgroup *mem;
 	struct mem_cgroup_per_zone *mz;
 
 	if (mem_cgroup_disabled())
 		return;
 	pc = lookup_page_cgroup(page);
 	/* can happen while we handle swapcache. */
-	if (list_empty(&pc->lru) || !pc->mem_cgroup)
+	if (!TestClearPageCgroupAcctLRU(pc))
 		return;
+	VM_BUG_ON(!pc->mem_cgroup);
 	/*
 	 * We don't check PCG_USED bit. It's cleared when the "page" is finally
 	 * removed from global LRU.
 	 */
 	mz = page_cgroup_zoneinfo(pc);
-	mem = pc->mem_cgroup;
 	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+	if (mem_cgroup_is_root(pc->mem_cgroup))
+		return;
+	VM_BUG_ON(list_empty(&pc->lru));
 	list_del_init(&pc->lru);
 	return;
 }
@@ -410,8 +413,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
 	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
 	 */
 	smp_rmb();
-	/* unused page is not rotated. */
-	if (!PageCgroupUsed(pc))
+	/* unused or root page is not rotated. */
+	if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
 		return;
 	mz = page_cgroup_zoneinfo(pc);
 	list_move(&pc->lru, &mz->lists[lru]);
@@ -425,6 +428,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
 	if (mem_cgroup_disabled())
 		return;
 	pc = lookup_page_cgroup(page);
+	VM_BUG_ON(PageCgroupAcctLRU(pc));
 	/*
 	 * Used bit is set without atomic ops but after smp_wmb().
 	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
@@ -435,6 +439,9 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
 
 	mz = page_cgroup_zoneinfo(pc);
 	MEM_CGROUP_ZSTAT(mz, lru) += 1;
+	SetPageCgroupAcctLRU(pc);
+	if (mem_cgroup_is_root(pc->mem_cgroup))
+		return;
 	list_add(&pc->lru, &mz->lists[lru]);
 }
 
@@ -469,7 +476,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
 
 	spin_lock_irqsave(&zone->lru_lock, flags);
 	/* link when the page is linked to LRU but page_cgroup isn't */
-	if (PageLRU(page) && list_empty(&pc->lru))
+	if (PageLRU(page) && !PageCgroupAcctLRU(pc))
 		mem_cgroup_add_lru_list(page, page_lru(page));
 	spin_unlock_irqrestore(&zone->lru_lock, flags);
 }
@@ -1125,9 +1132,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 		css_put(&mem->css);
 		return;
 	}
+
 	pc->mem_cgroup = mem;
 	smp_wmb();
-	pc->flags = pcg_default_flags[ctype];
+	switch (ctype) {
+	case MEM_CGROUP_CHARGE_TYPE_CACHE:
+	case MEM_CGROUP_CHARGE_TYPE_SHMEM:
+		SetPageCgroupCache(pc);
+		SetPageCgroupUsed(pc);
+		break;
+	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
+		ClearPageCgroupCache(pc);
+		SetPageCgroupUsed(pc);
+		break;
+	default:
+		break;
+	}
 
 	mem_cgroup_charge_statistics(mem, pc, true);
 
@@ -2083,6 +2103,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 	name = MEMFILE_ATTR(cft->private);
 	switch (name) {
 	case RES_LIMIT:
+		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
+			ret = -EINVAL;
+			break;
+		}
 		/* This function does all necessary parse...reuse it */
 		ret = res_counter_memparse_write_strategy(buffer, &val);
 		if (ret)
@@ -2549,6 +2573,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	if (cont->parent == NULL) {
 		enable_swap_cgroup();
 		parent = NULL;
+		root_mem_cgroup = mem;
 	} else {
 		parent = mem_cgroup_from_cont(cont->parent);
 		mem->use_hierarchy = parent->use_hierarchy;
@@ -2577,6 +2602,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	return &mem->css;
 free_out:
 	__mem_cgroup_free(mem);
+	root_mem_cgroup = NULL;
 	return ERR_PTR(error);
 }
 
-- 
cgit v1.2.3-71-gd317


From 296c81d89f4f14269f7346f81442910158c0a83a Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbir@linux.vnet.ibm.com>
Date: Wed, 23 Sep 2009 15:56:36 -0700
Subject: memory controller: soft limit interface

Add an interface to allow get/set of soft limits.  Soft limits for memory
plus swap controller (memsw) is currently not supported.  Resource
counters have been enhanced to support soft limits and new type
RES_SOFT_LIMIT has been added.  Unlike hard limits, soft limits can be
directly set and do not need any reclaim or checks before setting them to
a newer value.

Kamezawa-San raised a question as to whether soft limit should belong to
res_counter.  Since all resources understand the basic concepts of hard
and soft limits, it is justified to add soft limits here.  Soft limits are
a generic resource usage feature, even file system quotas support soft
limits.

Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/res_counter.h | 58 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/res_counter.c        |  3 +++
 mm/memcontrol.c             | 20 ++++++++++++++++
 3 files changed, 81 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index 511f42fc6816..fcb9884df618 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -34,6 +34,10 @@ struct res_counter {
 	 * the limit that usage cannot exceed
 	 */
 	unsigned long long limit;
+	/*
+	 * the limit that usage can be exceed
+	 */
+	unsigned long long soft_limit;
 	/*
 	 * the number of unsuccessful attempts to consume the resource
 	 */
@@ -87,6 +91,7 @@ enum {
 	RES_MAX_USAGE,
 	RES_LIMIT,
 	RES_FAILCNT,
+	RES_SOFT_LIMIT,
 };
 
 /*
@@ -132,6 +137,36 @@ static inline bool res_counter_limit_check_locked(struct res_counter *cnt)
 	return false;
 }
 
+static inline bool res_counter_soft_limit_check_locked(struct res_counter *cnt)
+{
+	if (cnt->usage < cnt->soft_limit)
+		return true;
+
+	return false;
+}
+
+/**
+ * Get the difference between the usage and the soft limit
+ * @cnt: The counter
+ *
+ * Returns 0 if usage is less than or equal to soft limit
+ * The difference between usage and soft limit, otherwise.
+ */
+static inline unsigned long long
+res_counter_soft_limit_excess(struct res_counter *cnt)
+{
+	unsigned long long excess;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cnt->lock, flags);
+	if (cnt->usage <= cnt->soft_limit)
+		excess = 0;
+	else
+		excess = cnt->usage - cnt->soft_limit;
+	spin_unlock_irqrestore(&cnt->lock, flags);
+	return excess;
+}
+
 /*
  * Helper function to detect if the cgroup is within it's limit or
  * not. It's currently called from cgroup_rss_prepare()
@@ -147,6 +182,17 @@ static inline bool res_counter_check_under_limit(struct res_counter *cnt)
 	return ret;
 }
 
+static inline bool res_counter_check_under_soft_limit(struct res_counter *cnt)
+{
+	bool ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cnt->lock, flags);
+	ret = res_counter_soft_limit_check_locked(cnt);
+	spin_unlock_irqrestore(&cnt->lock, flags);
+	return ret;
+}
+
 static inline void res_counter_reset_max(struct res_counter *cnt)
 {
 	unsigned long flags;
@@ -180,4 +226,16 @@ static inline int res_counter_set_limit(struct res_counter *cnt,
 	return ret;
 }
 
+static inline int
+res_counter_set_soft_limit(struct res_counter *cnt,
+				unsigned long long soft_limit)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cnt->lock, flags);
+	cnt->soft_limit = soft_limit;
+	spin_unlock_irqrestore(&cnt->lock, flags);
+	return 0;
+}
+
 #endif
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index e1338f074314..bcdabf37c40b 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -19,6 +19,7 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
 {
 	spin_lock_init(&counter->lock);
 	counter->limit = RESOURCE_MAX;
+	counter->soft_limit = RESOURCE_MAX;
 	counter->parent = parent;
 }
 
@@ -101,6 +102,8 @@ res_counter_member(struct res_counter *counter, int member)
 		return &counter->limit;
 	case RES_FAILCNT:
 		return &counter->failcnt;
+	case RES_SOFT_LIMIT:
+		return &counter->soft_limit;
 	};
 
 	BUG();
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index eb9571815f0c..4ad3e6be045d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2123,6 +2123,20 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 		else
 			ret = mem_cgroup_resize_memsw_limit(memcg, val);
 		break;
+	case RES_SOFT_LIMIT:
+		ret = res_counter_memparse_write_strategy(buffer, &val);
+		if (ret)
+			break;
+		/*
+		 * For memsw, soft limits are hard to implement in terms
+		 * of semantics, for now, we support soft limits for
+		 * control without swap
+		 */
+		if (type == _MEM)
+			ret = res_counter_set_soft_limit(&memcg->res, val);
+		else
+			ret = -EINVAL;
+		break;
 	default:
 		ret = -EINVAL; /* should be BUG() ? */
 		break;
@@ -2375,6 +2389,12 @@ static struct cftype mem_cgroup_files[] = {
 		.write_string = mem_cgroup_write,
 		.read_u64 = mem_cgroup_read,
 	},
+	{
+		.name = "soft_limit_in_bytes",
+		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
+		.write_string = mem_cgroup_write,
+		.read_u64 = mem_cgroup_read,
+	},
 	{
 		.name = "failcnt",
 		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
-- 
cgit v1.2.3-71-gd317


From f64c3f54940d6929a2b6dcffaab942bd62be2e66 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbir@linux.vnet.ibm.com>
Date: Wed, 23 Sep 2009 15:56:37 -0700
Subject: memory controller: soft limit organize cgroups

Organize cgroups over soft limit in a RB-Tree

Introduce an RB-Tree for storing memory cgroups that are over their soft
limit.  The overall goal is to

1. Add a memory cgroup to the RB-Tree when the soft limit is exceeded.
   We are careful about updates, updates take place only after a particular
   time interval has passed
2. We remove the node from the RB-Tree when the usage goes below the soft
   limit

The next set of patches will exploit the RB-Tree to get the group that is
over its soft limit by the largest amount and reclaim from it, when we
face memory contention.

[hugh.dickins@tiscali.co.uk: CONFIG_CGROUP_MEM_RES_CTLR=y CONFIG_PREEMPT=y fails to boot]
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/res_counter.h |   6 +-
 kernel/res_counter.c        |  18 ++-
 mm/memcontrol.c             | 300 +++++++++++++++++++++++++++++++++++++-------
 3 files changed, 277 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index fcb9884df618..731af71cddc9 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -114,7 +114,8 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent);
 int __must_check res_counter_charge_locked(struct res_counter *counter,
 		unsigned long val);
 int __must_check res_counter_charge(struct res_counter *counter,
-		unsigned long val, struct res_counter **limit_fail_at);
+		unsigned long val, struct res_counter **limit_fail_at,
+		struct res_counter **soft_limit_at);
 
 /*
  * uncharge - tell that some portion of the resource is released
@@ -127,7 +128,8 @@ int __must_check res_counter_charge(struct res_counter *counter,
  */
 
 void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val);
-void res_counter_uncharge(struct res_counter *counter, unsigned long val);
+void res_counter_uncharge(struct res_counter *counter, unsigned long val,
+				bool *was_soft_limit_excess);
 
 static inline bool res_counter_limit_check_locked(struct res_counter *cnt)
 {
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index bcdabf37c40b..88faec23e833 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -37,17 +37,27 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
 }
 
 int res_counter_charge(struct res_counter *counter, unsigned long val,
-			struct res_counter **limit_fail_at)
+			struct res_counter **limit_fail_at,
+			struct res_counter **soft_limit_fail_at)
 {
 	int ret;
 	unsigned long flags;
 	struct res_counter *c, *u;
 
 	*limit_fail_at = NULL;
+	if (soft_limit_fail_at)
+		*soft_limit_fail_at = NULL;
 	local_irq_save(flags);
 	for (c = counter; c != NULL; c = c->parent) {
 		spin_lock(&c->lock);
 		ret = res_counter_charge_locked(c, val);
+		/*
+		 * With soft limits, we return the highest ancestor
+		 * that exceeds its soft limit
+		 */
+		if (soft_limit_fail_at &&
+			!res_counter_soft_limit_check_locked(c))
+			*soft_limit_fail_at = c;
 		spin_unlock(&c->lock);
 		if (ret < 0) {
 			*limit_fail_at = c;
@@ -75,7 +85,8 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
 	counter->usage -= val;
 }
 
-void res_counter_uncharge(struct res_counter *counter, unsigned long val)
+void res_counter_uncharge(struct res_counter *counter, unsigned long val,
+				bool *was_soft_limit_excess)
 {
 	unsigned long flags;
 	struct res_counter *c;
@@ -83,6 +94,9 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val)
 	local_irq_save(flags);
 	for (c = counter; c != NULL; c = c->parent) {
 		spin_lock(&c->lock);
+		if (was_soft_limit_excess)
+			*was_soft_limit_excess =
+				!res_counter_soft_limit_check_locked(c);
 		res_counter_uncharge_locked(c, val);
 		spin_unlock(&c->lock);
 	}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4ad3e6be045d..0ed325943cd1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -29,6 +29,7 @@
 #include <linux/rcupdate.h>
 #include <linux/limits.h>
 #include <linux/mutex.h>
+#include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/spinlock.h>
@@ -54,6 +55,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
 #endif
 
 static DEFINE_MUTEX(memcg_tasklist);	/* can be hold under cgroup_mutex */
+#define SOFTLIMIT_EVENTS_THRESH (1000)
 
 /*
  * Statistics for memory cgroup.
@@ -67,6 +69,7 @@ enum mem_cgroup_stat_index {
 	MEM_CGROUP_STAT_MAPPED_FILE,  /* # of pages charged as file rss */
 	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
 	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
+	MEM_CGROUP_STAT_EVENTS,	/* sum of pagein + pageout for internal use */
 
 	MEM_CGROUP_STAT_NSTATS,
 };
@@ -79,6 +82,20 @@ struct mem_cgroup_stat {
 	struct mem_cgroup_stat_cpu cpustat[0];
 };
 
+static inline void
+__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat,
+				enum mem_cgroup_stat_index idx)
+{
+	stat->count[idx] = 0;
+}
+
+static inline s64
+__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
+				enum mem_cgroup_stat_index idx)
+{
+	return stat->count[idx];
+}
+
 /*
  * For accounting under irq disable, no need for increment preempt count.
  */
@@ -118,6 +135,10 @@ struct mem_cgroup_per_zone {
 	unsigned long		count[NR_LRU_LISTS];
 
 	struct zone_reclaim_stat reclaim_stat;
+	struct rb_node		tree_node;	/* RB tree node */
+	unsigned long long	usage_in_excess;/* Set to the value by which */
+						/* the soft limit is exceeded*/
+	bool			on_tree;
 };
 /* Macro for accessing counter */
 #define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
@@ -130,6 +151,26 @@ struct mem_cgroup_lru_info {
 	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
 };
 
+/*
+ * Cgroups above their limits are maintained in a RB-Tree, independent of
+ * their hierarchy representation
+ */
+
+struct mem_cgroup_tree_per_zone {
+	struct rb_root rb_root;
+	spinlock_t lock;
+};
+
+struct mem_cgroup_tree_per_node {
+	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
+};
+
+struct mem_cgroup_tree {
+	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
+};
+
+static struct mem_cgroup_tree soft_limit_tree __read_mostly;
+
 /*
  * The memory controller data structure. The memory controller controls both
  * page cache and RSS per cgroup. We would eventually like to provide
@@ -215,6 +256,150 @@ static void mem_cgroup_get(struct mem_cgroup *mem);
 static void mem_cgroup_put(struct mem_cgroup *mem);
 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
 
+static struct mem_cgroup_per_zone *
+mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
+{
+	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
+}
+
+static struct mem_cgroup_per_zone *
+page_cgroup_zoneinfo(struct page_cgroup *pc)
+{
+	struct mem_cgroup *mem = pc->mem_cgroup;
+	int nid = page_cgroup_nid(pc);
+	int zid = page_cgroup_zid(pc);
+
+	if (!mem)
+		return NULL;
+
+	return mem_cgroup_zoneinfo(mem, nid, zid);
+}
+
+static struct mem_cgroup_tree_per_zone *
+soft_limit_tree_node_zone(int nid, int zid)
+{
+	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
+}
+
+static struct mem_cgroup_tree_per_zone *
+soft_limit_tree_from_page(struct page *page)
+{
+	int nid = page_to_nid(page);
+	int zid = page_zonenum(page);
+
+	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
+}
+
+static void
+mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
+				struct mem_cgroup_per_zone *mz,
+				struct mem_cgroup_tree_per_zone *mctz)
+{
+	struct rb_node **p = &mctz->rb_root.rb_node;
+	struct rb_node *parent = NULL;
+	struct mem_cgroup_per_zone *mz_node;
+
+	if (mz->on_tree)
+		return;
+
+	mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res);
+	spin_lock(&mctz->lock);
+	while (*p) {
+		parent = *p;
+		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
+					tree_node);
+		if (mz->usage_in_excess < mz_node->usage_in_excess)
+			p = &(*p)->rb_left;
+		/*
+		 * We can't avoid mem cgroups that are over their soft
+		 * limit by the same amount
+		 */
+		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
+			p = &(*p)->rb_right;
+	}
+	rb_link_node(&mz->tree_node, parent, p);
+	rb_insert_color(&mz->tree_node, &mctz->rb_root);
+	mz->on_tree = true;
+	spin_unlock(&mctz->lock);
+}
+
+static void
+mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
+				struct mem_cgroup_per_zone *mz,
+				struct mem_cgroup_tree_per_zone *mctz)
+{
+	spin_lock(&mctz->lock);
+	rb_erase(&mz->tree_node, &mctz->rb_root);
+	mz->on_tree = false;
+	spin_unlock(&mctz->lock);
+}
+
+static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
+{
+	bool ret = false;
+	int cpu;
+	s64 val;
+	struct mem_cgroup_stat_cpu *cpustat;
+
+	cpu = get_cpu();
+	cpustat = &mem->stat.cpustat[cpu];
+	val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS);
+	if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) {
+		__mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS);
+		ret = true;
+	}
+	put_cpu();
+	return ret;
+}
+
+static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
+{
+	unsigned long long prev_usage_in_excess, new_usage_in_excess;
+	bool updated_tree = false;
+	struct mem_cgroup_per_zone *mz;
+	struct mem_cgroup_tree_per_zone *mctz;
+
+	mz = mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page));
+	mctz = soft_limit_tree_from_page(page);
+
+	/*
+	 * We do updates in lazy mode, mem's are removed
+	 * lazily from the per-zone, per-node rb tree
+	 */
+	prev_usage_in_excess = mz->usage_in_excess;
+
+	new_usage_in_excess = res_counter_soft_limit_excess(&mem->res);
+	if (prev_usage_in_excess) {
+		mem_cgroup_remove_exceeded(mem, mz, mctz);
+		updated_tree = true;
+	}
+	if (!new_usage_in_excess)
+		goto done;
+	mem_cgroup_insert_exceeded(mem, mz, mctz);
+
+done:
+	if (updated_tree) {
+		spin_lock(&mctz->lock);
+		mz->usage_in_excess = new_usage_in_excess;
+		spin_unlock(&mctz->lock);
+	}
+}
+
+static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
+{
+	int node, zone;
+	struct mem_cgroup_per_zone *mz;
+	struct mem_cgroup_tree_per_zone *mctz;
+
+	for_each_node_state(node, N_POSSIBLE) {
+		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+			mz = mem_cgroup_zoneinfo(mem, node, zone);
+			mctz = soft_limit_tree_node_zone(node, zone);
+			mem_cgroup_remove_exceeded(mem, mz, mctz);
+		}
+	}
+}
+
 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
 					 struct page_cgroup *pc,
 					 bool charge)
@@ -236,28 +421,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
 	else
 		__mem_cgroup_stat_add_safe(cpustat,
 				MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
+	__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1);
 	put_cpu();
 }
 
-static struct mem_cgroup_per_zone *
-mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
-{
-	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
-}
-
-static struct mem_cgroup_per_zone *
-page_cgroup_zoneinfo(struct page_cgroup *pc)
-{
-	struct mem_cgroup *mem = pc->mem_cgroup;
-	int nid = page_cgroup_nid(pc);
-	int zid = page_cgroup_zid(pc);
-
-	if (!mem)
-		return NULL;
-
-	return mem_cgroup_zoneinfo(mem, nid, zid);
-}
-
 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
 					enum lru_list idx)
 {
@@ -972,11 +1139,11 @@ done:
  */
 static int __mem_cgroup_try_charge(struct mm_struct *mm,
 			gfp_t gfp_mask, struct mem_cgroup **memcg,
-			bool oom)
+			bool oom, struct page *page)
 {
-	struct mem_cgroup *mem, *mem_over_limit;
+	struct mem_cgroup *mem, *mem_over_limit, *mem_over_soft_limit;
 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
-	struct res_counter *fail_res;
+	struct res_counter *fail_res, *soft_fail_res = NULL;
 
 	if (unlikely(test_thread_flag(TIF_MEMDIE))) {
 		/* Don't account this! */
@@ -1006,16 +1173,17 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 		int ret;
 		bool noswap = false;
 
-		ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
+		ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res,
+						&soft_fail_res);
 		if (likely(!ret)) {
 			if (!do_swap_account)
 				break;
 			ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
-							&fail_res);
+							&fail_res, NULL);
 			if (likely(!ret))
 				break;
 			/* mem+swap counter fails */
-			res_counter_uncharge(&mem->res, PAGE_SIZE);
+			res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
 			noswap = true;
 			mem_over_limit = mem_cgroup_from_res_counter(fail_res,
 									memsw);
@@ -1053,13 +1221,23 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 			goto nomem;
 		}
 	}
+	/*
+	 * Insert just the ancestor, we should trickle down to the correct
+	 * cgroup for reclaim, since the other nodes will be below their
+	 * soft limit
+	 */
+	if (soft_fail_res) {
+		mem_over_soft_limit =
+			mem_cgroup_from_res_counter(soft_fail_res, res);
+		if (mem_cgroup_soft_limit_check(mem_over_soft_limit))
+			mem_cgroup_update_tree(mem_over_soft_limit, page);
+	}
 	return 0;
 nomem:
 	css_put(&mem->css);
 	return -ENOMEM;
 }
 
-
 /*
  * A helper function to get mem_cgroup from ID. must be called under
  * rcu_read_lock(). The caller must check css_is_removed() or some if
@@ -1126,9 +1304,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 	lock_page_cgroup(pc);
 	if (unlikely(PageCgroupUsed(pc))) {
 		unlock_page_cgroup(pc);
-		res_counter_uncharge(&mem->res, PAGE_SIZE);
+		res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
 		if (do_swap_account)
-			res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+			res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
 		css_put(&mem->css);
 		return;
 	}
@@ -1205,7 +1383,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 	if (pc->mem_cgroup != from)
 		goto out;
 
-	res_counter_uncharge(&from->res, PAGE_SIZE);
+	res_counter_uncharge(&from->res, PAGE_SIZE, NULL);
 	mem_cgroup_charge_statistics(from, pc, false);
 
 	page = pc->page;
@@ -1225,7 +1403,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 	}
 
 	if (do_swap_account)
-		res_counter_uncharge(&from->memsw, PAGE_SIZE);
+		res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL);
 	css_put(&from->css);
 
 	css_get(&to->css);
@@ -1265,7 +1443,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
 	parent = mem_cgroup_from_cont(pcg);
 
 
-	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
+	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
 	if (ret || !parent)
 		return ret;
 
@@ -1295,9 +1473,9 @@ uncharge:
 	/* drop extra refcnt by try_charge() */
 	css_put(&parent->css);
 	/* uncharge if move fails */
-	res_counter_uncharge(&parent->res, PAGE_SIZE);
+	res_counter_uncharge(&parent->res, PAGE_SIZE, NULL);
 	if (do_swap_account)
-		res_counter_uncharge(&parent->memsw, PAGE_SIZE);
+		res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL);
 	return ret;
 }
 
@@ -1322,7 +1500,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 	prefetchw(pc);
 
 	mem = memcg;
-	ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
+	ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page);
 	if (ret || !mem)
 		return ret;
 
@@ -1441,14 +1619,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
 	if (!mem)
 		goto charge_cur_mm;
 	*ptr = mem;
-	ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
+	ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page);
 	/* drop extra refcnt from tryget */
 	css_put(&mem->css);
 	return ret;
 charge_cur_mm:
 	if (unlikely(!mm))
 		mm = &init_mm;
-	return __mem_cgroup_try_charge(mm, mask, ptr, true);
+	return __mem_cgroup_try_charge(mm, mask, ptr, true, page);
 }
 
 static void
@@ -1486,7 +1664,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
 			 * This recorded memcg can be obsolete one. So, avoid
 			 * calling css_tryget
 			 */
-			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+			res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL);
 			mem_cgroup_put(memcg);
 		}
 		rcu_read_unlock();
@@ -1511,9 +1689,9 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
 		return;
 	if (!mem)
 		return;
-	res_counter_uncharge(&mem->res, PAGE_SIZE);
+	res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
 	if (do_swap_account)
-		res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+		res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
 	css_put(&mem->css);
 }
 
@@ -1527,6 +1705,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	struct page_cgroup *pc;
 	struct mem_cgroup *mem = NULL;
 	struct mem_cgroup_per_zone *mz;
+	bool soft_limit_excess = false;
 
 	if (mem_cgroup_disabled())
 		return NULL;
@@ -1565,9 +1744,9 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 		break;
 	}
 
-	res_counter_uncharge(&mem->res, PAGE_SIZE);
+	res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess);
 	if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
-		res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+		res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
 	mem_cgroup_charge_statistics(mem, pc, false);
 
 	ClearPageCgroupUsed(pc);
@@ -1581,6 +1760,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	mz = page_cgroup_zoneinfo(pc);
 	unlock_page_cgroup(pc);
 
+	if (soft_limit_excess && mem_cgroup_soft_limit_check(mem))
+		mem_cgroup_update_tree(mem, page);
 	/* at swapout, this memcg will be accessed to record to swap */
 	if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
 		css_put(&mem->css);
@@ -1656,7 +1837,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
 		 * We uncharge this because swap is freed.
 		 * This memcg can be obsolete one. We avoid calling css_tryget
 		 */
-		res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+		res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL);
 		mem_cgroup_put(memcg);
 	}
 	rcu_read_unlock();
@@ -1685,7 +1866,8 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
 	unlock_page_cgroup(pc);
 
 	if (mem) {
-		ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
+		ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
+						page);
 		css_put(&mem->css);
 	}
 	*ptr = mem;
@@ -2194,6 +2376,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 			res_counter_reset_failcnt(&mem->memsw);
 		break;
 	}
+
 	return 0;
 }
 
@@ -2489,6 +2672,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
 		mz = &pn->zoneinfo[zone];
 		for_each_lru(l)
 			INIT_LIST_HEAD(&mz->lists[l]);
+		mz->usage_in_excess = 0;
 	}
 	return 0;
 }
@@ -2534,6 +2718,7 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
 {
 	int node;
 
+	mem_cgroup_remove_from_trees(mem);
 	free_css_id(&mem_cgroup_subsys, &mem->css);
 
 	for_each_node_state(node, N_POSSIBLE)
@@ -2582,6 +2767,31 @@ static void __init enable_swap_cgroup(void)
 }
 #endif
 
+static int mem_cgroup_soft_limit_tree_init(void)
+{
+	struct mem_cgroup_tree_per_node *rtpn;
+	struct mem_cgroup_tree_per_zone *rtpz;
+	int tmp, node, zone;
+
+	for_each_node_state(node, N_POSSIBLE) {
+		tmp = node;
+		if (!node_state(node, N_NORMAL_MEMORY))
+			tmp = -1;
+		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
+		if (!rtpn)
+			return 1;
+
+		soft_limit_tree.rb_tree_per_node[node] = rtpn;
+
+		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+			rtpz = &rtpn->rb_tree_per_zone[zone];
+			rtpz->rb_root = RB_ROOT;
+			spin_lock_init(&rtpz->lock);
+		}
+	}
+	return 0;
+}
+
 static struct cgroup_subsys_state * __ref
 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 {
@@ -2596,11 +2806,15 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	for_each_node_state(node, N_POSSIBLE)
 		if (alloc_mem_cgroup_per_zone_info(mem, node))
 			goto free_out;
+
 	/* root ? */
 	if (cont->parent == NULL) {
 		enable_swap_cgroup();
 		parent = NULL;
 		root_mem_cgroup = mem;
+		if (mem_cgroup_soft_limit_tree_init())
+			goto free_out;
+
 	} else {
 		parent = mem_cgroup_from_cont(cont->parent);
 		mem->use_hierarchy = parent->use_hierarchy;
-- 
cgit v1.2.3-71-gd317


From 4e41695356fb4e0b153be1440ad027e46e0a7ea2 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbir@linux.vnet.ibm.com>
Date: Wed, 23 Sep 2009 15:56:39 -0700
Subject: memory controller: soft limit reclaim on contention

Implement reclaim from groups over their soft limit

Permit reclaim from memory cgroups on contention (via the direct reclaim
path).

memory cgroup soft limit reclaim finds the group that exceeds its soft
limit by the largest number of pages and reclaims pages from it and then
reinserts the cgroup into its correct place in the rbtree.

Add additional checks to mem_cgroup_hierarchical_reclaim() to detect long
loops in case all swap is turned off.  The code has been refactored and
the loop check (loop < 2) has been enhanced for soft limits.  For soft
limits, we try to do more targetted reclaim.  Instead of bailing out after
two loops, the routine now reclaims memory proportional to the size by
which the soft limit is exceeded.  The proportion has been empirically
determined.

[akpm@linux-foundation.org: build fix]
[kamezawa.hiroyu@jp.fujitsu.com: fix softlimit css refcnt handling]
[nishimura@mxp.nes.nec.co.jp: refcount of the "victim" should be decremented before exiting the loop]
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  10 ++
 include/linux/swap.h       |   5 +
 mm/memcontrol.c            | 226 ++++++++++++++++++++++++++++++++++++++++++---
 mm/vmscan.c                |  45 ++++++++-
 4 files changed, 271 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e46a0734ab6e..bf9213b2db8f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -118,6 +118,9 @@ static inline bool mem_cgroup_disabled(void)
 
 extern bool mem_cgroup_oom_called(struct task_struct *task);
 void mem_cgroup_update_mapped_file_stat(struct page *page, int val);
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+						gfp_t gfp_mask, int nid,
+						int zid);
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct mem_cgroup;
 
@@ -276,6 +279,13 @@ static inline void mem_cgroup_update_mapped_file_stat(struct page *page,
 {
 }
 
+static inline
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+					    gfp_t gfp_mask, int nid, int zid)
+{
+	return 0;
+}
+
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 6c990e658f4e..4c78fea989b9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -217,6 +217,11 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
 						  gfp_t gfp_mask, bool noswap,
 						  unsigned int swappiness);
+extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
+						gfp_t gfp_mask, bool noswap,
+						unsigned int swappiness,
+						struct zone *zone,
+						int nid);
 extern int __isolate_lru_page(struct page *page, int mode, int file);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 90f0b13e1c3c..011aba6cad70 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -139,6 +139,8 @@ struct mem_cgroup_per_zone {
 	unsigned long long	usage_in_excess;/* Set to the value by which */
 						/* the soft limit is exceeded*/
 	bool			on_tree;
+	struct mem_cgroup	*mem;		/* Back pointer, we cannot */
+						/* use container_of	   */
 };
 /* Macro for accessing counter */
 #define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
@@ -228,6 +230,13 @@ struct mem_cgroup {
 	struct mem_cgroup_stat stat;
 };
 
+/*
+ * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
+ * limit reclaim to prevent infinite loops, if they ever occur.
+ */
+#define	MEM_CGROUP_MAX_RECLAIM_LOOPS		(100)
+#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	(2)
+
 enum charge_type {
 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 	MEM_CGROUP_CHARGE_TYPE_MAPPED,
@@ -259,6 +268,8 @@ enum charge_type {
 #define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
 #define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
 #define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
+#define MEM_CGROUP_RECLAIM_SOFT_BIT	0x2
+#define MEM_CGROUP_RECLAIM_SOFT		(1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
 
 static void mem_cgroup_get(struct mem_cgroup *mem);
 static void mem_cgroup_put(struct mem_cgroup *mem);
@@ -299,7 +310,7 @@ soft_limit_tree_from_page(struct page *page)
 }
 
 static void
-mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
+__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
 				struct mem_cgroup_per_zone *mz,
 				struct mem_cgroup_tree_per_zone *mctz)
 {
@@ -311,7 +322,6 @@ mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
 		return;
 
 	mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res);
-	spin_lock(&mctz->lock);
 	while (*p) {
 		parent = *p;
 		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
@@ -328,6 +338,26 @@ mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
 	rb_link_node(&mz->tree_node, parent, p);
 	rb_insert_color(&mz->tree_node, &mctz->rb_root);
 	mz->on_tree = true;
+}
+
+static void
+__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
+				struct mem_cgroup_per_zone *mz,
+				struct mem_cgroup_tree_per_zone *mctz)
+{
+	if (!mz->on_tree)
+		return;
+	rb_erase(&mz->tree_node, &mctz->rb_root);
+	mz->on_tree = false;
+}
+
+static void
+mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
+				struct mem_cgroup_per_zone *mz,
+				struct mem_cgroup_tree_per_zone *mctz)
+{
+	spin_lock(&mctz->lock);
+	__mem_cgroup_insert_exceeded(mem, mz, mctz);
 	spin_unlock(&mctz->lock);
 }
 
@@ -337,8 +367,7 @@ mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
 				struct mem_cgroup_tree_per_zone *mctz)
 {
 	spin_lock(&mctz->lock);
-	rb_erase(&mz->tree_node, &mctz->rb_root);
-	mz->on_tree = false;
+	__mem_cgroup_remove_exceeded(mem, mz, mctz);
 	spin_unlock(&mctz->lock);
 }
 
@@ -408,6 +437,47 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
 	}
 }
 
+static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
+{
+	return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
+}
+
+static struct mem_cgroup_per_zone *
+__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
+{
+	struct rb_node *rightmost = NULL;
+	struct mem_cgroup_per_zone *mz = NULL;
+
+retry:
+	rightmost = rb_last(&mctz->rb_root);
+	if (!rightmost)
+		goto done;		/* Nothing to reclaim from */
+
+	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
+	/*
+	 * Remove the node now but someone else can add it back,
+	 * we will to add it back at the end of reclaim to its correct
+	 * position in the tree.
+	 */
+	__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
+	if (!res_counter_soft_limit_excess(&mz->mem->res) ||
+		!css_tryget(&mz->mem->css))
+		goto retry;
+done:
+	return mz;
+}
+
+static struct mem_cgroup_per_zone *
+mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
+{
+	struct mem_cgroup_per_zone *mz;
+
+	spin_lock(&mctz->lock);
+	mz = __mem_cgroup_largest_soft_limit_node(mctz);
+	spin_unlock(&mctz->lock);
+	return mz;
+}
+
 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
 					 struct page_cgroup *pc,
 					 bool charge)
@@ -1037,6 +1107,7 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
  * If shrink==true, for avoiding to free too much, this returns immedieately.
  */
 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
+						struct zone *zone,
 						gfp_t gfp_mask,
 						unsigned long reclaim_options)
 {
@@ -1045,23 +1116,53 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
 	int loop = 0;
 	bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
 	bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
+	bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
+	unsigned long excess = mem_cgroup_get_excess(root_mem);
 
 	/* If memsw_is_minimum==1, swap-out is of-no-use. */
 	if (root_mem->memsw_is_minimum)
 		noswap = true;
 
-	while (loop < 2) {
+	while (1) {
 		victim = mem_cgroup_select_victim(root_mem);
-		if (victim == root_mem)
+		if (victim == root_mem) {
 			loop++;
+			if (loop >= 2) {
+				/*
+				 * If we have not been able to reclaim
+				 * anything, it might because there are
+				 * no reclaimable pages under this hierarchy
+				 */
+				if (!check_soft || !total) {
+					css_put(&victim->css);
+					break;
+				}
+				/*
+				 * We want to do more targetted reclaim.
+				 * excess >> 2 is not to excessive so as to
+				 * reclaim too much, nor too less that we keep
+				 * coming back to reclaim from this cgroup
+				 */
+				if (total >= (excess >> 2) ||
+					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
+					css_put(&victim->css);
+					break;
+				}
+			}
+		}
 		if (!mem_cgroup_local_usage(&victim->stat)) {
 			/* this cgroup's local usage == 0 */
 			css_put(&victim->css);
 			continue;
 		}
 		/* we use swappiness of local cgroup */
-		ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap,
-						   get_swappiness(victim));
+		if (check_soft)
+			ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
+				noswap, get_swappiness(victim), zone,
+				zone->zone_pgdat->node_id);
+		else
+			ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
+						noswap, get_swappiness(victim));
 		css_put(&victim->css);
 		/*
 		 * At shrinking usage, we can't check we should stop here or
@@ -1071,7 +1172,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
 		if (shrink)
 			return ret;
 		total += ret;
-		if (mem_cgroup_check_under_limit(root_mem))
+		if (check_soft) {
+			if (res_counter_check_under_soft_limit(&root_mem->res))
+				return total;
+		} else if (mem_cgroup_check_under_limit(root_mem))
 			return 1 + total;
 	}
 	return total;
@@ -1206,8 +1310,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 		if (!(gfp_mask & __GFP_WAIT))
 			goto nomem;
 
-		ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
-							flags);
+		ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
+						gfp_mask, flags);
 		if (ret)
 			continue;
 
@@ -2018,8 +2122,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 		if (!ret)
 			break;
 
-		progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL,
-						   MEM_CGROUP_RECLAIM_SHRINK);
+		progress = mem_cgroup_hierarchical_reclaim(memcg, NULL,
+						GFP_KERNEL,
+						MEM_CGROUP_RECLAIM_SHRINK);
 		curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
 		/* Usage is reduced ? */
   		if (curusage >= oldusage)
@@ -2071,7 +2176,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 		if (!ret)
 			break;
 
-		mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL,
+		mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
 						MEM_CGROUP_RECLAIM_NOSWAP |
 						MEM_CGROUP_RECLAIM_SHRINK);
 		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
@@ -2084,6 +2189,97 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 	return ret;
 }
 
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+						gfp_t gfp_mask, int nid,
+						int zid)
+{
+	unsigned long nr_reclaimed = 0;
+	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
+	unsigned long reclaimed;
+	int loop = 0;
+	struct mem_cgroup_tree_per_zone *mctz;
+
+	if (order > 0)
+		return 0;
+
+	mctz = soft_limit_tree_node_zone(nid, zid);
+	/*
+	 * This loop can run a while, specially if mem_cgroup's continuously
+	 * keep exceeding their soft limit and putting the system under
+	 * pressure
+	 */
+	do {
+		if (next_mz)
+			mz = next_mz;
+		else
+			mz = mem_cgroup_largest_soft_limit_node(mctz);
+		if (!mz)
+			break;
+
+		reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
+						gfp_mask,
+						MEM_CGROUP_RECLAIM_SOFT);
+		nr_reclaimed += reclaimed;
+		spin_lock(&mctz->lock);
+
+		/*
+		 * If we failed to reclaim anything from this memory cgroup
+		 * it is time to move on to the next cgroup
+		 */
+		next_mz = NULL;
+		if (!reclaimed) {
+			do {
+				/*
+				 * Loop until we find yet another one.
+				 *
+				 * By the time we get the soft_limit lock
+				 * again, someone might have aded the
+				 * group back on the RB tree. Iterate to
+				 * make sure we get a different mem.
+				 * mem_cgroup_largest_soft_limit_node returns
+				 * NULL if no other cgroup is present on
+				 * the tree
+				 */
+				next_mz =
+				__mem_cgroup_largest_soft_limit_node(mctz);
+				if (next_mz == mz) {
+					css_put(&next_mz->mem->css);
+					next_mz = NULL;
+				} else /* next_mz == NULL or other memcg */
+					break;
+			} while (1);
+		}
+		mz->usage_in_excess =
+			res_counter_soft_limit_excess(&mz->mem->res);
+		__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
+		/*
+		 * One school of thought says that we should not add
+		 * back the node to the tree if reclaim returns 0.
+		 * But our reclaim could return 0, simply because due
+		 * to priority we are exposing a smaller subset of
+		 * memory to reclaim from. Consider this as a longer
+		 * term TODO.
+		 */
+		if (mz->usage_in_excess)
+			__mem_cgroup_insert_exceeded(mz->mem, mz, mctz);
+		spin_unlock(&mctz->lock);
+		css_put(&mz->mem->css);
+		loop++;
+		/*
+		 * Could not reclaim anything and there are no more
+		 * mem cgroups to try or we seem to be looping without
+		 * reclaiming anything.
+		 */
+		if (!nr_reclaimed &&
+			(next_mz == NULL ||
+			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
+			break;
+	} while (!nr_reclaimed);
+	if (next_mz)
+		css_put(&next_mz->mem->css);
+	return nr_reclaimed;
+}
+
 /*
  * This routine traverse page_cgroup in given list and drop them all.
  * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
@@ -2686,6 +2882,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
 		for_each_lru(l)
 			INIT_LIST_HEAD(&mz->lists[l]);
 		mz->usage_in_excess = 0;
+		mz->on_tree = false;
+		mz->mem = mem;
 	}
 	return 0;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 613e89f471d9..2423782214ab 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1836,11 +1836,45 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 
+unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
+						gfp_t gfp_mask, bool noswap,
+						unsigned int swappiness,
+						struct zone *zone, int nid)
+{
+	struct scan_control sc = {
+		.may_writepage = !laptop_mode,
+		.may_unmap = 1,
+		.may_swap = !noswap,
+		.swap_cluster_max = SWAP_CLUSTER_MAX,
+		.swappiness = swappiness,
+		.order = 0,
+		.mem_cgroup = mem,
+		.isolate_pages = mem_cgroup_isolate_pages,
+	};
+	nodemask_t nm  = nodemask_of_node(nid);
+
+	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
+	sc.nodemask = &nm;
+	sc.nr_reclaimed = 0;
+	sc.nr_scanned = 0;
+	/*
+	 * NOTE: Although we can get the priority field, using it
+	 * here is not a good idea, since it limits the pages we can scan.
+	 * if we don't reclaim here, the shrink_zone from balance_pgdat
+	 * will pick up pages from other mem cgroup's as well. We hack
+	 * the priority and make it zero.
+	 */
+	shrink_zone(0, zone, &sc);
+	return sc.nr_reclaimed;
+}
+
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 					   gfp_t gfp_mask,
 					   bool noswap,
 					   unsigned int swappiness)
 {
+	struct zonelist *zonelist;
 	struct scan_control sc = {
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
@@ -1852,7 +1886,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 		.isolate_pages = mem_cgroup_isolate_pages,
 		.nodemask = NULL, /* we don't care the placement */
 	};
-	struct zonelist *zonelist;
 
 	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -1974,6 +2007,7 @@ loop_again:
 		for (i = 0; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			int nr_slab;
+			int nid, zid;
 
 			if (!populated_zone(zone))
 				continue;
@@ -1988,6 +2022,15 @@ loop_again:
 			temp_priority[i] = priority;
 			sc.nr_scanned = 0;
 			note_zone_scanning_priority(zone, priority);
+
+			nid = pgdat->node_id;
+			zid = zone_idx(zone);
+			/*
+			 * Call soft limit reclaim before calling shrink_zone.
+			 * For now we ignore the return value
+			 */
+			mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask,
+							nid, zid);
 			/*
 			 * We put equal pressure on every zone, unless one
 			 * zone has way too many pages free already.
-- 
cgit v1.2.3-71-gd317


From a7f0765edfd53aed09cb7b0e15863688b39447de Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 23 Sep 2009 15:56:44 -0700
Subject: ptrace: __ptrace_detach: do __wake_up_parent() if we reap the tracee

The bug is old, it wasn't cause by recent changes.

Test case:

	static void *tfunc(void *arg)
	{
		int pid = (long)arg;

		assert(ptrace(PTRACE_ATTACH, pid, NULL, NULL) == 0);
		kill(pid, SIGKILL);

		sleep(1);
		return NULL;
	}

	int main(void)
	{
		pthread_t th;
		long pid = fork();

		if (!pid)
			pause();

		signal(SIGCHLD, SIG_IGN);
		assert(pthread_create(&th, NULL, tfunc, (void*)pid) == 0);

		int r = waitpid(-1, NULL, __WNOTHREAD);
		printf("waitpid: %d %m\n", r);

		return 0;
	}

Before the patch this program hangs, after this patch waitpid() correctly
fails with errno == -ECHILD.

The problem is, __ptrace_detach() reaps the EXIT_ZOMBIE tracee if its
->real_parent is our sub-thread and we ignore SIGCHLD.  But in this case
we should wake up other threads which can sleep in do_wait().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Vitaly Mayatskikh <vmayatsk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  1 +
 kernel/exit.c         |  5 +++++
 kernel/ptrace.c       | 11 +++++++----
 kernel/signal.c       |  9 ---------
 4 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 848d1f20086e..9e5a88afe6be 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2059,6 +2059,7 @@ extern int kill_pgrp(struct pid *pid, int sig, int priv);
 extern int kill_pid(struct pid *pid, int sig, int priv);
 extern int kill_proc_info(int, struct siginfo *, pid_t);
 extern int do_notify_parent(struct task_struct *, int);
+extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
 extern void force_sig(int, struct task_struct *);
 extern void force_sig_specific(int, struct task_struct *);
 extern int send_sig(int, struct task_struct *, int);
diff --git a/kernel/exit.c b/kernel/exit.c
index 60d6fdcc9265..782b2e1f7ca2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1575,6 +1575,11 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
 	return 0;
 }
 
+void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
+{
+	wake_up_interruptible_sync(&parent->signal->wait_chldexit);
+}
+
 static long do_wait(struct wait_opts *wo)
 {
 	DECLARE_WAITQUEUE(wait, current);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 307c285af59e..23bd09cd042e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -266,9 +266,10 @@ static int ignoring_children(struct sighand_struct *sigh)
  * or self-reaping.  Do notification now if it would have happened earlier.
  * If it should reap itself, return true.
  *
- * If it's our own child, there is no notification to do.
- * But if our normal children self-reap, then this child
- * was prevented by ptrace and we must reap it now.
+ * If it's our own child, there is no notification to do. But if our normal
+ * children self-reap, then this child was prevented by ptrace and we must
+ * reap it now, in that case we must also wake up sub-threads sleeping in
+ * do_wait().
  */
 static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
 {
@@ -278,8 +279,10 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
 		if (!task_detached(p) && thread_group_empty(p)) {
 			if (!same_thread_group(p->real_parent, tracer))
 				do_notify_parent(p, p->exit_signal);
-			else if (ignoring_children(tracer->sighand))
+			else if (ignoring_children(tracer->sighand)) {
+				__wake_up_parent(p, tracer);
 				p->exit_signal = -1;
+			}
 		}
 		if (task_detached(p)) {
 			/* Mark it as in the process of being reaped. */
diff --git a/kernel/signal.c b/kernel/signal.c
index 64c5deeaca5d..534ea81cde47 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1382,15 +1382,6 @@ ret:
 	return ret;
 }
 
-/*
- * Wake up any threads in the parent blocked in wait* syscalls.
- */
-static inline void __wake_up_parent(struct task_struct *p,
-				    struct task_struct *parent)
-{
-	wake_up_interruptible_sync(&parent->signal->wait_chldexit);
-}
-
 /*
  * Let a parent know about the death of a child.
  * For a stopped/continued status change, use do_notify_parent_cldstop instead.
-- 
cgit v1.2.3-71-gd317


From ae6d2ed7bb3877ff35b9569402025f40ea2e1803 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 23 Sep 2009 15:56:53 -0700
Subject: signals: tracehook_notify_jctl change

This changes tracehook_notify_jctl() so it's called with the siglock held,
and changes its argument and return value definition.  These clean-ups
make it a better fit for what new tracing hooks need to check.

Tracing needs the siglock here, held from the time TASK_STOPPED was set,
to avoid potential SIGCONT races if it wants to allow any blocking in its
tracing hooks.

This also folds the finish_stop() function into its caller
do_signal_stop().  The function is short, called only once and only
unconditionally.  It aids readability to fold it in.

[oleg@redhat.com: do not call tracehook_notify_jctl() in TASK_STOPPED state]
[oleg@redhat.com: introduce tracehook_finish_jctl() helper]
Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/tracehook.h | 34 ++++++++++++-----
 kernel/signal.c           | 97 +++++++++++++++++++++++------------------------
 2 files changed, 72 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 17ba82efa483..1eb44a924e56 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -1,7 +1,7 @@
 /*
  * Tracing hooks
  *
- * Copyright (C) 2008 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2008-2009 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -463,22 +463,38 @@ static inline int tracehook_get_signal(struct task_struct *task,
 
 /**
  * tracehook_notify_jctl - report about job control stop/continue
- * @notify:		nonzero if this is the last thread in the group to stop
+ * @notify:		zero, %CLD_STOPPED or %CLD_CONTINUED
  * @why:		%CLD_STOPPED or %CLD_CONTINUED
  *
  * This is called when we might call do_notify_parent_cldstop().
- * It's called when about to stop for job control; we are already in
- * %TASK_STOPPED state, about to call schedule().  It's also called when
- * a delayed %CLD_STOPPED or %CLD_CONTINUED report is ready to be made.
  *
- * Return nonzero to generate a %SIGCHLD with @why, which is
- * normal if @notify is nonzero.
+ * @notify is zero if we would not ordinarily send a %SIGCHLD,
+ * or is the %CLD_STOPPED or %CLD_CONTINUED .si_code for %SIGCHLD.
  *
- * Called with no locks held.
+ * @why is %CLD_STOPPED when about to stop for job control;
+ * we are already in %TASK_STOPPED state, about to call schedule().
+ * It might also be that we have just exited (check %PF_EXITING),
+ * but need to report that a group-wide stop is complete.
+ *
+ * @why is %CLD_CONTINUED when waking up after job control stop and
+ * ready to make a delayed @notify report.
+ *
+ * Return the %CLD_* value for %SIGCHLD, or zero to generate no signal.
+ *
+ * Called with the siglock held.
  */
 static inline int tracehook_notify_jctl(int notify, int why)
 {
-	return notify || (current->ptrace & PT_PTRACED);
+	return notify ?: (current->ptrace & PT_PTRACED) ? why : 0;
+}
+
+/**
+ * tracehook_finish_jctl - report about return from job control stop
+ *
+ * This is called by do_signal_stop() after wakeup.
+ */
+static inline void tracehook_finish_jctl(void)
+{
 }
 
 #define DEATH_REAP			-1
diff --git a/kernel/signal.c b/kernel/signal.c
index 534ea81cde47..5d3b3f8f219b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -705,7 +705,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
 
 		if (why) {
 			/*
-			 * The first thread which returns from finish_stop()
+			 * The first thread which returns from do_signal_stop()
 			 * will take ->siglock, notice SIGNAL_CLD_MASK, and
 			 * notify its parent. See get_signal_to_deliver().
 			 */
@@ -1664,29 +1664,6 @@ void ptrace_notify(int exit_code)
 	spin_unlock_irq(&current->sighand->siglock);
 }
 
-static void
-finish_stop(int stop_count)
-{
-	/*
-	 * If there are no other threads in the group, or if there is
-	 * a group stop in progress and we are the last to stop,
-	 * report to the parent.  When ptraced, every thread reports itself.
-	 */
-	if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) {
-		read_lock(&tasklist_lock);
-		do_notify_parent_cldstop(current, CLD_STOPPED);
-		read_unlock(&tasklist_lock);
-	}
-
-	do {
-		schedule();
-	} while (try_to_freeze());
-	/*
-	 * Now we don't run again until continued.
-	 */
-	current->exit_code = 0;
-}
-
 /*
  * This performs the stopping for SIGSTOP and other stop signals.
  * We have to stop all threads in the thread group.
@@ -1696,15 +1673,9 @@ finish_stop(int stop_count)
 static int do_signal_stop(int signr)
 {
 	struct signal_struct *sig = current->signal;
-	int stop_count;
+	int notify;
 
-	if (sig->group_stop_count > 0) {
-		/*
-		 * There is a group stop in progress.  We don't need to
-		 * start another one.
-		 */
-		stop_count = --sig->group_stop_count;
-	} else {
+	if (!sig->group_stop_count) {
 		struct task_struct *t;
 
 		if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
@@ -1716,7 +1687,7 @@ static int do_signal_stop(int signr)
 		 */
 		sig->group_exit_code = signr;
 
-		stop_count = 0;
+		sig->group_stop_count = 1;
 		for (t = next_thread(current); t != current; t = next_thread(t))
 			/*
 			 * Setting state to TASK_STOPPED for a group
@@ -1725,19 +1696,44 @@ static int do_signal_stop(int signr)
 			 */
 			if (!(t->flags & PF_EXITING) &&
 			    !task_is_stopped_or_traced(t)) {
-				stop_count++;
+				sig->group_stop_count++;
 				signal_wake_up(t, 0);
 			}
-		sig->group_stop_count = stop_count;
 	}
+	/*
+	 * If there are no other threads in the group, or if there is
+	 * a group stop in progress and we are the last to stop, report
+	 * to the parent.  When ptraced, every thread reports itself.
+	 */
+	notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0;
+	notify = tracehook_notify_jctl(notify, CLD_STOPPED);
+	/*
+	 * tracehook_notify_jctl() can drop and reacquire siglock, so
+	 * we keep ->group_stop_count != 0 before the call. If SIGCONT
+	 * or SIGKILL comes in between ->group_stop_count == 0.
+	 */
+	if (sig->group_stop_count) {
+		if (!--sig->group_stop_count)
+			sig->flags = SIGNAL_STOP_STOPPED;
+		current->exit_code = sig->group_exit_code;
+		__set_current_state(TASK_STOPPED);
+	}
+	spin_unlock_irq(&current->sighand->siglock);
 
-	if (stop_count == 0)
-		sig->flags = SIGNAL_STOP_STOPPED;
-	current->exit_code = sig->group_exit_code;
-	__set_current_state(TASK_STOPPED);
+	if (notify) {
+		read_lock(&tasklist_lock);
+		do_notify_parent_cldstop(current, notify);
+		read_unlock(&tasklist_lock);
+	}
+
+	/* Now we don't run again until woken by SIGCONT or SIGKILL */
+	do {
+		schedule();
+	} while (try_to_freeze());
+
+	tracehook_finish_jctl();
+	current->exit_code = 0;
 
-	spin_unlock_irq(&current->sighand->siglock);
-	finish_stop(stop_count);
 	return 1;
 }
 
@@ -1806,14 +1802,15 @@ relock:
 		int why = (signal->flags & SIGNAL_STOP_CONTINUED)
 				? CLD_CONTINUED : CLD_STOPPED;
 		signal->flags &= ~SIGNAL_CLD_MASK;
-		spin_unlock_irq(&sighand->siglock);
 
-		if (unlikely(!tracehook_notify_jctl(1, why)))
-			goto relock;
+		why = tracehook_notify_jctl(why, CLD_CONTINUED);
+		spin_unlock_irq(&sighand->siglock);
 
-		read_lock(&tasklist_lock);
-		do_notify_parent_cldstop(current->group_leader, why);
-		read_unlock(&tasklist_lock);
+		if (why) {
+			read_lock(&tasklist_lock);
+			do_notify_parent_cldstop(current->group_leader, why);
+			read_unlock(&tasklist_lock);
+		}
 		goto relock;
 	}
 
@@ -1978,14 +1975,14 @@ void exit_signals(struct task_struct *tsk)
 	if (unlikely(tsk->signal->group_stop_count) &&
 			!--tsk->signal->group_stop_count) {
 		tsk->signal->flags = SIGNAL_STOP_STOPPED;
-		group_stop = 1;
+		group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED);
 	}
 out:
 	spin_unlock_irq(&tsk->sighand->siglock);
 
-	if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) {
+	if (unlikely(group_stop)) {
 		read_lock(&tasklist_lock);
-		do_notify_parent_cldstop(tsk, CLD_STOPPED);
+		do_notify_parent_cldstop(tsk, group_stop);
 		read_unlock(&tasklist_lock);
 	}
 }
-- 
cgit v1.2.3-71-gd317


From 964ee7df90d799e38fb1556c57cd5c45fc736436 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 23 Sep 2009 15:56:59 -0700
Subject: exec: fix set_binfmt() vs sys_delete_module() race

sys_delete_module() can set MODULE_STATE_GOING after
search_binary_handler() does try_module_get().  In this case
set_binfmt()->try_module_get() fails but since none of the callers
check the returned error, the task will run with the wrong old
->binfmt.

The proper fix should change all ->load_binary() methods, but we can
rely on fact that the caller must hold a reference to binfmt->module
and use __module_get() which never fails.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c               | 14 +++++---------
 include/linux/binfmts.h |  2 +-
 2 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 8efbdc606a1e..6dc92c39dd94 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1395,18 +1395,14 @@ out_ret:
 	return retval;
 }
 
-int set_binfmt(struct linux_binfmt *new)
+void set_binfmt(struct linux_binfmt *new)
 {
-	struct linux_binfmt *old = current->binfmt;
+	if (current->binfmt)
+		module_put(current->binfmt->module);
 
-	if (new) {
-		if (!try_module_get(new->module))
-			return -1;
-	}
 	current->binfmt = new;
-	if (old)
-		module_put(old->module);
-	return 0;
+	if (new)
+		__module_get(new->module);
 }
 
 EXPORT_SYMBOL(set_binfmt);
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 2046b5b8af48..aece486ac734 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -120,7 +120,7 @@ extern int copy_strings_kernel(int argc,char ** argv,struct linux_binprm *bprm);
 extern int prepare_bprm_creds(struct linux_binprm *bprm);
 extern void install_exec_creds(struct linux_binprm *bprm);
 extern void do_coredump(long signr, int exit_code, struct pt_regs *regs);
-extern int set_binfmt(struct linux_binfmt *new);
+extern void set_binfmt(struct linux_binfmt *new);
 extern void free_bprm(struct linux_binprm *);
 
 #endif /* __KERNEL__ */
-- 
cgit v1.2.3-71-gd317


From 4a30debfb778240a4b1767d4b0c5a5b25ab97160 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 23 Sep 2009 15:57:00 -0700
Subject: signals: introduce do_send_sig_info() helper

Introduce do_send_sig_info() and convert group_send_sig_info(),
send_sig_info(), do_send_specific() to use this helper.

Hopefully it will have more users soon, it allows to specify
specific/group behaviour via "bool group" argument.

Shaves 80 bytes from .text.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/signal.h |  2 ++
 kernel/signal.c        | 56 ++++++++++++++++++++++++--------------------------
 2 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/signal.h b/include/linux/signal.h
index c7552836bd95..ab9272cc270c 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -233,6 +233,8 @@ static inline int valid_signal(unsigned long sig)
 }
 
 extern int next_signal(struct sigpending *pending, sigset_t *mask);
+extern int do_send_sig_info(int sig, struct siginfo *info,
+				struct task_struct *p, bool group);
 extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p);
 extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *);
 extern long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig,
diff --git a/kernel/signal.c b/kernel/signal.c
index 5d3b3f8f219b..c6d7a24a86a1 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -971,6 +971,20 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 	return send_signal(sig, info, t, 0);
 }
 
+int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
+			bool group)
+{
+	unsigned long flags;
+	int ret = -ESRCH;
+
+	if (lock_task_sighand(p, &flags)) {
+		ret = send_signal(sig, info, p, group);
+		unlock_task_sighand(p, &flags);
+	}
+
+	return ret;
+}
+
 /*
  * Force a signal that the process can't ignore: if necessary
  * we unblock the signal and change any SIG_IGN to SIG_DFL.
@@ -1068,18 +1082,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
  */
 int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
-	unsigned long flags;
-	int ret;
-
-	ret = check_kill_permission(sig, info, p);
+	int ret = check_kill_permission(sig, info, p);
 
-	if (!ret && sig) {
-		ret = -ESRCH;
-		if (lock_task_sighand(p, &flags)) {
-			ret = __group_send_sig_info(sig, info, p);
-			unlock_task_sighand(p, &flags);
-		}
-	}
+	if (!ret && sig)
+		ret = do_send_sig_info(sig, info, p, true);
 
 	return ret;
 }
@@ -1224,15 +1230,9 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
  * These are for backward compatibility with the rest of the kernel source.
  */
 
-/*
- * The caller must ensure the task can't exit.
- */
 int
 send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
-	int ret;
-	unsigned long flags;
-
 	/*
 	 * Make sure legacy kernel users don't send in bad values
 	 * (normal paths check this in check_kill_permission).
@@ -1240,10 +1240,7 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	if (!valid_signal(sig))
 		return -EINVAL;
 
-	spin_lock_irqsave(&p->sighand->siglock, flags);
-	ret = specific_send_sig_info(sig, info, p);
-	spin_unlock_irqrestore(&p->sighand->siglock, flags);
-	return ret;
+	return do_send_sig_info(sig, info, p, false);
 }
 
 #define __si_special(priv) \
@@ -2278,7 +2275,6 @@ static int
 do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
 {
 	struct task_struct *p;
-	unsigned long flags;
 	int error = -ESRCH;
 
 	rcu_read_lock();
@@ -2288,14 +2284,16 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
 		/*
 		 * The null signal is a permissions and process existence
 		 * probe.  No signal is actually delivered.
-		 *
-		 * If lock_task_sighand() fails we pretend the task dies
-		 * after receiving the signal. The window is tiny, and the
-		 * signal is private anyway.
 		 */
-		if (!error && sig && lock_task_sighand(p, &flags)) {
-			error = specific_send_sig_info(sig, info, p);
-			unlock_task_sighand(p, &flags);
+		if (!error && sig) {
+			error = do_send_sig_info(sig, info, p, false);
+			/*
+			 * If lock_task_sighand() failed we pretend the task
+			 * dies after receiving the signal. The window is tiny,
+			 * and the signal is private anyway.
+			 */
+			if (unlikely(error == -ESRCH))
+				error = 0;
 		}
 	}
 	rcu_read_unlock();
-- 
cgit v1.2.3-71-gd317


From d9588725e52650e82989707f8fd2feb67ad2dc8e Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 23 Sep 2009 15:57:04 -0700
Subject: signals: inline __fatal_signal_pending

__fatal_signal_pending inlines to one instruction on x86, probably two
instructions on other machines.  It takes two longer x86 instructions just
to call it and test its return value, not to mention the function itself.

On my random x86_64 config, this saved 70 bytes of text (59 of those being
__fatal_signal_pending itself).

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 5 ++++-
 kernel/signal.c       | 6 ------
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9e5a88afe6be..e951bd2eb9fc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2337,7 +2337,10 @@ static inline int signal_pending(struct task_struct *p)
 	return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
 }
 
-extern int __fatal_signal_pending(struct task_struct *p);
+static inline int __fatal_signal_pending(struct task_struct *p)
+{
+	return unlikely(sigismember(&p->pending.signal, SIGKILL));
+}
 
 static inline int fatal_signal_pending(struct task_struct *p)
 {
diff --git a/kernel/signal.c b/kernel/signal.c
index c6d7a24a86a1..6705320784fd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1050,12 +1050,6 @@ void zap_other_threads(struct task_struct *p)
 	}
 }
 
-int __fatal_signal_pending(struct task_struct *tsk)
-{
-	return sigismember(&tsk->pending.signal, SIGKILL);
-}
-EXPORT_SYMBOL(__fatal_signal_pending);
-
 struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
 {
 	struct sighand_struct *sighand;
-- 
cgit v1.2.3-71-gd317


From 8d65af789f3e2cf4cfbdbf71a0f7a61ebcd41d38 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 23 Sep 2009 15:57:19 -0700
Subject: sysctl: remove "struct file *" argument of ->proc_handler

It's unused.

It isn't needed -- read or write flag is already passed and sysctl
shouldn't care about the rest.

It _was_ used in two places at arch/frv for some reason.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: David Howells <dhowells@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/frv/kernel/pm.c               | 14 +++----
 arch/mips/lasat/sysctl.c           | 18 ++++-----
 arch/s390/appldata/appldata_base.c |  9 ++---
 arch/s390/kernel/debug.c           |  4 +-
 arch/s390/mm/cmm.c                 |  4 +-
 arch/x86/include/asm/nmi.h         |  3 +-
 arch/x86/kernel/apic/nmi.c         |  4 +-
 arch/x86/kernel/vsyscall_64.c      | 10 +----
 drivers/cdrom/cdrom.c              |  8 ++--
 drivers/char/random.c              |  4 +-
 drivers/net/wireless/arlan-proc.c  | 28 +++++++-------
 drivers/parport/procfs.c           | 12 +++---
 fs/coda/coda_int.h                 |  1 +
 fs/drop_caches.c                   |  4 +-
 fs/file_table.c                    |  6 +--
 fs/proc/proc_sysctl.c              |  2 +-
 fs/xfs/linux-2.6/xfs_sysctl.c      |  3 +-
 include/linux/fs.h                 |  2 +-
 include/linux/ftrace.h             |  4 +-
 include/linux/hugetlb.h            |  6 +--
 include/linux/mm.h                 |  2 +-
 include/linux/mmzone.h             | 13 +++----
 include/linux/sched.h              |  8 ++--
 include/linux/security.h           |  2 +-
 include/linux/swap.h               |  2 +-
 include/linux/sysctl.h             | 19 +++++-----
 include/linux/writeback.h          | 11 +++---
 include/net/ip.h                   |  2 +-
 include/net/ndisc.h                |  2 -
 ipc/ipc_sysctl.c                   | 16 ++++----
 ipc/mq_sysctl.c                    |  8 ++--
 kernel/hung_task.c                 |  4 +-
 kernel/sched.c                     |  4 +-
 kernel/sched_fair.c                |  4 +-
 kernel/slow-work.c                 | 12 +++---
 kernel/softlockup.c                |  4 +-
 kernel/sysctl.c                    | 78 ++++++++++++++++----------------------
 kernel/trace/ftrace.c              |  4 +-
 kernel/trace/trace_stack.c         |  4 +-
 kernel/utsname_sysctl.c            |  4 +-
 mm/hugetlb.c                       | 12 +++---
 mm/page-writeback.c                | 20 +++++-----
 mm/page_alloc.c                    | 24 ++++++------
 mm/vmscan.c                        |  4 +-
 net/bridge/br_netfilter.c          |  4 +-
 net/decnet/dn_dev.c                |  5 +--
 net/decnet/sysctl_net_decnet.c     |  2 -
 net/ipv4/devinet.c                 | 12 +++---
 net/ipv4/route.c                   |  7 ++--
 net/ipv4/sysctl_net_ipv4.c         | 16 ++++----
 net/ipv6/addrconf.c                |  8 ++--
 net/ipv6/ndisc.c                   |  8 ++--
 net/ipv6/route.c                   |  4 +-
 net/irda/irsysctl.c                |  8 ++--
 net/netfilter/ipvs/ip_vs_ctl.c     |  8 ++--
 net/netfilter/nf_log.c             |  4 +-
 net/phonet/sysctl.c                |  4 +-
 net/sunrpc/sysctl.c                |  4 +-
 net/sunrpc/xprtrdma/svc_rdma.c     |  2 +-
 security/min_addr.c                |  4 +-
 60 files changed, 239 insertions(+), 270 deletions(-)

(limited to 'include/linux')

diff --git a/arch/frv/kernel/pm.c b/arch/frv/kernel/pm.c
index be722fc1acff..0d4d3e3a4cfc 100644
--- a/arch/frv/kernel/pm.c
+++ b/arch/frv/kernel/pm.c
@@ -150,7 +150,7 @@ static int user_atoi(char __user *ubuf, size_t len)
 /*
  * Send us to sleep.
  */
-static int sysctl_pm_do_suspend(ctl_table *ctl, int write, struct file *filp,
+static int sysctl_pm_do_suspend(ctl_table *ctl, int write,
 				void __user *buffer, size_t *lenp, loff_t *fpos)
 {
 	int retval, mode;
@@ -198,13 +198,13 @@ static int try_set_cmode(int new_cmode)
 }
 
 
-static int cmode_procctl(ctl_table *ctl, int write, struct file *filp,
+static int cmode_procctl(ctl_table *ctl, int write,
 			 void __user *buffer, size_t *lenp, loff_t *fpos)
 {
 	int new_cmode;
 
 	if (!write)
-		return proc_dointvec(ctl, write, filp, buffer, lenp, fpos);
+		return proc_dointvec(ctl, write, buffer, lenp, fpos);
 
 	new_cmode = user_atoi(buffer, *lenp);
 
@@ -301,13 +301,13 @@ static int try_set_cm(int new_cm)
 	return 0;
 }
 
-static int p0_procctl(ctl_table *ctl, int write, struct file *filp,
+static int p0_procctl(ctl_table *ctl, int write,
 		      void __user *buffer, size_t *lenp, loff_t *fpos)
 {
 	int new_p0;
 
 	if (!write)
-		return proc_dointvec(ctl, write, filp, buffer, lenp, fpos);
+		return proc_dointvec(ctl, write, buffer, lenp, fpos);
 
 	new_p0 = user_atoi(buffer, *lenp);
 
@@ -345,13 +345,13 @@ static int p0_sysctl(ctl_table *table,
 	return 1;
 }
 
-static int cm_procctl(ctl_table *ctl, int write, struct file *filp,
+static int cm_procctl(ctl_table *ctl, int write,
 		      void __user *buffer, size_t *lenp, loff_t *fpos)
 {
 	int new_cm;
 
 	if (!write)
-		return proc_dointvec(ctl, write, filp, buffer, lenp, fpos);
+		return proc_dointvec(ctl, write, buffer, lenp, fpos);
 
 	new_cm = user_atoi(buffer, *lenp);
 
diff --git a/arch/mips/lasat/sysctl.c b/arch/mips/lasat/sysctl.c
index 3f04d4c406b7..b3deed8db619 100644
--- a/arch/mips/lasat/sysctl.c
+++ b/arch/mips/lasat/sysctl.c
@@ -56,12 +56,12 @@ int sysctl_lasatstring(ctl_table *table,
 
 
 /* And the same for proc */
-int proc_dolasatstring(ctl_table *table, int write, struct file *filp,
+int proc_dolasatstring(ctl_table *table, int write,
 		       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int r;
 
-	r = proc_dostring(table, write, filp, buffer, lenp, ppos);
+	r = proc_dostring(table, write, buffer, lenp, ppos);
 	if ((!write) || r)
 		return r;
 
@@ -71,12 +71,12 @@ int proc_dolasatstring(ctl_table *table, int write, struct file *filp,
 }
 
 /* proc function to write EEPROM after changing int entry */
-int proc_dolasatint(ctl_table *table, int write, struct file *filp,
+int proc_dolasatint(ctl_table *table, int write,
 		       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int r;
 
-	r = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	r = proc_dointvec(table, write, buffer, lenp, ppos);
 	if ((!write) || r)
 		return r;
 
@@ -89,7 +89,7 @@ int proc_dolasatint(ctl_table *table, int write, struct file *filp,
 static int rtctmp;
 
 /* proc function to read/write RealTime Clock */
-int proc_dolasatrtc(ctl_table *table, int write, struct file *filp,
+int proc_dolasatrtc(ctl_table *table, int write,
 		       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct timespec ts;
@@ -102,7 +102,7 @@ int proc_dolasatrtc(ctl_table *table, int write, struct file *filp,
 		if (rtctmp < 0)
 			rtctmp = 0;
 	}
-	r = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	r = proc_dointvec(table, write, buffer, lenp, ppos);
 	if (r)
 		return r;
 
@@ -154,7 +154,7 @@ int sysctl_lasat_rtc(ctl_table *table,
 #endif
 
 #ifdef CONFIG_INET
-int proc_lasat_ip(ctl_table *table, int write, struct file *filp,
+int proc_lasat_ip(ctl_table *table, int write,
 		       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	unsigned int ip;
@@ -231,12 +231,12 @@ static int sysctl_lasat_prid(ctl_table *table,
 	return 0;
 }
 
-int proc_lasat_prid(ctl_table *table, int write, struct file *filp,
+int proc_lasat_prid(ctl_table *table, int write,
 		       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int r;
 
-	r = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	r = proc_dointvec(table, write, buffer, lenp, ppos);
 	if (r < 0)
 		return r;
 	if (write) {
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index 264528e4f58d..b55fd7ed1c31 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -50,10 +50,9 @@ static struct platform_device *appldata_pdev;
  * /proc entries (sysctl)
  */
 static const char appldata_proc_name[APPLDATA_PROC_NAME_LENGTH] = "appldata";
-static int appldata_timer_handler(ctl_table *ctl, int write, struct file *filp,
+static int appldata_timer_handler(ctl_table *ctl, int write,
 				  void __user *buffer, size_t *lenp, loff_t *ppos);
 static int appldata_interval_handler(ctl_table *ctl, int write,
-					 struct file *filp,
 					 void __user *buffer,
 					 size_t *lenp, loff_t *ppos);
 
@@ -247,7 +246,7 @@ __appldata_vtimer_setup(int cmd)
  * Start/Stop timer, show status of timer (0 = not active, 1 = active)
  */
 static int
-appldata_timer_handler(ctl_table *ctl, int write, struct file *filp,
+appldata_timer_handler(ctl_table *ctl, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int len;
@@ -289,7 +288,7 @@ out:
  * current timer interval.
  */
 static int
-appldata_interval_handler(ctl_table *ctl, int write, struct file *filp,
+appldata_interval_handler(ctl_table *ctl, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int len, interval;
@@ -335,7 +334,7 @@ out:
  * monitoring (0 = not in process, 1 = in process)
  */
 static int
-appldata_generic_handler(ctl_table *ctl, int write, struct file *filp,
+appldata_generic_handler(ctl_table *ctl, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct appldata_ops *ops = NULL, *tmp_ops;
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index 4c512561687d..20f282c911c2 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -881,11 +881,11 @@ static int debug_active=1;
  * if debug_active is already off
  */
 static int
-s390dbf_procactive(ctl_table *table, int write, struct file *filp,
+s390dbf_procactive(ctl_table *table, int write,
                      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	if (!write || debug_stoppable || !debug_active)
-		return proc_dointvec(table, write, filp, buffer, lenp, ppos);
+		return proc_dointvec(table, write, buffer, lenp, ppos);
 	else
 		return 0;
 }
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index 413c240cbca7..b201135cc18c 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -262,7 +262,7 @@ cmm_skip_blanks(char *cp, char **endp)
 static struct ctl_table cmm_table[];
 
 static int
-cmm_pages_handler(ctl_table *ctl, int write, struct file *filp,
+cmm_pages_handler(ctl_table *ctl, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	char buf[16], *p;
@@ -303,7 +303,7 @@ cmm_pages_handler(ctl_table *ctl, int write, struct file *filp,
 }
 
 static int
-cmm_timeout_handler(ctl_table *ctl, int write, struct file *filp,
+cmm_timeout_handler(ctl_table *ctl, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	char buf[64], *p;
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index e63cf7d441e1..139d4c1a33a7 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -40,8 +40,7 @@ extern unsigned int nmi_watchdog;
 #define NMI_INVALID	3
 
 struct ctl_table;
-struct file;
-extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
+extern int proc_nmi_enabled(struct ctl_table *, int ,
 			void __user *, size_t *, loff_t *);
 extern int unknown_nmi_panic;
 
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index cb66a22d98ad..7ff61d6a188a 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -508,14 +508,14 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
 /*
  * proc handler for /proc/sys/kernel/nmi
  */
-int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
+int proc_nmi_enabled(struct ctl_table *table, int write,
 			void __user *buffer, size_t *length, loff_t *ppos)
 {
 	int old_state;
 
 	nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
 	old_state = nmi_watchdog_enabled;
-	proc_dointvec(table, write, file, buffer, length, ppos);
+	proc_dointvec(table, write, buffer, length, ppos);
 	if (!!old_state == !!nmi_watchdog_enabled)
 		return 0;
 
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index cf53a78e2dcf..8cb4974ff599 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -228,19 +228,11 @@ static long __vsyscall(3) venosys_1(void)
 }
 
 #ifdef CONFIG_SYSCTL
-
-static int
-vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
-		       void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
-}
-
 static ctl_table kernel_table2[] = {
 	{ .procname = "vsyscall64",
 	  .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
 	  .mode = 0644,
-	  .proc_handler = vsyscall_sysctl_change },
+	  .proc_handler = proc_dointvec },
 	{}
 };
 
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 71d1b9bab70b..614da5b8613a 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -3412,7 +3412,7 @@ static int cdrom_print_info(const char *header, int val, char *info,
 	return 0;
 }
 
-static int cdrom_sysctl_info(ctl_table *ctl, int write, struct file * filp,
+static int cdrom_sysctl_info(ctl_table *ctl, int write,
                            void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int pos;
@@ -3489,7 +3489,7 @@ static int cdrom_sysctl_info(ctl_table *ctl, int write, struct file * filp,
 		goto done;
 doit:
 	mutex_unlock(&cdrom_mutex);
-	return proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+	return proc_dostring(ctl, write, buffer, lenp, ppos);
 done:
 	printk(KERN_INFO "cdrom: info buffer too small\n");
 	goto doit;
@@ -3525,12 +3525,12 @@ static void cdrom_update_settings(void)
 	mutex_unlock(&cdrom_mutex);
 }
 
-static int cdrom_sysctl_handler(ctl_table *ctl, int write, struct file * filp,
+static int cdrom_sysctl_handler(ctl_table *ctl, int write,
 				void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret;
 	
-	ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
 	if (write) {
 	
diff --git a/drivers/char/random.c b/drivers/char/random.c
index d8a9255e1a3f..04b505e5a5e2 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1231,7 +1231,7 @@ static char sysctl_bootid[16];
  * as an ASCII string in the standard UUID format.  If accesses via the
  * sysctl system call, it is returned as 16 bytes of binary data.
  */
-static int proc_do_uuid(ctl_table *table, int write, struct file *filp,
+static int proc_do_uuid(ctl_table *table, int write,
 			void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	ctl_table fake_table;
@@ -1254,7 +1254,7 @@ static int proc_do_uuid(ctl_table *table, int write, struct file *filp,
 	fake_table.data = buf;
 	fake_table.maxlen = sizeof(buf);
 
-	return proc_dostring(&fake_table, write, filp, buffer, lenp, ppos);
+	return proc_dostring(&fake_table, write, buffer, lenp, ppos);
 }
 
 static int uuid_strategy(ctl_table *table,
diff --git a/drivers/net/wireless/arlan-proc.c b/drivers/net/wireless/arlan-proc.c
index 2ab1d59870f4..a8b689635a3b 100644
--- a/drivers/net/wireless/arlan-proc.c
+++ b/drivers/net/wireless/arlan-proc.c
@@ -402,7 +402,7 @@ static int arlan_setup_card_by_book(struct net_device *dev)
 
 static char arlan_drive_info[ARLAN_STR_SIZE] = "A655\n\0";
 
-static int arlan_sysctl_info(ctl_table * ctl, int write, struct file *filp,
+static int arlan_sysctl_info(ctl_table * ctl, int write,
 		      void __user *buffer, size_t * lenp, loff_t *ppos)
 {
 	int i;
@@ -629,7 +629,7 @@ final:
 	*lenp = pos;
 
 	if (!write)
-		retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+		retv = proc_dostring(ctl, write, buffer, lenp, ppos);
 	else
 	{
 		*lenp = 0;
@@ -639,7 +639,7 @@ final:
 }
 
 
-static int arlan_sysctl_info161719(ctl_table * ctl, int write, struct file *filp,
+static int arlan_sysctl_info161719(ctl_table * ctl, int write,
 			    void __user *buffer, size_t * lenp, loff_t *ppos)
 {
 	int i;
@@ -669,11 +669,11 @@ static int arlan_sysctl_info161719(ctl_table * ctl, int write, struct file *filp
 
 final:
 	*lenp = pos;
-	retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+	retv = proc_dostring(ctl, write, buffer, lenp, ppos);
 	return retv;
 }
 
-static int arlan_sysctl_infotxRing(ctl_table * ctl, int write, struct file *filp,
+static int arlan_sysctl_infotxRing(ctl_table * ctl, int write,
 			    void __user *buffer, size_t * lenp, loff_t *ppos)
 {
 	int i;
@@ -698,11 +698,11 @@ static int arlan_sysctl_infotxRing(ctl_table * ctl, int write, struct file *filp
 	SARLBNpln(u_char, txBuffer, 0x800);
 final:
 	*lenp = pos;
-	retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+	retv = proc_dostring(ctl, write, buffer, lenp, ppos);
 	return retv;
 }
 
-static int arlan_sysctl_inforxRing(ctl_table * ctl, int write, struct file *filp,
+static int arlan_sysctl_inforxRing(ctl_table * ctl, int write,
 			    void __user *buffer, size_t * lenp, loff_t *ppos)
 {
 	int i;
@@ -726,11 +726,11 @@ static int arlan_sysctl_inforxRing(ctl_table * ctl, int write, struct file *filp
 	SARLBNpln(u_char, rxBuffer, 0x800);
 final:
 	*lenp = pos;
-	retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+	retv = proc_dostring(ctl, write, buffer, lenp, ppos);
 	return retv;
 }
 
-static int arlan_sysctl_info18(ctl_table * ctl, int write, struct file *filp,
+static int arlan_sysctl_info18(ctl_table * ctl, int write,
 			void __user *buffer, size_t * lenp, loff_t *ppos)
 {
 	int i;
@@ -756,7 +756,7 @@ static int arlan_sysctl_info18(ctl_table * ctl, int write, struct file *filp,
 
 final:
 	*lenp = pos;
-	retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+	retv = proc_dostring(ctl, write, buffer, lenp, ppos);
 	return retv;
 }
 
@@ -766,7 +766,7 @@ final:
 
 static char conf_reset_result[200];
 
-static int arlan_configure(ctl_table * ctl, int write, struct file *filp,
+static int arlan_configure(ctl_table * ctl, int write,
 		    void __user *buffer, size_t * lenp, loff_t *ppos)
 {
 	int pos = 0;
@@ -788,10 +788,10 @@ static int arlan_configure(ctl_table * ctl, int write, struct file *filp,
 		return -1;
 
 	*lenp = pos;
-	return proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+	return proc_dostring(ctl, write, buffer, lenp, ppos);
 }
 
-static int arlan_sysctl_reset(ctl_table * ctl, int write, struct file *filp,
+static int arlan_sysctl_reset(ctl_table * ctl, int write,
 		       void __user *buffer, size_t * lenp, loff_t *ppos)
 {
 	int pos = 0;
@@ -811,7 +811,7 @@ static int arlan_sysctl_reset(ctl_table * ctl, int write, struct file *filp,
 	} else
 		return -1;
 	*lenp = pos + 3;
-	return proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+	return proc_dostring(ctl, write, buffer, lenp, ppos);
 }
 
 
diff --git a/drivers/parport/procfs.c b/drivers/parport/procfs.c
index 554e11f9e1ce..8eefe56f1cbe 100644
--- a/drivers/parport/procfs.c
+++ b/drivers/parport/procfs.c
@@ -31,7 +31,7 @@
 #define PARPORT_MIN_SPINTIME_VALUE 1
 #define PARPORT_MAX_SPINTIME_VALUE 1000
 
-static int do_active_device(ctl_table *table, int write, struct file *filp,
+static int do_active_device(ctl_table *table, int write,
 		      void __user *result, size_t *lenp, loff_t *ppos)
 {
 	struct parport *port = (struct parport *)table->extra1;
@@ -68,7 +68,7 @@ static int do_active_device(ctl_table *table, int write, struct file *filp,
 }
 
 #ifdef CONFIG_PARPORT_1284
-static int do_autoprobe(ctl_table *table, int write, struct file *filp,
+static int do_autoprobe(ctl_table *table, int write,
 			void __user *result, size_t *lenp, loff_t *ppos)
 {
 	struct parport_device_info *info = table->extra2;
@@ -111,7 +111,7 @@ static int do_autoprobe(ctl_table *table, int write, struct file *filp,
 #endif /* IEEE1284.3 support. */
 
 static int do_hardware_base_addr (ctl_table *table, int write,
-				  struct file *filp, void __user *result,
+				  void __user *result,
 				  size_t *lenp, loff_t *ppos)
 {
 	struct parport *port = (struct parport *)table->extra1;
@@ -139,7 +139,7 @@ static int do_hardware_base_addr (ctl_table *table, int write,
 }
 
 static int do_hardware_irq (ctl_table *table, int write,
-			    struct file *filp, void __user *result,
+			    void __user *result,
 			    size_t *lenp, loff_t *ppos)
 {
 	struct parport *port = (struct parport *)table->extra1;
@@ -167,7 +167,7 @@ static int do_hardware_irq (ctl_table *table, int write,
 }
 
 static int do_hardware_dma (ctl_table *table, int write,
-			    struct file *filp, void __user *result,
+			    void __user *result,
 			    size_t *lenp, loff_t *ppos)
 {
 	struct parport *port = (struct parport *)table->extra1;
@@ -195,7 +195,7 @@ static int do_hardware_dma (ctl_table *table, int write,
 }
 
 static int do_hardware_modes (ctl_table *table, int write,
-			      struct file *filp, void __user *result,
+			      void __user *result,
 			      size_t *lenp, loff_t *ppos)
 {
 	struct parport *port = (struct parport *)table->extra1;
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index 8ccd5ed81d9c..d99860a33890 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -2,6 +2,7 @@
 #define _CODA_INT_
 
 struct dentry;
+struct file;
 
 extern struct file_system_type coda_fs_type;
 extern unsigned long coda_timeout;
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index a2edb7913447..31f4b0e6d72c 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -63,9 +63,9 @@ static void drop_slab(void)
 }
 
 int drop_caches_sysctl_handler(ctl_table *table, int write,
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+	void __user *buffer, size_t *length, loff_t *ppos)
 {
-	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (write) {
 		if (sysctl_drop_caches & 1)
 			drop_pagecache();
diff --git a/fs/file_table.c b/fs/file_table.c
index 334ce39881f8..8eb44042e009 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -74,14 +74,14 @@ EXPORT_SYMBOL_GPL(get_max_files);
  * Handle nr_files sysctl
  */
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
-int proc_nr_files(ctl_table *table, int write, struct file *filp,
+int proc_nr_files(ctl_table *table, int write,
                      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	files_stat.nr_files = get_nr_files();
-	return proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	return proc_dointvec(table, write, buffer, lenp, ppos);
 }
 #else
-int proc_nr_files(ctl_table *table, int write, struct file *filp,
+int proc_nr_files(ctl_table *table, int write,
                      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 9b1e4e9a16bf..f667e8aeabdf 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -153,7 +153,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
 
 	/* careful: calling conventions are nasty here */
 	res = count;
-	error = table->proc_handler(table, write, filp, buf, &res, ppos);
+	error = table->proc_handler(table, write, buf, &res, ppos);
 	if (!error)
 		error = res;
 out:
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 916c0ffb6083..c5bc67c4e3bb 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -26,7 +26,6 @@ STATIC int
 xfs_stats_clear_proc_handler(
 	ctl_table	*ctl,
 	int		write,
-	struct file	*filp,
 	void		__user *buffer,
 	size_t		*lenp,
 	loff_t		*ppos)
@@ -34,7 +33,7 @@ xfs_stats_clear_proc_handler(
 	int		c, ret, *valp = ctl->data;
 	__uint32_t	vn_active;
 
-	ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
 
 	if (!ret && write && *valp) {
 		printk("XFS Clearing xfsstats\n");
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 51803528b095..33ed6644abd0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2467,7 +2467,7 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
 			  size_t len, loff_t *ppos);
 
 struct ctl_table;
-int proc_nr_files(struct ctl_table *table, int write, struct file *filp,
+int proc_nr_files(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
 
 int __init get_filesystem_list(char *buf);
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 3c0924a18daf..cd3d2abaf30a 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -19,7 +19,7 @@
 extern int ftrace_enabled;
 extern int
 ftrace_enable_sysctl(struct ctl_table *table, int write,
-		     struct file *filp, void __user *buffer, size_t *lenp,
+		     void __user *buffer, size_t *lenp,
 		     loff_t *ppos);
 
 typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip);
@@ -94,7 +94,7 @@ static inline void ftrace_start(void) { }
 extern int stack_tracer_enabled;
 int
 stack_trace_sysctl(struct ctl_table *table, int write,
-		   struct file *file, void __user *buffer, size_t *lenp,
+		   void __user *buffer, size_t *lenp,
 		   loff_t *ppos);
 #endif
 
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 176e7ee73eff..11ab19ac6b3d 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -20,9 +20,9 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
 }
 
 void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
-int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
-int hugetlb_overcommit_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
-int hugetlb_treat_movable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
+int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
+int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
+int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
 int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
 			struct page **, struct vm_area_struct **,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b6eae5e3144b..87218ae84e36 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1279,7 +1279,7 @@ int in_gate_area_no_task(unsigned long addr);
 #define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);})
 #endif	/* __HAVE_ARCH_GATE_AREA */
 
-int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
+int drop_caches_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
 			unsigned long lru_pages);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 652ef01be582..6f7561730d88 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -755,21 +755,20 @@ static inline int is_dma(struct zone *zone)
 
 /* These two functions are used to setup the per zone pages min values */
 struct ctl_table;
-struct file;
-int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, 
+int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
-int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
+int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
-int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file *,
+int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
-			struct file *, void __user *, size_t *, loff_t *);
+			void __user *, size_t *, loff_t *);
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
-			struct file *, void __user *, size_t *, loff_t *);
+			void __user *, size_t *, loff_t *);
 
 extern int numa_zonelist_order_handler(struct ctl_table *, int,
-			struct file *, void __user *, size_t *, loff_t *);
+			void __user *, size_t *, loff_t *);
 extern char numa_zonelist_order[];
 #define NUMA_ZONELIST_ORDER_LEN 16	/* string buffer size */
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e951bd2eb9fc..811cd96524d7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -309,7 +309,7 @@ extern void softlockup_tick(void);
 extern void touch_softlockup_watchdog(void);
 extern void touch_all_softlockup_watchdogs(void);
 extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
-				    struct file *filp, void __user *buffer,
+				    void __user *buffer,
 				    size_t *lenp, loff_t *ppos);
 extern unsigned int  softlockup_panic;
 extern int softlockup_thresh;
@@ -331,7 +331,7 @@ extern unsigned long sysctl_hung_task_check_count;
 extern unsigned long sysctl_hung_task_timeout_secs;
 extern unsigned long sysctl_hung_task_warnings;
 extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
-					 struct file *filp, void __user *buffer,
+					 void __user *buffer,
 					 size_t *lenp, loff_t *ppos);
 #endif
 
@@ -1906,7 +1906,7 @@ extern unsigned int sysctl_sched_time_avg;
 extern unsigned int sysctl_timer_migration;
 
 int sched_nr_latency_handler(struct ctl_table *table, int write,
-		struct file *file, void __user *buffer, size_t *length,
+		void __user *buffer, size_t *length,
 		loff_t *ppos);
 #endif
 #ifdef CONFIG_SCHED_DEBUG
@@ -1924,7 +1924,7 @@ extern unsigned int sysctl_sched_rt_period;
 extern int sysctl_sched_rt_runtime;
 
 int sched_rt_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 
 extern unsigned int sysctl_sched_compat_yield;
diff --git a/include/linux/security.h b/include/linux/security.h
index d050b66ab9ef..239e40d0450b 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -133,7 +133,7 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
 		return PAGE_ALIGN(mmap_min_addr);
 	return hint;
 }
-extern int mmap_min_addr_handler(struct ctl_table *table, int write, struct file *filp,
+extern int mmap_min_addr_handler(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp, loff_t *ppos);
 
 #ifdef CONFIG_SECURITY
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4c78fea989b9..82232dbea3f7 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -245,7 +245,7 @@ extern int page_evictable(struct page *page, struct vm_area_struct *vma);
 extern void scan_mapping_unevictable_pages(struct address_space *);
 
 extern unsigned long scan_unevictable_pages;
-extern int scan_unevictable_handler(struct ctl_table *, int, struct file *,
+extern int scan_unevictable_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 extern int scan_unevictable_register_node(struct node *node);
 extern void scan_unevictable_unregister_node(struct node *node);
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index e76d3b22a466..1e4743ee6831 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -29,7 +29,6 @@
 #include <linux/types.h>
 #include <linux/compiler.h>
 
-struct file;
 struct completion;
 
 #define CTL_MAXNAME 10		/* how many path components do we allow in a
@@ -977,25 +976,25 @@ typedef int ctl_handler (struct ctl_table *table,
 			 void __user *oldval, size_t __user *oldlenp,
 			 void __user *newval, size_t newlen);
 
-typedef int proc_handler (struct ctl_table *ctl, int write, struct file * filp,
+typedef int proc_handler (struct ctl_table *ctl, int write,
 			  void __user *buffer, size_t *lenp, loff_t *ppos);
 
-extern int proc_dostring(struct ctl_table *, int, struct file *,
+extern int proc_dostring(struct ctl_table *, int,
 			 void __user *, size_t *, loff_t *);
-extern int proc_dointvec(struct ctl_table *, int, struct file *,
+extern int proc_dointvec(struct ctl_table *, int,
 			 void __user *, size_t *, loff_t *);
-extern int proc_dointvec_minmax(struct ctl_table *, int, struct file *,
+extern int proc_dointvec_minmax(struct ctl_table *, int,
 				void __user *, size_t *, loff_t *);
-extern int proc_dointvec_jiffies(struct ctl_table *, int, struct file *,
+extern int proc_dointvec_jiffies(struct ctl_table *, int,
 				 void __user *, size_t *, loff_t *);
-extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, struct file *,
+extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
-extern int proc_dointvec_ms_jiffies(struct ctl_table *, int, struct file *,
+extern int proc_dointvec_ms_jiffies(struct ctl_table *, int,
 				    void __user *, size_t *, loff_t *);
-extern int proc_doulongvec_minmax(struct ctl_table *, int, struct file *,
+extern int proc_doulongvec_minmax(struct ctl_table *, int,
 				  void __user *, size_t *, loff_t *);
 extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int,
-				      struct file *, void __user *, size_t *, loff_t *);
+				      void __user *, size_t *, loff_t *);
 
 extern int do_sysctl (int __user *name, int nlen,
 		      void __user *oldval, size_t __user *oldlenp,
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 75cf58666ff9..66ebddcff664 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -110,21 +110,20 @@ extern int laptop_mode;
 extern unsigned long determine_dirtyable_memory(void);
 
 extern int dirty_background_ratio_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 extern int dirty_background_bytes_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 extern int dirty_ratio_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 extern int dirty_bytes_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 
 struct ctl_table;
-struct file;
-int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *,
+int dirty_writeback_centisecs_handler(struct ctl_table *, int,
 				      void __user *, size_t *, loff_t *);
 
 void get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
diff --git a/include/net/ip.h b/include/net/ip.h
index 72c36926c26d..5b26a0bd178e 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -399,7 +399,7 @@ extern void	ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport,
  * fed into the routing cache should use these handlers.
  */
 int ipv4_doint_and_flush(ctl_table *ctl, int write,
-			 struct file* filp, void __user *buffer,
+			 void __user *buffer,
 			 size_t *lenp, loff_t *ppos);
 int ipv4_doint_and_flush_strategy(ctl_table *table,
 				  void __user *oldval, size_t __user *oldlenp,
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index 1459ed3e2697..f76f22d05721 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -55,7 +55,6 @@ enum {
 #include <net/neighbour.h>
 
 struct ctl_table;
-struct file;
 struct inet6_dev;
 struct net_device;
 struct net_proto_family;
@@ -139,7 +138,6 @@ extern int			igmp6_event_report(struct sk_buff *skb);
 #ifdef CONFIG_SYSCTL
 extern int 			ndisc_ifinfo_sysctl_change(struct ctl_table *ctl,
 							   int write,
-							   struct file * filp,
 							   void __user *buffer,
 							   size_t *lenp,
 							   loff_t *ppos);
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 40eab7314aeb..7d3704750efc 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -27,18 +27,18 @@ static void *get_ipc(ctl_table *table)
 }
 
 #ifdef CONFIG_PROC_SYSCTL
-static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
+static int proc_ipc_dointvec(ctl_table *table, int write,
 	void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table ipc_table;
 	memcpy(&ipc_table, table, sizeof(ipc_table));
 	ipc_table.data = get_ipc(table);
 
-	return proc_dointvec(&ipc_table, write, filp, buffer, lenp, ppos);
+	return proc_dointvec(&ipc_table, write, buffer, lenp, ppos);
 }
 
 static int proc_ipc_callback_dointvec(ctl_table *table, int write,
-	struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+	void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table ipc_table;
 	size_t lenp_bef = *lenp;
@@ -47,7 +47,7 @@ static int proc_ipc_callback_dointvec(ctl_table *table, int write,
 	memcpy(&ipc_table, table, sizeof(ipc_table));
 	ipc_table.data = get_ipc(table);
 
-	rc = proc_dointvec(&ipc_table, write, filp, buffer, lenp, ppos);
+	rc = proc_dointvec(&ipc_table, write, buffer, lenp, ppos);
 
 	if (write && !rc && lenp_bef == *lenp)
 		/*
@@ -61,13 +61,13 @@ static int proc_ipc_callback_dointvec(ctl_table *table, int write,
 }
 
 static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
-	struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+	void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table ipc_table;
 	memcpy(&ipc_table, table, sizeof(ipc_table));
 	ipc_table.data = get_ipc(table);
 
-	return proc_doulongvec_minmax(&ipc_table, write, filp, buffer,
+	return proc_doulongvec_minmax(&ipc_table, write, buffer,
 					lenp, ppos);
 }
 
@@ -95,7 +95,7 @@ static void ipc_auto_callback(int val)
 }
 
 static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write,
-	struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+	void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table ipc_table;
 	size_t lenp_bef = *lenp;
@@ -106,7 +106,7 @@ static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write,
 	ipc_table.data = get_ipc(table);
 	oldval = *((int *)(ipc_table.data));
 
-	rc = proc_dointvec_minmax(&ipc_table, write, filp, buffer, lenp, ppos);
+	rc = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
 
 	if (write && !rc && lenp_bef == *lenp) {
 		int newval = *((int *)(ipc_table.data));
diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
index 24ae46dfe45d..8a058711fc10 100644
--- a/ipc/mq_sysctl.c
+++ b/ipc/mq_sysctl.c
@@ -31,24 +31,24 @@ static void *get_mq(ctl_table *table)
 	return which;
 }
 
-static int proc_mq_dointvec(ctl_table *table, int write, struct file *filp,
+static int proc_mq_dointvec(ctl_table *table, int write,
 	void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table mq_table;
 	memcpy(&mq_table, table, sizeof(mq_table));
 	mq_table.data = get_mq(table);
 
-	return proc_dointvec(&mq_table, write, filp, buffer, lenp, ppos);
+	return proc_dointvec(&mq_table, write, buffer, lenp, ppos);
 }
 
 static int proc_mq_dointvec_minmax(ctl_table *table, int write,
-	struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+	void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table mq_table;
 	memcpy(&mq_table, table, sizeof(mq_table));
 	mq_table.data = get_mq(table);
 
-	return proc_dointvec_minmax(&mq_table, write, filp, buffer,
+	return proc_dointvec_minmax(&mq_table, write, buffer,
 					lenp, ppos);
 }
 #else
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 022a4927b785..d4e841747400 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -171,12 +171,12 @@ static unsigned long timeout_jiffies(unsigned long timeout)
  * Process updating of timeout sysctl
  */
 int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
-				  struct file *filp, void __user *buffer,
+				  void __user *buffer,
 				  size_t *lenp, loff_t *ppos)
 {
 	int ret;
 
-	ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 
 	if (ret || !write)
 		goto out;
diff --git a/kernel/sched.c b/kernel/sched.c
index 0d0361b9dbb3..ee61f454a98b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -10312,7 +10312,7 @@ static int sched_rt_global_constraints(void)
 #endif /* CONFIG_RT_GROUP_SCHED */
 
 int sched_rt_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	int ret;
@@ -10323,7 +10323,7 @@ int sched_rt_handler(struct ctl_table *table, int write,
 	old_period = sysctl_sched_rt_period;
 	old_runtime = sysctl_sched_rt_runtime;
 
-	ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
 
 	if (!ret && write) {
 		ret = sched_rt_global_constraints();
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ecc637a0d591..4e777b47eeda 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -384,10 +384,10 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 
 #ifdef CONFIG_SCHED_DEBUG
 int sched_nr_latency_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
-	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 
 	if (ret || !write)
 		return ret;
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 09d7519557d3..0d31135efbf4 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -26,10 +26,10 @@ static void slow_work_cull_timeout(unsigned long);
 static void slow_work_oom_timeout(unsigned long);
 
 #ifdef CONFIG_SYSCTL
-static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *,
+static int slow_work_min_threads_sysctl(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 
-static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *,
+static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
 					void __user *, size_t *, loff_t *);
 #endif
 
@@ -493,10 +493,10 @@ static void slow_work_oom_timeout(unsigned long data)
  * Handle adjustment of the minimum number of threads
  */
 static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
-					struct file *filp, void __user *buffer,
+					void __user *buffer,
 					size_t *lenp, loff_t *ppos)
 {
-	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	int n;
 
 	if (ret == 0) {
@@ -521,10 +521,10 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
  * Handle adjustment of the maximum number of threads
  */
 static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
-					struct file *filp, void __user *buffer,
+					void __user *buffer,
 					size_t *lenp, loff_t *ppos)
 {
-	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	int n;
 
 	if (ret == 0) {
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 88796c330838..81324d12eb35 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -90,11 +90,11 @@ void touch_all_softlockup_watchdogs(void)
 EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
 
 int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
-			     struct file *filp, void __user *buffer,
+			     void __user *buffer,
 			     size_t *lenp, loff_t *ppos)
 {
 	touch_all_softlockup_watchdogs();
-	return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 }
 
 /*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 37abb8c3995b..a02697b7cb97 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -163,9 +163,9 @@ extern int max_lock_depth;
 #endif
 
 #ifdef CONFIG_PROC_SYSCTL
-static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
+static int proc_do_cad_pid(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
-static int proc_taint(struct ctl_table *table, int write, struct file *filp,
+static int proc_taint(struct ctl_table *table, int write,
 			       void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
 
@@ -2226,7 +2226,7 @@ void sysctl_head_put(struct ctl_table_header *head)
 #ifdef CONFIG_PROC_SYSCTL
 
 static int _proc_do_string(void* data, int maxlen, int write,
-			   struct file *filp, void __user *buffer,
+			   void __user *buffer,
 			   size_t *lenp, loff_t *ppos)
 {
 	size_t len;
@@ -2287,7 +2287,6 @@ static int _proc_do_string(void* data, int maxlen, int write,
  * proc_dostring - read a string sysctl
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2301,10 +2300,10 @@ static int _proc_do_string(void* data, int maxlen, int write,
  *
  * Returns 0 on success.
  */
-int proc_dostring(struct ctl_table *table, int write, struct file *filp,
+int proc_dostring(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	return _proc_do_string(table->data, table->maxlen, write, filp,
+	return _proc_do_string(table->data, table->maxlen, write,
 			       buffer, lenp, ppos);
 }
 
@@ -2329,7 +2328,7 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
 }
 
 static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
-		  int write, struct file *filp, void __user *buffer,
+		  int write, void __user *buffer,
 		  size_t *lenp, loff_t *ppos,
 		  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
 			      int write, void *data),
@@ -2436,13 +2435,13 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
 #undef TMPBUFLEN
 }
 
-static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp,
+static int do_proc_dointvec(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos,
 		  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
 			      int write, void *data),
 		  void *data)
 {
-	return __do_proc_dointvec(table->data, table, write, filp,
+	return __do_proc_dointvec(table->data, table, write,
 			buffer, lenp, ppos, conv, data);
 }
 
@@ -2450,7 +2449,6 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
  * proc_dointvec - read a vector of integers
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2460,10 +2458,10 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
  *
  * Returns 0 on success.
  */
-int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec(struct ctl_table *table, int write,
 		     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-    return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+    return do_proc_dointvec(table,write,buffer,lenp,ppos,
 		    	    NULL,NULL);
 }
 
@@ -2471,7 +2469,7 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
  * Taint values can only be increased
  * This means we can safely use a temporary.
  */
-static int proc_taint(struct ctl_table *table, int write, struct file *filp,
+static int proc_taint(struct ctl_table *table, int write,
 			       void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table t;
@@ -2483,7 +2481,7 @@ static int proc_taint(struct ctl_table *table, int write, struct file *filp,
 
 	t = *table;
 	t.data = &tmptaint;
-	err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos);
+	err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
 	if (err < 0)
 		return err;
 
@@ -2535,7 +2533,6 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
  * proc_dointvec_minmax - read a vector of integers with min/max values
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2548,19 +2545,18 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
  *
  * Returns 0 on success.
  */
-int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_minmax(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct do_proc_dointvec_minmax_conv_param param = {
 		.min = (int *) table->extra1,
 		.max = (int *) table->extra2,
 	};
-	return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
+	return do_proc_dointvec(table, write, buffer, lenp, ppos,
 				do_proc_dointvec_minmax_conv, &param);
 }
 
 static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
-				     struct file *filp,
 				     void __user *buffer,
 				     size_t *lenp, loff_t *ppos,
 				     unsigned long convmul,
@@ -2665,21 +2661,19 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
 }
 
 static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
-				     struct file *filp,
 				     void __user *buffer,
 				     size_t *lenp, loff_t *ppos,
 				     unsigned long convmul,
 				     unsigned long convdiv)
 {
 	return __do_proc_doulongvec_minmax(table->data, table, write,
-			filp, buffer, lenp, ppos, convmul, convdiv);
+			buffer, lenp, ppos, convmul, convdiv);
 }
 
 /**
  * proc_doulongvec_minmax - read a vector of long integers with min/max values
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2692,17 +2686,16 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
  *
  * Returns 0 on success.
  */
-int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp,
+int proc_doulongvec_minmax(struct ctl_table *table, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-    return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l);
+    return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
 }
 
 /**
  * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2717,11 +2710,10 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp
  * Returns 0 on success.
  */
 int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
-				      struct file *filp,
 				      void __user *buffer,
 				      size_t *lenp, loff_t *ppos)
 {
-    return do_proc_doulongvec_minmax(table, write, filp, buffer,
+    return do_proc_doulongvec_minmax(table, write, buffer,
 				     lenp, ppos, HZ, 1000l);
 }
 
@@ -2797,7 +2789,6 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
  * proc_dointvec_jiffies - read a vector of integers as seconds
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2809,10 +2800,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
  *
  * Returns 0 on success.
  */
-int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_jiffies(struct ctl_table *table, int write,
 			  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-    return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+    return do_proc_dointvec(table,write,buffer,lenp,ppos,
 		    	    do_proc_dointvec_jiffies_conv,NULL);
 }
 
@@ -2820,7 +2811,6 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
  * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: pointer to the file position
@@ -2832,10 +2822,10 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
  *
  * Returns 0 on success.
  */
-int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-    return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+    return do_proc_dointvec(table,write,buffer,lenp,ppos,
 		    	    do_proc_dointvec_userhz_jiffies_conv,NULL);
 }
 
@@ -2843,7 +2833,6 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
  * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2856,14 +2845,14 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
  *
  * Returns 0 on success.
  */
-int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
 			     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
+	return do_proc_dointvec(table, write, buffer, lenp, ppos,
 				do_proc_dointvec_ms_jiffies_conv, NULL);
 }
 
-static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
+static int proc_do_cad_pid(struct ctl_table *table, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct pid *new_pid;
@@ -2872,7 +2861,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
 
 	tmp = pid_vnr(cad_pid);
 
-	r = __do_proc_dointvec(&tmp, table, write, filp, buffer,
+	r = __do_proc_dointvec(&tmp, table, write, buffer,
 			       lenp, ppos, NULL, NULL);
 	if (r || !write)
 		return r;
@@ -2887,50 +2876,49 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
 
 #else /* CONFIG_PROC_FS */
 
-int proc_dostring(struct ctl_table *table, int write, struct file *filp,
+int proc_dostring(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
-int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
-int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_minmax(struct ctl_table *table, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
-int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_jiffies(struct ctl_table *table, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
-int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
-int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
 			     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
-int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp,
+int proc_doulongvec_minmax(struct ctl_table *table, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
 int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
-				      struct file *filp,
 				      void __user *buffer,
 				      size_t *lenp, loff_t *ppos)
 {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 23df7771c937..a142579765bf 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3015,7 +3015,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 
 int
 ftrace_enable_sysctl(struct ctl_table *table, int write,
-		     struct file *file, void __user *buffer, size_t *lenp,
+		     void __user *buffer, size_t *lenp,
 		     loff_t *ppos)
 {
 	int ret;
@@ -3025,7 +3025,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 
 	mutex_lock(&ftrace_lock);
 
-	ret  = proc_dointvec(table, write, file, buffer, lenp, ppos);
+	ret  = proc_dointvec(table, write, buffer, lenp, ppos);
 
 	if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
 		goto out;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 0f6facb050a1..8504ac71e4e8 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -296,14 +296,14 @@ static const struct file_operations stack_trace_fops = {
 
 int
 stack_trace_sysctl(struct ctl_table *table, int write,
-		   struct file *file, void __user *buffer, size_t *lenp,
+		   void __user *buffer, size_t *lenp,
 		   loff_t *ppos)
 {
 	int ret;
 
 	mutex_lock(&stack_sysctl_mutex);
 
-	ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
 
 	if (ret || !write ||
 	    (last_stack_tracer_enabled == !!stack_tracer_enabled))
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 92359cc747a7..69eae358a726 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -42,14 +42,14 @@ static void put_uts(ctl_table *table, int write, void *which)
  *	Special case of dostring for the UTS structure. This has locks
  *	to observe. Should this be in kernel/sys.c ????
  */
-static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
+static int proc_do_uts_string(ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table uts_table;
 	int r;
 	memcpy(&uts_table, table, sizeof(uts_table));
 	uts_table.data = get_uts(table, write);
-	r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos);
+	r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
 	put_uts(table, write, uts_table.data);
 	return r;
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 815dbd4a6dcb..6f048fcc749c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1537,7 +1537,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
 
 #ifdef CONFIG_SYSCTL
 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
-			   struct file *file, void __user *buffer,
+			   void __user *buffer,
 			   size_t *length, loff_t *ppos)
 {
 	struct hstate *h = &default_hstate;
@@ -1548,7 +1548,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
 
 	table->data = &tmp;
 	table->maxlen = sizeof(unsigned long);
-	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+	proc_doulongvec_minmax(table, write, buffer, length, ppos);
 
 	if (write)
 		h->max_huge_pages = set_max_huge_pages(h, tmp);
@@ -1557,10 +1557,10 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
 }
 
 int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
-			struct file *file, void __user *buffer,
+			void __user *buffer,
 			size_t *length, loff_t *ppos)
 {
-	proc_dointvec(table, write, file, buffer, length, ppos);
+	proc_dointvec(table, write, buffer, length, ppos);
 	if (hugepages_treat_as_movable)
 		htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
 	else
@@ -1569,7 +1569,7 @@ int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
 }
 
 int hugetlb_overcommit_handler(struct ctl_table *table, int write,
-			struct file *file, void __user *buffer,
+			void __user *buffer,
 			size_t *length, loff_t *ppos)
 {
 	struct hstate *h = &default_hstate;
@@ -1580,7 +1580,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 
 	table->data = &tmp;
 	table->maxlen = sizeof(unsigned long);
-	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+	proc_doulongvec_minmax(table, write, buffer, length, ppos);
 
 	if (write) {
 		spin_lock(&hugetlb_lock);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5f378dd58802..be197f71b096 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -155,37 +155,37 @@ static void update_completion_period(void)
 }
 
 int dirty_background_ratio_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	int ret;
 
-	ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret == 0 && write)
 		dirty_background_bytes = 0;
 	return ret;
 }
 
 int dirty_background_bytes_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	int ret;
 
-	ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret == 0 && write)
 		dirty_background_ratio = 0;
 	return ret;
 }
 
 int dirty_ratio_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	int old_ratio = vm_dirty_ratio;
 	int ret;
 
-	ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
 		update_completion_period();
 		vm_dirty_bytes = 0;
@@ -195,13 +195,13 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
 
 
 int dirty_bytes_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	unsigned long old_bytes = vm_dirty_bytes;
 	int ret;
 
-	ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
 		update_completion_period();
 		vm_dirty_ratio = 0;
@@ -686,9 +686,9 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
  * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
  */
 int dirty_writeback_centisecs_handler(ctl_table *table, int write,
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+	void __user *buffer, size_t *length, loff_t *ppos)
 {
-	proc_dointvec(table, write, file, buffer, length, ppos);
+	proc_dointvec(table, write, buffer, length, ppos);
 	return 0;
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5717f27a0704..88248b3c20bb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2373,7 +2373,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order);
  * sysctl handler for numa_zonelist_order
  */
 int numa_zonelist_order_handler(ctl_table *table, int write,
-		struct file *file, void __user *buffer, size_t *length,
+		void __user *buffer, size_t *length,
 		loff_t *ppos)
 {
 	char saved_string[NUMA_ZONELIST_ORDER_LEN];
@@ -2382,7 +2382,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
 	if (write)
 		strncpy(saved_string, (char*)table->data,
 			NUMA_ZONELIST_ORDER_LEN);
-	ret = proc_dostring(table, write, file, buffer, length, ppos);
+	ret = proc_dostring(table, write, buffer, length, ppos);
 	if (ret)
 		return ret;
 	if (write) {
@@ -4706,9 +4706,9 @@ module_init(init_per_zone_wmark_min)
  *	changes.
  */
 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+	void __user *buffer, size_t *length, loff_t *ppos)
 {
-	proc_dointvec(table, write, file, buffer, length, ppos);
+	proc_dointvec(table, write, buffer, length, ppos);
 	if (write)
 		setup_per_zone_wmarks();
 	return 0;
@@ -4716,12 +4716,12 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 
 #ifdef CONFIG_NUMA
 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int rc;
 
-	rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 
@@ -4732,12 +4732,12 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
 }
 
 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int rc;
 
-	rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 
@@ -4758,9 +4758,9 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
  * if in function of the boot time zone sizes.
  */
 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+	void __user *buffer, size_t *length, loff_t *ppos)
 {
-	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	proc_dointvec_minmax(table, write, buffer, length, ppos);
 	setup_per_zone_lowmem_reserve();
 	return 0;
 }
@@ -4772,13 +4772,13 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
  */
 
 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	unsigned int cpu;
 	int ret;
 
-	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (!write || (ret == -EINVAL))
 		return ret;
 	for_each_populated_zone(zone) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2423782214ab..f444b7409085 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2844,10 +2844,10 @@ static void scan_all_zones_unevictable_pages(void)
 unsigned long scan_unevictable_pages;
 
 int scan_unevictable_handler(struct ctl_table *table, int write,
-			   struct file *file, void __user *buffer,
+			   void __user *buffer,
 			   size_t *length, loff_t *ppos)
 {
-	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+	proc_doulongvec_minmax(table, write, buffer, length, ppos);
 
 	if (write && *(unsigned long *)table->data)
 		scan_all_zones_unevictable_pages();
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 907a82e9023d..a16a2342f6bf 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -965,12 +965,12 @@ static struct nf_hook_ops br_nf_ops[] __read_mostly = {
 
 #ifdef CONFIG_SYSCTL
 static
-int brnf_sysctl_call_tables(ctl_table * ctl, int write, struct file *filp,
+int brnf_sysctl_call_tables(ctl_table * ctl, int write,
 			    void __user * buffer, size_t * lenp, loff_t * ppos)
 {
 	int ret;
 
-	ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
 	if (write && *(int *)(ctl->data))
 		*(int *)(ctl->data) = 1;
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index 1c6a5bb6f0c8..6e1f085db06a 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -164,7 +164,7 @@ static int max_t3[] = { 8191 }; /* Must fit in 16 bits when multiplied by BCT3MU
 static int min_priority[1];
 static int max_priority[] = { 127 }; /* From DECnet spec */
 
-static int dn_forwarding_proc(ctl_table *, int, struct file *,
+static int dn_forwarding_proc(ctl_table *, int,
 			void __user *, size_t *, loff_t *);
 static int dn_forwarding_sysctl(ctl_table *table,
 			void __user *oldval, size_t __user *oldlenp,
@@ -274,7 +274,6 @@ static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms)
 }
 
 static int dn_forwarding_proc(ctl_table *table, int write,
-				struct file *filep,
 				void __user *buffer,
 				size_t *lenp, loff_t *ppos)
 {
@@ -290,7 +289,7 @@ static int dn_forwarding_proc(ctl_table *table, int write,
 	dn_db = dev->dn_ptr;
 	old = dn_db->parms.forwarding;
 
-	err = proc_dointvec(table, write, filep, buffer, lenp, ppos);
+	err = proc_dointvec(table, write, buffer, lenp, ppos);
 
 	if ((err >= 0) && write) {
 		if (dn_db->parms.forwarding < 0)
diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c
index 5bcd592ae6dd..26b0ab1e9f56 100644
--- a/net/decnet/sysctl_net_decnet.c
+++ b/net/decnet/sysctl_net_decnet.c
@@ -165,7 +165,6 @@ static int dn_node_address_strategy(ctl_table *table,
 }
 
 static int dn_node_address_handler(ctl_table *table, int write,
-				struct file *filp,
 				void __user *buffer,
 				size_t *lenp, loff_t *ppos)
 {
@@ -276,7 +275,6 @@ static int dn_def_dev_strategy(ctl_table *table,
 
 
 static int dn_def_dev_handler(ctl_table *table, int write,
-				struct file * filp,
 				void __user *buffer,
 				size_t *lenp, loff_t *ppos)
 {
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 07336c6201f0..e92f1fd28aa5 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1270,10 +1270,10 @@ static void inet_forward_change(struct net *net)
 }
 
 static int devinet_conf_proc(ctl_table *ctl, int write,
-			     struct file *filp, void __user *buffer,
+			     void __user *buffer,
 			     size_t *lenp, loff_t *ppos)
 {
-	int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
 	if (write) {
 		struct ipv4_devconf *cnf = ctl->extra1;
@@ -1342,12 +1342,12 @@ static int devinet_conf_sysctl(ctl_table *table,
 }
 
 static int devinet_sysctl_forward(ctl_table *ctl, int write,
-				  struct file *filp, void __user *buffer,
+				  void __user *buffer,
 				  size_t *lenp, loff_t *ppos)
 {
 	int *valp = ctl->data;
 	int val = *valp;
-	int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
 	if (write && *valp != val) {
 		struct net *net = ctl->extra2;
@@ -1372,12 +1372,12 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
 }
 
 int ipv4_doint_and_flush(ctl_table *ctl, int write,
-			 struct file *filp, void __user *buffer,
+			 void __user *buffer,
 			 size_t *lenp, loff_t *ppos)
 {
 	int *valp = ctl->data;
 	int val = *valp;
-	int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 	struct net *net = ctl->extra2;
 
 	if (write && *valp != val)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index df9347314538..bb4199252026 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3036,7 +3036,7 @@ void ip_rt_multicast_event(struct in_device *in_dev)
 
 #ifdef CONFIG_SYSCTL
 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
-					struct file *filp, void __user *buffer,
+					void __user *buffer,
 					size_t *lenp, loff_t *ppos)
 {
 	if (write) {
@@ -3046,7 +3046,7 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
 
 		memcpy(&ctl, __ctl, sizeof(ctl));
 		ctl.data = &flush_delay;
-		proc_dointvec(&ctl, write, filp, buffer, lenp, ppos);
+		proc_dointvec(&ctl, write, buffer, lenp, ppos);
 
 		net = (struct net *)__ctl->extra1;
 		rt_cache_flush(net, flush_delay);
@@ -3106,12 +3106,11 @@ static void rt_secret_reschedule(int old)
 }
 
 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
-					  struct file *filp,
 					  void __user *buffer, size_t *lenp,
 					  loff_t *ppos)
 {
 	int old = ip_rt_secret_interval;
-	int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos);
+	int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
 
 	rt_secret_reschedule(old);
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4710d219f06a..2dcf04d9b005 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -36,7 +36,7 @@ static void set_local_port_range(int range[2])
 }
 
 /* Validate changes from /proc interface. */
-static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp,
+static int ipv4_local_port_range(ctl_table *table, int write,
 				 void __user *buffer,
 				 size_t *lenp, loff_t *ppos)
 {
@@ -51,7 +51,7 @@ static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp,
 	};
 
 	inet_get_local_port_range(range, range + 1);
-	ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 
 	if (write && ret == 0) {
 		if (range[1] < range[0])
@@ -91,7 +91,7 @@ static int ipv4_sysctl_local_port_range(ctl_table *table,
 }
 
 
-static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp,
+static int proc_tcp_congestion_control(ctl_table *ctl, int write,
 				       void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	char val[TCP_CA_NAME_MAX];
@@ -103,7 +103,7 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file *
 
 	tcp_get_default_congestion_control(val);
 
-	ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
+	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
 	if (write && ret == 0)
 		ret = tcp_set_default_congestion_control(val);
 	return ret;
@@ -129,7 +129,7 @@ static int sysctl_tcp_congestion_control(ctl_table *table,
 }
 
 static int proc_tcp_available_congestion_control(ctl_table *ctl,
-						 int write, struct file * filp,
+						 int write,
 						 void __user *buffer, size_t *lenp,
 						 loff_t *ppos)
 {
@@ -140,13 +140,13 @@ static int proc_tcp_available_congestion_control(ctl_table *ctl,
 	if (!tbl.data)
 		return -ENOMEM;
 	tcp_get_available_congestion_control(tbl.data, TCP_CA_BUF_MAX);
-	ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
+	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
 	kfree(tbl.data);
 	return ret;
 }
 
 static int proc_allowed_congestion_control(ctl_table *ctl,
-					   int write, struct file * filp,
+					   int write,
 					   void __user *buffer, size_t *lenp,
 					   loff_t *ppos)
 {
@@ -158,7 +158,7 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
 		return -ENOMEM;
 
 	tcp_get_allowed_congestion_control(tbl.data, tbl.maxlen);
-	ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
+	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
 	if (write && ret == 0)
 		ret = tcp_set_allowed_congestion_control(tbl.data);
 	kfree(tbl.data);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 55f486d89c88..1fd0a3d775d2 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3986,14 +3986,14 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 #ifdef CONFIG_SYSCTL
 
 static
-int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
+int addrconf_sysctl_forward(ctl_table *ctl, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int *valp = ctl->data;
 	int val = *valp;
 	int ret;
 
-	ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
 	if (write)
 		ret = addrconf_fixup_forwarding(ctl, valp, val);
@@ -4090,14 +4090,14 @@ static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int old)
 }
 
 static
-int addrconf_sysctl_disable(ctl_table *ctl, int write, struct file * filp,
+int addrconf_sysctl_disable(ctl_table *ctl, int write,
 			    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int *valp = ctl->data;
 	int val = *valp;
 	int ret;
 
-	ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
 	if (write)
 		ret = addrconf_disable_ipv6(ctl, valp, val);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 7015478797f6..498b9b0b0fad 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1735,7 +1735,7 @@ static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl,
 	}
 }
 
-int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct net_device *dev = ctl->extra1;
 	struct inet6_dev *idev;
@@ -1746,16 +1746,16 @@ int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * f
 		ndisc_warn_deprecated_sysctl(ctl, "syscall", dev ? dev->name : "default");
 
 	if (strcmp(ctl->procname, "retrans_time") == 0)
-		ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+		ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
 	else if (strcmp(ctl->procname, "base_reachable_time") == 0)
 		ret = proc_dointvec_jiffies(ctl, write,
-					    filp, buffer, lenp, ppos);
+					    buffer, lenp, ppos);
 
 	else if ((strcmp(ctl->procname, "retrans_time_ms") == 0) ||
 		 (strcmp(ctl->procname, "base_reachable_time_ms") == 0))
 		ret = proc_dointvec_ms_jiffies(ctl, write,
-					       filp, buffer, lenp, ppos);
+					       buffer, lenp, ppos);
 	else
 		ret = -1;
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 77aecbe8ff6c..d6fe7646a8ff 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2524,13 +2524,13 @@ static const struct file_operations rt6_stats_seq_fops = {
 #ifdef CONFIG_SYSCTL
 
 static
-int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
+int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
 			      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct net *net = current->nsproxy->net_ns;
 	int delay = net->ipv6.sysctl.flush_delay;
 	if (write) {
-		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+		proc_dointvec(ctl, write, buffer, lenp, ppos);
 		fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
 		return 0;
 	} else
diff --git a/net/irda/irsysctl.c b/net/irda/irsysctl.c
index 57f8817c3979..5c86567e5a78 100644
--- a/net/irda/irsysctl.c
+++ b/net/irda/irsysctl.c
@@ -73,12 +73,12 @@ static int min_lap_keepalive_time = 100;	/* 100us */
 /* For other sysctl, I've no idea of the range. Maybe Dag could help
  * us on that - Jean II */
 
-static int do_devname(ctl_table *table, int write, struct file *filp,
+static int do_devname(ctl_table *table, int write,
 		      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret;
 
-	ret = proc_dostring(table, write, filp, buffer, lenp, ppos);
+	ret = proc_dostring(table, write, buffer, lenp, ppos);
 	if (ret == 0 && write) {
 		struct ias_value *val;
 
@@ -90,12 +90,12 @@ static int do_devname(ctl_table *table, int write, struct file *filp,
 }
 
 
-static int do_discovery(ctl_table *table, int write, struct file *filp,
+static int do_discovery(ctl_table *table, int write,
                     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        int ret;
 
-       ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+       ret = proc_dointvec(table, write, buffer, lenp, ppos);
        if (ret)
 	       return ret;
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index fba2892b99e1..446e9bd4b4bc 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1496,14 +1496,14 @@ static int ip_vs_zero_all(void)
 
 
 static int
-proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
+proc_do_defense_mode(ctl_table *table, int write,
 		     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int *valp = table->data;
 	int val = *valp;
 	int rc;
 
-	rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	rc = proc_dointvec(table, write, buffer, lenp, ppos);
 	if (write && (*valp != val)) {
 		if ((*valp < 0) || (*valp > 3)) {
 			/* Restore the correct value */
@@ -1517,7 +1517,7 @@ proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
 
 
 static int
-proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
+proc_do_sync_threshold(ctl_table *table, int write,
 		       void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int *valp = table->data;
@@ -1527,7 +1527,7 @@ proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
 	/* backup the value first */
 	memcpy(val, valp, sizeof(val));
 
-	rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	rc = proc_dointvec(table, write, buffer, lenp, ppos);
 	if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
 		/* Restore the correct value */
 		memcpy(valp, val, sizeof(val));
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 4e620305f28c..c93494fef8ef 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -226,7 +226,7 @@ static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3];
 static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1];
 static struct ctl_table_header *nf_log_dir_header;
 
-static int nf_log_proc_dostring(ctl_table *table, int write, struct file *filp,
+static int nf_log_proc_dostring(ctl_table *table, int write,
 			 void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	const struct nf_logger *logger;
@@ -260,7 +260,7 @@ static int nf_log_proc_dostring(ctl_table *table, int write, struct file *filp,
 			table->data = "NONE";
 		else
 			table->data = logger->name;
-		r = proc_dostring(table, write, filp, buffer, lenp, ppos);
+		r = proc_dostring(table, write, buffer, lenp, ppos);
 		mutex_unlock(&nf_log_mutex);
 	}
 
diff --git a/net/phonet/sysctl.c b/net/phonet/sysctl.c
index 7b5749ee2765..2220f3322326 100644
--- a/net/phonet/sysctl.c
+++ b/net/phonet/sysctl.c
@@ -56,7 +56,7 @@ void phonet_get_local_port_range(int *min, int *max)
 	} while (read_seqretry(&local_port_range_lock, seq));
 }
 
-static int proc_local_port_range(ctl_table *table, int write, struct file *filp,
+static int proc_local_port_range(ctl_table *table, int write,
 				void __user *buffer,
 				size_t *lenp, loff_t *ppos)
 {
@@ -70,7 +70,7 @@ static int proc_local_port_range(ctl_table *table, int write, struct file *filp,
 		.extra2 = &local_port_range_max,
 	};
 
-	ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 
 	if (write && ret == 0) {
 		if (range[1] < range[0])
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index 5231f7aaac0e..42f9748ae093 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -56,7 +56,7 @@ rpc_unregister_sysctl(void)
 	}
 }
 
-static int proc_do_xprt(ctl_table *table, int write, struct file *file,
+static int proc_do_xprt(ctl_table *table, int write,
 			void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	char tmpbuf[256];
@@ -71,7 +71,7 @@ static int proc_do_xprt(ctl_table *table, int write, struct file *file,
 }
 
 static int
-proc_dodebug(ctl_table *table, int write, struct file *file,
+proc_dodebug(ctl_table *table, int write,
 				void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	char		tmpbuf[20], c, *s;
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 87101177825b..35fb68b9c8ec 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -80,7 +80,7 @@ struct kmem_cache *svc_rdma_ctxt_cachep;
  * current value.
  */
 static int read_reset_stat(ctl_table *table, int write,
-			   struct file *filp, void __user *buffer, size_t *lenp,
+			   void __user *buffer, size_t *lenp,
 			   loff_t *ppos)
 {
 	atomic_t *stat = (atomic_t *)table->data;
diff --git a/security/min_addr.c b/security/min_addr.c
index 14cc7b3b8d03..c844eed7915d 100644
--- a/security/min_addr.c
+++ b/security/min_addr.c
@@ -28,12 +28,12 @@ static void update_mmap_min_addr(void)
  * sysctl handler which just sets dac_mmap_min_addr = the new value and then
  * calls update_mmap_min_addr() so non MAP_FIXED hints get rounded properly
  */
-int mmap_min_addr_handler(struct ctl_table *table, int write, struct file *filp,
+int mmap_min_addr_handler(struct ctl_table *table, int write,
 			  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret;
 
-	ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 
 	update_mmap_min_addr();
 
-- 
cgit v1.2.3-71-gd317


From 9064a6787aa1d8ceaf5ba16fe1dfedb0755dc7eb Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Wed, 23 Sep 2009 15:57:23 -0700
Subject: linux/futex.h: place kernel types behind __KERNEL__

The forward decls for some kernel types are only needed by the code behind
__KERNEL__, so don't bleed these types to userspace.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/futex.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 34956c8fdebf..8ec17997d94f 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -4,11 +4,6 @@
 #include <linux/compiler.h>
 #include <linux/types.h>
 
-struct inode;
-struct mm_struct;
-struct task_struct;
-union ktime;
-
 /* Second argument to futex syscall */
 
 
@@ -129,6 +124,11 @@ struct robust_list_head {
 #define FUTEX_BITSET_MATCH_ANY	0xffffffff
 
 #ifdef __KERNEL__
+struct inode;
+struct mm_struct;
+struct task_struct;
+union ktime;
+
 long do_futex(u32 __user *uaddr, int op, u32 val, union ktime *timeout,
 	      u32 __user *uaddr2, u32 val2, u32 val3);
 
-- 
cgit v1.2.3-71-gd317


From 858f09930b32c11b40fd0c5c467982ba09b10894 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 23 Sep 2009 15:57:32 -0700
Subject: aio: ifdef fields in mm_struct

->ioctx_lock and ->ioctx_list are used only under CONFIG_AIO.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Zach Brown <zach.brown@oracle.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h |  5 ++---
 kernel/fork.c            | 11 +++++++++--
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0042090a4d70..6b7029ab9c8e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -259,11 +259,10 @@ struct mm_struct {
 	unsigned long flags; /* Must use atomic bitops to access the bits */
 
 	struct core_state *core_state; /* coredumping support */
-
-	/* aio bits */
+#ifdef CONFIG_AIO
 	spinlock_t		ioctx_lock;
 	struct hlist_head	ioctx_list;
-
+#endif
 #ifdef CONFIG_MM_OWNER
 	/*
 	 * "owner" points to a task that is regarded as the canonical
diff --git a/kernel/fork.c b/kernel/fork.c
index b51fd2ccb2f1..e49f181ba1ca 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -434,6 +434,14 @@ __setup("coredump_filter=", coredump_filter_setup);
 
 #include <linux/init_task.h>
 
+static void mm_init_aio(struct mm_struct *mm)
+{
+#ifdef CONFIG_AIO
+	spin_lock_init(&mm->ioctx_lock);
+	INIT_HLIST_HEAD(&mm->ioctx_list);
+#endif
+}
+
 static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 {
 	atomic_set(&mm->mm_users, 1);
@@ -447,10 +455,9 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 	set_mm_counter(mm, file_rss, 0);
 	set_mm_counter(mm, anon_rss, 0);
 	spin_lock_init(&mm->page_table_lock);
-	spin_lock_init(&mm->ioctx_lock);
-	INIT_HLIST_HEAD(&mm->ioctx_list);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;
+	mm_init_aio(mm);
 	mm_init_owner(mm, p);
 
 	if (likely(!mm_alloc_pgd(mm))) {
-- 
cgit v1.2.3-71-gd317


From 2fa4341074cd02fb39aa23410740764948755635 Mon Sep 17 00:00:00 2001
From: Albin Tonnerre <albin.tonnerre@free-electrons.com>
Date: Wed, 23 Sep 2009 15:57:38 -0700
Subject: include/linux/unaligned/{l,b}e_byteshift.h: fix usage for compressed
 kernels

When unaligned accesses are required for uncompressing a kernel (such as
for LZO decompression on ARM in a patch that follows), including
<linux/kernel.h> causes issues as it brings in a lot of things that are
not available in the decompression environment.

linux/kernel.h brings at least:
extern int console_printk[];
extern const char hex_asc[];
which causes errors at link-time as they are not available when
compiling the pre-boot environement. There are also a few others:

  arch/arm/boot/compressed/misc.o: In function `valid_user_regs':
   arch/arm/include/asm/ptrace.h:158: undefined reference to `elf_hwcap'
  arch/arm/boot/compressed/misc.o: In function `console_silent':
   include/linux/kernel.h:292: undefined reference to `console_printk'
  arch/arm/boot/compressed/misc.o: In function `console_verbose':
   include/linux/kernel.h:297: undefined reference to `console_printk'
  arch/arm/boot/compressed/misc.o: In function `pack_hex_byte':
   include/linux/kernel.h:360: undefined reference to `hex_asc'
  arch/arm/boot/compressed/misc.o: In function `hweight_long':
   include/linux/bitops.h:45: undefined reference to `hweight32'
  arch/arm/boot/compressed/misc.o: In function `__cmpxchg_local_generic':
   include/asm-generic/cmpxchg-local.h:21: undefined reference to `wrong_size_cmpxchg'
   include/asm-generic/cmpxchg-local.h:42: undefined reference to `wrong_size_cmpxchg'
  arch/arm/boot/compressed/misc.o: In function `__xchg':
   arch/arm/include/asm/system.h:309: undefined reference to `__bad_xchg'

However, those files apparently use nothing from <linux/kernel.h>, all
they need is the declaration of types such as u32 or u64, so
<linux/types.h> should be enough

Signed-off-by: Albin Tonnerre <albin.tonnerre@free-electrons.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Phillip Lougher <phillip@lougher.demon.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/unaligned/be_byteshift.h | 2 +-
 include/linux/unaligned/le_byteshift.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/unaligned/be_byteshift.h b/include/linux/unaligned/be_byteshift.h
index 46dd12c5709e..9356b24223ac 100644
--- a/include/linux/unaligned/be_byteshift.h
+++ b/include/linux/unaligned/be_byteshift.h
@@ -1,7 +1,7 @@
 #ifndef _LINUX_UNALIGNED_BE_BYTESHIFT_H
 #define _LINUX_UNALIGNED_BE_BYTESHIFT_H
 
-#include <linux/kernel.h>
+#include <linux/types.h>
 
 static inline u16 __get_unaligned_be16(const u8 *p)
 {
diff --git a/include/linux/unaligned/le_byteshift.h b/include/linux/unaligned/le_byteshift.h
index 59777e951baf..be376fb79b64 100644
--- a/include/linux/unaligned/le_byteshift.h
+++ b/include/linux/unaligned/le_byteshift.h
@@ -1,7 +1,7 @@
 #ifndef _LINUX_UNALIGNED_LE_BYTESHIFT_H
 #define _LINUX_UNALIGNED_LE_BYTESHIFT_H
 
-#include <linux/kernel.h>
+#include <linux/types.h>
 
 static inline u16 __get_unaligned_le16(const u8 *p)
 {
-- 
cgit v1.2.3-71-gd317


From 801460d0cf5c5288153b722565773059b0f44348 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Wed, 23 Sep 2009 15:57:41 -0700
Subject: task_struct cleanup: move binfmt field to mm_struct

Because the binfmt is not different between threads in the same process,
it can be moved from task_struct to mm_struct.  And binfmt moudle is
handled per mm_struct instead of task_struct.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                | 10 ++++++----
 include/linux/mm_types.h |  2 ++
 include/linux/sched.h    |  1 -
 kernel/exit.c            |  2 --
 kernel/fork.c            | 13 +++++++------
 5 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 6dc92c39dd94..d49be6bc1793 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1397,10 +1397,12 @@ out_ret:
 
 void set_binfmt(struct linux_binfmt *new)
 {
-	if (current->binfmt)
-		module_put(current->binfmt->module);
+	struct mm_struct *mm = current->mm;
+
+	if (mm->binfmt)
+		module_put(mm->binfmt->module);
 
-	current->binfmt = new;
+	mm->binfmt = new;
 	if (new)
 		__module_get(new->module);
 }
@@ -1770,7 +1772,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 
 	audit_core_dumps(signr);
 
-	binfmt = current->binfmt;
+	binfmt = mm->binfmt;
 	if (!binfmt || !binfmt->core_dump)
 		goto fail;
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6b7029ab9c8e..21d6aa45206a 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -240,6 +240,8 @@ struct mm_struct {
 
 	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
 
+	struct linux_binfmt *binfmt;
+
 	cpumask_t cpu_vm_mask;
 
 	/* Architecture-specific MM context */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 811cd96524d7..8a16f6d11dcd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1271,7 +1271,6 @@ struct task_struct {
 	struct mm_struct *mm, *active_mm;
 
 /* task state */
-	struct linux_binfmt *binfmt;
 	int exit_state;
 	int exit_code, exit_signal;
 	int pdeath_signal;  /*  The signal sent when the parent dies  */
diff --git a/kernel/exit.c b/kernel/exit.c
index 6c75ff83a8fe..5859f598c951 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -976,8 +976,6 @@ NORET_TYPE void do_exit(long code)
 		disassociate_ctty(1);
 
 	module_put(task_thread_info(tsk)->exec_domain->module);
-	if (tsk->binfmt)
-		module_put(tsk->binfmt->module);
 
 	proc_exit_connector(tsk);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index e49f181ba1ca..266c6af6ef1b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -518,6 +518,8 @@ void mmput(struct mm_struct *mm)
 			spin_unlock(&mmlist_lock);
 		}
 		put_swap_token(mm);
+		if (mm->binfmt)
+			module_put(mm->binfmt->module);
 		mmdrop(mm);
 	}
 }
@@ -643,9 +645,14 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 	mm->hiwater_rss = get_mm_rss(mm);
 	mm->hiwater_vm = mm->total_vm;
 
+	if (mm->binfmt && !try_module_get(mm->binfmt->module))
+		goto free_pt;
+
 	return mm;
 
 free_pt:
+	/* don't put binfmt in mmput, we haven't got module yet */
+	mm->binfmt = NULL;
 	mmput(mm);
 
 fail_nomem:
@@ -1037,9 +1044,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (!try_module_get(task_thread_info(p)->exec_domain->module))
 		goto bad_fork_cleanup_count;
 
-	if (p->binfmt && !try_module_get(p->binfmt->module))
-		goto bad_fork_cleanup_put_domain;
-
 	p->did_exec = 0;
 	delayacct_tsk_init(p);	/* Must remain after dup_task_struct() */
 	copy_flags(clone_flags, p);
@@ -1327,9 +1331,6 @@ bad_fork_cleanup_cgroup:
 #endif
 	cgroup_exit(p, cgroup_callbacks_done);
 	delayacct_tsk_free(p);
-	if (p->binfmt)
-		module_put(p->binfmt->module);
-bad_fork_cleanup_put_domain:
 	module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
 	atomic_dec(&p->cred->user->processes);
-- 
cgit v1.2.3-71-gd317


From 4a4962263f07d14660849ec134ee42b63e95ea9a Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Mon, 6 Jul 2009 14:50:42 +0100
Subject: module: reduce symbol table for loaded modules (v2)

Discard all symbols not interesting for kallsyms use: absolute,
section, and in the common case (!KALLSYMS_ALL) data ones.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h | 10 ++++--
 kernel/module.c        | 91 +++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 94 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 1c755b2f937d..1d3ccb173fd6 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -308,9 +308,13 @@ struct module
 #endif
 
 #ifdef CONFIG_KALLSYMS
-	/* We keep the symbol and string tables for kallsyms. */
-	Elf_Sym *symtab;
-	unsigned int num_symtab;
+	/*
+	 * We keep the symbol and string tables for kallsyms.
+	 * The core_* fields below are temporary, loader-only (they
+	 * could really be discarded after module init).
+	 */
+	Elf_Sym *symtab, *core_symtab;
+	unsigned int num_symtab, core_num_syms;
 	char *strtab;
 
 	/* Section attributes */
diff --git a/kernel/module.c b/kernel/module.c
index e6bc4b28aa62..97f4d5e15535 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1862,13 +1862,68 @@ static char elf_type(const Elf_Sym *sym,
 	return '?';
 }
 
+static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
+                           unsigned int shnum)
+{
+	const Elf_Shdr *sec;
+
+	if (src->st_shndx == SHN_UNDEF
+	    || src->st_shndx >= shnum
+	    || !src->st_name)
+		return false;
+
+	sec = sechdrs + src->st_shndx;
+	if (!(sec->sh_flags & SHF_ALLOC)
+#ifndef CONFIG_KALLSYMS_ALL
+	    || !(sec->sh_flags & SHF_EXECINSTR)
+#endif
+	    || (sec->sh_entsize & INIT_OFFSET_MASK))
+		return false;
+
+	return true;
+}
+
+static unsigned long layout_symtab(struct module *mod,
+				   Elf_Shdr *sechdrs,
+				   unsigned int symindex,
+				   const Elf_Ehdr *hdr,
+				   const char *secstrings)
+{
+	unsigned long symoffs;
+	Elf_Shdr *symsect = sechdrs + symindex;
+	const Elf_Sym *src;
+	unsigned int i, nsrc, ndst;
+
+	/* Put symbol section at end of init part of module. */
+	symsect->sh_flags |= SHF_ALLOC;
+	symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
+					 symindex) | INIT_OFFSET_MASK;
+	DEBUGP("\t%s\n", secstrings + symsect->sh_name);
+
+	src = (void *)hdr + symsect->sh_offset;
+	nsrc = symsect->sh_size / sizeof(*src);
+	for (ndst = i = 1; i < nsrc; ++i, ++src)
+		if (is_core_symbol(src, sechdrs, hdr->e_shnum))
+			++ndst;
+
+	/* Append room for core symbols at end of core part. */
+	symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
+	mod->core_size = symoffs + ndst * sizeof(Elf_Sym);
+
+	return symoffs;
+}
+
 static void add_kallsyms(struct module *mod,
 			 Elf_Shdr *sechdrs,
+			 unsigned int shnum,
 			 unsigned int symindex,
 			 unsigned int strindex,
+			 unsigned long symoffs,
 			 const char *secstrings)
 {
-	unsigned int i;
+	unsigned int i, ndst;
+	const Elf_Sym *src;
+	Elf_Sym *dst;
 
 	mod->symtab = (void *)sechdrs[symindex].sh_addr;
 	mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
@@ -1878,12 +1933,32 @@ static void add_kallsyms(struct module *mod,
 	for (i = 0; i < mod->num_symtab; i++)
 		mod->symtab[i].st_info
 			= elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
+
+	mod->core_symtab = dst = mod->module_core + symoffs;
+	src = mod->symtab;
+	*dst = *src;
+	for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
+		if (!is_core_symbol(src, sechdrs, shnum))
+			continue;
+		dst[ndst] = *src;
+		++ndst;
+	}
+	mod->core_num_syms = ndst;
 }
 #else
+static inline unsigned long layout_symtab(struct module *mod,
+					  Elf_Shdr *sechdrs,
+					  unsigned int symindex,
+					  const Elf_Hdr *hdr,
+					  const char *secstrings)
+{
+}
 static inline void add_kallsyms(struct module *mod,
 				Elf_Shdr *sechdrs,
+				unsigned int shnum,
 				unsigned int symindex,
 				unsigned int strindex,
+				unsigned long symoffs,
 				const char *secstrings)
 {
 }
@@ -1959,6 +2034,9 @@ static noinline struct module *load_module(void __user *umod,
 	struct module *mod;
 	long err = 0;
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
+#ifdef CONFIG_KALLSYMS
+	unsigned long symoffs;
+#endif
 	mm_segment_t old_fs;
 
 	DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2041,8 +2119,7 @@ static noinline struct module *load_module(void __user *umod,
 	sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
 	sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
 #ifdef CONFIG_KALLSYMS
-	/* Keep symbol and string tables for decoding later. */
-	sechdrs[symindex].sh_flags |= SHF_ALLOC;
+	/* Keep string table for decoding later. */
 	sechdrs[strindex].sh_flags |= SHF_ALLOC;
 #endif
 
@@ -2109,6 +2186,7 @@ static noinline struct module *load_module(void __user *umod,
 	   this is done generically; there doesn't appear to be any
 	   special cases for the architectures. */
 	layout_sections(mod, hdr, sechdrs, secstrings);
+	symoffs = layout_symtab(mod, sechdrs, symindex, hdr, secstrings);
 
 	/* Do the allocs. */
 	ptr = module_alloc_update_bounds(mod->core_size);
@@ -2313,7 +2391,8 @@ static noinline struct module *load_module(void __user *umod,
 	percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
 		       sechdrs[pcpuindex].sh_size);
 
-	add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
+	add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
+		     symoffs, secstrings);
 
 	if (!mod->taints) {
 		struct _ddebug *debug;
@@ -2491,6 +2570,10 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
 	/* Drop initial reference. */
 	module_put(mod);
 	trim_init_extable(mod);
+#ifdef CONFIG_KALLSYMS
+	mod->num_symtab = mod->core_num_syms;
+	mod->symtab = mod->core_symtab;
+#endif
 	module_free(mod, mod->module_init);
 	mod->module_init = NULL;
 	mod->init_size = 0;
-- 
cgit v1.2.3-71-gd317


From 554bdfe5acf3715e87c8d5e25a4f9a896ac9f014 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Mon, 6 Jul 2009 14:51:44 +0100
Subject: module: reduce string table for loaded modules (v2)

Also remove all parts of the string table (referenced by the symbol
table) that are not needed for kallsyms use (i.e. which were only
referenced by symbols discarded by the previous patch, or not
referenced at all for whatever reason).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h |  2 +-
 kernel/module.c        | 68 +++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 57 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 1d3ccb173fd6..aca980365956 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -315,7 +315,7 @@ struct module
 	 */
 	Elf_Sym *symtab, *core_symtab;
 	unsigned int num_symtab, core_num_syms;
-	char *strtab;
+	char *strtab, *core_strtab;
 
 	/* Section attributes */
 	struct module_sect_attrs *sect_attrs;
diff --git a/kernel/module.c b/kernel/module.c
index 97f4d5e15535..39827c3d9484 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1886,12 +1886,17 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
 static unsigned long layout_symtab(struct module *mod,
 				   Elf_Shdr *sechdrs,
 				   unsigned int symindex,
+				   unsigned int strindex,
 				   const Elf_Ehdr *hdr,
-				   const char *secstrings)
+				   const char *secstrings,
+				   unsigned long *pstroffs,
+				   unsigned long *strmap)
 {
 	unsigned long symoffs;
 	Elf_Shdr *symsect = sechdrs + symindex;
+	Elf_Shdr *strsect = sechdrs + strindex;
 	const Elf_Sym *src;
+	const char *strtab;
 	unsigned int i, nsrc, ndst;
 
 	/* Put symbol section at end of init part of module. */
@@ -1902,14 +1907,31 @@ static unsigned long layout_symtab(struct module *mod,
 
 	src = (void *)hdr + symsect->sh_offset;
 	nsrc = symsect->sh_size / sizeof(*src);
+	strtab = (void *)hdr + strsect->sh_offset;
 	for (ndst = i = 1; i < nsrc; ++i, ++src)
-		if (is_core_symbol(src, sechdrs, hdr->e_shnum))
+		if (is_core_symbol(src, sechdrs, hdr->e_shnum)) {
+			unsigned int j = src->st_name;
+
+			while(!__test_and_set_bit(j, strmap) && strtab[j])
+				++j;
 			++ndst;
+		}
 
 	/* Append room for core symbols at end of core part. */
 	symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
 	mod->core_size = symoffs + ndst * sizeof(Elf_Sym);
 
+	/* Put string table section at end of init part of module. */
+	strsect->sh_flags |= SHF_ALLOC;
+	strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
+					 strindex) | INIT_OFFSET_MASK;
+	DEBUGP("\t%s\n", secstrings + strsect->sh_name);
+
+	/* Append room for core symbols' strings at end of core part. */
+	*pstroffs = mod->core_size;
+	__set_bit(0, strmap);
+	mod->core_size += bitmap_weight(strmap, strsect->sh_size);
+
 	return symoffs;
 }
 
@@ -1919,11 +1941,14 @@ static void add_kallsyms(struct module *mod,
 			 unsigned int symindex,
 			 unsigned int strindex,
 			 unsigned long symoffs,
-			 const char *secstrings)
+			 unsigned long stroffs,
+			 const char *secstrings,
+			 unsigned long *strmap)
 {
 	unsigned int i, ndst;
 	const Elf_Sym *src;
 	Elf_Sym *dst;
+	char *s;
 
 	mod->symtab = (void *)sechdrs[symindex].sh_addr;
 	mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
@@ -1941,16 +1966,25 @@ static void add_kallsyms(struct module *mod,
 		if (!is_core_symbol(src, sechdrs, shnum))
 			continue;
 		dst[ndst] = *src;
+		dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name);
 		++ndst;
 	}
 	mod->core_num_syms = ndst;
+
+	mod->core_strtab = s = mod->module_core + stroffs;
+	for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i)
+		if (test_bit(i, strmap))
+			*++s = mod->strtab[i];
 }
 #else
 static inline unsigned long layout_symtab(struct module *mod,
 					  Elf_Shdr *sechdrs,
 					  unsigned int symindex,
+					  unsigned int strindex,
 					  const Elf_Hdr *hdr,
-					  const char *secstrings)
+					  const char *secstrings,
+					  unsigned long *pstroffs,
+					  unsigned long *strmap)
 {
 }
 static inline void add_kallsyms(struct module *mod,
@@ -1959,7 +1993,9 @@ static inline void add_kallsyms(struct module *mod,
 				unsigned int symindex,
 				unsigned int strindex,
 				unsigned long symoffs,
-				const char *secstrings)
+				unsigned long stroffs,
+				const char *secstrings,
+				const unsigned long *strmap)
 {
 }
 #endif /* CONFIG_KALLSYMS */
@@ -2035,7 +2071,7 @@ static noinline struct module *load_module(void __user *umod,
 	long err = 0;
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
 #ifdef CONFIG_KALLSYMS
-	unsigned long symoffs;
+	unsigned long symoffs, stroffs, *strmap;
 #endif
 	mm_segment_t old_fs;
 
@@ -2118,10 +2154,6 @@ static noinline struct module *load_module(void __user *umod,
 	/* Don't keep modinfo and version sections. */
 	sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
 	sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
-#ifdef CONFIG_KALLSYMS
-	/* Keep string table for decoding later. */
-	sechdrs[strindex].sh_flags |= SHF_ALLOC;
-#endif
 
 	/* Check module struct version now, before we try to use module. */
 	if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -2157,6 +2189,13 @@ static noinline struct module *load_module(void __user *umod,
 		goto free_hdr;
 	}
 
+	strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size)
+			 * sizeof(long), GFP_KERNEL);
+	if (!strmap) {
+		err = -ENOMEM;
+		goto free_mod;
+	}
+
 	if (find_module(mod->name)) {
 		err = -EEXIST;
 		goto free_mod;
@@ -2186,7 +2225,8 @@ static noinline struct module *load_module(void __user *umod,
 	   this is done generically; there doesn't appear to be any
 	   special cases for the architectures. */
 	layout_sections(mod, hdr, sechdrs, secstrings);
-	symoffs = layout_symtab(mod, sechdrs, symindex, hdr, secstrings);
+	symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr,
+				secstrings, &stroffs, strmap);
 
 	/* Do the allocs. */
 	ptr = module_alloc_update_bounds(mod->core_size);
@@ -2392,7 +2432,9 @@ static noinline struct module *load_module(void __user *umod,
 		       sechdrs[pcpuindex].sh_size);
 
 	add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
-		     symoffs, secstrings);
+		     symoffs, stroffs, secstrings, strmap);
+	kfree(strmap);
+	strmap = NULL;
 
 	if (!mod->taints) {
 		struct _ddebug *debug;
@@ -2481,6 +2523,7 @@ static noinline struct module *load_module(void __user *umod,
 		percpu_modfree(percpu);
  free_mod:
 	kfree(args);
+	kfree(strmap);
  free_hdr:
 	vfree(hdr);
 	return ERR_PTR(err);
@@ -2573,6 +2616,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
 #ifdef CONFIG_KALLSYMS
 	mod->num_symtab = mod->core_num_syms;
 	mod->symtab = mod->core_symtab;
+	mod->strtab = mod->core_strtab;
 #endif
 	module_free(mod, mod->module_init);
 	mod->module_init = NULL;
-- 
cgit v1.2.3-71-gd317


From 1d7015caa082d465faeae5d6fd1be077ee6dfa87 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Fri, 25 Sep 2009 00:32:58 -0600
Subject: module: preferred way to use MODULE_AUTHOR

For the longest time now we've been using multiple MODULE_AUTHOR()
statements when a module has more than one author, but the comment here
disagrees.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Luciano Coelho <luciano.coelho@nokia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index aca980365956..482efc865acf 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -128,7 +128,10 @@ extern struct module __this_module;
  */
 #define MODULE_LICENSE(_license) MODULE_INFO(license, _license)
 
-/* Author, ideally of form NAME[, NAME]*[ and NAME] */
+/*
+ * Author(s), use "Name <email>" or just "Name", for multiple
+ * authors use multiple MODULE_AUTHOR() statements/lines.
+ */
 #define MODULE_AUTHOR(_author) MODULE_INFO(author, _author)
   
 /* What your module does. */
-- 
cgit v1.2.3-71-gd317