From 0da2f50944976e890ccc9436ab88c0da87788d02 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:16 +0200
Subject: ide: remove unnecessary blk_queue_flushing() test in do_ide_request()

Unplugging from a request function doesn't really help much (it's
already in the request_fn) and soon block layer will be updated to mix
barrier sequence with other commands, so there's no need to treat
queue flushing any differently.

ide was the only user of blk_queue_flushing().  Remove it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/blkdev.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux/blkdev.h')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2c54906f678f..015375c7d031 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -521,7 +521,6 @@ enum {
 #define blk_queue_nonrot(q)	test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
 #define blk_queue_io_stat(q)	test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
 #define blk_queue_add_random(q)	test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
-#define blk_queue_flushing(q)	((q)->ordseq)
 #define blk_queue_stackable(q)	\
 	test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
 #define blk_queue_discard(q)	test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
-- 
cgit v1.2.3-71-gd317


From 6958f145459ca7ad9715024de97445addacb8510 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:16 +0200
Subject: block: kill QUEUE_ORDERED_BY_TAG

Nobody is making meaningful use of ORDERED_BY_TAG now and queue
draining for barrier requests will be removed soon which will render
the advantage of tag ordering moot.  Kill ORDERED_BY_TAG.  The
following users are affected.

* brd: converted to ORDERED_DRAIN.
* virtio_blk: ORDERED_TAG path was already marked deprecated.  Removed.
* xen-blkfront: ORDERED_TAG case dropped.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-barrier.c          | 35 +++++++----------------------------
 drivers/block/brd.c          |  2 +-
 drivers/block/virtio_blk.c   |  9 ---------
 drivers/block/xen-blkfront.c |  8 +++-----
 drivers/scsi/sd.c            |  4 +---
 include/linux/blkdev.h       | 17 +----------------
 6 files changed, 13 insertions(+), 62 deletions(-)

(limited to 'include/linux/blkdev.h')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index f0faefca032f..c807e9ca3a68 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -26,10 +26,7 @@ int blk_queue_ordered(struct request_queue *q, unsigned ordered)
 	if (ordered != QUEUE_ORDERED_NONE &&
 	    ordered != QUEUE_ORDERED_DRAIN &&
 	    ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
-	    ordered != QUEUE_ORDERED_DRAIN_FUA &&
-	    ordered != QUEUE_ORDERED_TAG &&
-	    ordered != QUEUE_ORDERED_TAG_FLUSH &&
-	    ordered != QUEUE_ORDERED_TAG_FUA) {
+	    ordered != QUEUE_ORDERED_DRAIN_FUA) {
 		printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
 		return -EINVAL;
 	}
@@ -155,21 +152,9 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	 * For an empty barrier, there's no actual BAR request, which
 	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
 	 */
-	if (!blk_rq_sectors(rq)) {
+	if (!blk_rq_sectors(rq))
 		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
 				QUEUE_ORDERED_DO_POSTFLUSH);
-		/*
-		 * Empty barrier on a write-through device w/ ordered
-		 * tag has no command to issue and without any command
-		 * to issue, ordering by tag can't be used.  Drain
-		 * instead.
-		 */
-		if ((q->ordered & QUEUE_ORDERED_BY_TAG) &&
-		    !(q->ordered & QUEUE_ORDERED_DO_PREFLUSH)) {
-			q->ordered &= ~QUEUE_ORDERED_BY_TAG;
-			q->ordered |= QUEUE_ORDERED_BY_DRAIN;
-		}
-	}
 
 	/* stash away the original request */
 	blk_dequeue_request(rq);
@@ -210,7 +195,7 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	} else
 		skip |= QUEUE_ORDSEQ_PREFLUSH;
 
-	if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && queue_in_flight(q))
+	if (queue_in_flight(q))
 		rq = NULL;
 	else
 		skip |= QUEUE_ORDSEQ_DRAIN;
@@ -257,16 +242,10 @@ bool blk_do_ordered(struct request_queue *q, struct request **rqp)
 	    rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
 		return true;
 
-	if (q->ordered & QUEUE_ORDERED_BY_TAG) {
-		/* Ordered by tag.  Blocking the next barrier is enough. */
-		if (is_barrier && rq != &q->bar_rq)
-			*rqp = NULL;
-	} else {
-		/* Ordered by draining.  Wait for turn. */
-		WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
-		if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
-			*rqp = NULL;
-	}
+	/* Ordered by draining.  Wait for turn. */
+	WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
+	if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
+		*rqp = NULL;
 
 	return true;
 }
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 1c7f63792ff8..47a41272d26b 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -482,7 +482,7 @@ static struct brd_device *brd_alloc(int i)
 	if (!brd->brd_queue)
 		goto out_free_dev;
 	blk_queue_make_request(brd->brd_queue, brd_make_request);
-	blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG);
+	blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_DRAIN);
 	blk_queue_max_hw_sectors(brd->brd_queue, 1024);
 	blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
 
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 2aafafca2b13..79652809eee8 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -395,15 +395,6 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 		 * to implement write barrier support.
 		 */
 		blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH);
-	} else if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER)) {
-		/*
-		 * If the BARRIER feature is supported the host expects us
-		 * to order request by tags.  This implies there is not
-		 * volatile write cache on the host, and that the host
-		 * never re-orders outstanding I/O.  This feature is not
-		 * useful for real life scenarious and deprecated.
-		 */
-		blk_queue_ordered(q, QUEUE_ORDERED_TAG);
 	} else {
 		/*
 		 * If the FLUSH feature is not supported we must assume that
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index ac1b682edecb..50ec6f834996 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -424,8 +424,7 @@ static int xlvbd_barrier(struct blkfront_info *info)
 	const char *barrier;
 
 	switch (info->feature_barrier) {
-	case QUEUE_ORDERED_DRAIN:	barrier = "enabled (drain)"; break;
-	case QUEUE_ORDERED_TAG:		barrier = "enabled (tag)"; break;
+	case QUEUE_ORDERED_DRAIN:	barrier = "enabled"; break;
 	case QUEUE_ORDERED_NONE:	barrier = "disabled"; break;
 	default:			return -EINVAL;
 	}
@@ -1078,8 +1077,7 @@ static void blkfront_connect(struct blkfront_info *info)
 	 * we're dealing with a very old backend which writes
 	 * synchronously; draining will do what needs to get done.
 	 *
-	 * If there are barriers, then we can do full queued writes
-	 * with tagged barriers.
+	 * If there are barriers, then we use flush.
 	 *
 	 * If barriers are not supported, then there's no much we can
 	 * do, so just set ordering to NONE.
@@ -1087,7 +1085,7 @@ static void blkfront_connect(struct blkfront_info *info)
 	if (err)
 		info->feature_barrier = QUEUE_ORDERED_DRAIN;
 	else if (barrier)
-		info->feature_barrier = QUEUE_ORDERED_TAG;
+		info->feature_barrier = QUEUE_ORDERED_DRAIN_FLUSH;
 	else
 		info->feature_barrier = QUEUE_ORDERED_NONE;
 
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 2714becc2eaf..cdfc51ab9cf2 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -2151,9 +2151,7 @@ static int sd_revalidate_disk(struct gendisk *disk)
 
 	/*
 	 * We now have all cache related info, determine how we deal
-	 * with ordered requests.  Note that as the current SCSI
-	 * dispatch function can alter request order, we cannot use
-	 * QUEUE_ORDERED_TAG_* even when ordered tag is supported.
+	 * with ordered requests.
 	 */
 	if (sdkp->WCE)
 		ordered = sdkp->DPOFUA
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 015375c7d031..7077bc0d6138 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -470,12 +470,7 @@ enum {
 	 * DRAIN	: ordering by draining is enough
 	 * DRAIN_FLUSH	: ordering by draining w/ pre and post flushes
 	 * DRAIN_FUA	: ordering by draining w/ pre flush and FUA write
-	 * TAG		: ordering by tag is enough
-	 * TAG_FLUSH	: ordering by tag w/ pre and post flushes
-	 * TAG_FUA	: ordering by tag w/ pre flush and FUA write
 	 */
-	QUEUE_ORDERED_BY_DRAIN		= 0x01,
-	QUEUE_ORDERED_BY_TAG		= 0x02,
 	QUEUE_ORDERED_DO_PREFLUSH	= 0x10,
 	QUEUE_ORDERED_DO_BAR		= 0x20,
 	QUEUE_ORDERED_DO_POSTFLUSH	= 0x40,
@@ -483,8 +478,7 @@ enum {
 
 	QUEUE_ORDERED_NONE		= 0x00,
 
-	QUEUE_ORDERED_DRAIN		= QUEUE_ORDERED_BY_DRAIN |
-					  QUEUE_ORDERED_DO_BAR,
+	QUEUE_ORDERED_DRAIN		= QUEUE_ORDERED_DO_BAR,
 	QUEUE_ORDERED_DRAIN_FLUSH	= QUEUE_ORDERED_DRAIN |
 					  QUEUE_ORDERED_DO_PREFLUSH |
 					  QUEUE_ORDERED_DO_POSTFLUSH,
@@ -492,15 +486,6 @@ enum {
 					  QUEUE_ORDERED_DO_PREFLUSH |
 					  QUEUE_ORDERED_DO_FUA,
 
-	QUEUE_ORDERED_TAG		= QUEUE_ORDERED_BY_TAG |
-					  QUEUE_ORDERED_DO_BAR,
-	QUEUE_ORDERED_TAG_FLUSH		= QUEUE_ORDERED_TAG |
-					  QUEUE_ORDERED_DO_PREFLUSH |
-					  QUEUE_ORDERED_DO_POSTFLUSH,
-	QUEUE_ORDERED_TAG_FUA		= QUEUE_ORDERED_TAG |
-					  QUEUE_ORDERED_DO_PREFLUSH |
-					  QUEUE_ORDERED_DO_FUA,
-
 	/*
 	 * Ordered operation sequence
 	 */
-- 
cgit v1.2.3-71-gd317


From 4913efe456c987057e5d36a3f0a55422a9072cae Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:16 +0200
Subject: block: deprecate barrier and replace blk_queue_ordered() with
 blk_queue_flush()

Barrier is deemed too heavy and will soon be replaced by FLUSH/FUA
requests.  Deprecate barrier.  All REQ_HARDBARRIERs are failed with
-EOPNOTSUPP and blk_queue_ordered() is replaced with simpler
blk_queue_flush().

blk_queue_flush() takes combinations of REQ_FLUSH and FUA.  If a
device has write cache and can flush it, it should set REQ_FLUSH.  If
the device can handle FUA writes, it should also set REQ_FUA.

All blk_queue_ordered() users are converted.

* ORDERED_DRAIN is mapped to 0 which is the default value.
* ORDERED_DRAIN_FLUSH is mapped to REQ_FLUSH.
* ORDERED_DRAIN_FLUSH_FUA is mapped to REQ_FLUSH | REQ_FUA.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Boaz Harrosh <bharrosh@panasas.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Alasdair G Kergon <agk@redhat.com>
Cc: Pierre Ossman <drzeus@drzeus.cx>
Cc: Stefan Weinhuber <wein@de.ibm.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-barrier.c          | 29 -----------------------------
 block/blk-core.c             |  6 ++++--
 block/blk-settings.c         | 20 ++++++++++++++++++++
 drivers/block/brd.c          |  1 -
 drivers/block/loop.c         |  2 +-
 drivers/block/osdblk.c       |  2 +-
 drivers/block/ps3disk.c      |  2 +-
 drivers/block/virtio_blk.c   | 25 +++++++++----------------
 drivers/block/xen-blkfront.c | 43 ++++++++++++-------------------------------
 drivers/ide/ide-disk.c       | 13 ++++++-------
 drivers/md/dm.c              |  2 +-
 drivers/mmc/card/queue.c     |  1 -
 drivers/s390/block/dasd.c    |  1 -
 drivers/scsi/sd.c            | 16 ++++++++--------
 include/linux/blkdev.h       |  6 ++++--
 15 files changed, 67 insertions(+), 102 deletions(-)

(limited to 'include/linux/blkdev.h')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index c807e9ca3a68..ed0aba5463ab 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -9,35 +9,6 @@
 
 #include "blk.h"
 
-/**
- * blk_queue_ordered - does this queue support ordered writes
- * @q:        the request queue
- * @ordered:  one of QUEUE_ORDERED_*
- *
- * Description:
- *   For journalled file systems, doing ordered writes on a commit
- *   block instead of explicitly doing wait_on_buffer (which is bad
- *   for performance) can be a big win. Block drivers supporting this
- *   feature should call this function and indicate so.
- *
- **/
-int blk_queue_ordered(struct request_queue *q, unsigned ordered)
-{
-	if (ordered != QUEUE_ORDERED_NONE &&
-	    ordered != QUEUE_ORDERED_DRAIN &&
-	    ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
-	    ordered != QUEUE_ORDERED_DRAIN_FUA) {
-		printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
-		return -EINVAL;
-	}
-
-	q->ordered = ordered;
-	q->next_ordered = ordered;
-
-	return 0;
-}
-EXPORT_SYMBOL(blk_queue_ordered);
-
 /*
  * Cache flushing for ordered writes handling
  */
diff --git a/block/blk-core.c b/block/blk-core.c
index ee1a1e7e63cc..f06354183b29 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1203,11 +1203,13 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
 	int rw_flags;
 
-	if ((bio->bi_rw & REQ_HARDBARRIER) &&
-	    (q->next_ordered == QUEUE_ORDERED_NONE)) {
+	/* REQ_HARDBARRIER is no more */
+	if (WARN_ONCE(bio->bi_rw & REQ_HARDBARRIER,
+		"block: HARDBARRIER is deprecated, use FLUSH/FUA instead\n")) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
 	}
+
 	/*
 	 * low level driver can indicate that it wants pages above a
 	 * certain limit bounced to low memory (ie for highmem, or even
diff --git a/block/blk-settings.c b/block/blk-settings.c
index a234f4bf1d6f..9b18afcfe925 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -794,6 +794,26 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
 }
 EXPORT_SYMBOL(blk_queue_update_dma_alignment);
 
+/**
+ * blk_queue_flush - configure queue's cache flush capability
+ * @q:		the request queue for the device
+ * @flush:	0, REQ_FLUSH or REQ_FLUSH | REQ_FUA
+ *
+ * Tell block layer cache flush capability of @q.  If it supports
+ * flushing, REQ_FLUSH should be set.  If it supports bypassing
+ * write cache for individual writes, REQ_FUA should be set.
+ */
+void blk_queue_flush(struct request_queue *q, unsigned int flush)
+{
+	WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA));
+
+	if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA)))
+		flush &= ~REQ_FUA;
+
+	q->flush_flags = flush & (REQ_FLUSH | REQ_FUA);
+}
+EXPORT_SYMBOL_GPL(blk_queue_flush);
+
 static int __init blk_settings_init(void)
 {
 	blk_max_low_pfn = max_low_pfn - 1;
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 47a41272d26b..fa33f97722ba 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -482,7 +482,6 @@ static struct brd_device *brd_alloc(int i)
 	if (!brd->brd_queue)
 		goto out_free_dev;
 	blk_queue_make_request(brd->brd_queue, brd_make_request);
-	blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_DRAIN);
 	blk_queue_max_hw_sectors(brd->brd_queue, 1024);
 	blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
 
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index c3a4a2e176da..953d1e12f4d4 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -832,7 +832,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	lo->lo_queue->unplug_fn = loop_unplug;
 
 	if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
-		blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_DRAIN_FLUSH);
+		blk_queue_flush(lo->lo_queue, REQ_FLUSH);
 
 	set_capacity(lo->lo_disk, size);
 	bd_set_size(bdev, size << 9);
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index 2284b4f05c62..72d62462433d 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -439,7 +439,7 @@ static int osdblk_init_disk(struct osdblk_device *osdev)
 	blk_queue_stack_limits(q, osd_request_queue(osdev->osd));
 
 	blk_queue_prep_rq(q, blk_queue_start_tag);
-	blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH);
+	blk_queue_flush(q, REQ_FLUSH);
 
 	disk->queue = q;
 
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index e9da874d0419..4911f9e57bc7 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -468,7 +468,7 @@ static int __devinit ps3disk_probe(struct ps3_system_bus_device *_dev)
 	blk_queue_dma_alignment(queue, dev->blk_size-1);
 	blk_queue_logical_block_size(queue, dev->blk_size);
 
-	blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH);
+	blk_queue_flush(queue, REQ_FLUSH);
 
 	blk_queue_max_segments(queue, -1);
 	blk_queue_max_segment_size(queue, dev->bounce_size);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 79652809eee8..d10b635b3946 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -388,22 +388,15 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 	vblk->disk->driverfs_dev = &vdev->dev;
 	index++;
 
-	if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH)) {
-		/*
-		 * If the FLUSH feature is supported we do have support for
-		 * flushing a volatile write cache on the host.  Use that
-		 * to implement write barrier support.
-		 */
-		blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH);
-	} else {
-		/*
-		 * If the FLUSH feature is not supported we must assume that
-		 * the host does not perform any kind of volatile write
-		 * caching. We still need to drain the queue to provider
-		 * proper barrier semantics.
-		 */
-		blk_queue_ordered(q, QUEUE_ORDERED_DRAIN);
-	}
+	/*
+	 * If the FLUSH feature is supported we do have support for
+	 * flushing a volatile write cache on the host.  Use that to
+	 * implement write barrier support; otherwise, we must assume
+	 * that the host does not perform any kind of volatile write
+	 * caching.
+	 */
+	if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH))
+		blk_queue_flush(q, REQ_FLUSH);
 
 	/* If disk is read-only in the host, the guest should obey */
 	if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 50ec6f834996..0b1eea643262 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -95,7 +95,7 @@ struct blkfront_info
 	struct gnttab_free_callback callback;
 	struct blk_shadow shadow[BLK_RING_SIZE];
 	unsigned long shadow_free;
-	int feature_barrier;
+	unsigned int feature_flush;
 	int is_ready;
 };
 
@@ -418,25 +418,12 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
 }
 
 
-static int xlvbd_barrier(struct blkfront_info *info)
+static void xlvbd_flush(struct blkfront_info *info)
 {
-	int err;
-	const char *barrier;
-
-	switch (info->feature_barrier) {
-	case QUEUE_ORDERED_DRAIN:	barrier = "enabled"; break;
-	case QUEUE_ORDERED_NONE:	barrier = "disabled"; break;
-	default:			return -EINVAL;
-	}
-
-	err = blk_queue_ordered(info->rq, info->feature_barrier);
-
-	if (err)
-		return err;
-
+	blk_queue_flush(info->rq, info->feature_flush);
 	printk(KERN_INFO "blkfront: %s: barriers %s\n",
-	       info->gd->disk_name, barrier);
-	return 0;
+	       info->gd->disk_name,
+	       info->feature_flush ? "enabled" : "disabled");
 }
 
 
@@ -515,7 +502,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 	info->rq = gd->queue;
 	info->gd = gd;
 
-	xlvbd_barrier(info);
+	xlvbd_flush(info);
 
 	if (vdisk_info & VDISK_READONLY)
 		set_disk_ro(gd, 1);
@@ -661,8 +648,8 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 				printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
 				       info->gd->disk_name);
 				error = -EOPNOTSUPP;
-				info->feature_barrier = QUEUE_ORDERED_NONE;
-				xlvbd_barrier(info);
+				info->feature_flush = 0;
+				xlvbd_flush(info);
 			}
 			/* fall through */
 		case BLKIF_OP_READ:
@@ -1075,19 +1062,13 @@ static void blkfront_connect(struct blkfront_info *info)
 	/*
 	 * If there's no "feature-barrier" defined, then it means
 	 * we're dealing with a very old backend which writes
-	 * synchronously; draining will do what needs to get done.
+	 * synchronously; nothing to do.
 	 *
 	 * If there are barriers, then we use flush.
-	 *
-	 * If barriers are not supported, then there's no much we can
-	 * do, so just set ordering to NONE.
 	 */
-	if (err)
-		info->feature_barrier = QUEUE_ORDERED_DRAIN;
-	else if (barrier)
-		info->feature_barrier = QUEUE_ORDERED_DRAIN_FLUSH;
-	else
-		info->feature_barrier = QUEUE_ORDERED_NONE;
+	info->feature_flush = 0;
+	if (!err && barrier)
+		info->feature_flush = REQ_FLUSH;
 
 	err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
 	if (err) {
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 7433e07de30e..7c5b01ce51d2 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -516,10 +516,10 @@ static int ide_do_setfeature(ide_drive_t *drive, u8 feature, u8 nsect)
 	return ide_no_data_taskfile(drive, &cmd);
 }
 
-static void update_ordered(ide_drive_t *drive)
+static void update_flush(ide_drive_t *drive)
 {
 	u16 *id = drive->id;
-	unsigned ordered = QUEUE_ORDERED_NONE;
+	unsigned flush = 0;
 
 	if (drive->dev_flags & IDE_DFLAG_WCACHE) {
 		unsigned long long capacity;
@@ -543,13 +543,12 @@ static void update_ordered(ide_drive_t *drive)
 		       drive->name, barrier ? "" : "not ");
 
 		if (barrier) {
-			ordered = QUEUE_ORDERED_DRAIN_FLUSH;
+			flush = REQ_FLUSH;
 			blk_queue_prep_rq(drive->queue, idedisk_prep_fn);
 		}
-	} else
-		ordered = QUEUE_ORDERED_DRAIN;
+	}
 
-	blk_queue_ordered(drive->queue, ordered);
+	blk_queue_flush(drive->queue, flush);
 }
 
 ide_devset_get_flag(wcache, IDE_DFLAG_WCACHE);
@@ -572,7 +571,7 @@ static int set_wcache(ide_drive_t *drive, int arg)
 		}
 	}
 
-	update_ordered(drive);
+	update_flush(drive);
 
 	return err;
 }
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ac384b2a6a33..b1d92be8f990 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2245,7 +2245,7 @@ static int dm_init_request_based_queue(struct mapped_device *md)
 	blk_queue_softirq_done(md->queue, dm_softirq_done);
 	blk_queue_prep_rq(md->queue, dm_prep_fn);
 	blk_queue_lld_busy(md->queue, dm_lld_busy);
-	blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH);
+	blk_queue_flush(md->queue, REQ_FLUSH);
 
 	elv_register_queue(md->queue);
 
diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
index e876678176be..9c0b42bfe089 100644
--- a/drivers/mmc/card/queue.c
+++ b/drivers/mmc/card/queue.c
@@ -128,7 +128,6 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, spinlock_t *lock
 	mq->req = NULL;
 
 	blk_queue_prep_rq(mq->queue, mmc_prep_request);
-	blk_queue_ordered(mq->queue, QUEUE_ORDERED_DRAIN);
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mq->queue);
 	if (mmc_can_erase(card)) {
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mq->queue);
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 8373ca0de8e0..9b106d83b0cd 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -2197,7 +2197,6 @@ static void dasd_setup_queue(struct dasd_block *block)
 	 */
 	blk_queue_max_segment_size(block->request_queue, PAGE_SIZE);
 	blk_queue_segment_boundary(block->request_queue, PAGE_SIZE - 1);
-	blk_queue_ordered(block->request_queue, QUEUE_ORDERED_DRAIN);
 }
 
 /*
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index cdfc51ab9cf2..63bd01ae534f 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -2109,7 +2109,7 @@ static int sd_revalidate_disk(struct gendisk *disk)
 	struct scsi_disk *sdkp = scsi_disk(disk);
 	struct scsi_device *sdp = sdkp->device;
 	unsigned char *buffer;
-	unsigned ordered;
+	unsigned flush = 0;
 
 	SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp,
 				      "sd_revalidate_disk\n"));
@@ -2151,15 +2151,15 @@ static int sd_revalidate_disk(struct gendisk *disk)
 
 	/*
 	 * We now have all cache related info, determine how we deal
-	 * with ordered requests.
+	 * with flush requests.
 	 */
-	if (sdkp->WCE)
-		ordered = sdkp->DPOFUA
-			? QUEUE_ORDERED_DRAIN_FUA : QUEUE_ORDERED_DRAIN_FLUSH;
-	else
-		ordered = QUEUE_ORDERED_DRAIN;
+	if (sdkp->WCE) {
+		flush |= REQ_FLUSH;
+		if (sdkp->DPOFUA)
+			flush |= REQ_FUA;
+	}
 
-	blk_queue_ordered(sdkp->disk->queue, ordered);
+	blk_queue_flush(sdkp->disk->queue, flush);
 
 	set_capacity(disk, sdkp->capacity);
 	kfree(buffer);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7077bc0d6138..e97911d4dec3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -355,8 +355,10 @@ struct request_queue
 	struct blk_trace	*blk_trace;
 #endif
 	/*
-	 * reserved for flush operations
+	 * for flush operations
 	 */
+	unsigned int		flush_flags;
+
 	unsigned int		ordered, next_ordered, ordseq;
 	int			orderr, ordcolor;
 	struct request		pre_flush_rq, bar_rq, post_flush_rq;
@@ -865,8 +867,8 @@ extern void blk_queue_update_dma_alignment(struct request_queue *, int);
 extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
 extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
+extern void blk_queue_flush(struct request_queue *q, unsigned int flush);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
-extern int blk_queue_ordered(struct request_queue *, unsigned);
 extern bool blk_do_ordered(struct request_queue *, struct request **);
 extern unsigned blk_ordered_cur_seq(struct request_queue *);
 extern unsigned blk_ordered_req_seq(struct request *);
-- 
cgit v1.2.3-71-gd317


From dd831006d5be7f74c3fe7aef82380c51c3637960 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:16 +0200
Subject: block: misc cleanups in barrier code

Make the following cleanups in preparation of barrier/flush update.

* blk_do_ordered() declaration is moved from include/linux/blkdev.h to
  block/blk.h.

* blk_do_ordered() now returns pointer to struct request, with %NULL
  meaning "try the next request" and ERR_PTR(-EAGAIN) "try again
  later".  The third case will be dropped with further changes.

* In the initialization of proxy barrier request, data direction is
  already set by init_request_from_bio().  Drop unnecessary explicit
  REQ_WRITE setting and move init_request_from_bio() above REQ_FUA
  flag setting.

* add_request() is collapsed into __make_request().

These changes don't make any functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-barrier.c    | 32 ++++++++++++++------------------
 block/blk-core.c       | 21 ++++-----------------
 block/blk.h            |  7 +++++--
 include/linux/blkdev.h |  1 -
 4 files changed, 23 insertions(+), 38 deletions(-)

(limited to 'include/linux/blkdev.h')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index ed0aba5463ab..f1be85ba2bb5 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -110,9 +110,9 @@ static void queue_flush(struct request_queue *q, unsigned which)
 	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 }
 
-static inline bool start_ordered(struct request_queue *q, struct request **rqp)
+static inline struct request *start_ordered(struct request_queue *q,
+					    struct request *rq)
 {
-	struct request *rq = *rqp;
 	unsigned skip = 0;
 
 	q->orderr = 0;
@@ -149,11 +149,9 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 
 		/* initialize proxy request and queue it */
 		blk_rq_init(q, rq);
-		if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
-			rq->cmd_flags |= REQ_WRITE;
+		init_request_from_bio(rq, q->orig_bar_rq->bio);
 		if (q->ordered & QUEUE_ORDERED_DO_FUA)
 			rq->cmd_flags |= REQ_FUA;
-		init_request_from_bio(rq, q->orig_bar_rq->bio);
 		rq->end_io = bar_end_io;
 
 		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
@@ -171,27 +169,26 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	else
 		skip |= QUEUE_ORDSEQ_DRAIN;
 
-	*rqp = rq;
-
 	/*
 	 * Complete skipped sequences.  If whole sequence is complete,
-	 * return false to tell elevator that this request is gone.
+	 * return %NULL to tell elevator that this request is gone.
 	 */
-	return !blk_ordered_complete_seq(q, skip, 0);
+	if (blk_ordered_complete_seq(q, skip, 0))
+		rq = NULL;
+	return rq;
 }
 
-bool blk_do_ordered(struct request_queue *q, struct request **rqp)
+struct request *blk_do_ordered(struct request_queue *q, struct request *rq)
 {
-	struct request *rq = *rqp;
 	const int is_barrier = rq->cmd_type == REQ_TYPE_FS &&
 				(rq->cmd_flags & REQ_HARDBARRIER);
 
 	if (!q->ordseq) {
 		if (!is_barrier)
-			return true;
+			return rq;
 
 		if (q->next_ordered != QUEUE_ORDERED_NONE)
-			return start_ordered(q, rqp);
+			return start_ordered(q, rq);
 		else {
 			/*
 			 * Queue ordering not supported.  Terminate
@@ -199,8 +196,7 @@ bool blk_do_ordered(struct request_queue *q, struct request **rqp)
 			 */
 			blk_dequeue_request(rq);
 			__blk_end_request_all(rq, -EOPNOTSUPP);
-			*rqp = NULL;
-			return false;
+			return NULL;
 		}
 	}
 
@@ -211,14 +207,14 @@ bool blk_do_ordered(struct request_queue *q, struct request **rqp)
 	/* Special requests are not subject to ordering rules. */
 	if (rq->cmd_type != REQ_TYPE_FS &&
 	    rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
-		return true;
+		return rq;
 
 	/* Ordered by draining.  Wait for turn. */
 	WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
 	if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
-		*rqp = NULL;
+		rq = ERR_PTR(-EAGAIN);
 
-	return true;
+	return rq;
 }
 
 static void bio_end_empty_barrier(struct bio *bio, int err)
diff --git a/block/blk-core.c b/block/blk-core.c
index f06354183b29..f8d37a8e2c55 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1037,22 +1037,6 @@ void blk_insert_request(struct request_queue *q, struct request *rq,
 }
 EXPORT_SYMBOL(blk_insert_request);
 
-/*
- * add-request adds a request to the linked list.
- * queue lock is held and interrupts disabled, as we muck with the
- * request queue list.
- */
-static inline void add_request(struct request_queue *q, struct request *req)
-{
-	drive_stat_acct(req, 1);
-
-	/*
-	 * elevator indicated where it wants this request to be
-	 * inserted at elevator_merge time
-	 */
-	__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
-}
-
 static void part_round_stats_single(int cpu, struct hd_struct *part,
 				    unsigned long now)
 {
@@ -1316,7 +1300,10 @@ get_rq:
 		req->cpu = blk_cpu_to_group(smp_processor_id());
 	if (queue_should_plug(q) && elv_queue_empty(q))
 		blk_plug_device(q);
-	add_request(q, req);
+
+	/* insert the request into the elevator */
+	drive_stat_acct(req, 1);
+	__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
 out:
 	if (unplug || !queue_should_plug(q))
 		__generic_unplug_device(q);
diff --git a/block/blk.h b/block/blk.h
index 6e7dc87141e4..874eb4ea8093 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -51,6 +51,8 @@ static inline void blk_clear_rq_complete(struct request *rq)
  */
 #define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
 
+struct request *blk_do_ordered(struct request_queue *q, struct request *rq);
+
 static inline struct request *__elv_next_request(struct request_queue *q)
 {
 	struct request *rq;
@@ -58,8 +60,9 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 	while (1) {
 		while (!list_empty(&q->queue_head)) {
 			rq = list_entry_rq(q->queue_head.next);
-			if (blk_do_ordered(q, &rq))
-				return rq;
+			rq = blk_do_ordered(q, rq);
+			if (rq)
+				return !IS_ERR(rq) ? rq : NULL;
 		}
 
 		if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e97911d4dec3..996549d71923 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -869,7 +869,6 @@ extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern void blk_queue_flush(struct request_queue *q, unsigned int flush);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
-extern bool blk_do_ordered(struct request_queue *, struct request **);
 extern unsigned blk_ordered_cur_seq(struct request_queue *);
 extern unsigned blk_ordered_req_seq(struct request *);
 extern bool blk_ordered_complete_seq(struct request_queue *, unsigned, int);
-- 
cgit v1.2.3-71-gd317


From 28e7d1845216538303bb95d679d8fd4de50e2f1a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:16 +0200
Subject: block: drop barrier ordering by queue draining

Filesystems will take all the responsibilities for ordering requests
around commit writes and will only indicate how the commit writes
themselves should be handled by block layers.  This patch drops
barrier ordering by queue draining from block layer.  Ordering by
draining implementation was somewhat invasive to request handling.
List of notable changes follow.

* Each queue has 1 bit color which is flipped on each barrier issue.
  This is used to track whether a given request is issued before the
  current barrier or not.  REQ_ORDERED_COLOR flag and coloring
  implementation in __elv_add_request() are removed.

* Requests which shouldn't be processed yet for draining were stalled
  by returning -EAGAIN from blk_do_ordered() according to the test
  result between blk_ordered_req_seq() and blk_blk_ordered_cur_seq().
  This logic is removed.

* Draining completion logic in elv_completed_request() removed.

* All barrier sequence requests were queued to request queue and then
  trckled to lower layer according to progress and thus maintaining
  request orders during requeue was necessary.  This is replaced by
  queueing the next request in the barrier sequence only after the
  current one is complete from blk_ordered_complete_seq(), which
  removes the need for multiple proxy requests in struct request_queue
  and the request sorting logic in the ELEVATOR_INSERT_REQUEUE path of
  elv_insert().

* As barriers no longer have ordering constraints, there's no need to
  dump the whole elevator onto the dispatch queue on each barrier.
  Insert barriers at the front instead.

* If other barrier requests come to the front of the dispatch queue
  while one is already in progress, they are stored in
  q->pending_barriers and restored to dispatch queue one-by-one after
  each barrier completion from blk_ordered_complete_seq().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-barrier.c       | 220 ++++++++++++++++++----------------------------
 block/blk-core.c          |  11 ++-
 block/blk.h               |   2 +-
 block/elevator.c          |  79 ++---------------
 include/linux/blk_types.h |   2 -
 include/linux/blkdev.h    |  19 ++--
 6 files changed, 113 insertions(+), 220 deletions(-)

(limited to 'include/linux/blkdev.h')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index f1be85ba2bb5..e8b2e5c091b1 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -9,6 +9,8 @@
 
 #include "blk.h"
 
+static struct request *queue_next_ordseq(struct request_queue *q);
+
 /*
  * Cache flushing for ordered writes handling
  */
@@ -19,38 +21,10 @@ unsigned blk_ordered_cur_seq(struct request_queue *q)
 	return 1 << ffz(q->ordseq);
 }
 
-unsigned blk_ordered_req_seq(struct request *rq)
-{
-	struct request_queue *q = rq->q;
-
-	BUG_ON(q->ordseq == 0);
-
-	if (rq == &q->pre_flush_rq)
-		return QUEUE_ORDSEQ_PREFLUSH;
-	if (rq == &q->bar_rq)
-		return QUEUE_ORDSEQ_BAR;
-	if (rq == &q->post_flush_rq)
-		return QUEUE_ORDSEQ_POSTFLUSH;
-
-	/*
-	 * !fs requests don't need to follow barrier ordering.  Always
-	 * put them at the front.  This fixes the following deadlock.
-	 *
-	 * http://thread.gmane.org/gmane.linux.kernel/537473
-	 */
-	if (rq->cmd_type != REQ_TYPE_FS)
-		return QUEUE_ORDSEQ_DRAIN;
-
-	if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
-	    (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))
-		return QUEUE_ORDSEQ_DRAIN;
-	else
-		return QUEUE_ORDSEQ_DONE;
-}
-
-bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
+static struct request *blk_ordered_complete_seq(struct request_queue *q,
+						unsigned seq, int error)
 {
-	struct request *rq;
+	struct request *next_rq = NULL;
 
 	if (error && !q->orderr)
 		q->orderr = error;
@@ -58,16 +32,22 @@ bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
 	BUG_ON(q->ordseq & seq);
 	q->ordseq |= seq;
 
-	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
-		return false;
-
-	/*
-	 * Okay, sequence complete.
-	 */
-	q->ordseq = 0;
-	rq = q->orig_bar_rq;
-	__blk_end_request_all(rq, q->orderr);
-	return true;
+	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) {
+		/* not complete yet, queue the next ordered sequence */
+		next_rq = queue_next_ordseq(q);
+	} else {
+		/* complete this barrier request */
+		__blk_end_request_all(q->orig_bar_rq, q->orderr);
+		q->orig_bar_rq = NULL;
+		q->ordseq = 0;
+
+		/* dispatch the next barrier if there's one */
+		if (!list_empty(&q->pending_barriers)) {
+			next_rq = list_entry_rq(q->pending_barriers.next);
+			list_move(&next_rq->queuelist, &q->queue_head);
+		}
+	}
+	return next_rq;
 }
 
 static void pre_flush_end_io(struct request *rq, int error)
@@ -88,133 +68,105 @@ static void post_flush_end_io(struct request *rq, int error)
 	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
 }
 
-static void queue_flush(struct request_queue *q, unsigned which)
+static void queue_flush(struct request_queue *q, struct request *rq,
+			rq_end_io_fn *end_io)
 {
-	struct request *rq;
-	rq_end_io_fn *end_io;
-
-	if (which == QUEUE_ORDERED_DO_PREFLUSH) {
-		rq = &q->pre_flush_rq;
-		end_io = pre_flush_end_io;
-	} else {
-		rq = &q->post_flush_rq;
-		end_io = post_flush_end_io;
-	}
-
 	blk_rq_init(q, rq);
 	rq->cmd_type = REQ_TYPE_FS;
-	rq->cmd_flags = REQ_HARDBARRIER | REQ_FLUSH;
+	rq->cmd_flags = REQ_FLUSH;
 	rq->rq_disk = q->orig_bar_rq->rq_disk;
 	rq->end_io = end_io;
 
 	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 }
 
-static inline struct request *start_ordered(struct request_queue *q,
-					    struct request *rq)
+static struct request *queue_next_ordseq(struct request_queue *q)
 {
-	unsigned skip = 0;
-
-	q->orderr = 0;
-	q->ordered = q->next_ordered;
-	q->ordseq |= QUEUE_ORDSEQ_STARTED;
-
-	/*
-	 * For an empty barrier, there's no actual BAR request, which
-	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
-	 */
-	if (!blk_rq_sectors(rq))
-		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
-				QUEUE_ORDERED_DO_POSTFLUSH);
-
-	/* stash away the original request */
-	blk_dequeue_request(rq);
-	q->orig_bar_rq = rq;
-	rq = NULL;
-
-	/*
-	 * Queue ordered sequence.  As we stack them at the head, we
-	 * need to queue in reverse order.  Note that we rely on that
-	 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
-	 * request gets inbetween ordered sequence.
-	 */
-	if (q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) {
-		queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
-		rq = &q->post_flush_rq;
-	} else
-		skip |= QUEUE_ORDSEQ_POSTFLUSH;
+	struct request *rq = &q->bar_rq;
 
-	if (q->ordered & QUEUE_ORDERED_DO_BAR) {
-		rq = &q->bar_rq;
+	switch (blk_ordered_cur_seq(q)) {
+	case QUEUE_ORDSEQ_PREFLUSH:
+		queue_flush(q, rq, pre_flush_end_io);
+		break;
 
+	case QUEUE_ORDSEQ_BAR:
 		/* initialize proxy request and queue it */
 		blk_rq_init(q, rq);
 		init_request_from_bio(rq, q->orig_bar_rq->bio);
+		rq->cmd_flags &= ~REQ_HARDBARRIER;
 		if (q->ordered & QUEUE_ORDERED_DO_FUA)
 			rq->cmd_flags |= REQ_FUA;
 		rq->end_io = bar_end_io;
 
 		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
-	} else
-		skip |= QUEUE_ORDSEQ_BAR;
+		break;
 
-	if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) {
-		queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH);
-		rq = &q->pre_flush_rq;
-	} else
-		skip |= QUEUE_ORDSEQ_PREFLUSH;
+	case QUEUE_ORDSEQ_POSTFLUSH:
+		queue_flush(q, rq, post_flush_end_io);
+		break;
 
-	if (queue_in_flight(q))
-		rq = NULL;
-	else
-		skip |= QUEUE_ORDSEQ_DRAIN;
-
-	/*
-	 * Complete skipped sequences.  If whole sequence is complete,
-	 * return %NULL to tell elevator that this request is gone.
-	 */
-	if (blk_ordered_complete_seq(q, skip, 0))
-		rq = NULL;
+	default:
+		BUG();
+	}
 	return rq;
 }
 
 struct request *blk_do_ordered(struct request_queue *q, struct request *rq)
 {
-	const int is_barrier = rq->cmd_type == REQ_TYPE_FS &&
-				(rq->cmd_flags & REQ_HARDBARRIER);
-
-	if (!q->ordseq) {
-		if (!is_barrier)
-			return rq;
-
-		if (q->next_ordered != QUEUE_ORDERED_NONE)
-			return start_ordered(q, rq);
-		else {
-			/*
-			 * Queue ordering not supported.  Terminate
-			 * with prejudice.
-			 */
-			blk_dequeue_request(rq);
-			__blk_end_request_all(rq, -EOPNOTSUPP);
-			return NULL;
-		}
+	unsigned skip = 0;
+
+	if (!(rq->cmd_flags & REQ_HARDBARRIER))
+		return rq;
+
+	if (q->ordseq) {
+		/*
+		 * Barrier is already in progress and they can't be
+		 * processed in parallel.  Queue for later processing.
+		 */
+		list_move_tail(&rq->queuelist, &q->pending_barriers);
+		return NULL;
+	}
+
+	if (unlikely(q->next_ordered == QUEUE_ORDERED_NONE)) {
+		/*
+		 * Queue ordering not supported.  Terminate
+		 * with prejudice.
+		 */
+		blk_dequeue_request(rq);
+		__blk_end_request_all(rq, -EOPNOTSUPP);
+		return NULL;
 	}
 
 	/*
-	 * Ordered sequence in progress
+	 * Start a new ordered sequence
 	 */
+	q->orderr = 0;
+	q->ordered = q->next_ordered;
+	q->ordseq |= QUEUE_ORDSEQ_STARTED;
 
-	/* Special requests are not subject to ordering rules. */
-	if (rq->cmd_type != REQ_TYPE_FS &&
-	    rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
-		return rq;
+	/*
+	 * For an empty barrier, there's no actual BAR request, which
+	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
+	 */
+	if (!blk_rq_sectors(rq))
+		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
+				QUEUE_ORDERED_DO_POSTFLUSH);
 
-	/* Ordered by draining.  Wait for turn. */
-	WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
-	if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
-		rq = ERR_PTR(-EAGAIN);
+	/* stash away the original request */
+	blk_dequeue_request(rq);
+	q->orig_bar_rq = rq;
 
-	return rq;
+	if (!(q->ordered & QUEUE_ORDERED_DO_PREFLUSH))
+		skip |= QUEUE_ORDSEQ_PREFLUSH;
+
+	if (!(q->ordered & QUEUE_ORDERED_DO_BAR))
+		skip |= QUEUE_ORDSEQ_BAR;
+
+	if (!(q->ordered & QUEUE_ORDERED_DO_POSTFLUSH))
+		skip |= QUEUE_ORDSEQ_POSTFLUSH;
+
+	/* complete skipped sequences and return the first sequence */
+	return blk_ordered_complete_seq(q, skip, 0);
 }
 
 static void bio_end_empty_barrier(struct bio *bio, int err)
diff --git a/block/blk-core.c b/block/blk-core.c
index f8d37a8e2c55..d316662682c8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -520,6 +520,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	init_timer(&q->unplug_timer);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
 	INIT_LIST_HEAD(&q->timeout_list);
+	INIT_LIST_HEAD(&q->pending_barriers);
 	INIT_WORK(&q->unplug_work, blk_unplug_work);
 
 	kobject_init(&q->kobj, &blk_queue_ktype);
@@ -1185,6 +1186,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	const bool sync = (bio->bi_rw & REQ_SYNC);
 	const bool unplug = (bio->bi_rw & REQ_UNPLUG);
 	const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
+	int where = ELEVATOR_INSERT_SORT;
 	int rw_flags;
 
 	/* REQ_HARDBARRIER is no more */
@@ -1203,7 +1205,12 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 
 	spin_lock_irq(q->queue_lock);
 
-	if (unlikely((bio->bi_rw & REQ_HARDBARRIER)) || elv_queue_empty(q))
+	if (bio->bi_rw & REQ_HARDBARRIER) {
+		where = ELEVATOR_INSERT_FRONT;
+		goto get_rq;
+	}
+
+	if (elv_queue_empty(q))
 		goto get_rq;
 
 	el_ret = elv_merge(q, &req, bio);
@@ -1303,7 +1310,7 @@ get_rq:
 
 	/* insert the request into the elevator */
 	drive_stat_acct(req, 1);
-	__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
+	__elv_add_request(q, req, where, 0);
 out:
 	if (unplug || !queue_should_plug(q))
 		__generic_unplug_device(q);
diff --git a/block/blk.h b/block/blk.h
index 874eb4ea8093..08081e4b294e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -62,7 +62,7 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 			rq = list_entry_rq(q->queue_head.next);
 			rq = blk_do_ordered(q, rq);
 			if (rq)
-				return !IS_ERR(rq) ? rq : NULL;
+				return rq;
 		}
 
 		if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
diff --git a/block/elevator.c b/block/elevator.c
index ec585c9554d3..241c69c45c5f 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -617,8 +617,6 @@ void elv_quiesce_end(struct request_queue *q)
 
 void elv_insert(struct request_queue *q, struct request *rq, int where)
 {
-	struct list_head *pos;
-	unsigned ordseq;
 	int unplug_it = 1;
 
 	trace_block_rq_insert(q, rq);
@@ -626,9 +624,16 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 	rq->q = q;
 
 	switch (where) {
+	case ELEVATOR_INSERT_REQUEUE:
+		/*
+		 * Most requeues happen because of a busy condition,
+		 * don't force unplug of the queue for that case.
+		 * Clear unplug_it and fall through.
+		 */
+		unplug_it = 0;
+
 	case ELEVATOR_INSERT_FRONT:
 		rq->cmd_flags |= REQ_SOFTBARRIER;
-
 		list_add(&rq->queuelist, &q->queue_head);
 		break;
 
@@ -668,36 +673,6 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 		q->elevator->ops->elevator_add_req_fn(q, rq);
 		break;
 
-	case ELEVATOR_INSERT_REQUEUE:
-		/*
-		 * If ordered flush isn't in progress, we do front
-		 * insertion; otherwise, requests should be requeued
-		 * in ordseq order.
-		 */
-		rq->cmd_flags |= REQ_SOFTBARRIER;
-
-		/*
-		 * Most requeues happen because of a busy condition,
-		 * don't force unplug of the queue for that case.
-		 */
-		unplug_it = 0;
-
-		if (q->ordseq == 0) {
-			list_add(&rq->queuelist, &q->queue_head);
-			break;
-		}
-
-		ordseq = blk_ordered_req_seq(rq);
-
-		list_for_each(pos, &q->queue_head) {
-			struct request *pos_rq = list_entry_rq(pos);
-			if (ordseq <= blk_ordered_req_seq(pos_rq))
-				break;
-		}
-
-		list_add_tail(&rq->queuelist, pos);
-		break;
-
 	default:
 		printk(KERN_ERR "%s: bad insertion point %d\n",
 		       __func__, where);
@@ -716,26 +691,8 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 void __elv_add_request(struct request_queue *q, struct request *rq, int where,
 		       int plug)
 {
-	if (q->ordcolor)
-		rq->cmd_flags |= REQ_ORDERED_COLOR;
-
 	if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
-		/*
-		 * toggle ordered color
-		 */
-		if (rq->cmd_flags & REQ_HARDBARRIER)
-			q->ordcolor ^= 1;
-
-		/*
-		 * barriers implicitly indicate back insertion
-		 */
-		if (where == ELEVATOR_INSERT_SORT)
-			where = ELEVATOR_INSERT_BACK;
-
-		/*
-		 * this request is scheduling boundary, update
-		 * end_sector
-		 */
+		/* barriers are scheduling boundary, update end_sector */
 		if (rq->cmd_type == REQ_TYPE_FS ||
 		    (rq->cmd_flags & REQ_DISCARD)) {
 			q->end_sector = rq_end_sector(rq);
@@ -855,24 +812,6 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 		    e->ops->elevator_completed_req_fn)
 			e->ops->elevator_completed_req_fn(q, rq);
 	}
-
-	/*
-	 * Check if the queue is waiting for fs requests to be
-	 * drained for flush sequence.
-	 */
-	if (unlikely(q->ordseq)) {
-		struct request *next = NULL;
-
-		if (!list_empty(&q->queue_head))
-			next = list_entry_rq(q->queue_head.next);
-
-		if (!queue_in_flight(q) &&
-		    blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
-		    (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) {
-			blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
-			__blk_run_queue(q);
-		}
-	}
 }
 
 #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index ca83a97c9715..9192282b4259 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -143,7 +143,6 @@ enum rq_flag_bits {
 	__REQ_FAILED,		/* set if the request failed */
 	__REQ_QUIET,		/* don't worry about errors */
 	__REQ_PREEMPT,		/* set for "ide_preempt" requests */
-	__REQ_ORDERED_COLOR,	/* is before or after barrier */
 	__REQ_ALLOCED,		/* request came from our alloc pool */
 	__REQ_COPY_USER,	/* contains copies of user pages */
 	__REQ_INTEGRITY,	/* integrity metadata has been remapped */
@@ -184,7 +183,6 @@ enum rq_flag_bits {
 #define REQ_FAILED		(1 << __REQ_FAILED)
 #define REQ_QUIET		(1 << __REQ_QUIET)
 #define REQ_PREEMPT		(1 << __REQ_PREEMPT)
-#define REQ_ORDERED_COLOR	(1 << __REQ_ORDERED_COLOR)
 #define REQ_ALLOCED		(1 << __REQ_ALLOCED)
 #define REQ_COPY_USER		(1 << __REQ_COPY_USER)
 #define REQ_INTEGRITY		(1 << __REQ_INTEGRITY)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 996549d71923..20a3710a481b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -360,9 +360,10 @@ struct request_queue
 	unsigned int		flush_flags;
 
 	unsigned int		ordered, next_ordered, ordseq;
-	int			orderr, ordcolor;
-	struct request		pre_flush_rq, bar_rq, post_flush_rq;
+	int			orderr;
+	struct request		bar_rq;
 	struct request		*orig_bar_rq;
+	struct list_head	pending_barriers;
 
 	struct mutex		sysfs_lock;
 
@@ -491,12 +492,11 @@ enum {
 	/*
 	 * Ordered operation sequence
 	 */
-	QUEUE_ORDSEQ_STARTED	= 0x01,	/* flushing in progress */
-	QUEUE_ORDSEQ_DRAIN	= 0x02,	/* waiting for the queue to be drained */
-	QUEUE_ORDSEQ_PREFLUSH	= 0x04,	/* pre-flushing in progress */
-	QUEUE_ORDSEQ_BAR	= 0x08,	/* original barrier req in progress */
-	QUEUE_ORDSEQ_POSTFLUSH	= 0x10,	/* post-flushing in progress */
-	QUEUE_ORDSEQ_DONE	= 0x20,
+	QUEUE_ORDSEQ_STARTED	= (1 << 0), /* flushing in progress */
+	QUEUE_ORDSEQ_PREFLUSH	= (1 << 1), /* pre-flushing in progress */
+	QUEUE_ORDSEQ_BAR	= (1 << 2), /* barrier write in progress */
+	QUEUE_ORDSEQ_POSTFLUSH	= (1 << 3), /* post-flushing in progress */
+	QUEUE_ORDSEQ_DONE	= (1 << 4),
 };
 
 #define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
@@ -869,9 +869,6 @@ extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern void blk_queue_flush(struct request_queue *q, unsigned int flush);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
-extern unsigned blk_ordered_cur_seq(struct request_queue *);
-extern unsigned blk_ordered_req_seq(struct request *);
-extern bool blk_ordered_complete_seq(struct request_queue *, unsigned, int);
 
 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
-- 
cgit v1.2.3-71-gd317


From dd4c133f387c48f526022860ad70354637a80f4c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:16 +0200
Subject: block: rename barrier/ordered to flush

With ordering requirements dropped, barrier and ordered are misnomers.
Now all block layer does is sequencing FLUSH and FUA.  Rename them to
flush.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c       | 21 ++++++-----
 block/blk-flush.c      | 98 +++++++++++++++++++++++++-------------------------
 block/blk.h            |  4 +--
 include/linux/blkdev.h | 24 ++++++-------
 4 files changed, 72 insertions(+), 75 deletions(-)

(limited to 'include/linux/blkdev.h')

diff --git a/block/blk-core.c b/block/blk-core.c
index d316662682c8..8870ae40179d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -136,7 +136,7 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 {
 	struct request_queue *q = rq->q;
 
-	if (&q->bar_rq != rq) {
+	if (&q->flush_rq != rq) {
 		if (error)
 			clear_bit(BIO_UPTODATE, &bio->bi_flags);
 		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
@@ -160,13 +160,12 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 		if (bio->bi_size == 0)
 			bio_endio(bio, error);
 	} else {
-
 		/*
-		 * Okay, this is the barrier request in progress, just
-		 * record the error;
+		 * Okay, this is the sequenced flush request in
+		 * progress, just record the error;
 		 */
-		if (error && !q->orderr)
-			q->orderr = error;
+		if (error && !q->flush_err)
+			q->flush_err = error;
 	}
 }
 
@@ -520,7 +519,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	init_timer(&q->unplug_timer);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
 	INIT_LIST_HEAD(&q->timeout_list);
-	INIT_LIST_HEAD(&q->pending_barriers);
+	INIT_LIST_HEAD(&q->pending_flushes);
 	INIT_WORK(&q->unplug_work, blk_unplug_work);
 
 	kobject_init(&q->kobj, &blk_queue_ktype);
@@ -1764,11 +1763,11 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
 static void blk_account_io_done(struct request *req)
 {
 	/*
-	 * Account IO completion.  bar_rq isn't accounted as a normal
-	 * IO on queueing nor completion.  Accounting the containing
-	 * request is enough.
+	 * Account IO completion.  flush_rq isn't accounted as a
+	 * normal IO on queueing nor completion.  Accounting the
+	 * containing request is enough.
 	 */
-	if (blk_do_io_stat(req) && req != &req->q->bar_rq) {
+	if (blk_do_io_stat(req) && req != &req->q->flush_rq) {
 		unsigned long duration = jiffies - req->start_time;
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index e8b2e5c091b1..dd873225da97 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -9,41 +9,38 @@
 
 #include "blk.h"
 
-static struct request *queue_next_ordseq(struct request_queue *q);
+static struct request *queue_next_fseq(struct request_queue *q);
 
-/*
- * Cache flushing for ordered writes handling
- */
-unsigned blk_ordered_cur_seq(struct request_queue *q)
+unsigned blk_flush_cur_seq(struct request_queue *q)
 {
-	if (!q->ordseq)
+	if (!q->flush_seq)
 		return 0;
-	return 1 << ffz(q->ordseq);
+	return 1 << ffz(q->flush_seq);
 }
 
-static struct request *blk_ordered_complete_seq(struct request_queue *q,
-						unsigned seq, int error)
+static struct request *blk_flush_complete_seq(struct request_queue *q,
+					      unsigned seq, int error)
 {
 	struct request *next_rq = NULL;
 
-	if (error && !q->orderr)
-		q->orderr = error;
+	if (error && !q->flush_err)
+		q->flush_err = error;
 
-	BUG_ON(q->ordseq & seq);
-	q->ordseq |= seq;
+	BUG_ON(q->flush_seq & seq);
+	q->flush_seq |= seq;
 
-	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) {
-		/* not complete yet, queue the next ordered sequence */
-		next_rq = queue_next_ordseq(q);
+	if (blk_flush_cur_seq(q) != QUEUE_FSEQ_DONE) {
+		/* not complete yet, queue the next flush sequence */
+		next_rq = queue_next_fseq(q);
 	} else {
-		/* complete this barrier request */
-		__blk_end_request_all(q->orig_bar_rq, q->orderr);
-		q->orig_bar_rq = NULL;
-		q->ordseq = 0;
-
-		/* dispatch the next barrier if there's one */
-		if (!list_empty(&q->pending_barriers)) {
-			next_rq = list_entry_rq(q->pending_barriers.next);
+		/* complete this flush request */
+		__blk_end_request_all(q->orig_flush_rq, q->flush_err);
+		q->orig_flush_rq = NULL;
+		q->flush_seq = 0;
+
+		/* dispatch the next flush if there's one */
+		if (!list_empty(&q->pending_flushes)) {
+			next_rq = list_entry_rq(q->pending_flushes.next);
 			list_move(&next_rq->queuelist, &q->queue_head);
 		}
 	}
@@ -53,19 +50,19 @@ static struct request *blk_ordered_complete_seq(struct request_queue *q,
 static void pre_flush_end_io(struct request *rq, int error)
 {
 	elv_completed_request(rq->q, rq);
-	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
+	blk_flush_complete_seq(rq->q, QUEUE_FSEQ_PREFLUSH, error);
 }
 
-static void bar_end_io(struct request *rq, int error)
+static void flush_data_end_io(struct request *rq, int error)
 {
 	elv_completed_request(rq->q, rq);
-	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
+	blk_flush_complete_seq(rq->q, QUEUE_FSEQ_DATA, error);
 }
 
 static void post_flush_end_io(struct request *rq, int error)
 {
 	elv_completed_request(rq->q, rq);
-	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
+	blk_flush_complete_seq(rq->q, QUEUE_FSEQ_POSTFLUSH, error);
 }
 
 static void queue_flush(struct request_queue *q, struct request *rq,
@@ -74,34 +71,34 @@ static void queue_flush(struct request_queue *q, struct request *rq,
 	blk_rq_init(q, rq);
 	rq->cmd_type = REQ_TYPE_FS;
 	rq->cmd_flags = REQ_FLUSH;
-	rq->rq_disk = q->orig_bar_rq->rq_disk;
+	rq->rq_disk = q->orig_flush_rq->rq_disk;
 	rq->end_io = end_io;
 
 	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 }
 
-static struct request *queue_next_ordseq(struct request_queue *q)
+static struct request *queue_next_fseq(struct request_queue *q)
 {
-	struct request *rq = &q->bar_rq;
+	struct request *rq = &q->flush_rq;
 
-	switch (blk_ordered_cur_seq(q)) {
-	case QUEUE_ORDSEQ_PREFLUSH:
+	switch (blk_flush_cur_seq(q)) {
+	case QUEUE_FSEQ_PREFLUSH:
 		queue_flush(q, rq, pre_flush_end_io);
 		break;
 
-	case QUEUE_ORDSEQ_BAR:
+	case QUEUE_FSEQ_DATA:
 		/* initialize proxy request and queue it */
 		blk_rq_init(q, rq);
-		init_request_from_bio(rq, q->orig_bar_rq->bio);
+		init_request_from_bio(rq, q->orig_flush_rq->bio);
 		rq->cmd_flags &= ~REQ_HARDBARRIER;
 		if (q->ordered & QUEUE_ORDERED_DO_FUA)
 			rq->cmd_flags |= REQ_FUA;
-		rq->end_io = bar_end_io;
+		rq->end_io = flush_data_end_io;
 
 		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 		break;
 
-	case QUEUE_ORDSEQ_POSTFLUSH:
+	case QUEUE_FSEQ_POSTFLUSH:
 		queue_flush(q, rq, post_flush_end_io);
 		break;
 
@@ -111,19 +108,20 @@ static struct request *queue_next_ordseq(struct request_queue *q)
 	return rq;
 }
 
-struct request *blk_do_ordered(struct request_queue *q, struct request *rq)
+struct request *blk_do_flush(struct request_queue *q, struct request *rq)
 {
 	unsigned skip = 0;
 
 	if (!(rq->cmd_flags & REQ_HARDBARRIER))
 		return rq;
 
-	if (q->ordseq) {
+	if (q->flush_seq) {
 		/*
-		 * Barrier is already in progress and they can't be
-		 * processed in parallel.  Queue for later processing.
+		 * Sequenced flush is already in progress and they
+		 * can't be processed in parallel.  Queue for later
+		 * processing.
 		 */
-		list_move_tail(&rq->queuelist, &q->pending_barriers);
+		list_move_tail(&rq->queuelist, &q->pending_flushes);
 		return NULL;
 	}
 
@@ -138,11 +136,11 @@ struct request *blk_do_ordered(struct request_queue *q, struct request *rq)
 	}
 
 	/*
-	 * Start a new ordered sequence
+	 * Start a new flush sequence
 	 */
-	q->orderr = 0;
+	q->flush_err = 0;
 	q->ordered = q->next_ordered;
-	q->ordseq |= QUEUE_ORDSEQ_STARTED;
+	q->flush_seq |= QUEUE_FSEQ_STARTED;
 
 	/*
 	 * For an empty barrier, there's no actual BAR request, which
@@ -154,19 +152,19 @@ struct request *blk_do_ordered(struct request_queue *q, struct request *rq)
 
 	/* stash away the original request */
 	blk_dequeue_request(rq);
-	q->orig_bar_rq = rq;
+	q->orig_flush_rq = rq;
 
 	if (!(q->ordered & QUEUE_ORDERED_DO_PREFLUSH))
-		skip |= QUEUE_ORDSEQ_PREFLUSH;
+		skip |= QUEUE_FSEQ_PREFLUSH;
 
 	if (!(q->ordered & QUEUE_ORDERED_DO_BAR))
-		skip |= QUEUE_ORDSEQ_BAR;
+		skip |= QUEUE_FSEQ_DATA;
 
 	if (!(q->ordered & QUEUE_ORDERED_DO_POSTFLUSH))
-		skip |= QUEUE_ORDSEQ_POSTFLUSH;
+		skip |= QUEUE_FSEQ_POSTFLUSH;
 
 	/* complete skipped sequences and return the first sequence */
-	return blk_ordered_complete_seq(q, skip, 0);
+	return blk_flush_complete_seq(q, skip, 0);
 }
 
 static void bio_end_empty_barrier(struct bio *bio, int err)
diff --git a/block/blk.h b/block/blk.h
index 08081e4b294e..24b92bd78f37 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -51,7 +51,7 @@ static inline void blk_clear_rq_complete(struct request *rq)
  */
 #define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
 
-struct request *blk_do_ordered(struct request_queue *q, struct request *rq);
+struct request *blk_do_flush(struct request_queue *q, struct request *rq);
 
 static inline struct request *__elv_next_request(struct request_queue *q)
 {
@@ -60,7 +60,7 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 	while (1) {
 		while (!list_empty(&q->queue_head)) {
 			rq = list_entry_rq(q->queue_head.next);
-			rq = blk_do_ordered(q, rq);
+			rq = blk_do_flush(q, rq);
 			if (rq)
 				return rq;
 		}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 20a3710a481b..1cd83ec077db 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -357,13 +357,13 @@ struct request_queue
 	/*
 	 * for flush operations
 	 */
+	unsigned int		ordered, next_ordered;
 	unsigned int		flush_flags;
-
-	unsigned int		ordered, next_ordered, ordseq;
-	int			orderr;
-	struct request		bar_rq;
-	struct request		*orig_bar_rq;
-	struct list_head	pending_barriers;
+	unsigned int		flush_seq;
+	int			flush_err;
+	struct request		flush_rq;
+	struct request		*orig_flush_rq;
+	struct list_head	pending_flushes;
 
 	struct mutex		sysfs_lock;
 
@@ -490,13 +490,13 @@ enum {
 					  QUEUE_ORDERED_DO_FUA,
 
 	/*
-	 * Ordered operation sequence
+	 * FLUSH/FUA sequences.
 	 */
-	QUEUE_ORDSEQ_STARTED	= (1 << 0), /* flushing in progress */
-	QUEUE_ORDSEQ_PREFLUSH	= (1 << 1), /* pre-flushing in progress */
-	QUEUE_ORDSEQ_BAR	= (1 << 2), /* barrier write in progress */
-	QUEUE_ORDSEQ_POSTFLUSH	= (1 << 3), /* post-flushing in progress */
-	QUEUE_ORDSEQ_DONE	= (1 << 4),
+	QUEUE_FSEQ_STARTED	= (1 << 0), /* flushing in progress */
+	QUEUE_FSEQ_PREFLUSH	= (1 << 1), /* pre-flushing in progress */
+	QUEUE_FSEQ_DATA		= (1 << 2), /* data write in progress */
+	QUEUE_FSEQ_POSTFLUSH	= (1 << 3), /* post-flushing in progress */
+	QUEUE_FSEQ_DONE		= (1 << 4),
 };
 
 #define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
-- 
cgit v1.2.3-71-gd317


From 4fed947cb311e5aa51781d316cefca836352f6ce Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:17 +0200
Subject: block: implement REQ_FLUSH/FUA based interface for FLUSH/FUA requests

Now that the backend conversion is complete, export sequenced
FLUSH/FUA capability through REQ_FLUSH/FUA flags.  REQ_FLUSH means the
device cache should be flushed before executing the request.  REQ_FUA
means that the data in the request should be on non-volatile media on
completion.

Block layer will choose the correct way of implementing the semantics
and execute it.  The request may be passed to the device directly if
the device can handle it; otherwise, it will be sequenced using one or
more proxy requests.  Devices will never see REQ_FLUSH and/or FUA
which it doesn't support.

Also, unlike the original REQ_HARDBARRIER, REQ_FLUSH/FUA requests are
never failed with -EOPNOTSUPP.  If the underlying device doesn't
support FLUSH/FUA, the block layer simply make those noop.  IOW, it no
longer distinguishes between writeback cache which doesn't support
cache flush and writethrough/no cache.  Devices which have WB cache
w/o flush are very difficult to come by these days and there's nothing
much we can do anyway, so it doesn't make sense to require everyone to
implement -EOPNOTSUPP handling.  This will simplify filesystems and
block drivers as they can drop -EOPNOTSUPP retry logic for barriers.

* QUEUE_ORDERED_* are removed and QUEUE_FSEQ_* are moved into
  blk-flush.c.

* REQ_FLUSH w/o data can also be directly passed to drivers without
  sequencing but some drivers assume that zero length requests don't
  have rq->bio which isn't true for these requests requiring the use
  of proxy requests.

* REQ_COMMON_MASK now includes REQ_FLUSH | REQ_FUA so that they are
  copied from bio to request.

* WRITE_BARRIER is marked deprecated and WRITE_FLUSH, WRITE_FUA and
  WRITE_FLUSH_FUA are added.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c            |  2 +-
 block/blk-flush.c           | 85 ++++++++++++++++++++++++---------------------
 block/blk.h                 |  3 ++
 include/linux/blk_types.h   |  2 +-
 include/linux/blkdev.h      | 38 ++------------------
 include/linux/buffer_head.h |  2 +-
 include/linux/fs.h          | 19 ++++++----
 7 files changed, 67 insertions(+), 84 deletions(-)

(limited to 'include/linux/blkdev.h')

diff --git a/block/blk-core.c b/block/blk-core.c
index 8870ae40179d..18455c4f618a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1204,7 +1204,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 
 	spin_lock_irq(q->queue_lock);
 
-	if (bio->bi_rw & REQ_HARDBARRIER) {
+	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
 		where = ELEVATOR_INSERT_FRONT;
 		goto get_rq;
 	}
diff --git a/block/blk-flush.c b/block/blk-flush.c
index dd873225da97..452c552e9ead 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -1,5 +1,5 @@
 /*
- * Functions related to barrier IO handling
+ * Functions to sequence FLUSH and FUA writes.
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -9,6 +9,15 @@
 
 #include "blk.h"
 
+/* FLUSH/FUA sequences */
+enum {
+	QUEUE_FSEQ_STARTED	= (1 << 0), /* flushing in progress */
+	QUEUE_FSEQ_PREFLUSH	= (1 << 1), /* pre-flushing in progress */
+	QUEUE_FSEQ_DATA		= (1 << 2), /* data write in progress */
+	QUEUE_FSEQ_POSTFLUSH	= (1 << 3), /* post-flushing in progress */
+	QUEUE_FSEQ_DONE		= (1 << 4),
+};
+
 static struct request *queue_next_fseq(struct request_queue *q);
 
 unsigned blk_flush_cur_seq(struct request_queue *q)
@@ -79,6 +88,7 @@ static void queue_flush(struct request_queue *q, struct request *rq,
 
 static struct request *queue_next_fseq(struct request_queue *q)
 {
+	struct request *orig_rq = q->orig_flush_rq;
 	struct request *rq = &q->flush_rq;
 
 	switch (blk_flush_cur_seq(q)) {
@@ -87,12 +97,11 @@ static struct request *queue_next_fseq(struct request_queue *q)
 		break;
 
 	case QUEUE_FSEQ_DATA:
-		/* initialize proxy request and queue it */
+		/* initialize proxy request, inherit FLUSH/FUA and queue it */
 		blk_rq_init(q, rq);
-		init_request_from_bio(rq, q->orig_flush_rq->bio);
-		rq->cmd_flags &= ~REQ_HARDBARRIER;
-		if (q->ordered & QUEUE_ORDERED_DO_FUA)
-			rq->cmd_flags |= REQ_FUA;
+		init_request_from_bio(rq, orig_rq->bio);
+		rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA);
+		rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA);
 		rq->end_io = flush_data_end_io;
 
 		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
@@ -110,60 +119,58 @@ static struct request *queue_next_fseq(struct request_queue *q)
 
 struct request *blk_do_flush(struct request_queue *q, struct request *rq)
 {
+	unsigned int fflags = q->flush_flags; /* may change, cache it */
+	bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA;
+	bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH);
+	bool do_postflush = has_flush && !has_fua && (rq->cmd_flags & REQ_FUA);
 	unsigned skip = 0;
 
-	if (!(rq->cmd_flags & REQ_HARDBARRIER))
+	/*
+	 * Special case.  If there's data but flush is not necessary,
+	 * the request can be issued directly.
+	 *
+	 * Flush w/o data should be able to be issued directly too but
+	 * currently some drivers assume that rq->bio contains
+	 * non-zero data if it isn't NULL and empty FLUSH requests
+	 * getting here usually have bio's without data.
+	 */
+	if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) {
+		rq->cmd_flags &= ~REQ_FLUSH;
+		if (!has_fua)
+			rq->cmd_flags &= ~REQ_FUA;
 		return rq;
+	}
 
+	/*
+	 * Sequenced flushes can't be processed in parallel.  If
+	 * another one is already in progress, queue for later
+	 * processing.
+	 */
 	if (q->flush_seq) {
-		/*
-		 * Sequenced flush is already in progress and they
-		 * can't be processed in parallel.  Queue for later
-		 * processing.
-		 */
 		list_move_tail(&rq->queuelist, &q->pending_flushes);
 		return NULL;
 	}
 
-	if (unlikely(q->next_ordered == QUEUE_ORDERED_NONE)) {
-		/*
-		 * Queue ordering not supported.  Terminate
-		 * with prejudice.
-		 */
-		blk_dequeue_request(rq);
-		__blk_end_request_all(rq, -EOPNOTSUPP);
-		return NULL;
-	}
-
 	/*
 	 * Start a new flush sequence
 	 */
 	q->flush_err = 0;
-	q->ordered = q->next_ordered;
 	q->flush_seq |= QUEUE_FSEQ_STARTED;
 
-	/*
-	 * For an empty barrier, there's no actual BAR request, which
-	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
-	 */
-	if (!blk_rq_sectors(rq))
-		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
-				QUEUE_ORDERED_DO_POSTFLUSH);
-
-	/* stash away the original request */
+	/* adjust FLUSH/FUA of the original request and stash it away */
+	rq->cmd_flags &= ~REQ_FLUSH;
+	if (!has_fua)
+		rq->cmd_flags &= ~REQ_FUA;
 	blk_dequeue_request(rq);
 	q->orig_flush_rq = rq;
 
-	if (!(q->ordered & QUEUE_ORDERED_DO_PREFLUSH))
+	/* skip unneded sequences and return the first one */
+	if (!do_preflush)
 		skip |= QUEUE_FSEQ_PREFLUSH;
-
-	if (!(q->ordered & QUEUE_ORDERED_DO_BAR))
+	if (!blk_rq_sectors(rq))
 		skip |= QUEUE_FSEQ_DATA;
-
-	if (!(q->ordered & QUEUE_ORDERED_DO_POSTFLUSH))
+	if (!do_postflush)
 		skip |= QUEUE_FSEQ_POSTFLUSH;
-
-	/* complete skipped sequences and return the first sequence */
 	return blk_flush_complete_seq(q, skip, 0);
 }
 
diff --git a/block/blk.h b/block/blk.h
index 24b92bd78f37..a09c18b19116 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -60,6 +60,9 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 	while (1) {
 		while (!list_empty(&q->queue_head)) {
 			rq = list_entry_rq(q->queue_head.next);
+			if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) ||
+			    rq == &q->flush_rq)
+				return rq;
 			rq = blk_do_flush(q, rq);
 			if (rq)
 				return rq;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 9192282b4259..179799479e6f 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -167,7 +167,7 @@ enum rq_flag_bits {
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
 #define REQ_COMMON_MASK \
 	(REQ_WRITE | REQ_FAILFAST_MASK | REQ_HARDBARRIER | REQ_SYNC | \
-	 REQ_META| REQ_DISCARD | REQ_NOIDLE)
+	 REQ_META | REQ_DISCARD | REQ_NOIDLE | REQ_FLUSH | REQ_FUA)
 
 #define REQ_UNPLUG		(1 << __REQ_UNPLUG)
 #define REQ_RAHEAD		(1 << __REQ_RAHEAD)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1cd83ec077db..8ef705f800ab 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -357,7 +357,6 @@ struct request_queue
 	/*
 	 * for flush operations
 	 */
-	unsigned int		ordered, next_ordered;
 	unsigned int		flush_flags;
 	unsigned int		flush_seq;
 	int			flush_err;
@@ -465,40 +464,6 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 	__clear_bit(flag, &q->queue_flags);
 }
 
-enum {
-	/*
-	 * Hardbarrier is supported with one of the following methods.
-	 *
-	 * NONE		: hardbarrier unsupported
-	 * DRAIN	: ordering by draining is enough
-	 * DRAIN_FLUSH	: ordering by draining w/ pre and post flushes
-	 * DRAIN_FUA	: ordering by draining w/ pre flush and FUA write
-	 */
-	QUEUE_ORDERED_DO_PREFLUSH	= 0x10,
-	QUEUE_ORDERED_DO_BAR		= 0x20,
-	QUEUE_ORDERED_DO_POSTFLUSH	= 0x40,
-	QUEUE_ORDERED_DO_FUA		= 0x80,
-
-	QUEUE_ORDERED_NONE		= 0x00,
-
-	QUEUE_ORDERED_DRAIN		= QUEUE_ORDERED_DO_BAR,
-	QUEUE_ORDERED_DRAIN_FLUSH	= QUEUE_ORDERED_DRAIN |
-					  QUEUE_ORDERED_DO_PREFLUSH |
-					  QUEUE_ORDERED_DO_POSTFLUSH,
-	QUEUE_ORDERED_DRAIN_FUA		= QUEUE_ORDERED_DRAIN |
-					  QUEUE_ORDERED_DO_PREFLUSH |
-					  QUEUE_ORDERED_DO_FUA,
-
-	/*
-	 * FLUSH/FUA sequences.
-	 */
-	QUEUE_FSEQ_STARTED	= (1 << 0), /* flushing in progress */
-	QUEUE_FSEQ_PREFLUSH	= (1 << 1), /* pre-flushing in progress */
-	QUEUE_FSEQ_DATA		= (1 << 2), /* data write in progress */
-	QUEUE_FSEQ_POSTFLUSH	= (1 << 3), /* post-flushing in progress */
-	QUEUE_FSEQ_DONE		= (1 << 4),
-};
-
 #define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
@@ -578,7 +543,8 @@ static inline void blk_clear_queue_full(struct request_queue *q, int sync)
  * it already be started by driver.
  */
 #define RQ_NOMERGE_FLAGS	\
-	(REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER)
+	(REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER | \
+	 REQ_FLUSH | REQ_FUA)
 #define rq_mergeable(rq)	\
 	(!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \
 	 (((rq)->cmd_flags & REQ_DISCARD) || \
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index ec94c12f21da..fc999f583fda 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -32,7 +32,7 @@ enum bh_state_bits {
 	BH_Delay,	/* Buffer is not yet allocated on disk */
 	BH_Boundary,	/* Block is followed by a discontiguity */
 	BH_Write_EIO,	/* I/O error on write */
-	BH_Eopnotsupp,	/* operation not supported (barrier) */
+	BH_Eopnotsupp,	/* DEPRECATED: operation not supported (barrier) */
 	BH_Unwritten,	/* Buffer is allocated on disk but not written */
 	BH_Quiet,	/* Buffer Error Prinks to be quiet */
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 76041b614758..352c48627381 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -135,12 +135,13 @@ struct inodes_stat_t {
  *			immediately after submission. The write equivalent
  *			of READ_SYNC.
  * WRITE_ODIRECT_PLUG	Special case write for O_DIRECT only.
- * WRITE_BARRIER	Like WRITE_SYNC, but tells the block layer that all
- *			previously submitted writes must be safely on storage
- *			before this one is started. Also guarantees that when
- *			this write is complete, it itself is also safely on
- *			storage. Prevents reordering of writes on both sides
- *			of this IO.
+ * WRITE_BARRIER	DEPRECATED. Always fails. Use FLUSH/FUA instead.
+ * WRITE_FLUSH		Like WRITE_SYNC but with preceding cache flush.
+ * WRITE_FUA		Like WRITE_SYNC but data is guaranteed to be on
+ *			non-volatile media on completion.
+ * WRITE_FLUSH_FUA	Combination of WRITE_FLUSH and FUA. The IO is preceded
+ *			by a cache flush and data is guaranteed to be on
+ *			non-volatile media on completion.
  *
  */
 #define RW_MASK			REQ_WRITE
@@ -158,6 +159,12 @@ struct inodes_stat_t {
 #define WRITE_META		(WRITE | REQ_META)
 #define WRITE_BARRIER		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
 				 REQ_HARDBARRIER)
+#define WRITE_FLUSH		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
+				 REQ_FLUSH)
+#define WRITE_FUA		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
+				 REQ_FUA)
+#define WRITE_FLUSH_FUA		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
+				 REQ_FLUSH | REQ_FUA)
 
 /*
  * These aren't really reads or writes, they pass down information about
-- 
cgit v1.2.3-71-gd317


From 2cf6d26a354ab6362e301b5a323832b02867df47 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:10 -0400
Subject: block: pass gfp_mask and flags to sb_issue_discard

We'll need to get rid of the BLKDEV_IFL_BARRIER flag, and to facilitate
that and to make the interface less confusing pass all flags explicitly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/ext4/mballoc.c      |  3 ++-
 fs/fat/fatent.c        |  4 +++-
 include/linux/blkdev.h | 11 +++++------
 3 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux/blkdev.h')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4b4ad4b7ce57..df44b345f662 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2566,7 +2566,8 @@ static inline void ext4_issue_discard(struct super_block *sb,
 	discard_block = block + ext4_group_first_block_no(sb, block_group);
 	trace_ext4_discard_blocks(sb,
 			(unsigned long long) discard_block, count);
-	ret = sb_issue_discard(sb, discard_block, count);
+	ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS,
+			       BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 	if (ret == EOPNOTSUPP) {
 		ext4_warning(sb, "discard not supported, disabling");
 		clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 81184d3b75a3..3a56a82f5658 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -577,7 +577,9 @@ int fat_free_clusters(struct inode *inode, int cluster)
 
 				sb_issue_discard(sb,
 					fat_clus_to_blknr(sbi, first_cl),
-					nr_clus * sbi->sec_per_clus);
+					nr_clus * sbi->sec_per_clus,
+					GFP_NOFS,
+					BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 
 				first_cl = cluster;
 			}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8ef705f800ab..6b305eb4a343 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -881,13 +881,12 @@ extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
 extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 			sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
-static inline int sb_issue_discard(struct super_block *sb,
-				   sector_t block, sector_t nr_blocks)
+static inline int sb_issue_discard(struct super_block *sb, sector_t block,
+		sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
 {
-	block <<= (sb->s_blocksize_bits - 9);
-	nr_blocks <<= (sb->s_blocksize_bits - 9);
-	return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_NOFS,
-				   BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+	return blkdev_issue_discard(sb->s_bdev, block << (sb->s_blocksize_bits - 9),
+				    nr_blocks << (sb->s_blocksize_bits - 9),
+				    gfp_mask, flags);
 }
 
 extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
-- 
cgit v1.2.3-71-gd317


From 8c5553678237b7121355108e03c36086037d8975 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:22 -0400
Subject: block: remove the BLKDEV_IFL_BARRIER flag

Remove support for barriers on discards, which is unused now.  Also
remove the DISCARD_NOBARRIER I/O type in favour of just setting the
rw flags up locally in blkdev_issue_discard.

tj: Also remove DISCARD_SECURE and use REQ_SECURE directly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-lib.c        | 18 ++----------------
 include/linux/blkdev.h |  2 --
 include/linux/fs.h     |  8 --------
 3 files changed, 2 insertions(+), 26 deletions(-)

(limited to 'include/linux/blkdev.h')

diff --git a/block/blk-lib.c b/block/blk-lib.c
index c392029a104e..fe2e6ed0f510 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -39,8 +39,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	struct request_queue *q = bdev_get_queue(bdev);
-	int type = flags & BLKDEV_IFL_BARRIER ?
-		DISCARD_BARRIER : DISCARD_NOBARRIER;
+	int type = REQ_WRITE | REQ_DISCARD;
 	unsigned int max_discard_sectors;
 	struct bio *bio;
 	int ret = 0;
@@ -65,7 +64,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	if (flags & BLKDEV_IFL_SECURE) {
 		if (!blk_queue_secdiscard(q))
 			return -EOPNOTSUPP;
-		type |= DISCARD_SECURE;
+		type |= REQ_SECURE;
 	}
 
 	while (nr_sects && !ret) {
@@ -162,12 +161,6 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 	bb.wait = &wait;
 	bb.end_io = NULL;
 
-	if (flags & BLKDEV_IFL_BARRIER) {
-		/* issue async barrier before the data */
-		ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0);
-		if (ret)
-			return ret;
-	}
 submit:
 	ret = 0;
 	while (nr_sects != 0) {
@@ -199,13 +192,6 @@ submit:
 		issued++;
 		submit_bio(WRITE, bio);
 	}
-	/*
-	 * When all data bios are in flight. Send final barrier if requeted.
-	 */
-	if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER)
-		ret = blkdev_issue_flush(bdev, gfp_mask, NULL,
-					flags & BLKDEV_IFL_WAIT);
-
 
 	if (flags & BLKDEV_IFL_WAIT)
 		/* Wait for bios in-flight */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6b305eb4a343..cfcb3a610605 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -869,11 +869,9 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 }
 enum{
 	BLKDEV_WAIT,	/* wait for completion */
-	BLKDEV_BARRIER,	/* issue request with barrier */
 	BLKDEV_SECURE,	/* secure discard */
 };
 #define BLKDEV_IFL_WAIT		(1 << BLKDEV_WAIT)
-#define BLKDEV_IFL_BARRIER	(1 << BLKDEV_BARRIER)
 #define BLKDEV_IFL_SECURE	(1 << BLKDEV_SECURE)
 extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *,
 			unsigned long);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d6add69bc170..6b0f6e9993a3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -163,14 +163,6 @@ struct inodes_stat_t {
 #define WRITE_FLUSH_FUA		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
 				 REQ_FLUSH | REQ_FUA)
 
-/*
- * These aren't really reads or writes, they pass down information about
- * parts of device that are now unused by the file system.
- */
-#define DISCARD_NOBARRIER	(WRITE | REQ_DISCARD)
-#define DISCARD_BARRIER		(WRITE | REQ_DISCARD | REQ_HARDBARRIER)
-#define DISCARD_SECURE		(DISCARD_NOBARRIER | REQ_SECURE)
-
 #define SEL_IN		1
 #define SEL_OUT		2
 #define SEL_EX		4
-- 
cgit v1.2.3-71-gd317


From 13f05c8d8e98bbdce89158bfdb2e380940695a88 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 10 Sep 2010 20:50:10 +0200
Subject: block/scsi: Provide a limit on the number of integrity segments

Some controllers have a hardware limit on the number of protection
information scatter-gather list segments they can handle.

Introduce a max_integrity_segments limit in the block layer and provide
a new scsi_host_template setting that allows HBA drivers to provide a
value suitable for the hardware.

Add support for honoring the integrity segment limit when merging both
bios and requests.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@carl.home.kernel.dk>
---
 block/blk-integrity.c     | 93 ++++++++++++++++++++++++++++++++++++-----------
 block/blk-merge.c         | 23 +++++++-----
 block/blk-settings.c      |  3 ++
 block/blk-sysfs.c         | 11 ++++++
 block/blk.h               |  8 ----
 drivers/scsi/hosts.c      |  1 +
 drivers/scsi/scsi_lib.c   | 26 +++++++++----
 drivers/scsi/scsi_sysfs.c |  2 +
 include/linux/bio.h       |  4 ++
 include/linux/blkdev.h    | 33 +++++++++++++++--
 include/scsi/scsi.h       |  6 +++
 include/scsi/scsi_host.h  |  7 ++++
 12 files changed, 167 insertions(+), 50 deletions(-)

(limited to 'include/linux/blkdev.h')

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index edce1ef7933d..885cbb59967e 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -32,24 +32,37 @@ static struct kmem_cache *integrity_cachep;
 
 /**
  * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements
- * @rq:		request with integrity metadata attached
+ * @q:		request queue
+ * @bio:	bio with integrity metadata attached
  *
  * Description: Returns the number of elements required in a
- * scatterlist corresponding to the integrity metadata in a request.
+ * scatterlist corresponding to the integrity metadata in a bio.
  */
-int blk_rq_count_integrity_sg(struct request *rq)
+int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio)
 {
-	struct bio_vec *iv, *ivprv;
-	struct req_iterator iter;
-	unsigned int segments;
+	struct bio_vec *iv, *ivprv = NULL;
+	unsigned int segments = 0;
+	unsigned int seg_size = 0;
+	unsigned int i = 0;
 
-	ivprv = NULL;
-	segments = 0;
+	bio_for_each_integrity_vec(iv, bio, i) {
 
-	rq_for_each_integrity_segment(iv, rq, iter) {
+		if (ivprv) {
+			if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
+				goto new_segment;
+
+			if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv))
+				goto new_segment;
+
+			if (seg_size + iv->bv_len > queue_max_segment_size(q))
+				goto new_segment;
 
-		if (!ivprv || !BIOVEC_PHYS_MERGEABLE(ivprv, iv))
+			seg_size += iv->bv_len;
+		} else {
+new_segment:
 			segments++;
+			seg_size = iv->bv_len;
+		}
 
 		ivprv = iv;
 	}
@@ -60,30 +73,34 @@ EXPORT_SYMBOL(blk_rq_count_integrity_sg);
 
 /**
  * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
- * @rq:		request with integrity metadata attached
+ * @q:		request queue
+ * @bio:	bio with integrity metadata attached
  * @sglist:	target scatterlist
  *
  * Description: Map the integrity vectors in request into a
  * scatterlist.  The scatterlist must be big enough to hold all
  * elements.  I.e. sized using blk_rq_count_integrity_sg().
  */
-int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist)
+int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio,
+			    struct scatterlist *sglist)
 {
-	struct bio_vec *iv, *ivprv;
-	struct req_iterator iter;
-	struct scatterlist *sg;
-	unsigned int segments;
+	struct bio_vec *iv, *ivprv = NULL;
+	struct scatterlist *sg = NULL;
+	unsigned int segments = 0;
+	unsigned int i = 0;
 
-	ivprv = NULL;
-	sg = NULL;
-	segments = 0;
-
-	rq_for_each_integrity_segment(iv, rq, iter) {
+	bio_for_each_integrity_vec(iv, bio, i) {
 
 		if (ivprv) {
 			if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
 				goto new_segment;
 
+			if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv))
+				goto new_segment;
+
+			if (sg->length + iv->bv_len > queue_max_segment_size(q))
+				goto new_segment;
+
 			sg->length += iv->bv_len;
 		} else {
 new_segment:
@@ -162,6 +179,40 @@ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2)
 }
 EXPORT_SYMBOL(blk_integrity_compare);
 
+int blk_integrity_merge_rq(struct request_queue *q, struct request *req,
+			   struct request *next)
+{
+	if (blk_integrity_rq(req) != blk_integrity_rq(next))
+		return -1;
+
+	if (req->nr_integrity_segments + next->nr_integrity_segments >
+	    q->limits.max_integrity_segments)
+		return -1;
+
+	return 0;
+}
+EXPORT_SYMBOL(blk_integrity_merge_rq);
+
+int blk_integrity_merge_bio(struct request_queue *q, struct request *req,
+			    struct bio *bio)
+{
+	int nr_integrity_segs;
+	struct bio *next = bio->bi_next;
+
+	bio->bi_next = NULL;
+	nr_integrity_segs = blk_rq_count_integrity_sg(q, bio);
+	bio->bi_next = next;
+
+	if (req->nr_integrity_segments + nr_integrity_segs >
+	    q->limits.max_integrity_segments)
+		return -1;
+
+	req->nr_integrity_segments += nr_integrity_segs;
+
+	return 0;
+}
+EXPORT_SYMBOL(blk_integrity_merge_bio);
+
 struct integrity_sysfs_entry {
 	struct attribute attr;
 	ssize_t (*show)(struct blk_integrity *, char *);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 3b0cd4249671..6a725461654d 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -205,12 +205,11 @@ static inline int ll_new_hw_segment(struct request_queue *q,
 {
 	int nr_phys_segs = bio_phys_segments(q, bio);
 
-	if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) {
-		req->cmd_flags |= REQ_NOMERGE;
-		if (req == q->last_merge)
-			q->last_merge = NULL;
-		return 0;
-	}
+	if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q))
+		goto no_merge;
+
+	if (bio_integrity(bio) && blk_integrity_merge_bio(q, req, bio))
+		goto no_merge;
 
 	/*
 	 * This will form the start of a new hw segment.  Bump both
@@ -218,6 +217,12 @@ static inline int ll_new_hw_segment(struct request_queue *q,
 	 */
 	req->nr_phys_segments += nr_phys_segs;
 	return 1;
+
+no_merge:
+	req->cmd_flags |= REQ_NOMERGE;
+	if (req == q->last_merge)
+		q->last_merge = NULL;
+	return 0;
 }
 
 int ll_back_merge_fn(struct request_queue *q, struct request *req,
@@ -301,6 +306,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 	if (total_phys_segments > queue_max_segments(q))
 		return 0;
 
+	if (blk_integrity_rq(req) && blk_integrity_merge_rq(q, req, next))
+		return 0;
+
 	/* Merge is OK... */
 	req->nr_phys_segments = total_phys_segments;
 	return 1;
@@ -372,9 +380,6 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	    || next->special)
 		return 0;
 
-	if (blk_integrity_rq(req) != blk_integrity_rq(next))
-		return 0;
-
 	/*
 	 * If we are allowed to merge, then append bio list
 	 * from next to rq and release next. merge_requests_fn
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 8d592b559bd3..f8f2ddf20613 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -111,6 +111,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
 void blk_set_default_limits(struct queue_limits *lim)
 {
 	lim->max_segments = BLK_MAX_SEGMENTS;
+	lim->max_integrity_segments = 0;
 	lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
 	lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
 	lim->max_sectors = BLK_DEF_MAX_SECTORS;
@@ -509,6 +510,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 					    b->seg_boundary_mask);
 
 	t->max_segments = min_not_zero(t->max_segments, b->max_segments);
+	t->max_integrity_segments = min_not_zero(t->max_integrity_segments,
+						 b->max_integrity_segments);
 
 	t->max_segment_size = min_not_zero(t->max_segment_size,
 					   b->max_segment_size);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 001ab18078f5..b014f7739e98 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -112,6 +112,11 @@ static ssize_t queue_max_segments_show(struct request_queue *q, char *page)
 	return queue_var_show(queue_max_segments(q), (page));
 }
 
+static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->limits.max_integrity_segments, (page));
+}
+
 static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page)
 {
 	if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags))
@@ -288,6 +293,11 @@ static struct queue_sysfs_entry queue_max_segments_entry = {
 	.show = queue_max_segments_show,
 };
 
+static struct queue_sysfs_entry queue_max_integrity_segments_entry = {
+	.attr = {.name = "max_integrity_segments", .mode = S_IRUGO },
+	.show = queue_max_integrity_segments_show,
+};
+
 static struct queue_sysfs_entry queue_max_segment_size_entry = {
 	.attr = {.name = "max_segment_size", .mode = S_IRUGO },
 	.show = queue_max_segment_size_show,
@@ -375,6 +385,7 @@ static struct attribute *default_attrs[] = {
 	&queue_max_hw_sectors_entry.attr,
 	&queue_max_sectors_entry.attr,
 	&queue_max_segments_entry.attr,
+	&queue_max_integrity_segments_entry.attr,
 	&queue_max_segment_size_entry.attr,
 	&queue_iosched_entry.attr,
 	&queue_hw_sector_size_entry.attr,
diff --git a/block/blk.h b/block/blk.h
index 6e7dc87141e4..6738831ba447 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -132,14 +132,6 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
 	return q->nr_congestion_off;
 }
 
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
-
-#define rq_for_each_integrity_segment(bvl, _rq, _iter)		\
-	__rq_for_each_bio(_iter.bio, _rq)			\
-		bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i)
-
-#endif /* BLK_DEV_INTEGRITY */
-
 static inline int blk_cpu_to_group(int cpu)
 {
 #ifdef CONFIG_SCHED_MC
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 8a8f803439e1..10478153641b 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -376,6 +376,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
 	shost->this_id = sht->this_id;
 	shost->can_queue = sht->can_queue;
 	shost->sg_tablesize = sht->sg_tablesize;
+	shost->sg_prot_tablesize = sht->sg_prot_tablesize;
 	shost->cmd_per_lun = sht->cmd_per_lun;
 	shost->unchecked_isa_dma = sht->unchecked_isa_dma;
 	shost->use_clustering = sht->use_clustering;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9ade720422c6..861c0b937ac9 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -968,11 +968,13 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
  */
 int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask)
 {
-	int error = scsi_init_sgtable(cmd->request, &cmd->sdb, gfp_mask);
+	struct request *rq = cmd->request;
+
+	int error = scsi_init_sgtable(rq, &cmd->sdb, gfp_mask);
 	if (error)
 		goto err_exit;
 
-	if (blk_bidi_rq(cmd->request)) {
+	if (blk_bidi_rq(rq)) {
 		struct scsi_data_buffer *bidi_sdb = kmem_cache_zalloc(
 			scsi_sdb_cache, GFP_ATOMIC);
 		if (!bidi_sdb) {
@@ -980,28 +982,28 @@ int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask)
 			goto err_exit;
 		}
 
-		cmd->request->next_rq->special = bidi_sdb;
-		error = scsi_init_sgtable(cmd->request->next_rq, bidi_sdb,
-								    GFP_ATOMIC);
+		rq->next_rq->special = bidi_sdb;
+		error = scsi_init_sgtable(rq->next_rq, bidi_sdb, GFP_ATOMIC);
 		if (error)
 			goto err_exit;
 	}
 
-	if (blk_integrity_rq(cmd->request)) {
+	if (blk_integrity_rq(rq)) {
 		struct scsi_data_buffer *prot_sdb = cmd->prot_sdb;
 		int ivecs, count;
 
 		BUG_ON(prot_sdb == NULL);
-		ivecs = blk_rq_count_integrity_sg(cmd->request);
+		ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio);
 
 		if (scsi_alloc_sgtable(prot_sdb, ivecs, gfp_mask)) {
 			error = BLKPREP_DEFER;
 			goto err_exit;
 		}
 
-		count = blk_rq_map_integrity_sg(cmd->request,
+		count = blk_rq_map_integrity_sg(rq->q, rq->bio,
 						prot_sdb->table.sgl);
 		BUG_ON(unlikely(count > ivecs));
+		BUG_ON(unlikely(count > queue_max_integrity_segments(rq->q)));
 
 		cmd->prot_sdb = prot_sdb;
 		cmd->prot_sdb->table.nents = count;
@@ -1625,6 +1627,14 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost,
 	blk_queue_max_segments(q, min_t(unsigned short, shost->sg_tablesize,
 					SCSI_MAX_SG_CHAIN_SEGMENTS));
 
+	if (scsi_host_prot_dma(shost)) {
+		shost->sg_prot_tablesize =
+			min_not_zero(shost->sg_prot_tablesize,
+				     (unsigned short)SCSI_MAX_PROT_SG_SEGMENTS);
+		BUG_ON(shost->sg_prot_tablesize < shost->sg_tablesize);
+		blk_queue_max_integrity_segments(q, shost->sg_prot_tablesize);
+	}
+
 	blk_queue_max_hw_sectors(q, shost->max_sectors);
 	blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
 	blk_queue_segment_boundary(q, shost->dma_boundary);
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index c3f67373a4f8..20ad59dff730 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -251,6 +251,7 @@ shost_rd_attr(host_busy, "%hu\n");
 shost_rd_attr(cmd_per_lun, "%hd\n");
 shost_rd_attr(can_queue, "%hd\n");
 shost_rd_attr(sg_tablesize, "%hu\n");
+shost_rd_attr(sg_prot_tablesize, "%hu\n");
 shost_rd_attr(unchecked_isa_dma, "%d\n");
 shost_rd_attr(prot_capabilities, "%u\n");
 shost_rd_attr(prot_guard_type, "%hd\n");
@@ -262,6 +263,7 @@ static struct attribute *scsi_sysfs_shost_attrs[] = {
 	&dev_attr_cmd_per_lun.attr,
 	&dev_attr_can_queue.attr,
 	&dev_attr_sg_tablesize.attr,
+	&dev_attr_sg_prot_tablesize.attr,
 	&dev_attr_unchecked_isa_dma.attr,
 	&dev_attr_proc_name.attr,
 	&dev_attr_scan.attr,
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5274103434ad..2c3fd7421607 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -496,6 +496,10 @@ static inline struct bio *bio_list_get(struct bio_list *bl)
 #define bip_for_each_vec(bvl, bip, i)					\
 	__bip_for_each_vec(bvl, bip, i, (bip)->bip_idx)
 
+#define bio_for_each_integrity_vec(_bvl, _bio, _iter)			\
+	for_each_bio(_bio)						\
+		bip_for_each_vec(_bvl, _bio->bi_integrity, _iter)
+
 #define bio_integrity(bio) (bio->bi_integrity != NULL)
 
 extern struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *, gfp_t, unsigned int, struct bio_set *);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2c54906f678f..7e661106270a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -124,6 +124,9 @@ struct request {
 	 * physical address coalescing is performed.
 	 */
 	unsigned short nr_phys_segments;
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+	unsigned short nr_integrity_segments;
+#endif
 
 	unsigned short ioprio;
 
@@ -243,6 +246,7 @@ struct queue_limits {
 
 	unsigned short		logical_block_size;
 	unsigned short		max_segments;
+	unsigned short		max_integrity_segments;
 
 	unsigned char		misaligned;
 	unsigned char		discard_misaligned;
@@ -1213,8 +1217,13 @@ struct blk_integrity {
 extern int blk_integrity_register(struct gendisk *, struct blk_integrity *);
 extern void blk_integrity_unregister(struct gendisk *);
 extern int blk_integrity_compare(struct gendisk *, struct gendisk *);
-extern int blk_rq_map_integrity_sg(struct request *, struct scatterlist *);
-extern int blk_rq_count_integrity_sg(struct request *);
+extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
+				   struct scatterlist *);
+extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
+extern int blk_integrity_merge_rq(struct request_queue *, struct request *,
+				  struct request *);
+extern int blk_integrity_merge_bio(struct request_queue *, struct request *,
+				   struct bio *);
 
 static inline
 struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
@@ -1235,16 +1244,32 @@ static inline int blk_integrity_rq(struct request *rq)
 	return bio_integrity(rq->bio);
 }
 
+static inline void blk_queue_max_integrity_segments(struct request_queue *q,
+						    unsigned int segs)
+{
+	q->limits.max_integrity_segments = segs;
+}
+
+static inline unsigned short
+queue_max_integrity_segments(struct request_queue *q)
+{
+	return q->limits.max_integrity_segments;
+}
+
 #else /* CONFIG_BLK_DEV_INTEGRITY */
 
 #define blk_integrity_rq(rq)			(0)
-#define blk_rq_count_integrity_sg(a)		(0)
-#define blk_rq_map_integrity_sg(a, b)		(0)
+#define blk_rq_count_integrity_sg(a, b)		(0)
+#define blk_rq_map_integrity_sg(a, b, c)	(0)
 #define bdev_get_integrity(a)			(0)
 #define blk_get_integrity(a)			(0)
 #define blk_integrity_compare(a, b)		(0)
 #define blk_integrity_register(a, b)		(0)
 #define blk_integrity_unregister(a)		do { } while (0);
+#define blk_queue_max_integrity_segments(a, b)	do { } while (0);
+#define queue_max_integrity_segments(a)		(0)
+#define blk_integrity_merge_rq(a, b, c)		(0)
+#define blk_integrity_merge_bio(a, b, c)	(0)
 
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
index 8fcb6e0e9e72..d63533a4a59e 100644
--- a/include/scsi/scsi.h
+++ b/include/scsi/scsi.h
@@ -31,6 +31,12 @@ struct scsi_cmnd;
 #define SCSI_MAX_SG_CHAIN_SEGMENTS	SCSI_MAX_SG_SEGMENTS
 #endif
 
+/*
+ * DIX-capable adapters effectively support infinite chaining for the
+ * protection information scatterlist
+ */
+#define SCSI_MAX_PROT_SG_SEGMENTS	0xFFFF
+
 /*
  * Special value for scanning to specify scanning or rescanning of all
  * possible channels, (target) ids, or luns on a given shost.
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index b7bdecb7b76e..d0a6a845f204 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -388,6 +388,7 @@ struct scsi_host_template {
 	 * of scatter-gather.
 	 */
 	unsigned short sg_tablesize;
+	unsigned short sg_prot_tablesize;
 
 	/*
 	 * Set this if the host adapter has limitations beside segment count.
@@ -599,6 +600,7 @@ struct Scsi_Host {
 	int can_queue;
 	short cmd_per_lun;
 	short unsigned int sg_tablesize;
+	short unsigned int sg_prot_tablesize;
 	short unsigned int max_sectors;
 	unsigned long dma_boundary;
 	/* 
@@ -823,6 +825,11 @@ static inline unsigned int scsi_host_get_prot(struct Scsi_Host *shost)
 	return shost->prot_capabilities;
 }
 
+static inline int scsi_host_prot_dma(struct Scsi_Host *shost)
+{
+	return shost->prot_capabilities >= SHOST_DIX_TYPE0_PROTECTION;
+}
+
 static inline unsigned int scsi_host_dif_capable(struct Scsi_Host *shost, unsigned int target_type)
 {
 	static unsigned char cap[] = { 0,
-- 
cgit v1.2.3-71-gd317


From 144177991ca624841ddbd1e7edff958fc0f6d1fe Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Wed, 15 Sep 2010 13:08:27 +0200
Subject: block: fix an address space warning in blk-map.c

Change type of 2nd parameter of blk_rq_aligned() into unsigned long
and remove unnecessary casting. Now we can call it with 'uaddr'
instead of 'ubuf' in __blk_rq_map_user() so that it can remove
following warnings from sparse:

 block/blk-map.c:57:31: warning: incorrect type in argument 2 (different address spaces)
 block/blk-map.c:57:31:    expected void *addr
 block/blk-map.c:57:31:    got void [noderef] <asn:1>*ubuf

However blk_rq_map_kern() needs one more local variable to handle it.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-map.c        | 5 +++--
 include/linux/blkdev.h | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux/blkdev.h')

diff --git a/block/blk-map.c b/block/blk-map.c
index c65d7593f7f1..ac0f7d46db63 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -54,7 +54,7 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
 	 * direct dma. else, set up kernel bounce buffers
 	 */
 	uaddr = (unsigned long) ubuf;
-	if (blk_rq_aligned(q, ubuf, len) && !map_data)
+	if (blk_rq_aligned(q, uaddr, len) && !map_data)
 		bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask);
 	else
 		bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask);
@@ -288,6 +288,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 		    unsigned int len, gfp_t gfp_mask)
 {
 	int reading = rq_data_dir(rq) == READ;
+	unsigned long addr = (unsigned long) kbuf;
 	int do_copy = 0;
 	struct bio *bio;
 	int ret;
@@ -297,7 +298,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	if (!len || !kbuf)
 		return -EINVAL;
 
-	do_copy = !blk_rq_aligned(q, kbuf, len) || object_is_on_stack(kbuf);
+	do_copy = !blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf);
 	if (do_copy)
 		bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
 	else
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7e661106270a..780824edac16 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1097,11 +1097,11 @@ static inline int queue_dma_alignment(struct request_queue *q)
 	return q ? q->dma_alignment : 511;
 }
 
-static inline int blk_rq_aligned(struct request_queue *q, void *addr,
+static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr,
 				 unsigned int len)
 {
 	unsigned int alignment = queue_dma_alignment(q) | q->dma_pad_mask;
-	return !((unsigned long)addr & alignment) && !(len & alignment);
+	return !(addr & alignment) && !(len & alignment);
 }
 
 /* assumes size > 256 */
-- 
cgit v1.2.3-71-gd317


From e43473b7f223ec866f7db273697e76c337c390f9 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 15 Sep 2010 17:06:35 -0400
Subject: blkio: Core implementation of throttle policy

o Actual implementation of throttling policy in block layer. Currently it
  implements READ and WRITE bytes per second throttling logic. IOPS throttling
  comes in later patches.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/Kconfig             |  12 +
 block/Makefile            |   1 +
 block/blk-core.c          |  24 ++
 block/blk-throttle.c      | 909 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/blk_types.h |   3 +
 include/linux/blkdev.h    |  24 ++
 init/Kconfig              |   9 +-
 7 files changed, 979 insertions(+), 3 deletions(-)
 create mode 100644 block/blk-throttle.c

(limited to 'include/linux/blkdev.h')

diff --git a/block/Kconfig b/block/Kconfig
index 9be0b56eaee1..6c9213ef15a1 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,6 +77,18 @@ config BLK_DEV_INTEGRITY
 	T10/SCSI Data Integrity Field or the T13/ATA External Path
 	Protection.  If in doubt, say N.
 
+config BLK_DEV_THROTTLING
+	bool "Block layer bio throttling support"
+	depends on BLK_CGROUP=y && EXPERIMENTAL
+	default n
+	---help---
+	Block layer bio throttling support. It can be used to limit
+	the IO rate to a device. IO rate policies are per cgroup and
+	one needs to mount and use blkio cgroup controller for creating
+	cgroups and specifying per device IO rate policies.
+
+	See Documentation/cgroups/blkio-controller.txt for more information.
+
 endif # BLOCK
 
 config BLOCK_COMPAT
diff --git a/block/Makefile b/block/Makefile
index 0bb499a739cd..c850d5ef80a2 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 
 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
 obj-$(CONFIG_BLK_CGROUP)	+= blk-cgroup.o
+obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
diff --git a/block/blk-core.c b/block/blk-core.c
index 8d07c1b7e701..797d5095eb83 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -382,6 +382,7 @@ void blk_sync_queue(struct request_queue *q)
 	del_timer_sync(&q->unplug_timer);
 	del_timer_sync(&q->timeout);
 	cancel_work_sync(&q->unplug_work);
+	throtl_shutdown_timer_wq(q);
 }
 EXPORT_SYMBOL(blk_sync_queue);
 
@@ -459,6 +460,8 @@ void blk_cleanup_queue(struct request_queue *q)
 	if (q->elevator)
 		elevator_exit(q->elevator);
 
+	blk_throtl_exit(q);
+
 	blk_put_queue(q);
 }
 EXPORT_SYMBOL(blk_cleanup_queue);
@@ -515,6 +518,11 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 		return NULL;
 	}
 
+	if (blk_throtl_init(q)) {
+		kmem_cache_free(blk_requestq_cachep, q);
+		return NULL;
+	}
+
 	setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
 		    laptop_mode_timer_fn, (unsigned long) q);
 	init_timer(&q->unplug_timer);
@@ -1522,6 +1530,15 @@ static inline void __generic_make_request(struct bio *bio)
 			goto end_io;
 		}
 
+		blk_throtl_bio(q, &bio);
+
+		/*
+		 * If bio = NULL, bio has been throttled and will be submitted
+		 * later.
+		 */
+		if (!bio)
+			break;
+
 		trace_block_bio_queue(q, bio);
 
 		ret = q->make_request_fn(q, bio);
@@ -2580,6 +2597,13 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
 }
 EXPORT_SYMBOL(kblockd_schedule_work);
 
+int kblockd_schedule_delayed_work(struct request_queue *q,
+			struct delayed_work *dwork, unsigned long delay)
+{
+	return queue_delayed_work(kblockd_workqueue, dwork, delay);
+}
+EXPORT_SYMBOL(kblockd_schedule_delayed_work);
+
 int __init blk_dev_init(void)
 {
 	BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
new file mode 100644
index 000000000000..4b492011e0de
--- /dev/null
+++ b/block/blk-throttle.c
@@ -0,0 +1,909 @@
+/*
+ * Interface for controlling IO bandwidth on a request queue
+ *
+ * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/blktrace_api.h>
+#include "blk-cgroup.h"
+
+/* Max dispatch from a group in 1 round */
+static int throtl_grp_quantum = 8;
+
+/* Total max dispatch from all groups in one round */
+static int throtl_quantum = 32;
+
+/* Throttling is performed over 100ms slice and after that slice is renewed */
+static unsigned long throtl_slice = HZ/10;	/* 100 ms */
+
+struct throtl_rb_root {
+	struct rb_root rb;
+	struct rb_node *left;
+	unsigned int count;
+	unsigned long min_disptime;
+};
+
+#define THROTL_RB_ROOT	(struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
+			.count = 0, .min_disptime = 0}
+
+#define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
+
+struct throtl_grp {
+	/* List of throtl groups on the request queue*/
+	struct hlist_node tg_node;
+
+	/* active throtl group service_tree member */
+	struct rb_node rb_node;
+
+	/*
+	 * Dispatch time in jiffies. This is the estimated time when group
+	 * will unthrottle and is ready to dispatch more bio. It is used as
+	 * key to sort active groups in service tree.
+	 */
+	unsigned long disptime;
+
+	struct blkio_group blkg;
+	atomic_t ref;
+	unsigned int flags;
+
+	/* Two lists for READ and WRITE */
+	struct bio_list bio_lists[2];
+
+	/* Number of queued bios on READ and WRITE lists */
+	unsigned int nr_queued[2];
+
+	/* bytes per second rate limits */
+	uint64_t bps[2];
+
+	/* Number of bytes disptached in current slice */
+	uint64_t bytes_disp[2];
+
+	/* When did we start a new slice */
+	unsigned long slice_start[2];
+	unsigned long slice_end[2];
+};
+
+struct throtl_data
+{
+	/* List of throtl groups */
+	struct hlist_head tg_list;
+
+	/* service tree for active throtl groups */
+	struct throtl_rb_root tg_service_tree;
+
+	struct throtl_grp root_tg;
+	struct request_queue *queue;
+
+	/* Total Number of queued bios on READ and WRITE lists */
+	unsigned int nr_queued[2];
+
+	/*
+	 * number of total undestroyed groups (excluding root group)
+	 */
+	unsigned int nr_undestroyed_grps;
+
+	/* Work for dispatching throttled bios */
+	struct delayed_work throtl_work;
+};
+
+enum tg_state_flags {
+	THROTL_TG_FLAG_on_rr = 0,	/* on round-robin busy list */
+};
+
+#define THROTL_TG_FNS(name)						\
+static inline void throtl_mark_tg_##name(struct throtl_grp *tg)		\
+{									\
+	(tg)->flags |= (1 << THROTL_TG_FLAG_##name);			\
+}									\
+static inline void throtl_clear_tg_##name(struct throtl_grp *tg)	\
+{									\
+	(tg)->flags &= ~(1 << THROTL_TG_FLAG_##name);			\
+}									\
+static inline int throtl_tg_##name(const struct throtl_grp *tg)		\
+{									\
+	return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0;	\
+}
+
+THROTL_TG_FNS(on_rr);
+
+#define throtl_log_tg(td, tg, fmt, args...)				\
+	blk_add_trace_msg((td)->queue, "throtl %s " fmt,		\
+				blkg_path(&(tg)->blkg), ##args);      	\
+
+#define throtl_log(td, fmt, args...)	\
+	blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
+
+static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
+{
+	if (blkg)
+		return container_of(blkg, struct throtl_grp, blkg);
+
+	return NULL;
+}
+
+static inline int total_nr_queued(struct throtl_data *td)
+{
+	return (td->nr_queued[0] + td->nr_queued[1]);
+}
+
+static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
+{
+	atomic_inc(&tg->ref);
+	return tg;
+}
+
+static void throtl_put_tg(struct throtl_grp *tg)
+{
+	BUG_ON(atomic_read(&tg->ref) <= 0);
+	if (!atomic_dec_and_test(&tg->ref))
+		return;
+	kfree(tg);
+}
+
+static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
+			struct cgroup *cgroup)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+	struct throtl_grp *tg = NULL;
+	void *key = td;
+	struct backing_dev_info *bdi = &td->queue->backing_dev_info;
+	unsigned int major, minor;
+
+	/*
+	 * TODO: Speed up blkiocg_lookup_group() by maintaining a radix
+	 * tree of blkg (instead of traversing through hash list all
+	 * the time.
+	 */
+	tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
+
+	/* Fill in device details for root group */
+	if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+		tg->blkg.dev = MKDEV(major, minor);
+		goto done;
+	}
+
+	if (tg)
+		goto done;
+
+	tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
+	if (!tg)
+		goto done;
+
+	INIT_HLIST_NODE(&tg->tg_node);
+	RB_CLEAR_NODE(&tg->rb_node);
+	bio_list_init(&tg->bio_lists[0]);
+	bio_list_init(&tg->bio_lists[1]);
+
+	/*
+	 * Take the initial reference that will be released on destroy
+	 * This can be thought of a joint reference by cgroup and
+	 * request queue which will be dropped by either request queue
+	 * exit or cgroup deletion path depending on who is exiting first.
+	 */
+	atomic_set(&tg->ref, 1);
+
+	/* Add group onto cgroup list */
+	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+	blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
+				MKDEV(major, minor), BLKIO_POLICY_THROTL);
+
+	tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
+	tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
+
+	hlist_add_head(&tg->tg_node, &td->tg_list);
+	td->nr_undestroyed_grps++;
+done:
+	return tg;
+}
+
+static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
+{
+	struct cgroup *cgroup;
+	struct throtl_grp *tg = NULL;
+
+	rcu_read_lock();
+	cgroup = task_cgroup(current, blkio_subsys_id);
+	tg = throtl_find_alloc_tg(td, cgroup);
+	if (!tg)
+		tg = &td->root_tg;
+	rcu_read_unlock();
+	return tg;
+}
+
+static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
+{
+	/* Service tree is empty */
+	if (!root->count)
+		return NULL;
+
+	if (!root->left)
+		root->left = rb_first(&root->rb);
+
+	if (root->left)
+		return rb_entry_tg(root->left);
+
+	return NULL;
+}
+
+static void rb_erase_init(struct rb_node *n, struct rb_root *root)
+{
+	rb_erase(n, root);
+	RB_CLEAR_NODE(n);
+}
+
+static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
+{
+	if (root->left == n)
+		root->left = NULL;
+	rb_erase_init(n, &root->rb);
+	--root->count;
+}
+
+static void update_min_dispatch_time(struct throtl_rb_root *st)
+{
+	struct throtl_grp *tg;
+
+	tg = throtl_rb_first(st);
+	if (!tg)
+		return;
+
+	st->min_disptime = tg->disptime;
+}
+
+static void
+tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
+{
+	struct rb_node **node = &st->rb.rb_node;
+	struct rb_node *parent = NULL;
+	struct throtl_grp *__tg;
+	unsigned long key = tg->disptime;
+	int left = 1;
+
+	while (*node != NULL) {
+		parent = *node;
+		__tg = rb_entry_tg(parent);
+
+		if (time_before(key, __tg->disptime))
+			node = &parent->rb_left;
+		else {
+			node = &parent->rb_right;
+			left = 0;
+		}
+	}
+
+	if (left)
+		st->left = &tg->rb_node;
+
+	rb_link_node(&tg->rb_node, parent, node);
+	rb_insert_color(&tg->rb_node, &st->rb);
+}
+
+static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
+{
+	struct throtl_rb_root *st = &td->tg_service_tree;
+
+	tg_service_tree_add(st, tg);
+	throtl_mark_tg_on_rr(tg);
+	st->count++;
+}
+
+static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
+{
+	if (!throtl_tg_on_rr(tg))
+		__throtl_enqueue_tg(td, tg);
+}
+
+static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
+{
+	throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
+	throtl_clear_tg_on_rr(tg);
+}
+
+static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
+{
+	if (throtl_tg_on_rr(tg))
+		__throtl_dequeue_tg(td, tg);
+}
+
+static void throtl_schedule_next_dispatch(struct throtl_data *td)
+{
+	struct throtl_rb_root *st = &td->tg_service_tree;
+
+	/*
+	 * If there are more bios pending, schedule more work.
+	 */
+	if (!total_nr_queued(td))
+		return;
+
+	BUG_ON(!st->count);
+
+	update_min_dispatch_time(st);
+
+	if (time_before_eq(st->min_disptime, jiffies))
+		throtl_schedule_delayed_work(td->queue, 0);
+	else
+		throtl_schedule_delayed_work(td->queue,
+				(st->min_disptime - jiffies));
+}
+
+static inline void
+throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+{
+	tg->bytes_disp[rw] = 0;
+	tg->slice_start[rw] = jiffies;
+	tg->slice_end[rw] = jiffies + throtl_slice;
+	throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
+			rw == READ ? 'R' : 'W', tg->slice_start[rw],
+			tg->slice_end[rw], jiffies);
+}
+
+static inline void throtl_extend_slice(struct throtl_data *td,
+		struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
+{
+	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
+	throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
+			rw == READ ? 'R' : 'W', tg->slice_start[rw],
+			tg->slice_end[rw], jiffies);
+}
+
+/* Determine if previously allocated or extended slice is complete or not */
+static bool
+throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+{
+	if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
+		return 0;
+
+	return 1;
+}
+
+/* Trim the used slices and adjust slice start accordingly */
+static inline void
+throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+{
+	unsigned long nr_slices, bytes_trim, time_elapsed;
+
+	BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
+
+	/*
+	 * If bps are unlimited (-1), then time slice don't get
+	 * renewed. Don't try to trim the slice if slice is used. A new
+	 * slice will start when appropriate.
+	 */
+	if (throtl_slice_used(td, tg, rw))
+		return;
+
+	time_elapsed = jiffies - tg->slice_start[rw];
+
+	nr_slices = time_elapsed / throtl_slice;
+
+	if (!nr_slices)
+		return;
+
+	bytes_trim = (tg->bps[rw] * throtl_slice * nr_slices)/HZ;
+
+	if (!bytes_trim)
+		return;
+
+	if (tg->bytes_disp[rw] >= bytes_trim)
+		tg->bytes_disp[rw] -= bytes_trim;
+	else
+		tg->bytes_disp[rw] = 0;
+
+	tg->slice_start[rw] += nr_slices * throtl_slice;
+
+	throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu"
+			" start=%lu end=%lu jiffies=%lu",
+			rw == READ ? 'R' : 'W', nr_slices, bytes_trim,
+			tg->slice_start[rw], tg->slice_end[rw], jiffies);
+}
+
+/*
+ * Returns whether one can dispatch a bio or not. Also returns approx number
+ * of jiffies to wait before this bio is with-in IO rate and can be dispatched
+ */
+static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
+				struct bio *bio, unsigned long *wait)
+{
+	bool rw = bio_data_dir(bio);
+	u64 bytes_allowed, extra_bytes;
+	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
+
+	/*
+	 * Currently whole state machine of group depends on first bio
+	 * queued in the group bio list. So one should not be calling
+	 * this function with a different bio if there are other bios
+	 * queued.
+	 */
+	BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
+
+	/* If tg->bps = -1, then BW is unlimited */
+	if (tg->bps[rw] == -1) {
+		if (wait)
+			*wait = 0;
+		return 1;
+	}
+
+	/*
+	 * If previous slice expired, start a new one otherwise renew/extend
+	 * existing slice to make sure it is at least throtl_slice interval
+	 * long since now.
+	 */
+	if (throtl_slice_used(td, tg, rw))
+		throtl_start_new_slice(td, tg, rw);
+	else {
+		if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
+			throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
+	}
+
+	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
+
+	/* Slice has just started. Consider one slice interval */
+	if (!jiffy_elapsed)
+		jiffy_elapsed_rnd = throtl_slice;
+
+	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
+
+	bytes_allowed = (tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd))
+				/ MSEC_PER_SEC;
+
+	if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
+		if (wait)
+			*wait = 0;
+		return 1;
+	}
+
+	/* Calc approx time to dispatch */
+	extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
+	jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
+
+	if (!jiffy_wait)
+		jiffy_wait = 1;
+
+	/*
+	 * This wait time is without taking into consideration the rounding
+	 * up we did. Add that time also.
+	 */
+	jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
+
+	if (wait)
+		*wait = jiffy_wait;
+
+	if (time_before(tg->slice_end[rw], jiffies + jiffy_wait))
+		throtl_extend_slice(td, tg, rw, jiffies + jiffy_wait);
+
+	return 0;
+}
+
+static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
+{
+	bool rw = bio_data_dir(bio);
+	bool sync = bio->bi_rw & REQ_SYNC;
+
+	/* Charge the bio to the group */
+	tg->bytes_disp[rw] += bio->bi_size;
+
+	/*
+	 * TODO: This will take blkg->stats_lock. Figure out a way
+	 * to avoid this cost.
+	 */
+	blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
+
+}
+
+static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
+			struct bio *bio)
+{
+	bool rw = bio_data_dir(bio);
+
+	bio_list_add(&tg->bio_lists[rw], bio);
+	/* Take a bio reference on tg */
+	throtl_ref_get_tg(tg);
+	tg->nr_queued[rw]++;
+	td->nr_queued[rw]++;
+	throtl_enqueue_tg(td, tg);
+}
+
+static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
+{
+	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
+	struct bio *bio;
+
+	if ((bio = bio_list_peek(&tg->bio_lists[READ])))
+		tg_may_dispatch(td, tg, bio, &read_wait);
+
+	if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
+		tg_may_dispatch(td, tg, bio, &write_wait);
+
+	min_wait = min(read_wait, write_wait);
+	disptime = jiffies + min_wait;
+
+	/*
+	 * If group is already on active tree, then update dispatch time
+	 * only if it is lesser than existing dispatch time. Otherwise
+	 * always update the dispatch time
+	 */
+
+	if (throtl_tg_on_rr(tg) && time_before(disptime, tg->disptime))
+		return;
+
+	/* Update dispatch time */
+	throtl_dequeue_tg(td, tg);
+	tg->disptime = disptime;
+	throtl_enqueue_tg(td, tg);
+}
+
+static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
+				bool rw, struct bio_list *bl)
+{
+	struct bio *bio;
+
+	bio = bio_list_pop(&tg->bio_lists[rw]);
+	tg->nr_queued[rw]--;
+	/* Drop bio reference on tg */
+	throtl_put_tg(tg);
+
+	BUG_ON(td->nr_queued[rw] <= 0);
+	td->nr_queued[rw]--;
+
+	throtl_charge_bio(tg, bio);
+	bio_list_add(bl, bio);
+	bio->bi_rw |= REQ_THROTTLED;
+
+	throtl_trim_slice(td, tg, rw);
+}
+
+static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
+				struct bio_list *bl)
+{
+	unsigned int nr_reads = 0, nr_writes = 0;
+	unsigned int max_nr_reads = throtl_grp_quantum*3/4;
+	unsigned int max_nr_writes = throtl_grp_quantum - nr_reads;
+	struct bio *bio;
+
+	/* Try to dispatch 75% READS and 25% WRITES */
+
+	while ((bio = bio_list_peek(&tg->bio_lists[READ]))
+		&& tg_may_dispatch(td, tg, bio, NULL)) {
+
+		tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
+		nr_reads++;
+
+		if (nr_reads >= max_nr_reads)
+			break;
+	}
+
+	while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
+		&& tg_may_dispatch(td, tg, bio, NULL)) {
+
+		tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
+		nr_writes++;
+
+		if (nr_writes >= max_nr_writes)
+			break;
+	}
+
+	return nr_reads + nr_writes;
+}
+
+static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
+{
+	unsigned int nr_disp = 0;
+	struct throtl_grp *tg;
+	struct throtl_rb_root *st = &td->tg_service_tree;
+
+	while (1) {
+		tg = throtl_rb_first(st);
+
+		if (!tg)
+			break;
+
+		if (time_before(jiffies, tg->disptime))
+			break;
+
+		throtl_dequeue_tg(td, tg);
+
+		nr_disp += throtl_dispatch_tg(td, tg, bl);
+
+		if (tg->nr_queued[0] || tg->nr_queued[1]) {
+			tg_update_disptime(td, tg);
+			throtl_enqueue_tg(td, tg);
+		}
+
+		if (nr_disp >= throtl_quantum)
+			break;
+	}
+
+	return nr_disp;
+}
+
+/* Dispatch throttled bios. Should be called without queue lock held. */
+static int throtl_dispatch(struct request_queue *q)
+{
+	struct throtl_data *td = q->td;
+	unsigned int nr_disp = 0;
+	struct bio_list bio_list_on_stack;
+	struct bio *bio;
+
+	spin_lock_irq(q->queue_lock);
+
+	if (!total_nr_queued(td))
+		goto out;
+
+	bio_list_init(&bio_list_on_stack);
+
+	throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u",
+			total_nr_queued(td), td->nr_queued[READ],
+			td->nr_queued[WRITE]);
+
+	nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
+
+	if (nr_disp)
+		throtl_log(td, "bios disp=%u", nr_disp);
+
+	throtl_schedule_next_dispatch(td);
+out:
+	spin_unlock_irq(q->queue_lock);
+
+	/*
+	 * If we dispatched some requests, unplug the queue to make sure
+	 * immediate dispatch
+	 */
+	if (nr_disp) {
+		while((bio = bio_list_pop(&bio_list_on_stack)))
+			generic_make_request(bio);
+		blk_unplug(q);
+	}
+	return nr_disp;
+}
+
+void blk_throtl_work(struct work_struct *work)
+{
+	struct throtl_data *td = container_of(work, struct throtl_data,
+					throtl_work.work);
+	struct request_queue *q = td->queue;
+
+	throtl_dispatch(q);
+}
+
+/* Call with queue lock held */
+void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay)
+{
+
+	struct throtl_data *td = q->td;
+	struct delayed_work *dwork = &td->throtl_work;
+
+	if (total_nr_queued(td) > 0) {
+		/*
+		 * We might have a work scheduled to be executed in future.
+		 * Cancel that and schedule a new one.
+		 */
+		__cancel_delayed_work(dwork);
+		kblockd_schedule_delayed_work(q, dwork, delay);
+		throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
+				delay, jiffies);
+	}
+}
+EXPORT_SYMBOL(throtl_schedule_delayed_work);
+
+static void
+throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
+{
+	/* Something wrong if we are trying to remove same group twice */
+	BUG_ON(hlist_unhashed(&tg->tg_node));
+
+	hlist_del_init(&tg->tg_node);
+
+	/*
+	 * Put the reference taken at the time of creation so that when all
+	 * queues are gone, group can be destroyed.
+	 */
+	throtl_put_tg(tg);
+	td->nr_undestroyed_grps--;
+}
+
+static void throtl_release_tgs(struct throtl_data *td)
+{
+	struct hlist_node *pos, *n;
+	struct throtl_grp *tg;
+
+	hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
+		/*
+		 * If cgroup removal path got to blk_group first and removed
+		 * it from cgroup list, then it will take care of destroying
+		 * cfqg also.
+		 */
+		if (!blkiocg_del_blkio_group(&tg->blkg))
+			throtl_destroy_tg(td, tg);
+	}
+}
+
+static void throtl_td_free(struct throtl_data *td)
+{
+	kfree(td);
+}
+
+/*
+ * Blk cgroup controller notification saying that blkio_group object is being
+ * delinked as associated cgroup object is going away. That also means that
+ * no new IO will come in this group. So get rid of this group as soon as
+ * any pending IO in the group is finished.
+ *
+ * This function is called under rcu_read_lock(). key is the rcu protected
+ * pointer. That means "key" is a valid throtl_data pointer as long as we are
+ * rcu read lock.
+ *
+ * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
+ * it should not be NULL as even if queue was going away, cgroup deltion
+ * path got to it first.
+ */
+void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
+{
+	unsigned long flags;
+	struct throtl_data *td = key;
+
+	spin_lock_irqsave(td->queue->queue_lock, flags);
+	throtl_destroy_tg(td, tg_of_blkg(blkg));
+	spin_unlock_irqrestore(td->queue->queue_lock, flags);
+}
+
+static void throtl_update_blkio_group_read_bps (struct blkio_group *blkg,
+			u64 read_bps)
+{
+	tg_of_blkg(blkg)->bps[READ] = read_bps;
+}
+
+static void throtl_update_blkio_group_write_bps (struct blkio_group *blkg,
+			u64 write_bps)
+{
+	tg_of_blkg(blkg)->bps[WRITE] = write_bps;
+}
+
+void throtl_shutdown_timer_wq(struct request_queue *q)
+{
+	struct throtl_data *td = q->td;
+
+	cancel_delayed_work_sync(&td->throtl_work);
+}
+
+static struct blkio_policy_type blkio_policy_throtl = {
+	.ops = {
+		.blkio_unlink_group_fn = throtl_unlink_blkio_group,
+		.blkio_update_group_read_bps_fn =
+					throtl_update_blkio_group_read_bps,
+		.blkio_update_group_write_bps_fn =
+					throtl_update_blkio_group_write_bps,
+	},
+};
+
+int blk_throtl_bio(struct request_queue *q, struct bio **biop)
+{
+	struct throtl_data *td = q->td;
+	struct throtl_grp *tg;
+	struct bio *bio = *biop;
+	bool rw = bio_data_dir(bio), update_disptime = true;
+
+	if (bio->bi_rw & REQ_THROTTLED) {
+		bio->bi_rw &= ~REQ_THROTTLED;
+		return 0;
+	}
+
+	spin_lock_irq(q->queue_lock);
+	tg = throtl_get_tg(td);
+
+	if (tg->nr_queued[rw]) {
+		/*
+		 * There is already another bio queued in same dir. No
+		 * need to update dispatch time.
+		 */
+		update_disptime = false;
+		goto queue_bio;
+	}
+
+	/* Bio is with-in rate limit of group */
+	if (tg_may_dispatch(td, tg, bio, NULL)) {
+		throtl_charge_bio(tg, bio);
+		goto out;
+	}
+
+queue_bio:
+	throtl_log_tg(td, tg, "[%c] bio. disp=%u sz=%u bps=%llu"
+			" queued=%d/%d", rw == READ ? 'R' : 'W',
+			tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
+			tg->nr_queued[READ], tg->nr_queued[WRITE]);
+
+	throtl_add_bio_tg(q->td, tg, bio);
+	*biop = NULL;
+
+	if (update_disptime) {
+		tg_update_disptime(td, tg);
+		throtl_schedule_next_dispatch(td);
+	}
+
+out:
+	spin_unlock_irq(q->queue_lock);
+	return 0;
+}
+
+int blk_throtl_init(struct request_queue *q)
+{
+	struct throtl_data *td;
+	struct throtl_grp *tg;
+
+	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
+	if (!td)
+		return -ENOMEM;
+
+	INIT_HLIST_HEAD(&td->tg_list);
+	td->tg_service_tree = THROTL_RB_ROOT;
+
+	/* Init root group */
+	tg = &td->root_tg;
+	INIT_HLIST_NODE(&tg->tg_node);
+	RB_CLEAR_NODE(&tg->rb_node);
+	bio_list_init(&tg->bio_lists[0]);
+	bio_list_init(&tg->bio_lists[1]);
+
+	/* Practically unlimited BW */
+	tg->bps[0] = tg->bps[1] = -1;
+	atomic_set(&tg->ref, 1);
+
+	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
+
+	rcu_read_lock();
+	blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td,
+					0, BLKIO_POLICY_THROTL);
+	rcu_read_unlock();
+
+	/* Attach throtl data to request queue */
+	td->queue = q;
+	q->td = td;
+	return 0;
+}
+
+void blk_throtl_exit(struct request_queue *q)
+{
+	struct throtl_data *td = q->td;
+	bool wait = false;
+
+	BUG_ON(!td);
+
+	throtl_shutdown_timer_wq(q);
+
+	spin_lock_irq(q->queue_lock);
+	throtl_release_tgs(td);
+	blkiocg_del_blkio_group(&td->root_tg.blkg);
+
+	/* If there are other groups */
+	if (td->nr_undestroyed_grps >= 1)
+		wait = true;
+
+	spin_unlock_irq(q->queue_lock);
+
+	/*
+	 * Wait for tg->blkg->key accessors to exit their grace periods.
+	 * Do this wait only if there are other undestroyed groups out
+	 * there (other than root group). This can happen if cgroup deletion
+	 * path claimed the responsibility of cleaning up a group before
+	 * queue cleanup code get to the group.
+	 *
+	 * Do not call synchronize_rcu() unconditionally as there are drivers
+	 * which create/delete request queue hundreds of times during scan/boot
+	 * and synchronize_rcu() can take significant time and slow down boot.
+	 */
+	if (wait)
+		synchronize_rcu();
+	throtl_td_free(td);
+}
+
+static int __init throtl_init(void)
+{
+	blkio_policy_register(&blkio_policy_throtl);
+	return 0;
+}
+
+module_init(throtl_init);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index ca83a97c9715..10a0c291b55a 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -130,6 +130,8 @@ enum rq_flag_bits {
 	/* bio only flags */
 	__REQ_UNPLUG,		/* unplug the immediately after submission */
 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
+	__REQ_THROTTLED,	/* This bio has already been subjected to
+				 * throttling rules. Don't do it again. */
 
 	/* request only flags */
 	__REQ_SORTED,		/* elevator knows about this request */
@@ -172,6 +174,7 @@ enum rq_flag_bits {
 
 #define REQ_UNPLUG		(1 << __REQ_UNPLUG)
 #define REQ_RAHEAD		(1 << __REQ_RAHEAD)
+#define REQ_THROTTLED		(1 << __REQ_THROTTLED)
 
 #define REQ_SORTED		(1 << __REQ_SORTED)
 #define REQ_SOFTBARRIER		(1 << __REQ_SOFTBARRIER)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 780824edac16..1341df5806df 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -371,6 +371,11 @@ struct request_queue
 #if defined(CONFIG_BLK_DEV_BSG)
 	struct bsg_class_device bsg_dev;
 #endif
+
+#ifdef CONFIG_BLK_DEV_THROTTLING
+	/* Throttle data */
+	struct throtl_data *td;
+#endif
 };
 
 #define QUEUE_FLAG_CLUSTER	0	/* cluster several segments into 1 */
@@ -1131,6 +1136,7 @@ static inline void put_dev_sector(Sector p)
 
 struct work_struct;
 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
+int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay);
 
 #ifdef CONFIG_BLK_CGROUP
 /*
@@ -1174,6 +1180,24 @@ static inline uint64_t rq_io_start_time_ns(struct request *req)
 }
 #endif
 
+#ifdef CONFIG_BLK_DEV_THROTTLING
+extern int blk_throtl_init(struct request_queue *q);
+extern void blk_throtl_exit(struct request_queue *q);
+extern int blk_throtl_bio(struct request_queue *q, struct bio **bio);
+extern void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay);
+extern void throtl_shutdown_timer_wq(struct request_queue *q);
+#else /* CONFIG_BLK_DEV_THROTTLING */
+static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio)
+{
+	return 0;
+}
+
+static inline int blk_throtl_init(struct request_queue *q) { return 0; }
+static inline int blk_throtl_exit(struct request_queue *q) { return 0; }
+static inline void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay) {}
+static inline void throtl_shutdown_timer_wq(struct request_queue *q) {}
+#endif /* CONFIG_BLK_DEV_THROTTLING */
+
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
 #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
diff --git a/init/Kconfig b/init/Kconfig
index 2de5b1cbadd9..950ba26f7233 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -634,11 +634,14 @@ config BLK_CGROUP
 
 	Currently, CFQ IO scheduler uses it to recognize task groups and
 	control disk bandwidth allocation (proportional time slice allocation)
-	to such task groups.
+	to such task groups. It is also used by bio throttling logic in
+	block layer to implement upper limit in IO rates on a device.
 
 	This option only enables generic Block IO controller infrastructure.
-	One needs to also enable actual IO controlling logic in CFQ for it
-	to take effect. (CONFIG_CFQ_GROUP_IOSCHED=y).
+	One needs to also enable actual IO controlling logic/policy. For
+	enabling proportional weight division of disk bandwidth in CFQ seti
+	CONFIG_CFQ_GROUP_IOSCHED=y and for enabling throttling policy set
+	CONFIG_BLK_THROTTLE=y.
 
 	See Documentation/cgroups/blkio-controller.txt for more information.
 
-- 
cgit v1.2.3-71-gd317


From dd3932eddf428571762596e17b65f5dc92ca361b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 16 Sep 2010 20:51:46 +0200
Subject: block: remove BLKDEV_IFL_WAIT

All the blkdev_issue_* helpers can only sanely be used for synchronous
caller.  To issue cache flushes or barriers asynchronously the caller needs
to set up a bio by itself with a completion callback to move the asynchronous
state machine ahead.  So drop the BLKDEV_IFL_WAIT flag that is always
specified when calling blkdev_issue_* and also remove the now unused flags
argument to blkdev_issue_flush and blkdev_issue_zeroout.  For
blkdev_issue_discard we need to keep it for the secure discard flag, which
gains a more descriptive name and loses the bitops vs flag confusion.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-flush.c                  | 25 +++++++++++--------------
 block/blk-lib.c                    | 21 ++++++++-------------
 block/ioctl.c                      |  4 ++--
 drivers/block/drbd/drbd_int.h      |  3 +--
 drivers/block/drbd/drbd_receiver.c |  2 +-
 fs/block_dev.c                     |  2 +-
 fs/btrfs/extent-tree.c             |  3 +--
 fs/ext3/fsync.c                    |  3 +--
 fs/ext4/fsync.c                    |  5 ++---
 fs/ext4/mballoc.c                  |  3 +--
 fs/fat/fatent.c                    |  3 +--
 fs/gfs2/rgrp.c                     |  5 ++---
 fs/jbd2/checkpoint.c               |  3 +--
 fs/jbd2/commit.c                   |  6 ++----
 fs/nilfs2/the_nilfs.c              |  4 ++--
 fs/reiserfs/file.c                 |  3 +--
 fs/xfs/linux-2.6/xfs_super.c       |  3 +--
 include/linux/blkdev.h             | 14 +++++---------
 mm/swapfile.c                      |  6 +++---
 19 files changed, 47 insertions(+), 71 deletions(-)

(limited to 'include/linux/blkdev.h')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 62b7df9bca9d..54b123d6563e 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -205,7 +205,6 @@ static void bio_end_flush(struct bio *bio, int err)
  * @bdev:	blockdev to issue flush for
  * @gfp_mask:	memory allocation flags (for bio_alloc)
  * @error_sector:	error sector
- * @flags:	BLKDEV_IFL_* flags to control behaviour
  *
  * Description:
  *    Issue a flush for the block device in question. Caller can supply
@@ -214,7 +213,7 @@ static void bio_end_flush(struct bio *bio, int err)
  *    request was pushed in some internal queue for later handling.
  */
 int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
-		sector_t *error_sector, unsigned long flags)
+		sector_t *error_sector)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	struct request_queue *q;
@@ -240,21 +239,19 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 	bio = bio_alloc(gfp_mask, 0);
 	bio->bi_end_io = bio_end_flush;
 	bio->bi_bdev = bdev;
-	if (test_bit(BLKDEV_WAIT, &flags))
-		bio->bi_private = &wait;
+	bio->bi_private = &wait;
 
 	bio_get(bio);
 	submit_bio(WRITE_FLUSH, bio);
-	if (test_bit(BLKDEV_WAIT, &flags)) {
-		wait_for_completion(&wait);
-		/*
-		 * The driver must store the error location in ->bi_sector, if
-		 * it supports it. For non-stacked drivers, this should be
-		 * copied from blk_rq_pos(rq).
-		 */
-		if (error_sector)
-			*error_sector = bio->bi_sector;
-	}
+	wait_for_completion(&wait);
+
+	/*
+	 * The driver must store the error location in ->bi_sector, if
+	 * it supports it. For non-stacked drivers, this should be
+	 * copied from blk_rq_pos(rq).
+	 */
+	if (error_sector)
+               *error_sector = bio->bi_sector;
 
 	if (!bio_flagged(bio, BIO_UPTODATE))
 		ret = -EIO;
diff --git a/block/blk-lib.c b/block/blk-lib.c
index fe2e6ed0f510..1a320d2406b0 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -61,7 +61,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		max_discard_sectors &= ~(disc_sects - 1);
 	}
 
-	if (flags & BLKDEV_IFL_SECURE) {
+	if (flags & BLKDEV_DISCARD_SECURE) {
 		if (!blk_queue_secdiscard(q))
 			return -EOPNOTSUPP;
 		type |= REQ_SECURE;
@@ -77,8 +77,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		bio->bi_sector = sector;
 		bio->bi_end_io = blkdev_discard_end_io;
 		bio->bi_bdev = bdev;
-		if (flags & BLKDEV_IFL_WAIT)
-			bio->bi_private = &wait;
+		bio->bi_private = &wait;
 
 		if (nr_sects > max_discard_sectors) {
 			bio->bi_size = max_discard_sectors << 9;
@@ -92,8 +91,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		bio_get(bio);
 		submit_bio(type, bio);
 
-		if (flags & BLKDEV_IFL_WAIT)
-			wait_for_completion(&wait);
+		wait_for_completion(&wait);
 
 		if (bio_flagged(bio, BIO_EOPNOTSUPP))
 			ret = -EOPNOTSUPP;
@@ -139,7 +137,6 @@ static void bio_batch_end_io(struct bio *bio, int err)
  * @sector:	start sector
  * @nr_sects:	number of sectors to write
  * @gfp_mask:	memory allocation flags (for bio_alloc)
- * @flags:	BLKDEV_IFL_* flags to control behaviour
  *
  * Description:
  *  Generate and issue number of bios with zerofiled pages.
@@ -148,7 +145,7 @@ static void bio_batch_end_io(struct bio *bio, int err)
  */
 
 int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-			sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+			sector_t nr_sects, gfp_t gfp_mask)
 {
 	int ret;
 	struct bio *bio;
@@ -174,8 +171,7 @@ submit:
 		bio->bi_sector = sector;
 		bio->bi_bdev   = bdev;
 		bio->bi_end_io = bio_batch_end_io;
-		if (flags & BLKDEV_IFL_WAIT)
-			bio->bi_private = &bb;
+		bio->bi_private = &bb;
 
 		while (nr_sects != 0) {
 			sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
@@ -193,10 +189,9 @@ submit:
 		submit_bio(WRITE, bio);
 	}
 
-	if (flags & BLKDEV_IFL_WAIT)
-		/* Wait for bios in-flight */
-		while ( issued != atomic_read(&bb.done))
-			wait_for_completion(&wait);
+	/* Wait for bios in-flight */
+	while (issued != atomic_read(&bb.done))
+		wait_for_completion(&wait);
 
 	if (!test_bit(BIO_UPTODATE, &bb.flags))
 		/* One of bios in the batch was completed with error.*/
diff --git a/block/ioctl.c b/block/ioctl.c
index d8052f0dabd3..cb2b9099862b 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -116,7 +116,7 @@ static int blkdev_reread_part(struct block_device *bdev)
 static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 			     uint64_t len, int secure)
 {
-	unsigned long flags = BLKDEV_IFL_WAIT;
+	unsigned long flags = 0;
 
 	if (start & 511)
 		return -EINVAL;
@@ -128,7 +128,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 	if (start + len > (bdev->bd_inode->i_size >> 9))
 		return -EINVAL;
 	if (secure)
-		flags |= BLKDEV_IFL_SECURE;
+		flags |= BLKDEV_DISCARD_SECURE;
 	return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags);
 }
 
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 352441b0f92f..c2ef476f5711 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -2321,8 +2321,7 @@ static inline void drbd_md_flush(struct drbd_conf *mdev)
 	if (test_bit(MD_NO_BARRIER, &mdev->flags))
 		return;
 
-	r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL,
-			BLKDEV_IFL_WAIT);
+	r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL);
 	if (r) {
 		set_bit(MD_NO_BARRIER, &mdev->flags);
 		dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 081522d3c742..df15e7f0e7b7 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -975,7 +975,7 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d
 
 	if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
 		rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
-					NULL, BLKDEV_IFL_WAIT);
+					NULL);
 		if (rv) {
 			dev_err(DEV, "local disk flush failed with status %d\n", rv);
 			/* would rather check on EOPNOTSUPP, but that is not reliable.
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 50e8c8582faa..b737451e2e9d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -370,7 +370,7 @@ int blkdev_fsync(struct file *filp, int datasync)
 	 */
 	mutex_unlock(&bd_inode->i_mutex);
 
-	error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT);
+	error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
 	if (error == -EOPNOTSUPP)
 		error = 0;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 43dc9ea9aef6..0b81ecdb101c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1695,8 +1695,7 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
 static void btrfs_issue_discard(struct block_device *bdev,
 				u64 start, u64 len)
 {
-	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
-			BLKDEV_IFL_WAIT);
+	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0);
 }
 
 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d7e9f74dc3a6..09b13bb34c94 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -90,7 +90,6 @@ int ext3_sync_file(struct file *file, int datasync)
 	 * storage
 	 */
 	if (needs_barrier)
-		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
-				BLKDEV_IFL_WAIT);
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 	return ret;
 }
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 592adf2e546e..3f3ff5ee8f9d 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -128,10 +128,9 @@ int ext4_sync_file(struct file *file, int datasync)
 		    (journal->j_fs_dev != journal->j_dev) &&
 		    (journal->j_flags & JBD2_BARRIER))
 			blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
-					NULL, BLKDEV_IFL_WAIT);
+					NULL);
 		ret = jbd2_log_wait_commit(journal, commit_tid);
 	} else if (journal->j_flags & JBD2_BARRIER)
-		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
-			BLKDEV_IFL_WAIT);
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 	return ret;
 }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a22bfef3da95..19aa0d44d822 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2566,8 +2566,7 @@ static inline void ext4_issue_discard(struct super_block *sb,
 	discard_block = block + ext4_group_first_block_no(sb, block_group);
 	trace_ext4_discard_blocks(sb,
 			(unsigned long long) discard_block, count);
-	ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS,
-			       BLKDEV_IFL_WAIT);
+	ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
 	if (ret == EOPNOTSUPP) {
 		ext4_warning(sb, "discard not supported, disabling");
 		clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index f9a0b7ae8648..b47d2c9f4fa1 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -578,8 +578,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
 				sb_issue_discard(sb,
 					fat_clus_to_blknr(sbi, first_cl),
 					nr_clus * sbi->sec_per_clus,
-					GFP_NOFS,
-					BLKDEV_IFL_WAIT);
+					GFP_NOFS, 0);
 
 				first_cl = cluster;
 			}
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 379316472918..38b3ea1abacc 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -854,7 +854,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 				if ((start + nr_sects) != blk) {
 					rv = blkdev_issue_discard(bdev, start,
 							    nr_sects, GFP_NOFS,
-							    BLKDEV_IFL_WAIT);
+							    0);
 					if (rv)
 						goto fail;
 					nr_sects = 0;
@@ -868,8 +868,7 @@ start_new_extent:
 		}
 	}
 	if (nr_sects) {
-		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
-					 BLKDEV_IFL_WAIT);
+		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0);
 		if (rv)
 			goto fail;
 	}
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 5247e7ffdcb4..6571a056e55d 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -532,8 +532,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	 */
 	if ((journal->j_fs_dev != journal->j_dev) &&
 	    (journal->j_flags & JBD2_BARRIER))
-		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
-			BLKDEV_IFL_WAIT);
+		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
 	if (!(journal->j_flags & JBD2_ABORT))
 		jbd2_journal_update_superblock(journal, 1);
 	return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f204e27f44d1..cb43c605cfaa 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -684,8 +684,7 @@ start_journal_io:
 	if (commit_transaction->t_flushed_data_blocks &&
 	    (journal->j_fs_dev != journal->j_dev) &&
 	    (journal->j_flags & JBD2_BARRIER))
-		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
-			BLKDEV_IFL_WAIT);
+		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
 
 	/* Done it all: now write the commit record asynchronously. */
 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -810,8 +809,7 @@ wait_for_iobuf:
 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
 	    journal->j_flags & JBD2_BARRIER) {
-		blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
-				   BLKDEV_IFL_WAIT);
+		blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
 	}
 
 	if (err)
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 400b2caef4d8..d97310f07bef 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -774,7 +774,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
 			ret = blkdev_issue_discard(nilfs->ns_bdev,
 						   start * sects_per_block,
 						   nblocks * sects_per_block,
-						   GFP_NOFS, BLKDEV_IFL_WAIT);
+						   GFP_NOFS, 0);
 			if (ret < 0)
 				return ret;
 			nblocks = 0;
@@ -784,7 +784,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
 		ret = blkdev_issue_discard(nilfs->ns_bdev,
 					   start * sects_per_block,
 					   nblocks * sects_per_block,
-					   GFP_NOFS, BLKDEV_IFL_WAIT);
+					   GFP_NOFS, 0);
 	return ret;
 }
 
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 6846371498b6..91f080cc76c8 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -152,8 +152,7 @@ static int reiserfs_sync_file(struct file *filp, int datasync)
 	barrier_done = reiserfs_commit_for_inode(inode);
 	reiserfs_write_unlock(inode->i_sb);
 	if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
-		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 
-			BLKDEV_IFL_WAIT);
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 	if (barrier_done < 0)
 		return barrier_done;
 	return (err < 0) ? -EIO : 0;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 15c35b62ff14..5fa7a30cc3f0 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -693,8 +693,7 @@ void
 xfs_blkdev_issue_flush(
 	xfs_buftarg_t		*buftarg)
 {
-	blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL,
-			BLKDEV_IFL_WAIT);
+	blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL);
 }
 
 STATIC void
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index cfcb3a610605..accbd0e5c893 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -867,18 +867,14 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 		return NULL;
 	return bqt->tag_index[tag];
 }
-enum{
-	BLKDEV_WAIT,	/* wait for completion */
-	BLKDEV_SECURE,	/* secure discard */
-};
-#define BLKDEV_IFL_WAIT		(1 << BLKDEV_WAIT)
-#define BLKDEV_IFL_SECURE	(1 << BLKDEV_SECURE)
-extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *,
-			unsigned long);
+
+#define BLKDEV_DISCARD_SECURE  0x01    /* secure discard */
+
+extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
 extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
 extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-			sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
+			sector_t nr_sects, gfp_t gfp_mask);
 static inline int sb_issue_discard(struct super_block *sb, sector_t block,
 		sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
 {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 68cda164dff6..e132e1708acc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -141,7 +141,7 @@ static int discard_swap(struct swap_info_struct *si)
 	nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
 	if (nr_blocks) {
 		err = blkdev_issue_discard(si->bdev, start_block,
-				nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT);
+				nr_blocks, GFP_KERNEL, 0);
 		if (err)
 			return err;
 		cond_resched();
@@ -152,7 +152,7 @@ static int discard_swap(struct swap_info_struct *si)
 		nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
 
 		err = blkdev_issue_discard(si->bdev, start_block,
-				nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT);
+				nr_blocks, GFP_KERNEL, 0);
 		if (err)
 			break;
 
@@ -191,7 +191,7 @@ static void discard_swap_cluster(struct swap_info_struct *si,
 			start_block <<= PAGE_SHIFT - 9;
 			nr_blocks <<= PAGE_SHIFT - 9;
 			if (blkdev_issue_discard(si->bdev, start_block,
-				    nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT))
+				    nr_blocks, GFP_NOIO, 0))
 				break;
 		}
 
-- 
cgit v1.2.3-71-gd317


From 892b6f90db81cccb723d5d92f4fddc2d68b206e1 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Wed, 13 Oct 2010 21:18:03 +0200
Subject: block: Ensure physical block size is unsigned int

Physical block size was declared unsigned int to accomodate the maximum
size reported by READ CAPACITY(16).  Make sure we use the right type in
the related functions.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-settings.c   | 2 +-
 include/linux/blkdev.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux/blkdev.h')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index a3600a7ab8bb..315b88c8cbbb 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -344,7 +344,7 @@ EXPORT_SYMBOL(blk_queue_logical_block_size);
  *   hardware can operate on without reverting to read-modify-write
  *   operations.
  */
-void blk_queue_physical_block_size(struct request_queue *q, unsigned short size)
+void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
 {
 	q->limits.physical_block_size = size;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1341df5806df..8f3dd981b973 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -860,7 +860,7 @@ extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
 extern void blk_queue_max_discard_sectors(struct request_queue *q,
 		unsigned int max_discard_sectors);
 extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
-extern void blk_queue_physical_block_size(struct request_queue *, unsigned short);
+extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
 extern void blk_queue_alignment_offset(struct request_queue *q,
 				       unsigned int alignment);
 extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
@@ -1013,7 +1013,7 @@ static inline unsigned int queue_physical_block_size(struct request_queue *q)
 	return q->limits.physical_block_size;
 }
 
-static inline int bdev_physical_block_size(struct block_device *bdev)
+static inline unsigned int bdev_physical_block_size(struct block_device *bdev)
 {
 	return queue_physical_block_size(bdev_get_queue(bdev));
 }
-- 
cgit v1.2.3-71-gd317


From 7681bfeeccff5efa9eb29bf09249a3c400b15327 Mon Sep 17 00:00:00 2001
From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Date: Tue, 19 Oct 2010 09:05:00 +0200
Subject: block: fix accounting bug on cross partition merges

/proc/diskstats would display a strange output as follows.

$ cat /proc/diskstats |grep sda
   8       0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089
   8       1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691
                                                ~~~~~~~~~~
   8       2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390
   8       3 sda3 54 487 2188 92 0 0 0 0 0 88 92
   8       4 sda4 4 0 8 0 0 0 0 0 0 0 0
   8       5 sda5 81 2027 2130 138 0 0 0 0 0 87 137

Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is
merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE.

The detailed root cause is as follows.

Assuming that there are two partition, sda1 and sda2.

1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight
   is 0 and sda2's one is 1.

        | hd_struct->in_flight
   ---------------------------
   sda1 |          0
   sda2 |          1
   ---------------------------

2. A bio belongs to sda1 is issued and is merged into the request mentioned on
   step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed
   from sda2 region to sda1 region. However the two partition's
   hd_struct->in_flight are not changed.

        | hd_struct->in_flight
   ---------------------------
   sda1 |          0
   sda2 |          1
   ---------------------------

3. The request is finished and blk_account_io_done() is called. In this case,
   sda2's hd_struct->in_flight, not a sda1's one, is decremented.

        | hd_struct->in_flight
   ---------------------------
   sda1 |         -1
   sda2 |          1
   ---------------------------

The patch fixes the problem by caching the partition lookup
inside the request structure, hence making sure that the increment
and decrement will always happen on the same partition struct. This
also speeds up IO with accounting enabled, since it cuts down on
the number of lookups we have to do.

When reloading partition tables, quiesce IO to ensure that no
request references to the partition struct exists. When it is safe
to free the partition table, the IO for that device is restarted
again.

Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c         | 24 ++++++++++++++++--------
 block/blk-merge.c        |  2 +-
 block/blk.h              |  4 ----
 block/genhd.c            | 14 ++++++++++++++
 fs/partitions/check.c    | 12 ++++++++++++
 include/linux/blkdev.h   |  1 +
 include/linux/elevator.h |  2 ++
 include/linux/genhd.h    |  1 +
 8 files changed, 47 insertions(+), 13 deletions(-)

(limited to 'include/linux/blkdev.h')

diff --git a/block/blk-core.c b/block/blk-core.c
index 797d5095eb83..ddc68332d655 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -64,13 +64,15 @@ static void drive_stat_acct(struct request *rq, int new_io)
 		return;
 
 	cpu = part_stat_lock();
-	part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
 
-	if (!new_io)
+	if (!new_io) {
+		part = rq->part;
 		part_stat_inc(cpu, part, merges[rw]);
-	else {
+	} else {
+		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
 		part_round_stats(cpu, part);
 		part_inc_in_flight(part, rw);
+		rq->part = part;
 	}
 
 	part_stat_unlock();
@@ -128,6 +130,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	rq->ref_count = 1;
 	rq->start_time = jiffies;
 	set_start_time_ns(rq);
+	rq->part = NULL;
 }
 EXPORT_SYMBOL(blk_rq_init);
 
@@ -804,11 +807,16 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	rl->starved[is_sync] = 0;
 
 	priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
-	if (priv)
+	if (priv) {
 		rl->elvpriv++;
 
-	if (blk_queue_io_stat(q))
-		rw_flags |= REQ_IO_STAT;
+		/*
+		 * Don't do stats for non-priv requests
+		 */
+		if (blk_queue_io_stat(q))
+			rw_flags |= REQ_IO_STAT;
+	}
+
 	spin_unlock_irq(q->queue_lock);
 
 	rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
@@ -1777,7 +1785,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+		part = req->part;
 		part_stat_add(cpu, part, sectors[rw], bytes >> 9);
 		part_stat_unlock();
 	}
@@ -1797,7 +1805,7 @@ static void blk_account_io_done(struct request *req)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+		part = req->part;
 
 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 6a725461654d..38ff234012a4 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -351,7 +351,7 @@ static void blk_account_io_merge(struct request *req)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+		part = req->part;
 
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part, rq_data_dir(req));
diff --git a/block/blk.h b/block/blk.h
index 6738831ba447..1340cce5721a 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -110,10 +110,6 @@ void blk_queue_congestion_threshold(struct request_queue *q);
 
 int blk_dev_init(void);
 
-void elv_quiesce_start(struct request_queue *q);
-void elv_quiesce_end(struct request_queue *q);
-
-
 /*
  * Return the threshold (number of used requests) at which the queue is
  * considered to be congested.  It include a little hysteresis to keep the
diff --git a/block/genhd.c b/block/genhd.c
index 7923e720ddf5..8313834596db 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -932,8 +932,15 @@ static void disk_free_ptbl_rcu_cb(struct rcu_head *head)
 {
 	struct disk_part_tbl *ptbl =
 		container_of(head, struct disk_part_tbl, rcu_head);
+	struct gendisk *disk = ptbl->disk;
+	struct request_queue *q = disk->queue;
+	unsigned long flags;
 
 	kfree(ptbl);
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	elv_quiesce_end(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
 /**
@@ -951,11 +958,17 @@ static void disk_replace_part_tbl(struct gendisk *disk,
 				  struct disk_part_tbl *new_ptbl)
 {
 	struct disk_part_tbl *old_ptbl = disk->part_tbl;
+	struct request_queue *q = disk->queue;
 
 	rcu_assign_pointer(disk->part_tbl, new_ptbl);
 
 	if (old_ptbl) {
 		rcu_assign_pointer(old_ptbl->last_lookup, NULL);
+
+		spin_lock_irq(q->queue_lock);
+		elv_quiesce_start(q);
+		spin_unlock_irq(q->queue_lock);
+
 		call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb);
 	}
 }
@@ -996,6 +1009,7 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
 		return -ENOMEM;
 
 	new_ptbl->len = target;
+	new_ptbl->disk = disk;
 
 	for (i = 0; i < len; i++)
 		rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6dfbee03ccc6..30f46c2cb9d5 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -365,17 +365,25 @@ struct device_type part_type = {
 static void delete_partition_rcu_cb(struct rcu_head *head)
 {
 	struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
+	struct gendisk *disk = part_to_disk(part);
+	struct request_queue *q = disk->queue;
+	unsigned long flags;
 
 	part->start_sect = 0;
 	part->nr_sects = 0;
 	part_stat_set_all(part, 0);
 	put_device(part_to_dev(part));
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	elv_quiesce_end(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
 void delete_partition(struct gendisk *disk, int partno)
 {
 	struct disk_part_tbl *ptbl = disk->part_tbl;
 	struct hd_struct *part;
+	struct request_queue *q = disk->queue;
 
 	if (partno >= ptbl->len)
 		return;
@@ -390,6 +398,10 @@ void delete_partition(struct gendisk *disk, int partno)
 	kobject_put(part->holder_dir);
 	device_del(part_to_dev(part));
 
+	spin_lock_irq(q->queue_lock);
+	elv_quiesce_start(q);
+	spin_unlock_irq(q->queue_lock);
+
 	call_rcu(&part->rcu_head, delete_partition_rcu_cb);
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8f3dd981b973..16f7f1be1acf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -115,6 +115,7 @@ struct request {
 	void *elevator_private3;
 
 	struct gendisk *rq_disk;
+	struct hd_struct *part;
 	unsigned long start_time;
 #ifdef CONFIG_BLK_CGROUP
 	unsigned long long start_time_ns;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 2c958f4fce1e..df1ee866d715 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -121,6 +121,8 @@ extern void elv_completed_request(struct request_queue *, struct request *);
 extern int elv_set_request(struct request_queue *, struct request *, gfp_t);
 extern void elv_put_request(struct request_queue *, struct request *);
 extern void elv_drain_elevator(struct request_queue *);
+extern void elv_quiesce_start(struct request_queue *);
+extern void elv_quiesce_end(struct request_queue *);
 
 /*
  * io scheduler registration
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 66e26b5a1537..57647ecfc1bd 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -140,6 +140,7 @@ struct disk_part_tbl {
 	struct rcu_head rcu_head;
 	int len;
 	struct hd_struct *last_lookup;
+	struct gendisk *disk;
 	struct hd_struct *part[];
 };
 
-- 
cgit v1.2.3-71-gd317


From f253b86b4ad1b3220544e75880510fd455ebd23f Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Sun, 24 Oct 2010 22:06:02 +0200
Subject: Revert "block: fix accounting bug on cross partition merges"

This reverts commit 7681bfeeccff5efa9eb29bf09249a3c400b15327.

Conflicts:

	include/linux/genhd.h

It has numerous issues with the cleanup path and non-elevator
devices. Revert it for now so we can come up with a clean
version without rushing things.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c         | 24 ++++++++----------------
 block/blk-merge.c        |  2 +-
 block/blk.h              |  4 ++++
 block/genhd.c            | 14 --------------
 fs/partitions/check.c    | 12 ------------
 include/linux/blkdev.h   |  1 -
 include/linux/elevator.h |  2 --
 include/linux/genhd.h    |  1 -
 8 files changed, 13 insertions(+), 47 deletions(-)

(limited to 'include/linux/blkdev.h')

diff --git a/block/blk-core.c b/block/blk-core.c
index 51efd835d4cf..f8548876d7ea 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -64,15 +64,13 @@ static void drive_stat_acct(struct request *rq, int new_io)
 		return;
 
 	cpu = part_stat_lock();
+	part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
 
-	if (!new_io) {
-		part = rq->part;
+	if (!new_io)
 		part_stat_inc(cpu, part, merges[rw]);
-	} else {
-		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+	else {
 		part_round_stats(cpu, part);
 		part_inc_in_flight(part, rw);
-		rq->part = part;
 	}
 
 	part_stat_unlock();
@@ -130,7 +128,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	rq->ref_count = 1;
 	rq->start_time = jiffies;
 	set_start_time_ns(rq);
-	rq->part = NULL;
 }
 EXPORT_SYMBOL(blk_rq_init);
 
@@ -805,16 +802,11 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	rl->starved[is_sync] = 0;
 
 	priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
-	if (priv) {
+	if (priv)
 		rl->elvpriv++;
 
-		/*
-		 * Don't do stats for non-priv requests
-		 */
-		if (blk_queue_io_stat(q))
-			rw_flags |= REQ_IO_STAT;
-	}
-
+	if (blk_queue_io_stat(q))
+		rw_flags |= REQ_IO_STAT;
 	spin_unlock_irq(q->queue_lock);
 
 	rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
@@ -1791,7 +1783,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = req->part;
+		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
 		part_stat_add(cpu, part, sectors[rw], bytes >> 9);
 		part_stat_unlock();
 	}
@@ -1811,7 +1803,7 @@ static void blk_account_io_done(struct request *req)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = req->part;
+		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
 
 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 0a2fd8a48a38..77b7c26df6b5 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -351,7 +351,7 @@ static void blk_account_io_merge(struct request *req)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = req->part;
+		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
 
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part, rq_data_dir(req));
diff --git a/block/blk.h b/block/blk.h
index 1e675e5ade02..2db8f32838e7 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -116,6 +116,10 @@ void blk_queue_congestion_threshold(struct request_queue *q);
 
 int blk_dev_init(void);
 
+void elv_quiesce_start(struct request_queue *q);
+void elv_quiesce_end(struct request_queue *q);
+
+
 /*
  * Return the threshold (number of used requests) at which the queue is
  * considered to be congested.  It include a little hysteresis to keep the
diff --git a/block/genhd.c b/block/genhd.c
index a8adf96a4b41..5fa2b44a72ff 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -929,15 +929,8 @@ static void disk_free_ptbl_rcu_cb(struct rcu_head *head)
 {
 	struct disk_part_tbl *ptbl =
 		container_of(head, struct disk_part_tbl, rcu_head);
-	struct gendisk *disk = ptbl->disk;
-	struct request_queue *q = disk->queue;
-	unsigned long flags;
 
 	kfree(ptbl);
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	elv_quiesce_end(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
 /**
@@ -955,17 +948,11 @@ static void disk_replace_part_tbl(struct gendisk *disk,
 				  struct disk_part_tbl *new_ptbl)
 {
 	struct disk_part_tbl *old_ptbl = disk->part_tbl;
-	struct request_queue *q = disk->queue;
 
 	rcu_assign_pointer(disk->part_tbl, new_ptbl);
 
 	if (old_ptbl) {
 		rcu_assign_pointer(old_ptbl->last_lookup, NULL);
-
-		spin_lock_irq(q->queue_lock);
-		elv_quiesce_start(q);
-		spin_unlock_irq(q->queue_lock);
-
 		call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb);
 	}
 }
@@ -1006,7 +993,6 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
 		return -ENOMEM;
 
 	new_ptbl->len = target;
-	new_ptbl->disk = disk;
 
 	for (i = 0; i < len; i++)
 		rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index b81bfc016a05..0a8b0ad0c7e2 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -365,25 +365,17 @@ struct device_type part_type = {
 static void delete_partition_rcu_cb(struct rcu_head *head)
 {
 	struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
-	struct gendisk *disk = part_to_disk(part);
-	struct request_queue *q = disk->queue;
-	unsigned long flags;
 
 	part->start_sect = 0;
 	part->nr_sects = 0;
 	part_stat_set_all(part, 0);
 	put_device(part_to_dev(part));
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	elv_quiesce_end(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
 void delete_partition(struct gendisk *disk, int partno)
 {
 	struct disk_part_tbl *ptbl = disk->part_tbl;
 	struct hd_struct *part;
-	struct request_queue *q = disk->queue;
 
 	if (partno >= ptbl->len)
 		return;
@@ -398,10 +390,6 @@ void delete_partition(struct gendisk *disk, int partno)
 	kobject_put(part->holder_dir);
 	device_del(part_to_dev(part));
 
-	spin_lock_irq(q->queue_lock);
-	elv_quiesce_start(q);
-	spin_unlock_irq(q->queue_lock);
-
 	call_rcu(&part->rcu_head, delete_partition_rcu_cb);
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 009b80e49f53..646b462d04df 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -115,7 +115,6 @@ struct request {
 	void *elevator_private3;
 
 	struct gendisk *rq_disk;
-	struct hd_struct *part;
 	unsigned long start_time;
 #ifdef CONFIG_BLK_CGROUP
 	unsigned long long start_time_ns;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 80a0ece8f7e4..4fd978e7eb83 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -122,8 +122,6 @@ extern void elv_completed_request(struct request_queue *, struct request *);
 extern int elv_set_request(struct request_queue *, struct request *, gfp_t);
 extern void elv_put_request(struct request_queue *, struct request *);
 extern void elv_drain_elevator(struct request_queue *);
-extern void elv_quiesce_start(struct request_queue *);
-extern void elv_quiesce_end(struct request_queue *);
 
 /*
  * io scheduler registration
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 557c3927e70f..7a7b9c1644e4 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -140,7 +140,6 @@ struct disk_part_tbl {
 	struct rcu_head rcu_head;
 	int len;
 	struct hd_struct __rcu *last_lookup;
-	struct gendisk *disk;
 	struct hd_struct __rcu *part[];
 };
 
-- 
cgit v1.2.3-71-gd317


From e6fa0be699449d28a20e815bfe9ce26725ec4962 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Wed, 27 Oct 2010 21:30:04 -0400
Subject: Add helper function for blkdev_issue_zeroout (sb_issue_discard)

This is done the same way as helper sb_issue_discard for
blkdev_issue_discard.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 include/linux/blkdev.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux/blkdev.h')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2c54906f678f..e5cb4d029689 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -941,6 +941,14 @@ static inline int sb_issue_discard(struct super_block *sb,
 	return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_NOFS,
 				   BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 }
+static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
+		sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
+{
+	return blkdev_issue_zeroout(sb->s_bdev,
+				    block << (sb->s_blocksize_bits - 9),
+				    nr_blocks << (sb->s_blocksize_bits - 9),
+				    gfp_mask, flags);
+}
 
 extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
 
-- 
cgit v1.2.3-71-gd317