From: Jens Axboe <axboe@suse.de>


DESC
dm plug buglet
EDESC
From: Jens Axboe <axboe@suse.de>

At SUSE I reproduced a problem with dm and ide disks on a 4-way where ide
request_fn would discover the queue plugged all of a sudden, and then give
up on doing io.  The problem is due to the fact that dm now sets
QUEUE_FLAG_PLUGGED without holding the target queue lock.  I think the best
fix is simply not to set the PLUGGED bit outside of the queue lock, and
just let __generic_unplug_device() always call down into the request_fn()
even if the queue wasn't plugged.  This should be a very rare occurence.
DESC
per-backing-dev unplugging: fix BIO_RW_SYNC handling
EDESC

It's a bitshift, not a bitmask.
DESC
per-backing dev unplugging oops fix #42
EDESC
From: Chris Mason <mason@suse.com>

Hmpf, one more.  If one proc does a wait_on_buffer while another does
discard_buffer, bh->b_bdev might be null by the time __wait_on_buffer uses
it.  

Someone hit this with reiserfs, but it should be possible to trigger
anywhere.
DESC
fix md for per-address_space unplugging
EDESC
From: Jens Axboe <axboe@suse.de>


DESC
more backing_dev unplug functions
EDESC

shmem.c and rd.c have standalone bakcing_dev_info's.
DESC
plugged bit
EDESC
From: Jens Axboe <axboe@suse.de>

Following some consideration, I think it's better to simply always invoke
the request_fn even if the device wasn't plugged if someone unplugs it. 
This solves the problem of md + dm setting the plugged bit unconditionally
_outside_ of the queue lock, thus confusing some drivers that checks this
bit for sanity in the request_fn.


---

 25-akpm/drivers/block/ll_rw_blk.c    |  108 ++++++++++++-----------------------
 25-akpm/drivers/block/loop.c         |   15 ++++
 25-akpm/drivers/block/rd.c           |    1 
 25-akpm/drivers/block/umem.c         |    3 
 25-akpm/drivers/md/dm-crypt.c        |    2 
 25-akpm/drivers/md/dm-table.c        |   16 +++++
 25-akpm/drivers/md/dm.c              |   23 ++++++-
 25-akpm/drivers/md/dm.h              |    1 
 25-akpm/drivers/md/md.c              |   32 +++++++++-
 25-akpm/drivers/md/raid1.c           |    3 
 25-akpm/drivers/md/raid5.c           |    4 -
 25-akpm/drivers/md/raid6main.c       |    3 
 25-akpm/drivers/mtd/devices/blkmtd.c |    6 -
 25-akpm/fs/buffer.c                  |   12 ++-
 25-akpm/fs/direct-io.c               |    4 -
 25-akpm/fs/jfs/jfs_logmgr.c          |    6 -
 25-akpm/fs/ntfs/compress.c           |    3 
 25-akpm/fs/ufs/truncate.c            |    3 
 25-akpm/fs/xfs/linux/xfs_buf.c       |   24 ++-----
 25-akpm/include/linux/backing-dev.h  |    3 
 25-akpm/include/linux/bio.h          |    3 
 25-akpm/include/linux/blkdev.h       |   23 +++++--
 25-akpm/include/linux/fs.h           |    2 
 25-akpm/include/linux/raid/md.h      |    1 
 25-akpm/include/linux/raid/md_k.h    |   26 --------
 25-akpm/include/linux/swap.h         |    2 
 25-akpm/kernel/power/disk.c          |    1 
 25-akpm/kernel/power/pmdisk.c        |    3 
 25-akpm/kernel/power/swsusp.c        |    5 -
 25-akpm/mm/filemap.c                 |    4 -
 25-akpm/mm/mempool.c                 |    2 
 25-akpm/mm/readahead.c               |    8 +-
 25-akpm/mm/shmem.c                   |    1 
 25-akpm/mm/swap_state.c              |    1 
 25-akpm/mm/swapfile.c                |   65 ++++++++++++++++++++-
 35 files changed, 259 insertions(+), 160 deletions(-)

diff -puN drivers/block/ll_rw_blk.c~per-backing_dev-unplugging drivers/block/ll_rw_blk.c
--- 25/drivers/block/ll_rw_blk.c~per-backing_dev-unplugging	2004-03-26 12:35:28.193934880 -0800
+++ 25-akpm/drivers/block/ll_rw_blk.c	2004-03-26 12:35:28.254925608 -0800
@@ -41,12 +41,6 @@ static void blk_unplug_timeout(unsigned 
  */
 static kmem_cache_t *request_cachep;
 
-/*
- * plug management
- */
-static LIST_HEAD(blk_plug_list);
-static spinlock_t blk_plug_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
-
 static wait_queue_head_t congestion_wqh[2] = {
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
@@ -236,9 +230,13 @@ void blk_queue_make_request(request_queu
 	blk_queue_dma_alignment(q, 511);
 
 	q->unplug_thresh = 4;		/* hmm */
+#if 0
 	q->unplug_delay = (3 * HZ) / 1000;	/* 3 milliseconds */
 	if (q->unplug_delay == 0)
 		q->unplug_delay = 1;
+#else
+	q->unplug_delay = HZ;
+#endif
 
 	INIT_WORK(&q->unplug_work, blk_unplug_work, q);
 
@@ -250,8 +248,6 @@ void blk_queue_make_request(request_queu
 	 */
 	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
 
-	INIT_LIST_HEAD(&q->plug_list);
-
 	blk_queue_activity_fn(q, NULL, NULL);
 }
 
@@ -1103,13 +1099,11 @@ void blk_plug_device(request_queue_t *q)
 	 * don't plug a stopped queue, it must be paired with blk_start_queue()
 	 * which will restart the queueing
 	 */
-	if (!blk_queue_plugged(q)
-	    && !test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) {
-		spin_lock(&blk_plug_lock);
-		list_add_tail(&q->plug_list, &blk_plug_list);
+	if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
+		return;
+
+	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
-		spin_unlock(&blk_plug_lock);
-	}
 }
 
 EXPORT_SYMBOL(blk_plug_device);
@@ -1121,15 +1115,12 @@ EXPORT_SYMBOL(blk_plug_device);
 int blk_remove_plug(request_queue_t *q)
 {
 	WARN_ON(!irqs_disabled());
-	if (blk_queue_plugged(q)) {
-		spin_lock(&blk_plug_lock);
-		list_del_init(&q->plug_list);
-		del_timer(&q->unplug_timer);
-		spin_unlock(&blk_plug_lock);
-		return 1;
-	}
 
-	return 0;
+	if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
+		return 0;
+
+	del_timer(&q->unplug_timer);
+	return 1;
 }
 
 EXPORT_SYMBOL(blk_remove_plug);
@@ -1142,8 +1133,11 @@ static inline void __generic_unplug_devi
 	if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
 		return;
 
-	if (!blk_remove_plug(q))
-		return;
+	/*
+	 * always call down, since we can race now with setting the plugged
+	 * bit outside of the queue lock
+	 */
+	blk_remove_plug(q);
 
 	/*
 	 * was plugged, fire request_fn if queue has stuff to do
@@ -1160,14 +1154,11 @@ static inline void __generic_unplug_devi
  *   Linux uses plugging to build bigger requests queues before letting
  *   the device have at them. If a queue is plugged, the I/O scheduler
  *   is still adding and merging requests on the queue. Once the queue
- *   gets unplugged (either by manually calling this function, or by
- *   calling blk_run_queues()), the request_fn defined for the
- *   queue is invoked and transfers started.
+ *   gets unplugged, the request_fn defined for the queue is invoked and
+ *   transfers started.
  **/
-void generic_unplug_device(void *data)
+void generic_unplug_device(request_queue_t *q)
 {
-	request_queue_t *q = data;
-
 	spin_lock_irq(q->queue_lock);
 	__generic_unplug_device(q);
 	spin_unlock_irq(q->queue_lock);
@@ -1175,9 +1166,23 @@ void generic_unplug_device(void *data)
 
 EXPORT_SYMBOL(generic_unplug_device);
 
+static inline void blk_backing_dev_unplug(struct backing_dev_info *bdi)
+{
+	request_queue_t *q = bdi->unplug_io_data;
+
+	/*
+	 * devices don't necessarily have an ->unplug_fn defined
+	 */
+	if (q->unplug_fn)
+		q->unplug_fn(q);
+}
+
+EXPORT_SYMBOL(blk_backing_dev_unplug);
+
 static void blk_unplug_work(void *data)
 {
 	request_queue_t *q = data;
+
 	q->unplug_fn(q);
 }
 
@@ -1255,42 +1260,6 @@ void blk_run_queue(struct request_queue 
 EXPORT_SYMBOL(blk_run_queue);
 
 /**
- * blk_run_queues - fire all plugged queues
- *
- * Description:
- *   Start I/O on all plugged queues known to the block layer. Queues that
- *   are currently stopped are ignored. This is equivalent to the older
- *   tq_disk task queue run.
- **/
-#define blk_plug_entry(entry) list_entry((entry), request_queue_t, plug_list)
-void blk_run_queues(void)
-{
-	LIST_HEAD(local_plug_list);
-
-	spin_lock_irq(&blk_plug_lock);
-
-	/*
-	 * this will happen fairly often
-	 */
-	if (list_empty(&blk_plug_list))
-		goto out;
-
-	list_splice_init(&blk_plug_list, &local_plug_list);
-	
-	while (!list_empty(&local_plug_list)) {
-		request_queue_t *q = blk_plug_entry(local_plug_list.next);
-
-		spin_unlock_irq(&blk_plug_lock);
-		q->unplug_fn(q);
-		spin_lock_irq(&blk_plug_lock);
-	}
-out:
-	spin_unlock_irq(&blk_plug_lock);
-}
-
-EXPORT_SYMBOL(blk_run_queues);
-
-/**
  * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed
  * @q:    the request queue to be released
  *
@@ -1395,6 +1364,10 @@ request_queue_t *blk_alloc_queue(int gfp
 	memset(q, 0, sizeof(*q));
 	init_timer(&q->unplug_timer);
 	atomic_set(&q->refcnt, 1);
+
+	q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
+	q->backing_dev_info.unplug_io_data = q;
+
 	return q;
 }
 
@@ -2055,7 +2028,6 @@ long blk_congestion_wait(int rw, long ti
 	DEFINE_WAIT(wait);
 	wait_queue_head_t *wqh = &congestion_wqh[rw];
 
-	blk_run_queues();
 	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 	ret = io_schedule_timeout(timeout);
 	finish_wait(wqh, &wait);
@@ -2320,7 +2292,7 @@ out:
 	if (blk_queue_plugged(q)) {
 		int nr_queued = q->rq.count[READ] + q->rq.count[WRITE];
 
-		if (nr_queued == q->unplug_thresh)
+		if (nr_queued == q->unplug_thresh || bio_sync(bio))
 			__generic_unplug_device(q);
 	}
 	spin_unlock_irq(q->queue_lock);
diff -puN drivers/block/loop.c~per-backing_dev-unplugging drivers/block/loop.c
--- 25/drivers/block/loop.c~per-backing_dev-unplugging	2004-03-26 12:35:28.194934728 -0800
+++ 25-akpm/drivers/block/loop.c	2004-03-26 12:35:28.255925456 -0800
@@ -434,6 +434,17 @@ inactive:
 	goto out;
 }
 
+/*
+ * kick off io on the underlying address space
+ */
+static void loop_unplug(request_queue_t *q)
+{
+	struct loop_device *lo = q->queuedata;
+
+	clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags);
+	blk_run_address_space(lo->lo_backing_file->f_mapping);
+}
+
 struct switch_request {
 	struct file *file;
 	struct completion wait;
@@ -614,7 +625,6 @@ static int loop_set_fd(struct loop_devic
 {
 	struct file	*file;
 	struct inode	*inode;
-	struct block_device *lo_device = NULL;
 	struct address_space *mapping;
 	unsigned lo_blocksize;
 	int		lo_flags = 0;
@@ -671,7 +681,7 @@ static int loop_set_fd(struct loop_devic
 	set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
 
 	lo->lo_blocksize = lo_blocksize;
-	lo->lo_device = lo_device;
+	lo->lo_device = bdev;
 	lo->lo_flags = lo_flags;
 	lo->lo_backing_file = file;
 	lo->transfer = NULL;
@@ -689,6 +699,7 @@ static int loop_set_fd(struct loop_devic
 	 */
 	blk_queue_make_request(lo->lo_queue, loop_make_request);
 	lo->lo_queue->queuedata = lo;
+	lo->lo_queue->unplug_fn = loop_unplug;
 
 	set_capacity(disks[lo->lo_number], size);
 
diff -puN drivers/block/umem.c~per-backing_dev-unplugging drivers/block/umem.c
--- 25/drivers/block/umem.c~per-backing_dev-unplugging	2004-03-26 12:35:28.195934576 -0800
+++ 25-akpm/drivers/block/umem.c	2004-03-26 12:35:28.256925304 -0800
@@ -368,9 +368,8 @@ static inline void reset_page(struct mm_
 	page->biotail = & page->bio;
 }
 
-static void mm_unplug_device(void *data)
+static void mm_unplug_device(request_queue_t *q)
 {
-	request_queue_t *q = data;
 	struct cardinfo *card = q->queuedata;
 	unsigned long flags;
 
diff -puN drivers/md/dm.c~per-backing_dev-unplugging drivers/md/dm.c
--- 25/drivers/md/dm.c~per-backing_dev-unplugging	2004-03-26 12:35:28.197934272 -0800
+++ 25-akpm/drivers/md/dm.c	2004-03-26 12:35:28.257925152 -0800
@@ -575,6 +575,17 @@ static int dm_request(request_queue_t *q
 	return 0;
 }
 
+static void dm_unplug_all(request_queue_t *q)
+{
+	struct mapped_device *md = q->queuedata;
+	struct dm_table *map = dm_get_table(md);
+
+	if (map) {
+		dm_table_unplug_all(map);
+		dm_table_put(map);
+	}
+}
+
 static int dm_any_congested(void *congested_data, int bdi_bits)
 {
 	int r;
@@ -672,6 +683,7 @@ static struct mapped_device *alloc_dev(u
 	md->queue->backing_dev_info.congested_fn = dm_any_congested;
 	md->queue->backing_dev_info.congested_data = md;
 	blk_queue_make_request(md->queue, dm_request);
+	md->queue->unplug_fn = dm_unplug_all;
 
 	md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
 				     mempool_free_slab, _io_cache);
@@ -896,11 +908,17 @@ int dm_suspend(struct mapped_device *md)
 	add_wait_queue(&md->wait, &wait);
 	up_write(&md->lock);
 
+	/* unplug */
+	map = dm_get_table(md);
+	if (map) {
+		dm_table_unplug_all(map);
+		dm_table_put(map);
+	}
+
 	/*
 	 * Then we wait for the already mapped ios to
 	 * complete.
 	 */
-	blk_run_queues();
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
 
@@ -945,10 +963,9 @@ int dm_resume(struct mapped_device *md)
 	def = bio_list_get(&md->deferred);
 	__flush_deferred_io(md, def);
 	up_write(&md->lock);
+	dm_table_unplug_all(md->map);
 	dm_table_put(map);
 
-	blk_run_queues();
-
 	return 0;
 }
 
diff -puN drivers/md/dm-crypt.c~per-backing_dev-unplugging drivers/md/dm-crypt.c
--- 25/drivers/md/dm-crypt.c~per-backing_dev-unplugging	2004-03-26 12:35:28.198934120 -0800
+++ 25-akpm/drivers/md/dm-crypt.c	2004-03-26 12:35:28.258925000 -0800
@@ -668,7 +668,7 @@ static int crypt_map(struct dm_target *t
 
 		/* out of memory -> run queues */
 		if (remaining)
-			blk_run_queues();
+			blk_congestion_wait(bio_data_dir(clone), HZ/100);
 	}
 
 	/* drop reference, clones could have returned before we reach this */
diff -puN drivers/md/dm.h~per-backing_dev-unplugging drivers/md/dm.h
--- 25/drivers/md/dm.h~per-backing_dev-unplugging	2004-03-26 12:35:28.199933968 -0800
+++ 25-akpm/drivers/md/dm.h	2004-03-26 12:35:28.258925000 -0800
@@ -116,6 +116,7 @@ int dm_table_get_mode(struct dm_table *t
 void dm_table_suspend_targets(struct dm_table *t);
 void dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
+void dm_table_unplug_all(struct dm_table *t);
 
 /*-----------------------------------------------------------------
  * A registry of target types.
diff -puN drivers/md/dm-table.c~per-backing_dev-unplugging drivers/md/dm-table.c
--- 25/drivers/md/dm-table.c~per-backing_dev-unplugging	2004-03-26 12:35:28.201933664 -0800
+++ 25-akpm/drivers/md/dm-table.c	2004-03-26 12:35:28.259924848 -0800
@@ -885,8 +885,24 @@ int dm_table_any_congested(struct dm_tab
 	return r;
 }
 
+void dm_table_unplug_all(struct dm_table *t)
+{
+	struct list_head *d, *devices = dm_table_get_devices(t);
+
+	for (d = devices->next; d != devices; d = d->next) {
+		struct dm_dev *dd = list_entry(d, struct dm_dev, list);
+		request_queue_t *q = bdev_get_queue(dd->bdev);
+
+		if (q->unplug_fn)
+			q->unplug_fn(q);
+	}
+}
+
 EXPORT_SYMBOL(dm_vcalloc);
 EXPORT_SYMBOL(dm_get_device);
 EXPORT_SYMBOL(dm_put_device);
 EXPORT_SYMBOL(dm_table_event);
 EXPORT_SYMBOL(dm_table_get_mode);
+EXPORT_SYMBOL(dm_table_put);
+EXPORT_SYMBOL(dm_table_get);
+EXPORT_SYMBOL(dm_table_unplug_all);
diff -puN drivers/md/md.c~per-backing_dev-unplugging drivers/md/md.c
--- 25/drivers/md/md.c~per-backing_dev-unplugging	2004-03-26 12:35:28.202933512 -0800
+++ 25-akpm/drivers/md/md.c	2004-03-26 12:35:28.266923784 -0800
@@ -160,6 +160,30 @@ static int md_fail_request (request_queu
 	return 0;
 }
 
+void md_unplug_mddev(mddev_t *mddev)
+{
+	struct list_head *tmp;
+	mdk_rdev_t *rdev;
+
+	/*
+	 * this list iteration is done without any locking in md?!
+	 */
+	ITERATE_RDEV(mddev, rdev, tmp) {
+		request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
+
+		if (r_queue->unplug_fn)
+			r_queue->unplug_fn(r_queue);
+	}
+}
+EXPORT_SYMBOL(md_unplug_mddev);
+
+static void md_unplug_all(request_queue_t *q)
+{
+	mddev_t *mddev = q->queuedata;
+
+	md_unplug_mddev(mddev);
+}
+
 static inline mddev_t *mddev_get(mddev_t *mddev)
 {
 	atomic_inc(&mddev->active);
@@ -335,6 +359,8 @@ static int sync_page_io(struct block_dev
 	struct bio_vec vec;
 	struct completion event;
 
+	rw |= (1 << BIO_RW_SYNC);
+
 	bio_init(&bio);
 	bio.bi_io_vec = &vec;
 	vec.bv_page = page;
@@ -349,7 +375,6 @@ static int sync_page_io(struct block_dev
 	bio.bi_private = &event;
 	bio.bi_end_io = bi_complete;
 	submit_bio(rw, &bio);
-	blk_run_queues();
 	wait_for_completion(&event);
 
 	return test_bit(BIO_UPTODATE, &bio.bi_flags);
@@ -1644,6 +1669,7 @@ static int do_md_run(mddev_t * mddev)
 	 */
 	mddev->queue->queuedata = mddev;
 	mddev->queue->make_request_fn = mddev->pers->make_request;
+	mddev->queue->unplug_fn = md_unplug_all;
 
 	mddev->changed = 1;
 	return 0;
@@ -2718,7 +2744,7 @@ int md_thread(void * arg)
 		run = thread->run;
 		if (run) {
 			run(thread->mddev);
-			blk_run_queues();
+			md_unplug_mddev(thread->mddev);
 		}
 		if (signal_pending(current))
 			flush_signals(current);
@@ -3287,7 +3313,7 @@ static void md_do_sync(mddev_t *mddev)
 		    test_bit(MD_RECOVERY_ERR, &mddev->recovery))
 			break;
 
-		blk_run_queues();
+		md_unplug_mddev(mddev);
 
 	repeat:
 		if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
diff -puN drivers/md/raid1.c~per-backing_dev-unplugging drivers/md/raid1.c
--- 25/drivers/md/raid1.c~per-backing_dev-unplugging	2004-03-26 12:35:28.203933360 -0800
+++ 25-akpm/drivers/md/raid1.c	2004-03-26 12:35:28.268923480 -0800
@@ -451,6 +451,7 @@ rb_out:
 
 static void device_barrier(conf_t *conf, sector_t sect)
 {
+	md_unplug_mddev(conf->mddev);
 	spin_lock_irq(&conf->resync_lock);
 	wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), conf->resync_lock);
 	
@@ -478,6 +479,7 @@ static int make_request(request_queue_t 
 	 * thread has put up a bar for new requests.
 	 * Continue immediately if no resync is active currently.
 	 */
+	md_unplug_mddev(conf->mddev);
 	spin_lock_irq(&conf->resync_lock);
 	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock);
 	conf->nr_pending++;
@@ -644,6 +646,7 @@ static void print_conf(conf_t *conf)
 
 static void close_sync(conf_t *conf)
 {
+	md_unplug_mddev(conf->mddev);
 	spin_lock_irq(&conf->resync_lock);
 	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock);
 	spin_unlock_irq(&conf->resync_lock);
diff -puN drivers/md/raid5.c~per-backing_dev-unplugging drivers/md/raid5.c
--- 25/drivers/md/raid5.c~per-backing_dev-unplugging	2004-03-26 12:35:28.205933056 -0800
+++ 25-akpm/drivers/md/raid5.c	2004-03-26 12:35:28.269923328 -0800
@@ -249,6 +249,7 @@ static struct stripe_head *get_active_st
 				break;
 			if (!sh) {
 				conf->inactive_blocked = 1;
+				md_unplug_mddev(conf->mddev);
 				wait_event_lock_irq(conf->wait_for_stripe,
 						    !list_empty(&conf->inactive_list) &&
 						    (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
@@ -1292,9 +1293,8 @@ static inline void raid5_activate_delaye
 		}
 	}
 }
-static void raid5_unplug_device(void *data)
+static void raid5_unplug_device(request_queue_t *q)
 {
-	request_queue_t *q = data;
 	mddev_t *mddev = q->queuedata;
 	raid5_conf_t *conf = mddev_to_conf(mddev);
 	unsigned long flags;
diff -puN drivers/md/raid6main.c~per-backing_dev-unplugging drivers/md/raid6main.c
--- 25/drivers/md/raid6main.c~per-backing_dev-unplugging	2004-03-26 12:35:28.206932904 -0800
+++ 25-akpm/drivers/md/raid6main.c	2004-03-26 12:35:28.270923176 -0800
@@ -1454,9 +1454,8 @@ static inline void raid6_activate_delaye
 		}
 	}
 }
-static void raid6_unplug_device(void *data)
+static void raid6_unplug_device(request_queue_t *q)
 {
-	request_queue_t *q = data;
 	mddev_t *mddev = q->queuedata;
 	raid6_conf_t *conf = mddev_to_conf(mddev);
 	unsigned long flags;
diff -puN drivers/mtd/devices/blkmtd.c~per-backing_dev-unplugging drivers/mtd/devices/blkmtd.c
--- 25/drivers/mtd/devices/blkmtd.c~per-backing_dev-unplugging	2004-03-26 12:35:28.208932600 -0800
+++ 25-akpm/drivers/mtd/devices/blkmtd.c	2004-03-26 12:35:28.271923024 -0800
@@ -147,8 +147,7 @@ static int blkmtd_readpage(struct blkmtd
 		bio->bi_private = &event;
 		bio->bi_end_io = bi_read_complete;
 		if(bio_add_page(bio, page, PAGE_SIZE, 0) == PAGE_SIZE) {
-			submit_bio(READ, bio);
-			blk_run_queues();
+			submit_bio(READ_SYNC, bio);
 			wait_for_completion(&event);
 			err = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : -EIO;
 			bio_put(bio);
@@ -179,8 +178,7 @@ static int blkmtd_write_out(struct bio *
 	init_completion(&event);
 	bio->bi_private = &event;
 	bio->bi_end_io = bi_write_complete;
-	submit_bio(WRITE, bio);
-	blk_run_queues();
+	submit_bio(WRITE_SYNC, bio);
 	wait_for_completion(&event);
 	DEBUG(3, "submit_bio completed, bi_vcnt = %d\n", bio->bi_vcnt);
 	err = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : -EIO;
diff -puN fs/buffer.c~per-backing_dev-unplugging fs/buffer.c
--- 25/fs/buffer.c~per-backing_dev-unplugging	2004-03-26 12:35:28.209932448 -0800
+++ 25-akpm/fs/buffer.c	2004-03-26 12:35:28.277922112 -0800
@@ -132,7 +132,11 @@ void __wait_on_buffer(struct buffer_head
 	do {
 		prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 		if (buffer_locked(bh)) {
-			blk_run_queues();
+			struct block_device *bd;
+			smp_mb();
+			bd = bh->b_bdev;
+			if (bd)
+				blk_run_address_space(bd->bd_inode->i_mapping);
 			io_schedule();
 		}
 	} while (buffer_locked(bh));
@@ -491,7 +495,6 @@ static void free_more_memory(void)
 	pg_data_t *pgdat;
 
 	wakeup_bdflush(1024);
-	blk_run_queues();
 	yield();
 
 	for_each_pgdat(pgdat) {
@@ -2925,7 +2928,10 @@ EXPORT_SYMBOL(try_to_free_buffers);
 
 int block_sync_page(struct page *page)
 {
-	blk_run_queues();
+	struct address_space *mapping;
+	smp_mb();
+	mapping = page->mapping;
+	blk_run_address_space(mapping);
 	return 0;
 }
 
diff -puN fs/direct-io.c~per-backing_dev-unplugging fs/direct-io.c
--- 25/fs/direct-io.c~per-backing_dev-unplugging	2004-03-26 12:35:28.211932144 -0800
+++ 25-akpm/fs/direct-io.c	2004-03-26 12:35:28.278921960 -0800
@@ -329,7 +329,7 @@ static struct bio *dio_await_one(struct 
 		if (dio->bio_list == NULL) {
 			dio->waiter = current;
 			spin_unlock_irqrestore(&dio->bio_list_lock, flags);
-			blk_run_queues();
+			blk_run_address_space(dio->inode->i_mapping);
 			io_schedule();
 			spin_lock_irqsave(&dio->bio_list_lock, flags);
 			dio->waiter = NULL;
@@ -960,7 +960,7 @@ direct_io_worker(int rw, struct kiocb *i
 		if (ret == 0)
 			ret = dio->result;	/* Bytes written */
 		finished_one_bio(dio);		/* This can free the dio */
-		blk_run_queues();
+		blk_run_address_space(inode->i_mapping);
 	} else {
 		finished_one_bio(dio);
 		ret2 = dio_await_completion(dio);
diff -puN fs/jfs/jfs_logmgr.c~per-backing_dev-unplugging fs/jfs/jfs_logmgr.c
--- 25/fs/jfs/jfs_logmgr.c~per-backing_dev-unplugging	2004-03-26 12:35:28.213931840 -0800
+++ 25-akpm/fs/jfs/jfs_logmgr.c	2004-03-26 12:35:28.284921048 -0800
@@ -1976,8 +1976,7 @@ static int lbmRead(struct jfs_log * log,
 
 	bio->bi_end_io = lbmIODone;
 	bio->bi_private = bp;
-	submit_bio(READ, bio);
-	blk_run_queues();
+	submit_bio(READ_SYNC, bio);
 
 	wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD));
 
@@ -2121,9 +2120,8 @@ static void lbmStartIO(struct lbuf * bp)
 
 	/* check if journaling to disk has been disabled */
 	if (!log->no_integrity) {
-		submit_bio(WRITE, bio);
+		submit_bio(WRITE_SYNC, bio);
 		INCREMENT(lmStat.submitted);
-		blk_run_queues();
 	}
 	else {
 		bio->bi_size = 0;
diff -puN fs/ntfs/compress.c~per-backing_dev-unplugging fs/ntfs/compress.c
--- 25/fs/ntfs/compress.c~per-backing_dev-unplugging	2004-03-26 12:35:28.217931232 -0800
+++ 25-akpm/fs/ntfs/compress.c	2004-03-26 12:35:28.284921048 -0800
@@ -23,6 +23,7 @@
 
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
+#include <linux/blkdev.h>
 
 #include "ntfs.h"
 
@@ -668,7 +669,7 @@ lock_retry_remap:
 					"uptodate! Unplugging the disk queue "
 					"and rescheduling.");
 			get_bh(tbh);
-			blk_run_queues();
+			blk_run_address_space(mapping);
 			schedule();
 			put_bh(tbh);
 			if (unlikely(!buffer_uptodate(tbh)))
diff -puN fs/ufs/truncate.c~per-backing_dev-unplugging fs/ufs/truncate.c
--- 25/fs/ufs/truncate.c~per-backing_dev-unplugging	2004-03-26 12:35:28.218931080 -0800
+++ 25-akpm/fs/ufs/truncate.c	2004-03-26 12:35:28.285920896 -0800
@@ -38,6 +38,7 @@
 #include <linux/string.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/blkdev.h>
 #include <linux/sched.h>
 
 #include "swab.h"
@@ -456,7 +457,7 @@ void ufs_truncate (struct inode * inode)
 			break;
 		if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
 			ufs_sync_inode (inode);
-		blk_run_queues();
+		blk_run_address_space(inode->i_mapping);
 		yield();
 	}
 	offset = inode->i_size & uspi->s_fshift;
diff -puN fs/xfs/linux/xfs_buf.c~per-backing_dev-unplugging fs/xfs/linux/xfs_buf.c
--- 25/fs/xfs/linux/xfs_buf.c~per-backing_dev-unplugging	2004-03-26 12:35:28.220930776 -0800
+++ 25-akpm/fs/xfs/linux/xfs_buf.c	2004-03-26 12:35:28.287920592 -0800
@@ -1013,7 +1013,7 @@ pagebuf_lock(
 {
 	PB_TRACE(pb, "lock", 0);
 	if (atomic_read(&pb->pb_io_remaining))
-		blk_run_queues();
+		blk_run_address_space(pb->pb_target->pbr_mapping);
 	down(&pb->pb_sema);
 	PB_SET_OWNER(pb);
 	PB_TRACE(pb, "locked", 0);
@@ -1109,7 +1109,7 @@ _pagebuf_wait_unpin(
 		if (atomic_read(&pb->pb_pin_count) == 0)
 			break;
 		if (atomic_read(&pb->pb_io_remaining))
-			blk_run_queues();
+			blk_run_address_space(pb->pb_target->pbr_mapping);
 		schedule();
 	}
 	remove_wait_queue(&pb->pb_waiters, &wait);
@@ -1407,7 +1407,7 @@ submit_io:
 	if (pb->pb_flags & PBF_RUN_QUEUES) {
 		pb->pb_flags &= ~PBF_RUN_QUEUES;
 		if (atomic_read(&pb->pb_io_remaining) > 1)
-			blk_run_queues();
+			blk_run_address_space(pb->pb_target->pbr_mapping);
 	}
 }
 
@@ -1471,7 +1471,7 @@ pagebuf_iowait(
 {
 	PB_TRACE(pb, "iowait", 0);
 	if (atomic_read(&pb->pb_io_remaining))
-		blk_run_queues();
+		blk_run_address_space(pb->pb_target->pbr_mapping);
 	down(&pb->pb_iodonesema);
 	PB_TRACE(pb, "iowaited", (long)pb->pb_error);
 	return pb->pb_error;
@@ -1617,7 +1617,6 @@ STATIC int
 pagebuf_daemon(
 	void			*data)
 {
-	int			count;
 	page_buf_t		*pb;
 	struct list_head	*curr, *next, tmp;
 
@@ -1640,7 +1639,6 @@ pagebuf_daemon(
 
 		spin_lock(&pbd_delwrite_lock);
 
-		count = 0;
 		list_for_each_safe(curr, next, &pbd_delwrite_queue) {
 			pb = list_entry(curr, page_buf_t, pb_list);
 
@@ -1657,7 +1655,6 @@ pagebuf_daemon(
 				pb->pb_flags &= ~PBF_DELWRI;
 				pb->pb_flags |= PBF_WRITE;
 				list_move(&pb->pb_list, &tmp);
-				count++;
 			}
 		}
 
@@ -1667,12 +1664,11 @@ pagebuf_daemon(
 			list_del_init(&pb->pb_list);
 
 			pagebuf_iostrategy(pb);
+			blk_run_address_space(pb->pb_target->pbr_mapping);
 		}
 
 		if (as_list_len > 0)
 			purge_addresses();
-		if (count)
-			blk_run_queues();
 
 		force_flush = 0;
 	} while (pagebuf_daemon_active);
@@ -1689,7 +1685,6 @@ pagebuf_delwri_flush(
 	page_buf_t		*pb;
 	struct list_head	*curr, *next, tmp;
 	int			pincount = 0;
-	int			flush_cnt = 0;
 
 	pagebuf_runall_queues(pagebuf_dataio_workqueue);
 	pagebuf_runall_queues(pagebuf_logio_workqueue);
@@ -1733,14 +1728,8 @@ pagebuf_delwri_flush(
 
 		pagebuf_lock(pb);
 		pagebuf_iostrategy(pb);
-		if (++flush_cnt > 32) {
-			blk_run_queues();
-			flush_cnt = 0;
-		}
 	}
 
-	blk_run_queues();
-
 	while (!list_empty(&tmp)) {
 		pb = list_entry(tmp.next, page_buf_t, pb_list);
 
@@ -1751,6 +1740,9 @@ pagebuf_delwri_flush(
 		pagebuf_rele(pb);
 	}
 
+	if (flags & PBDF_WAIT)
+		blk_run_address_space(target->pbr_mapping);
+
 	if (pinptr)
 		*pinptr = pincount;
 }
diff -puN include/linux/backing-dev.h~per-backing_dev-unplugging include/linux/backing-dev.h
--- 25/include/linux/backing-dev.h~per-backing_dev-unplugging	2004-03-26 12:35:28.221930624 -0800
+++ 25-akpm/include/linux/backing-dev.h	2004-03-26 12:35:28.287920592 -0800
@@ -28,9 +28,12 @@ struct backing_dev_info {
 	int memory_backed;	/* Cannot clean pages with writepage */
 	congested_fn *congested_fn; /* Function pointer if device is md/dm */
 	void *congested_data;	/* Pointer to aux data for congested func */
+	void (*unplug_io_fn)(struct backing_dev_info *);
+	void *unplug_io_data;
 };
 
 extern struct backing_dev_info default_backing_dev_info;
+void default_unplug_io_fn(struct backing_dev_info *bdi);
 
 int writeback_acquire(struct backing_dev_info *bdi);
 int writeback_in_progress(struct backing_dev_info *bdi);
diff -puN include/linux/bio.h~per-backing_dev-unplugging include/linux/bio.h
--- 25/include/linux/bio.h~per-backing_dev-unplugging	2004-03-26 12:35:28.222930472 -0800
+++ 25-akpm/include/linux/bio.h	2004-03-26 12:35:28.288920440 -0800
@@ -119,11 +119,13 @@ struct bio {
  * bit 1 -- rw-ahead when set
  * bit 2 -- barrier
  * bit 3 -- fail fast, don't want low level driver retries
+ * bit 4 -- synchronous I/O hint: the block layer will unplug immediately
  */
 #define BIO_RW		0
 #define BIO_RW_AHEAD	1
 #define BIO_RW_BARRIER	2
 #define BIO_RW_FAILFAST	3
+#define BIO_RW_SYNC	4
 
 /*
  * various member access, note that bio_data should of course not be used
@@ -138,6 +140,7 @@ struct bio {
 #define bio_cur_sectors(bio)	(bio_iovec(bio)->bv_len >> 9)
 #define bio_data(bio)		(page_address(bio_page((bio))) + bio_offset((bio)))
 #define bio_barrier(bio)	((bio)->bi_rw & (1 << BIO_RW_BARRIER))
+#define bio_sync(bio)		((bio)->bi_rw & (1 << BIO_RW_SYNC))
 
 /*
  * will die
diff -puN include/linux/blkdev.h~per-backing_dev-unplugging include/linux/blkdev.h
--- 25/include/linux/blkdev.h~per-backing_dev-unplugging	2004-03-26 12:35:28.223930320 -0800
+++ 25-akpm/include/linux/blkdev.h	2004-03-26 12:35:28.289920288 -0800
@@ -243,7 +243,7 @@ typedef int (merge_requests_fn) (request
 typedef void (request_fn_proc) (request_queue_t *q);
 typedef int (make_request_fn) (request_queue_t *q, struct bio *bio);
 typedef int (prep_rq_fn) (request_queue_t *, struct request *);
-typedef void (unplug_fn) (void *q);
+typedef void (unplug_fn) (request_queue_t *);
 
 struct bio_vec;
 typedef int (merge_bvec_fn) (request_queue_t *, struct bio *, struct bio_vec *);
@@ -315,8 +315,6 @@ struct request_queue
 	unsigned long		bounce_pfn;
 	int			bounce_gfp;
 
-	struct list_head	plug_list;
-
 	/*
 	 * various queue flags, see QUEUE_* below
 	 */
@@ -370,8 +368,9 @@ struct request_queue
 #define QUEUE_FLAG_WRITEFULL	4	/* read queue has been filled */
 #define QUEUE_FLAG_DEAD		5	/* queue being torn down */
 #define QUEUE_FLAG_REENTER	6	/* Re-entrancy avoidance */
+#define QUEUE_FLAG_PLUGGED	7	/* queue is plugged */
 
-#define blk_queue_plugged(q)	!list_empty(&(q)->plug_list)
+#define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 
@@ -515,7 +514,7 @@ extern int scsi_cmd_ioctl(struct gendisk
 extern void blk_start_queue(request_queue_t *q);
 extern void blk_stop_queue(request_queue_t *q);
 extern void __blk_stop_queue(request_queue_t *q);
-extern void blk_run_queue(request_queue_t *q);
+extern void blk_run_queue(request_queue_t *);
 extern void blk_queue_activity_fn(request_queue_t *, activity_fn *, void *);
 extern struct request *blk_rq_map_user(request_queue_t *, int, void __user *, unsigned int);
 extern int blk_rq_unmap_user(struct request *, void __user *, unsigned int);
@@ -526,6 +525,18 @@ static inline request_queue_t *bdev_get_
 	return bdev->bd_disk->queue;
 }
 
+static inline void blk_run_backing_dev(struct backing_dev_info *bdi)
+{
+	if (bdi && bdi->unplug_io_fn)
+		bdi->unplug_io_fn(bdi);
+}
+
+static inline void blk_run_address_space(struct address_space *mapping)
+{
+	if (mapping)
+		blk_run_backing_dev(mapping->backing_dev_info);
+}
+
 /*
  * end_request() and friends. Must be called with the request queue spinlock
  * acquired. All functions called within end_request() _must_be_ atomic.
@@ -572,7 +583,7 @@ extern struct backing_dev_info *blk_get_
 
 extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
-extern void generic_unplug_device(void *);
+extern void generic_unplug_device(request_queue_t *);
 extern long nr_blockdev_pages(void);
 
 int blk_get_queue(request_queue_t *);
diff -puN include/linux/fs.h~per-backing_dev-unplugging include/linux/fs.h
--- 25/include/linux/fs.h~per-backing_dev-unplugging	2004-03-26 12:35:28.225930016 -0800
+++ 25-akpm/include/linux/fs.h	2004-03-26 12:35:28.290920136 -0800
@@ -83,6 +83,8 @@ extern int leases_enable, dir_notify_ena
 #define WRITE 1
 #define READA 2		/* read-ahead  - don't block if no resources */
 #define SPECIAL 4	/* For non-blockdevice requests in request queue */
+#define READ_SYNC	(READ | (1 << BIO_RW_SYNC))
+#define WRITE_SYNC	(WRITE | (1 << BIO_RW_SYNC))
 
 #define SEL_IN		1
 #define SEL_OUT		2
diff -puN include/linux/raid/md_k.h~per-backing_dev-unplugging include/linux/raid/md_k.h
--- 25/include/linux/raid/md_k.h~per-backing_dev-unplugging	2004-03-26 12:35:28.226929864 -0800
+++ 25-akpm/include/linux/raid/md_k.h	2004-03-26 12:35:28.291919984 -0800
@@ -326,7 +326,6 @@ do {									\
 		if (condition)						\
 			break;						\
 		spin_unlock_irq(&lock);					\
-		blk_run_queues();					\
 		schedule();						\
 		spin_lock_irq(&lock);					\
 	}								\
@@ -341,30 +340,5 @@ do {									\
 	__wait_event_lock_irq(wq, condition, lock);			\
 } while (0)
 
-
-#define __wait_disk_event(wq, condition) 				\
-do {									\
-	wait_queue_t __wait;						\
-	init_waitqueue_entry(&__wait, current);				\
-									\
-	add_wait_queue(&wq, &__wait);					\
-	for (;;) {							\
-		set_current_state(TASK_UNINTERRUPTIBLE);		\
-		if (condition)						\
-			break;						\
-		blk_run_queues();					\
-		schedule();						\
-	}								\
-	current->state = TASK_RUNNING;					\
-	remove_wait_queue(&wq, &__wait);				\
-} while (0)
-
-#define wait_disk_event(wq, condition) 					\
-do {									\
-	if (condition)	 						\
-		break;							\
-	__wait_disk_event(wq, condition);				\
-} while (0)
-
 #endif
 
diff -puN kernel/power/disk.c~per-backing_dev-unplugging kernel/power/disk.c
--- 25/kernel/power/disk.c~per-backing_dev-unplugging	2004-03-26 12:35:28.227929712 -0800
+++ 25-akpm/kernel/power/disk.c	2004-03-26 12:35:28.291919984 -0800
@@ -84,7 +84,6 @@ static void free_some_memory(void)
 	while (shrink_all_memory(10000))
 		printk(".");
 	printk("|\n");
-	blk_run_queues();
 }
 
 
diff -puN kernel/power/pmdisk.c~per-backing_dev-unplugging kernel/power/pmdisk.c
--- 25/kernel/power/pmdisk.c~per-backing_dev-unplugging	2004-03-26 12:35:28.229929408 -0800
+++ 25-akpm/kernel/power/pmdisk.c	2004-03-26 12:35:28.292919832 -0800
@@ -859,7 +859,6 @@ static int end_io(struct bio * bio, unsi
 
 static void wait_io(void)
 {
-	blk_run_queues();
 	while(atomic_read(&io_done))
 		io_schedule();
 }
@@ -898,7 +897,7 @@ static int submit(int rw, pgoff_t page_o
 	if (rw == WRITE)
 		bio_set_pages_dirty(bio);
 	start_io();
-	submit_bio(rw,bio);
+	submit_bio(rw | (1 << BIO_RW_SYNC), bio);
 	wait_io();
  Done:
 	bio_put(bio);
diff -puN kernel/power/swsusp.c~per-backing_dev-unplugging kernel/power/swsusp.c
--- 25/kernel/power/swsusp.c~per-backing_dev-unplugging	2004-03-26 12:35:28.230929256 -0800
+++ 25-akpm/kernel/power/swsusp.c	2004-03-26 12:35:28.293919680 -0800
@@ -707,11 +707,6 @@ int software_suspend(void)
 
 		free_some_memory();
 		
-		/* No need to invalidate any vfsmnt list -- 
-		 * they will be valid after resume, anyway.
-		 */
-		blk_run_queues();
-
 		/* Save state of all device drivers, and stop them. */		   
 		if ((res = device_suspend(4))==0)
 			/* If stopping device drivers worked, we proceed basically into
diff -puN mm/mempool.c~per-backing_dev-unplugging mm/mempool.c
--- 25/mm/mempool.c~per-backing_dev-unplugging	2004-03-26 12:35:28.231929104 -0800
+++ 25-akpm/mm/mempool.c	2004-03-26 12:35:28.293919680 -0800
@@ -234,8 +234,6 @@ repeat_alloc:
 	if (!(gfp_mask & __GFP_WAIT))
 		return NULL;
 
-	blk_run_queues();
-
 	prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
 	mb();
 	if (!pool->curr_nr)
diff -puN mm/readahead.c~per-backing_dev-unplugging mm/readahead.c
--- 25/mm/readahead.c~per-backing_dev-unplugging	2004-03-26 12:35:28.233928800 -0800
+++ 25-akpm/mm/readahead.c	2004-03-26 12:35:28.294919528 -0800
@@ -15,11 +15,16 @@
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 
+void default_unplug_io_fn(struct backing_dev_info *bdi)
+{
+}
+EXPORT_SYMBOL(default_unplug_io_fn);
+
 struct backing_dev_info default_backing_dev_info = {
 	.ra_pages	= (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
 	.state		= 0,
+	.unplug_io_fn	= default_unplug_io_fn,
 };
-
 EXPORT_SYMBOL_GPL(default_backing_dev_info);
 
 /*
@@ -32,7 +37,6 @@ file_ra_state_init(struct file_ra_state 
 	ra->ra_pages = mapping->backing_dev_info->ra_pages;
 	ra->average = ra->ra_pages / 2;
 }
-
 EXPORT_SYMBOL(file_ra_state_init);
 
 /*
diff -puN mm/filemap.c~per-backing_dev-unplugging mm/filemap.c
--- 25/mm/filemap.c~per-backing_dev-unplugging	2004-03-26 12:35:28.234928648 -0800
+++ 25-akpm/mm/filemap.c	2004-03-26 12:35:28.295919376 -0800
@@ -118,8 +118,10 @@ void remove_from_page_cache(struct page 
 
 static inline int sync_page(struct page *page)
 {
-	struct address_space *mapping = page->mapping;
+	struct address_space *mapping;
 
+	smp_mb();
+	mapping = page->mapping;
 	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
 		return mapping->a_ops->sync_page(page);
 	return 0;
diff -puN include/linux/raid/md.h~per-backing_dev-unplugging include/linux/raid/md.h
--- 25/include/linux/raid/md.h~per-backing_dev-unplugging	2004-03-26 12:35:28.236928344 -0800
+++ 25-akpm/include/linux/raid/md.h	2004-03-26 12:35:28.296919224 -0800
@@ -76,6 +76,7 @@ extern void md_handle_safemode(mddev_t *
 extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
 extern void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors);
 extern void md_error (mddev_t *mddev, mdk_rdev_t *rdev);
+extern void md_unplug_mddev(mddev_t *mddev);
 
 extern void md_print_devices (void);
 
diff -puN include/linux/swap.h~per-backing_dev-unplugging include/linux/swap.h
--- 25/include/linux/swap.h~per-backing_dev-unplugging	2004-03-26 12:35:28.237928192 -0800
+++ 25-akpm/include/linux/swap.h	2004-03-26 12:35:28.296919224 -0800
@@ -232,6 +232,8 @@ extern sector_t map_swap_page(struct swa
 extern struct swap_info_struct *get_swap_info_struct(unsigned);
 extern int can_share_swap_page(struct page *);
 extern int remove_exclusive_swap_page(struct page *);
+struct backing_dev_info;
+extern void swap_unplug_io_fn(struct backing_dev_info *);
 
 extern struct swap_list_t swap_list;
 extern spinlock_t swaplock;
diff -puN mm/swapfile.c~per-backing_dev-unplugging mm/swapfile.c
--- 25/mm/swapfile.c~per-backing_dev-unplugging	2004-03-26 12:35:28.239927888 -0800
+++ 25-akpm/mm/swapfile.c	2004-03-26 12:35:28.298918920 -0800
@@ -23,6 +23,7 @@
 #include <linux/module.h>
 #include <linux/rmap-locking.h>
 #include <linux/security.h>
+#include <linux/backing-dev.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -44,8 +45,64 @@ struct swap_list_t swap_list = {-1, -1};
 
 struct swap_info_struct swap_info[MAX_SWAPFILES];
 
+/*
+ * Array of backing blockdevs, for swap_unplug_fn.  We need this because the
+ * bdev->unplug_fn can sleep and we cannot hold swap_list_lock while calling
+ * the unplug_fn.  And swap_list_lock cannot be turned into a semaphore.
+ */
+static DECLARE_MUTEX(swap_bdevs_sem);
+static struct block_device *swap_bdevs[MAX_SWAPFILES];
+
 #define SWAPFILE_CLUSTER 256
 
+/*
+ * Caller holds swap_bdevs_sem
+ */
+static void install_swap_bdev(struct block_device *bdev)
+{
+	int i;
+
+	for (i = 0; i < MAX_SWAPFILES; i++) {
+		if (swap_bdevs[i] == NULL) {
+			swap_bdevs[i] = bdev;
+			return;
+		}
+	}
+	BUG();
+}
+
+static void remove_swap_bdev(struct block_device *bdev)
+{
+	int i;
+
+	for (i = 0; i < MAX_SWAPFILES; i++) {
+		if (swap_bdevs[i] == bdev) {
+			memcpy(&swap_bdevs[i], &swap_bdevs[i + 1],
+				(MAX_SWAPFILES - i - 1) * sizeof(*swap_bdevs));
+			swap_bdevs[MAX_SWAPFILES - 1] = NULL;
+			return;
+		}
+	}
+	BUG();
+}
+
+void swap_unplug_io_fn(struct backing_dev_info *unused_bdi)
+{
+	int i;
+
+	down(&swap_bdevs_sem);
+	for (i = 0; i < MAX_SWAPFILES; i++) {
+		struct block_device *bdev = swap_bdevs[i];
+		struct backing_dev_info *bdi;
+
+		if (bdev == NULL)
+			break;
+		bdi = bdev->bd_inode->i_mapping->backing_dev_info;
+		(*bdi->unplug_io_fn)(bdi);
+	}
+	up(&swap_bdevs_sem);
+}
+
 static inline int scan_swap_map(struct swap_info_struct *si)
 {
 	unsigned long offset;
@@ -1088,6 +1145,7 @@ asmlinkage long sys_swapoff(const char _
 		swap_list_unlock();
 		goto out_dput;
 	}
+	down(&swap_bdevs_sem);
 	swap_list_lock();
 	swap_device_lock(p);
 	swap_file = p->swap_file;
@@ -1099,6 +1157,8 @@ asmlinkage long sys_swapoff(const char _
 	destroy_swap_extents(p);
 	swap_device_unlock(p);
 	swap_list_unlock();
+	remove_swap_bdev(p->bdev);
+	up(&swap_bdevs_sem);
 	vfree(swap_map);
 	if (S_ISBLK(mapping->host->i_mode)) {
 		struct block_device *bdev = I_BDEV(mapping->host);
@@ -1414,6 +1474,7 @@ asmlinkage long sys_swapon(const char __
 	if (error)
 		goto bad_swap;
 
+	down(&swap_bdevs_sem);
 	swap_list_lock();
 	swap_device_lock(p);
 	p->flags = SWP_ACTIVE;
@@ -1439,6 +1500,8 @@ asmlinkage long sys_swapon(const char __
 	}
 	swap_device_unlock(p);
 	swap_list_unlock();
+	install_swap_bdev(p->bdev);
+	up(&swap_bdevs_sem);
 	error = 0;
 	goto out;
 bad_swap:
@@ -1458,7 +1521,7 @@ bad_swap_2:
 	destroy_swap_extents(p);
 	if (swap_map)
 		vfree(swap_map);
-	if (swap_file && !IS_ERR(swap_file))
+	if (swap_file)
 		filp_close(swap_file, NULL);
 out:
 	if (page && !IS_ERR(page)) {
diff -puN mm/swap_state.c~per-backing_dev-unplugging mm/swap_state.c
--- 25/mm/swap_state.c~per-backing_dev-unplugging	2004-03-26 12:35:28.240927736 -0800
+++ 25-akpm/mm/swap_state.c	2004-03-26 12:35:28.298918920 -0800
@@ -19,6 +19,7 @@
 static struct backing_dev_info swap_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
 	.memory_backed	= 1,	/* Does not contribute to dirty memory */
+	.unplug_io_fn	= swap_unplug_io_fn,
 };
 
 extern struct address_space_operations swap_aops;
diff -puN drivers/block/rd.c~per-backing_dev-unplugging drivers/block/rd.c
--- 25/drivers/block/rd.c~per-backing_dev-unplugging	2004-03-26 12:35:28.241927584 -0800
+++ 25-akpm/drivers/block/rd.c	2004-03-26 12:35:28.299918768 -0800
@@ -271,6 +271,7 @@ static int rd_ioctl(struct inode *inode,
 static struct backing_dev_info rd_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
 	.memory_backed	= 1,	/* Does not contribute to dirty memory */
+	.unplug_io_fn = default_unplug_io_fn,
 };
 
 static int rd_open(struct inode *inode, struct file *filp)
diff -puN mm/shmem.c~per-backing_dev-unplugging mm/shmem.c
--- 25/mm/shmem.c~per-backing_dev-unplugging	2004-03-26 12:35:28.243927280 -0800
+++ 25-akpm/mm/shmem.c	2004-03-26 12:35:28.300918616 -0800
@@ -133,6 +133,7 @@ static struct vm_operations_struct shmem
 static struct backing_dev_info shmem_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
 	.memory_backed	= 1,	/* Does not contribute to dirty memory */
+	.unplug_io_fn = default_unplug_io_fn,
 };
 
 LIST_HEAD(shmem_inodes);

_