If pdflush hits a locked-and-clean buffer in __block_write_full_page() it
will just pass over the buffer.  Typically the buffer is an ext3 data=ordered
buffer which is being written by kjournald, but a similar thing can happen
with blockdev buffers and ll_rw_block().

This is bad because the buffer is still under I/O and a subsequent fsync's
fdatawait() needs to know about it.

It is not practical to tag the page for writeback - only the submitter of the
I/O can do that, because the submitter has control of the end_io handler.

So instead, redirty the page so a subsequent fsync's fdatawrite() will wait on
the underway I/O.

There is a risk that pdflush::background_writeout() will lock up, repeatedly
trying and failing to write the same page.  This is prevented by ensuring
that background_writeout() always throttles when it made no progress.



---

 25-akpm/fs/buffer.c               |   19 ++++++++++++-------
 25-akpm/fs/fs-writeback.c         |    9 +++++++++
 25-akpm/include/linux/writeback.h |    1 +
 25-akpm/mm/page-writeback.c       |    8 ++++----
 4 files changed, 26 insertions(+), 11 deletions(-)

diff -puN fs/buffer.c~block_write_full_page-redirty fs/buffer.c
--- 25/fs/buffer.c~block_write_full_page-redirty	2004-03-26 12:36:47.021951200 -0800
+++ 25-akpm/fs/buffer.c	2004-03-26 12:36:47.030949832 -0800
@@ -1807,14 +1807,18 @@ static int __block_write_full_page(struc
 		get_bh(bh);
 		if (!buffer_mapped(bh))
 			continue;
-		if (wbc->sync_mode != WB_SYNC_NONE) {
+		/*
+		 * If it's a fully non-blocking write attempt and we cannot
+		 * lock the buffer then redirty the page.  Note that this can
+		 * potentially cause a busy-wait loop from pdflush and kswapd
+		 * activity, but those code paths have their own higher-level
+		 * throttling.
+		 */
+		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
 			lock_buffer(bh);
-		} else {
-			if (test_set_buffer_locked(bh)) {
-				if (buffer_dirty(bh))
-					__set_page_dirty_nobuffers(page);
-				continue;
-			}
+		} else if (test_set_buffer_locked(bh)) {
+			__set_page_dirty_nobuffers(page);
+			continue;
 		}
 		if (test_clear_buffer_dirty(bh)) {
 			if (!buffer_uptodate(bh))
@@ -1862,6 +1866,7 @@ done:
 		if (uptodate)
 			SetPageUptodate(page);
 		end_page_writeback(page);
+		wbc->pages_skipped++;	/* We didn't write this page */
 	}
 	return err;
 
diff -puN fs/fs-writeback.c~block_write_full_page-redirty fs/fs-writeback.c
--- 25/fs/fs-writeback.c~block_write_full_page-redirty	2004-03-26 12:36:47.023950896 -0800
+++ 25-akpm/fs/fs-writeback.c	2004-03-26 12:36:47.030949832 -0800
@@ -279,6 +279,7 @@ sync_sb_inodes(struct super_block *sb, s
 						struct inode, i_list);
 		struct address_space *mapping = inode->i_mapping;
 		struct backing_dev_info *bdi = mapping->backing_dev_info;
+		long pages_skipped;
 
 		if (bdi->memory_backed) {
 			if (sb == blockdev_superblock) {
@@ -326,6 +327,7 @@ sync_sb_inodes(struct super_block *sb, s
 
 		BUG_ON(inode->i_state & I_FREEING);
 		__iget(inode);
+		pages_skipped = wbc->pages_skipped;
 		__writeback_single_inode(inode, wbc);
 		if (wbc->sync_mode == WB_SYNC_HOLD) {
 			inode->dirtied_when = jiffies;
@@ -333,6 +335,13 @@ sync_sb_inodes(struct super_block *sb, s
 		}
 		if (current_is_pdflush())
 			writeback_release(bdi);
+		if (wbc->pages_skipped != pages_skipped) {
+			/*
+			 * writeback is not making progress due to locked
+			 * buffers.  Skip this inode for now.
+			 */
+			list_move(&inode->i_list, &sb->s_dirty);
+		}
 		spin_unlock(&inode_lock);
 		iput(inode);
 		spin_lock(&inode_lock);
diff -puN include/linux/writeback.h~block_write_full_page-redirty include/linux/writeback.h
--- 25/include/linux/writeback.h~block_write_full_page-redirty	2004-03-26 12:36:47.024950744 -0800
+++ 25-akpm/include/linux/writeback.h	2004-03-26 12:36:47.031949680 -0800
@@ -39,6 +39,7 @@ struct writeback_control {
 					   older than this */
 	long nr_to_write;		/* Write this many pages, and decrement
 					   this for each page written */
+	long pages_skipped;		/* Pages which were not written */
 	int nonblocking;		/* Don't get stuck on request queues */
 	int encountered_congestion;	/* An output: a queue is full */
 	int for_kupdate;		/* A kupdate writeback */
diff -puN mm/page-writeback.c~block_write_full_page-redirty mm/page-writeback.c
--- 25/mm/page-writeback.c~block_write_full_page-redirty	2004-03-26 12:36:47.026950440 -0800
+++ 25-akpm/mm/page-writeback.c	2004-03-26 12:36:47.032949528 -0800
@@ -261,13 +261,13 @@ static void background_writeout(unsigned
 			break;
 		wbc.encountered_congestion = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+		wbc.pages_skipped = 0;
 		writeback_inodes(&wbc);
 		min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
-		if (wbc.nr_to_write > 0) {
+		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
 			/* Wrote less than expected */
-			if (wbc.encountered_congestion)
-				blk_congestion_wait(WRITE, HZ/10);
-			else
+			blk_congestion_wait(WRITE, HZ/10);
+			if (!wbc.encountered_congestion)
 				break;
 		}
 	}

_