fs/nfs/direct.c |  455 +++++++++++++++++++++++++++++++++++++++-----------------
 1 files changed, 320 insertions(+), 135 deletions(-)

diff -puN fs/nfs/direct.c~25-odirect fs/nfs/direct.c
--- 25/fs/nfs/direct.c~25-odirect	2003-10-02 00:48:14.000000000 -0700
+++ 25-akpm/fs/nfs/direct.c	2003-10-02 00:48:14.000000000 -0700
@@ -1,7 +1,7 @@
 /*
  * linux/fs/nfs/direct.c
  *
- * Copyright (C) 2001 by Chuck Lever <cel@netapp.com>
+ * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
  *
  * High-performance uncached I/O for the Linux NFS client
  *
@@ -26,19 +26,23 @@
  * also supports uncaching whole NFS partitions with "-o forcedirectio,"
  * an undocumented mount option.
  *
- * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust.
+ * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
+ * help from Andrew Morton.
  *
  * 18 Dec 2001	Initial implementation for 2.4  --cel
  * 08 Jul 2002	Version for 2.4.19, with bug fixes --trondmy
- * 24 Sep 2002	Rewrite to use asynchronous RPCs, port to 2.5  --cel
+ * 08 Jun 2003	Port to 2.5 APIs  --cel
  *
  */
 
 #include <linux/config.h>
+#include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
+#include <linux/smp_lock.h>
 #include <linux/file.h>
-#include <linux/errno.h>
+#include <linux/pagemap.h>
+
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
 #include <linux/sunrpc/clnt.h>
@@ -46,35 +50,41 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 
-#define NFSDBG_FACILITY		(NFSDBG_PAGECACHE | NFSDBG_VFS)
+#define NFSDBG_FACILITY		NFSDBG_VFS
 #define VERF_SIZE		(2 * sizeof(__u32))
+#define MAX_DIRECTIO_SIZE	(4096UL << PAGE_SHIFT)
 
 
 /**
- * nfs_get_user_pages - find and set up page representing user buffer
- * addr: user-space address of target buffer
- * size: total size in bytes of target buffer
- * @pages: returned array of page struct pointers underlying target buffer
- * write: whether or not buffer is target of a write operation
+ * nfs_get_user_pages - find and set up pages underlying user's buffer
+ * rw: direction (read or write)
+ * user_addr: starting address of this segment of user's buffer
+ * count: size of this segment
+ * @pages: returned array of page struct pointers underlying user's buffer
  */
 static inline int
-nfs_get_user_pages(unsigned long addr, size_t size,
-		struct page ***pages, int rw)
+nfs_get_user_pages(int rw, unsigned long user_addr, size_t size,
+		struct page ***pages)
 {
 	int result = -ENOMEM;
-	unsigned page_count = (unsigned) size >> PAGE_SHIFT;
-	unsigned array_size = (page_count * sizeof(struct page *)) + 2U;
+	unsigned long page_count;
+	size_t array_size;
+
+	/* set an arbitrary limit to prevent arithmetic overflow */
+	if (size > MAX_DIRECTIO_SIZE)
+		return -EFBIG;
 
-	*pages = (struct page **) kmalloc(array_size, GFP_KERNEL);
+	page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	page_count -= user_addr >> PAGE_SHIFT;
+
+	array_size = (page_count * sizeof(struct page *));
+	*pages = kmalloc(array_size, GFP_KERNEL);
 	if (*pages) {
 		down_read(&current->mm->mmap_sem);
-		result = get_user_pages(current, current->mm, addr,
-					page_count, (rw == WRITE), 0,
+		result = get_user_pages(current, current->mm, user_addr,
+					page_count, (rw == READ), 0,
 					*pages, NULL);
 		up_read(&current->mm->mmap_sem);
-		if (result < 0)
-			printk(KERN_ERR "%s: get_user_pages result %d\n",
-					__FUNCTION__, result);
 	}
 	return result;
 }
@@ -84,174 +94,349 @@ nfs_get_user_pages(unsigned long addr, s
  * @pages: array of page struct pointers underlying target buffer
  */
 static inline void
-nfs_free_user_pages(struct page **pages, unsigned count)
+nfs_free_user_pages(struct page **pages)
 {
-	unsigned page = 0;
+	kfree(pages);
+}
 
-	while (count--)
-		page_cache_release(pages[page++]);
+/**
+ * nfs_direct_read_seg - Read in one iov segment.  Generate separate
+ *                        read RPCs for each "rsize" bytes.
+ * @inode: target inode
+ * @cred: user's credential
+ * user_addr: starting address of this segment of user's buffer
+ * count: size of this segment
+ * file_offset: offset in file to begin the operation
+ * @pages: array of addresses of page structs defining user's buffer
+ * nr_pages: size of pages array
+ */
+static int
+nfs_direct_read_seg(struct inode *inode, struct rpc_cred *cred,
+		unsigned long user_addr, size_t count, loff_t file_offset,
+		struct page **pages, int nr_pages)
+{
+	const unsigned int rsize = NFS_SERVER(inode)->rsize;
+	int tot_bytes = 0;
+	int curpage = 0;
+	struct nfs_read_data	rdata = {
+		.flags		= 0,
+		.cred		= cred,
+		.inode		= inode,
+		.args		= {
+			.fh		= NFS_FH(inode),
+		},
+		.res		= {
+			.fattr		= &rdata.fattr,
+		},
+	};
+
+        do {
+		int request, result;
+
+                request = count;
+                if (count > rsize)
+                        request = rsize;
+		rdata.args.count = request,
+		rdata.args.pgbase = user_addr & ~PAGE_MASK;
+		rdata.args.offset = file_offset;
+		rdata.args.pages = &pages[curpage];
+
+		dprintk("NFS: direct read: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n",
+			rdata.args.count, (long long) rdata.args.offset,
+			user_addr, rdata.args.pgbase, curpage);
+
+		lock_kernel();
+		result = NFS_PROTO(inode)->read(&rdata);
+		unlock_kernel();
+
+		if (result < 0) {
+			if (result == -EISDIR)
+				result = -EINVAL;
+			return result;
+		}
 
-	kfree(pages);
+                tot_bytes += result;
+                count -= result;
+                file_offset += result;
+		user_addr += result;
+
+		if (rdata.res.eof)
+			break;
+
+		curpage += (rdata.args.pgbase + result) >> PAGE_SHIFT;
+	} while (count);
+
+	/* XXX: should we zero the rest of the user's buffer if we
+	 *      hit eof? */
+
+	return tot_bytes;
 }
 
 /**
- * nfs_iov2pagelist - convert an array of iovecs to a list of page requests
- * @inode: inode of target file
- * @cred: credentials of user who requested I/O
+ * nfs_direct_read - For each iov segment, map the user's buffer
+ *                   then generate read RPCs.
+ * @inode: target inode
+ * @cred: user's credential
  * @iov: array of vectors that define I/O buffer
- * offset: where in file to begin the read
+ * file_offset: offset in file to begin the operation
  * nr_segs: size of iovec array
- * @requests: append new page requests to this list head
+ *
+ * generic_file_direct_IO has already pushed out any non-direct
+ * writes so that this read will see them when we read from the
+ * server.
  */
 static int
-nfs_iov2pagelist(int rw, const struct inode *inode,
-		const struct rpc_cred *cred,
-		const struct iovec *iov, loff_t offset,
-		unsigned long nr_segs, struct list_head *requests)
+nfs_direct_read(struct inode *inode, struct rpc_cred *cred,
+		const struct iovec *iov, loff_t file_offset,
+		unsigned long nr_segs)
 {
-	unsigned seg;
 	int tot_bytes = 0;
-	struct page **pages;
+	unsigned long seg = 0;
 
-	/* for each iovec in the array... */
-	for (seg = 0; seg < nr_segs; seg++) {
-		const unsigned long user_addr =
-					(unsigned long) iov[seg].iov_base;
-		size_t bytes = iov[seg].iov_len;
-		unsigned int pg_offset = (user_addr & ~PAGE_MASK);
-		int page_count, page = 0;
-
-		page_count = nfs_get_user_pages(user_addr, bytes, &pages, rw);
-		if (page_count < 0) {
-			return page_count;
+	while ((seg < nr_segs) && (tot_bytes >= 0)) {
+		int result, page_count;
+		struct page **pages;
+		const struct iovec *vec = &iov[seg++];
+		unsigned long user_addr = (unsigned long) vec->iov_base;
+		size_t size = vec->iov_len;
+
+                page_count = nfs_get_user_pages(READ, user_addr, size, &pages);
+                if (page_count < 0) {
+                        nfs_free_user_pages(pages);
+                        return page_count;
+                }
+
+		result = nfs_direct_read_seg(inode, cred, user_addr, size,
+				file_offset, pages, page_count);
+		if (result < 0)
+			tot_bytes = result;
+		else {
+			tot_bytes += result;
+			file_offset += result;
 		}
 
-		/* ...build as many page requests as required */
-		while (bytes > 0) {
-			struct nfs_page *new;
-			const unsigned int pg_bytes = (bytes > PAGE_SIZE) ?
-							PAGE_SIZE : bytes;
-
-			new = nfs_create_request((struct rpc_cred *) cred,
-						 (struct inode *) inode,
-						 pages[page],
-						 pg_offset, pg_bytes);
-			if (IS_ERR(new)) {
-				nfs_free_user_pages(pages, page_count);
-				nfs_release_list(requests);
-				return PTR_ERR(new);
-			}
-			new->wb_index = offset;
-			nfs_list_add_request(new, requests);
-
-			/* after the first page */
-			pg_offset = 0;
-			offset += PAGE_SIZE;
-			tot_bytes += pg_bytes;
-			bytes -= pg_bytes;
-			page++;
+		nfs_free_user_pages(pages);
+	}
+
+	return tot_bytes;
+}
+
+/**
+ * nfs_direct_write_seg - Write out one iov segment.  Generate separate
+ *                        write RPCs for each "wsize" bytes, then commit.
+ * @inode: target inode
+ * @cred: user's credential
+ * user_addr: starting address of this segment of user's buffer
+ * count: size of this segment
+ * file_offset: offset in file to begin the operation
+ * @pages: array of addresses of page structs defining user's buffer
+ * nr_pages: size of pages array
+ */
+static int
+nfs_direct_write_seg(struct inode *inode, struct rpc_cred *cred,
+		unsigned long user_addr, size_t count, loff_t file_offset,
+		struct page **pages, int nr_pages)
+{
+	const unsigned int wsize = NFS_SERVER(inode)->wsize;
+	loff_t save_offset = file_offset;
+	size_t save_count = count;
+	int need_commit = 0;
+	int tot_bytes = 0;
+	int curpage = 0;
+	struct nfs_writeverf first_verf;
+	struct nfs_write_data	wdata = {
+		.cred		= cred,
+		.inode		= inode,
+		.args		= {
+			.fh		= NFS_FH(inode),
+		},
+		.res		= {
+			.fattr		= &wdata.fattr,
+			.verf		= &wdata.verf,
+		},
+	};
+
+	wdata.args.stable = NFS_UNSTABLE;
+	if (IS_SYNC(inode) || NFS_PROTO(inode)->version == 2 || count <= wsize)
+		wdata.args.stable = NFS_FILE_SYNC;
+
+retry:
+        do {
+		int request, result;
+
+                request = count;
+                if (count > wsize)
+                        request = wsize;
+		wdata.args.count = request,
+		wdata.args.pgbase = user_addr & ~PAGE_MASK;
+		wdata.args.offset = file_offset;
+		wdata.args.pages = &pages[curpage];
+
+		dprintk("NFS: direct write: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n",
+			wdata.args.count, (long long) wdata.args.offset,
+			user_addr, wdata.args.pgbase, curpage);
+
+		lock_kernel();
+		result = NFS_PROTO(inode)->write(&wdata);
+		unlock_kernel();
+
+		if (result < 0)
+			return result;
+
+		if (!tot_bytes)
+			memcpy(&first_verf.verifier, &wdata.verf.verifier,
+								VERF_SIZE);
+		if (wdata.verf.committed != NFS_FILE_SYNC) {
+			need_commit = 1;
+			if (memcmp(&first_verf.verifier,
+					&wdata.verf.verifier, VERF_SIZE))
+				goto sync_retry;
 		}
 
-		/* don't release pages here -- I/O completion will do that */
-		nfs_free_user_pages(pages, 0);
+                tot_bytes += result;
+                count -= result;
+                file_offset += result;
+		user_addr += result;
+
+		curpage += (wdata.args.pgbase + result) >> PAGE_SHIFT;
+	} while (count);
+
+	/*
+	 * Commit data written so far, even in the event of an error
+	 */
+	if (need_commit) {
+		int result;
+
+		wdata.args.count = tot_bytes;
+		wdata.args.offset = save_offset;
+
+		lock_kernel();
+		result = NFS_PROTO(inode)->commit(&wdata);
+		unlock_kernel();
+
+		if (result < 0)
+			goto sync_retry;
+		if (memcmp(&first_verf.verifier, &wdata.verf.verifier,
+								VERF_SIZE))
+			goto sync_retry;
 	}
 
 	return tot_bytes;
+
+sync_retry:
+	wdata.args.stable = NFS_FILE_SYNC;
+	file_offset = save_offset;
+	count = save_count;
+	goto retry;
 }
 
 /**
- * do_nfs_direct_IO - Read or write data without caching
- * @inode: inode of target file
- * @cred: credentials of user who requested I/O
+ * nfs_direct_write - For each iov segment, map the user's buffer
+ *                    then generate write and commit RPCs.
+ * @inode: target inode
+ * @cred: user's credential
  * @iov: array of vectors that define I/O buffer
- * offset: where in file to begin the read
+ * file_offset: offset in file to begin the operation
  * nr_segs: size of iovec array
  *
- * Break the passed-in iovec into a series of page-sized or smaller
- * requests, where each page is mapped for direct user-land I/O.
- *
- * For each of these pages, create an NFS page request and
- * append it to an automatic list of page requests.
- *
- * When all page requests have been queued, start the I/O on the
- * whole list.  The underlying routines coalesce the pages on the
- * list into a bunch of asynchronous "r/wsize" network requests.
- *
- * I/O completion automatically unmaps and releases the pages.
+ * Upon return, generic_file_direct_IO invalidates any cached pages
+ * that non-direct readers might access, so they will pick up these
+ * writes immediately.
  */
 static int
-do_nfs_direct_IO(int rw, const struct inode *inode,
-		const struct rpc_cred *cred, const struct iovec *iov,
-		loff_t offset, unsigned long nr_segs)
+nfs_direct_write(struct inode *inode, struct rpc_cred *cred,
+		const struct iovec *iov, loff_t file_offset,
+		unsigned long nr_segs)
 {
-	LIST_HEAD(requests);
-	int result, tot_bytes;
+	int tot_bytes = 0;
+	unsigned long seg = 0;
 
-	result = nfs_iov2pagelist(rw, inode, cred, iov, offset, nr_segs,
-								&requests);
-	if (result < 0)
-		return result;
-	tot_bytes = result;
+	while ((seg < nr_segs) && (tot_bytes >= 0)) {
+		int result, page_count;
+		struct page **pages;
+		const struct iovec *vec = &iov[seg++];
+		unsigned long user_addr = (unsigned long) vec->iov_base;
+		size_t size = vec->iov_len;
+
+                page_count = nfs_get_user_pages(WRITE, user_addr, size, &pages);
+                if (page_count < 0) {
+                        nfs_free_user_pages(pages);
+                        return page_count;
+                }
 
-	switch (rw) {
-	case READ:
-		if (IS_SYNC(inode) || (NFS_SERVER(inode)->rsize < PAGE_SIZE)) {
-			result = nfs_direct_read_sync(inode, cred, iov, offset, nr_segs);
-			break;
+		result = nfs_direct_write_seg(inode, cred, user_addr, size,
+				file_offset, pages, page_count);
+		if (result < 0)
+			tot_bytes = result;
+		else {
+			tot_bytes += result;
+			file_offset += result;
 		}
-		result = nfs_pagein_list(&requests, NFS_SERVER(inode)->rpages);
-		break;
-	case WRITE:
-		if (IS_SYNC(inode) || (NFS_SERVER(inode)->wsize < PAGE_SIZE))
-			result = nfs_direct_write_sync(inode, cred, iov, offset, nr_segs);
-		else
-			result = nfs_flush_list(&requests,
-					NFS_SERVER(inode)->wpages, FLUSH_WAIT);
 
-		/* invalidate cache so non-direct readers pick up changes */
-		invalidate_inode_pages((struct inode *) inode);
-		break;
-	default:
-		result = -EINVAL;
-		break;
+		nfs_free_user_pages(pages);
 	}
 
-	if (result < 0)
-		return result;
 	return tot_bytes;
 }
 
 /**
  * nfs_direct_IO - NFS address space operation for direct I/O
  * rw: direction (read or write)
- * @file: file struct of target file
+ * @iocb: target I/O control block
  * @iov: array of vectors that define I/O buffer
- * offset: offset in file to begin the operation
+ * file_offset: offset in file to begin the operation
  * nr_segs: size of iovec array
  *
+ * Usually a file system implements direct I/O by calling out to
+ * blockdev_direct_IO.  The NFS client doesn't have a backing block
+ * device, so we do everything by hand instead.
+ *
  * The inode's i_sem is no longer held by the VFS layer before it calls
  * this function to do a write.
  */
 int
 nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-		loff_t offset, unsigned long nr_segs)
+		loff_t file_offset, unsigned long nr_segs)
 {
-	/* None of this works yet, so prevent it from compiling. */
-#if 0
-	int result;
+	int result = -EINVAL;
+	struct file *file = iocb->ki_filp;
 	struct dentry *dentry = file->f_dentry;
-	const struct inode *inode = dentry->d_inode->i_mapping->host;
-	const struct rpc_cred *cred = nfs_file_cred(file);
-#endif
-
-	dfprintk(VFS, "NFS: direct_IO(%s) (%s/%s) off/no(%Lu/%lu)\n",
-				((rw == READ) ? "READ" : "WRITE"),
-				dentry->d_parent->d_name.name,
-				dentry->d_name.name, offset, nr_segs);
+	struct inode *inode = dentry->d_inode;
+	struct rpc_cred *cred;
+
+	/*
+	 * No support for async yet
+	 */
+	if (!is_sync_kiocb(iocb))
+		goto out;
+
+	cred = get_rpccred(nfs_file_cred(file));
+	if (!cred)
+		cred = get_rpccred(NFS_I(inode)->mm_cred);
+
+	switch (rw) {
+	case READ:
+		dprintk("NFS: direct_IO(read) (%s) off/no(%Lu/%lu)\n",
+				dentry->d_name.name, file_offset, nr_segs);
+
+		result = nfs_direct_read(inode, cred, iov,
+						file_offset, nr_segs);
+		break;
+	case WRITE:
+		dprintk("NFS: direct_IO(write) (%s) off/no(%Lu/%lu)\n",
+				dentry->d_name.name, file_offset, nr_segs);
 
-	result = do_nfs_direct_IO(rw, inode, cred, iov, offset, nr_segs);
+		result = nfs_direct_write(inode, cred, iov,
+						file_offset, nr_segs);
+		break;
+	default:
+		break;
+	}
 
-	dfprintk(VFS, "NFS: direct_IO result = %d\n", result);
+	if (cred)
+		put_rpccred(cred);
 
+out:
+	dprintk("NFS: direct_IO result=%d\n", result);
 	return result;
 }

_