Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > fc11cd6e1c513a17304da94a5390f3cd > files > 2637

kernel-2.6.18-194.11.1.el5.src.rpm

From: Peter Staubach <staubach@redhat.com>
Date: Mon, 31 Aug 2009 13:58:46 -0400
Subject: [nfs] r/w I/O perf degraded by FLUSH_STABLE page flush
Message-id: 4A9C0F56.7090401@redhat.com
O-Subject: [PATCH RHEL-5.5] BZ498433 Read/Write NFS I/O performance degraded by FLUSH_STABLE page flushing
Bugzilla: 498433
RH-Acked-by: Jeff Layton <jlayton@redhat.com>
RH-Acked-by: Steve Dickson <SteveD@redhat.com>

Hi.

Attached is a patch to address bz498433, "Read/Write NFS I/O
performance degraded by FLUSH_STABLE page flushing".  This bugzilla
describes a performance issue when reading and writing files using
small system call sizes.

The issue is that the NFS client currently implements something
akin to modify-write-read when updating a file.  When a write(2)
occurs first to a page, an empty page is allocated, the contents
of the write system call are transfered to the page, and the dirty
region is kept track of.  If a read(2) is then issued for something
on the page, the contents of the page must be written out before
the page can be read in from the server.  This writing of the data
must be done synchronously and must flush the data all of the to
stable storage on the server.  This data flushing tends to happen
in very small chunks, so is very slow.

The usual mechanism for handling situations like this is read-
modify-write.  When the write(2) occurs, the current contents of the
page are read in the from server and then the contents of the write
system call are copied to the page.  When the read(2) is issued,
the page contents are available and without needing to flush the
modified data to the server.

This patch has been accepted by upstream and tested by the Clearcase
guys.  It was tested by building RHEL-5 kernel RPMS and counting
the number of the specific NFS protocol operations generated.

	Thanx...

		ps

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index a63e4ee..35a6c76 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -296,6 +296,42 @@ nfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 }
 
 /*
+ * Decide whether a read/modify/write cycle may be more efficient
+ * then a modify/write/read cycle when writing to a page in the
+ * page cache.
+ *
+ * The modify/write/read cycle may occur if a page is read before
+ * being completely filled by the writer.  In this situation, the
+ * page must be completely written to stable storage on the server
+ * before it can be refilled by reading in the page from the server.
+ * This can lead to expensive, small, FILE_SYNC mode writes being
+ * done.
+ *
+ * It may be more efficient to read the page first if the file is
+ * open for reading in addition to writing, the page is not marked
+ * as Uptodate, it is not dirty or waiting to be committed,
+ * indicating that it was previously allocated and then modified,
+ * that there were valid bytes of data in that range of the file,
+ * and that the new data won't completely replace the old data in
+ * that range of the file.
+ */
+static int nfs_want_read_modify_write(struct file *file, struct page *page,
+					loff_t pos, unsigned len)
+{
+	unsigned int pglen = nfs_page_length(page->mapping->host, page);
+	unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned int end = offset + len;
+
+	if ((file->f_mode & FMODE_READ) &&	/* open for read? */
+	    !PageUptodate(page) &&		/* Uptodate? */
+	    !PagePrivate(page) &&		/* i/o request already? */
+	    pglen &&				/* valid bytes of file? */
+	    (end < pglen || offset))		/* replace all valid bytes? */
+		return 1;
+	return 0;
+}
+
+/*
  * This does the "real" work of the write. We must allocate and lock the
  * page to be sent back to the generic routine, which then copies the
  * data from user space.
@@ -308,10 +344,11 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
 			struct page **pagep, void **fsdata)
 {
 	int ret;
-	pgoff_t index;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	struct page *page;
-	index = pos >> PAGE_CACHE_SHIFT;
+	int once_thru = 0;
 
+start:
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
@@ -321,6 +358,13 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
 	if (ret) {
 		unlock_page(page);
 		page_cache_release(page);
+	} else if (!once_thru &&
+		   nfs_want_read_modify_write(file, page, pos, len)) {
+		once_thru = 1;
+		ret = nfs_readpage(file, page);
+		page_cache_release(page);
+		if (!ret)
+			goto start;
 	}
 	return ret;
 }
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 10d477e..83f8fc2 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -186,6 +186,7 @@ extern int nfs4_path_walk(struct nfs_server *server,
 
 /* read.c */
 extern int nfs_readpage_async(struct nfs_open_context *, struct inode *, struct page *);
+unsigned int nfs_page_length(struct inode *, struct page *);
 
 /*
  * Determine the device name as a string
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 8b14dfb..f34a4d5 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -85,7 +85,6 @@ void nfs_readdata_release(void *data)
         nfs_readdata_free(data);
 }
 
-static
 unsigned int nfs_page_length(struct inode *inode, struct page *page)
 {
 	loff_t i_size = i_size_read(inode);