Sophie: kernel-2.6.18-238.el5 src

kernel-2.6.18-238.el5.src.rpm

From: Bob Peterson <rpeterso@redhat.com>
Date: Mon, 15 Sep 2008 19:29:44 -0400
Subject: [gfs2] glock deadlock in page fault path
Message-id: 1389207955.365281221521384635.JavaMail.root@zmail02.collab.prod.int.phx2.redhat.com
O-Subject: Re: [RHEL5.3 Patch] bz 458684 - GFS2: glock deadlock in page fault path
Bugzilla: 458684
RH-Acked-by: Steven Whitehouse <swhiteho@redhat.com>

Hi,

Here is a respin of my patch for GFS2 bug 458684.  The only difference
between this patch and my previous is that function upstream function
iov_iter_copy_from_user_atomic was moved into mm/filemap.c (where you'll
find it in the upstream kernel), and the iov_iter structure declaration
was moved to fs.h.

I was hoping to contain all the changes to GFS2 to minimize risk, but
I can't seem to find any other way to resolve the dependency issue Don
pointed out.  At least this minimizes the impact since the new function
is not called outside of GFS2.

I compiled this for x86_64 and i686 and re-ran the cluster "coherency"
test that gives it a good workout on x86_64.  All tests were successful.

Note that this will require a change to the whitelist.

Regards,

Bob Peterson
Red Hat Clustering & GFS
--

diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 5c8bac7..db147e3 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -36,6 +36,27 @@
 #include "util.h"
 #include "glops.h"
 
+static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
+
+static void fastcall gfs2_lru_cache_add(struct page *page)
+{
+        struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+
+        page_cache_get(page);
+        if (!pagevec_add(pvec, page))
+                __pagevec_lru_add(pvec);
+        put_cpu_var(lru_add_pvecs);
+}
+
+static int gfs2_add_to_page_cache_lru(struct page *page,
+				      struct address_space *mapping,
+				      pgoff_t offset, gfp_t gfp_mask)
+{
+	int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
+	if (ret == 0)
+		gfs2_lru_cache_add(page);
+	return ret;
+}
 
 static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
 				   unsigned int from, unsigned int to)
@@ -664,6 +685,446 @@ out_uninit:
 	return ret;
 }
 
+/*
+ * Find or create a page at the given pagecache position. Return the locked
+ * page. This function is specifically for buffered writes.
+ */
+static struct page *__grab_cache_page(struct address_space *mapping,
+			       unsigned long index)
+{
+	int status;
+	struct page *page;
+repeat:
+	page = find_lock_page(mapping, index);
+	if (likely(page))
+		return page;
+
+	page = page_cache_alloc(mapping);
+	if (!page)
+		return NULL;
+	status = gfs2_add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
+	if (unlikely(status)) {
+		page_cache_release(page);
+		if (status == -EEXIST)
+			goto repeat;
+		return NULL;
+	}
+	return page;
+}
+
+/**
+ * gfs2_write_begin - Begin to write to a file
+ * @file: The file to write to
+ * @mapping: The mapping in which to write
+ * @pos: The file offset at which to start writing
+ * @len: Length of the write
+ * @flags: Various flags
+ * @pagep: Pointer to return the page
+ * @fsdata: Pointer to return fs data (unused by GFS2)
+ *
+ * Returns: errno
+ */
+
+int gfs2_write_begin(struct file *file, struct address_space *mapping,
+		     loff_t pos, unsigned len, unsigned flags,
+		     struct page **pagep, void **fsdata)
+{
+	struct gfs2_inode *ip = GFS2_I(mapping->host);
+	struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
+	unsigned int data_blocks, ind_blocks, rblocks;
+	int alloc_required;
+	int error = 0;
+	struct gfs2_alloc *al;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned to = from + len;
+	struct page *page;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &ip->i_gh);
+	error = gfs2_glock_nq_atime(&ip->i_gh);
+	if (unlikely(error))
+		goto out_uninit;
+
+	gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
+	error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
+	if (error)
+		goto out_unlock;
+
+	if (alloc_required) {
+		al = gfs2_alloc_get(ip);
+		if (!al) {
+			error = -ENOMEM;
+			goto out_unlock;
+		}
+
+		error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+		if (error)
+			goto out_alloc_put;
+
+		error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
+		if (error)
+			goto out_qunlock;
+
+		al->al_requested = data_blocks + ind_blocks;
+		error = gfs2_inplace_reserve(ip);
+		if (error)
+			goto out_qunlock;
+	}
+
+	rblocks = RES_DINODE + ind_blocks;
+	if (gfs2_is_jdata(ip))
+		rblocks += data_blocks ? data_blocks : 1;
+	if (ind_blocks || data_blocks)
+		rblocks += RES_STATFS + RES_QUOTA;
+
+	error = gfs2_trans_begin(sdp, rblocks,
+				 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+	if (error)
+		goto out_trans_fail;
+
+	error = -ENOMEM;
+	page = __grab_cache_page(mapping, index);
+	*pagep = page;
+	if (unlikely(!page))
+		goto out_endtrans;
+
+	if (gfs2_is_stuffed(ip)) {
+		error = 0;
+		if (pos + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
+			error = gfs2_unstuff_dinode(ip, page);
+			if (error == 0)
+				goto prepare_write;
+		} else if (!PageUptodate(page)) {
+			error = stuffed_readpage(ip, page);
+		}
+		goto out;
+	}
+
+prepare_write:
+	error = block_prepare_write(page, from, to, gfs2_block_map);
+out:
+	if (error == 0)
+		return 0;
+
+	page_cache_release(page);
+	if (pos + len > ip->i_inode.i_size)
+		vmtruncate(&ip->i_inode, ip->i_inode.i_size);
+out_endtrans:
+	gfs2_trans_end(sdp);
+out_trans_fail:
+	if (alloc_required) {
+		gfs2_inplace_release(ip);
+out_qunlock:
+		gfs2_quota_unlock(ip);
+out_alloc_put:
+		gfs2_alloc_put(ip);
+	}
+out_unlock:
+	gfs2_glock_dq(&ip->i_gh);
+out_uninit:
+	gfs2_holder_uninit(&ip->i_gh);
+	return error;
+}
+
+/**
+ * adjust_fs_space - Adjusts the free space available due to gfs2_grow
+ * @inode: the rindex inode
+ */
+static void adjust_fs_space(struct inode *inode)
+{
+	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+	u64 fs_total, new_free;
+
+	/* Total up the file system space, according to the latest rindex. */
+	fs_total = gfs2_ri_total(sdp);
+
+	spin_lock(&sdp->sd_statfs_spin);
+	if (fs_total > (m_sc->sc_total + l_sc->sc_total))
+		new_free = fs_total - (m_sc->sc_total + l_sc->sc_total);
+	else
+		new_free = 0;
+	spin_unlock(&sdp->sd_statfs_spin);
+	fs_warn(sdp, "File system extended by %llu blocks.\n",
+		(unsigned long long)new_free);
+	gfs2_statfs_change(sdp, new_free, new_free, 0);
+}
+
+/**
+ * gfs2_stuffed_write_end - Write end for stuffed files
+ * @inode: The inode
+ * @dibh: The buffer_head containing the on-disk inode
+ * @pos: The file position
+ * @len: The length of the write
+ * @copied: How much was actually copied by the VFS
+ * @page: The page
+ *
+ * This copies the data from the page into the inode block after
+ * the inode data structure itself.
+ *
+ * Returns: errno
+ */
+static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
+				  loff_t pos, unsigned len, unsigned copied,
+				  struct page *page)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	u64 to = pos + copied;
+	void *kaddr;
+	unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
+	struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
+
+	BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode)));
+	kaddr = kmap_atomic(page, KM_USER0);
+	memcpy(buf + pos, kaddr + pos, copied);
+	memset(kaddr + pos + copied, 0, len - copied);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (inode->i_size < to) {
+		i_size_write(inode, to);
+		ip->i_di.di_size = inode->i_size;
+		di->di_size = cpu_to_be64(inode->i_size);
+		mark_inode_dirty(inode);
+	}
+
+	if (inode == sdp->sd_rindex)
+		adjust_fs_space(inode);
+
+	brelse(dibh);
+	gfs2_trans_end(sdp);
+	gfs2_glock_dq(&ip->i_gh);
+	gfs2_holder_uninit(&ip->i_gh);
+	return copied;
+}
+
+/*
+ * If a page has any new buffers, zero them out here, and mark them uptodate
+ * and dirty so they'll be written out (in order to prevent uninitialised
+ * block data from leaking). And clear the new bit.
+ */
+static void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
+{
+	unsigned int block_start, block_end;
+	struct buffer_head *head, *bh;
+
+	BUG_ON(!PageLocked(page));
+	if (!page_has_buffers(page))
+		return;
+
+	bh = head = page_buffers(page);
+	block_start = 0;
+	do {
+		block_end = block_start + bh->b_size;
+
+		if (buffer_new(bh)) {
+			if (block_end > from && block_start < to) {
+				if (!PageUptodate(page)) {
+					unsigned start, size;
+
+					start = max(from, block_start);
+					size = min(to, block_end) - start;
+
+					zero_user(page, start, size);
+					set_buffer_uptodate(bh);
+				}
+
+				clear_buffer_new(bh);
+				mark_buffer_dirty(bh);
+			}
+		}
+
+		block_start = block_end;
+		bh = bh->b_this_page;
+	} while (bh != head);
+}
+
+static int __block_commit_write(struct inode *inode, struct page *page,
+		unsigned from, unsigned to)
+{
+	unsigned block_start, block_end;
+	int partial = 0;
+	unsigned blocksize;
+	struct buffer_head *bh, *head;
+
+	blocksize = 1 << inode->i_blkbits;
+
+	for(bh = head = page_buffers(page), block_start = 0;
+	    bh != head || !block_start;
+	    block_start=block_end, bh = bh->b_this_page) {
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (!buffer_uptodate(bh))
+				partial = 1;
+		} else {
+			set_buffer_uptodate(bh);
+			mark_buffer_dirty(bh);
+		}
+		clear_buffer_new(bh);
+	}
+
+	/*
+	 * If this is a partial write which happened to make all buffers
+	 * uptodate then we can optimize away a bogus readpage() for
+	 * the next read(). Here we 'discover' whether the page went
+	 * uptodate as a result of this (potentially partial) write.
+	 */
+	if (!partial)
+		SetPageUptodate(page);
+	return 0;
+}
+
+static int block_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	unsigned start;
+
+	start = pos & (PAGE_CACHE_SIZE - 1);
+
+	if (unlikely(copied < len)) {
+		/*
+		 * The buffers that were written will now be uptodate, so we
+		 * don't have to worry about a readpage reading them and
+		 * overwriting a partial write. However if we have encountered
+		 * a short write and only partially written into a buffer, it
+		 * will not be marked uptodate, so a readpage might come in and
+		 * destroy our partial write.
+		 *
+		 * Do the simplest thing, and just treat any short write to a
+		 * non uptodate page as a zero-length write, and force the
+		 * caller to redo the whole thing.
+		 */
+		if (!PageUptodate(page))
+			copied = 0;
+
+		page_zero_new_buffers(page, start+copied, start+len);
+	}
+	flush_dcache_page(page);
+
+	/* This could be a short (even 0-length) commit */
+	__block_commit_write(inode, page, start, start+copied);
+
+	return copied;
+}
+
+static int generic_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	int i_size_changed = 0;
+
+	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold i_mutex.
+	 *
+	 * But it's important to update i_size while still holding page lock:
+	 * page writeout could otherwise come in and zero beyond i_size.
+	 */
+	if (pos+copied > inode->i_size) {
+		i_size_write(inode, pos+copied);
+		i_size_changed = 1;
+	}
+
+	unlock_page(page);
+	page_cache_release(page);
+
+	/*
+	 * Don't mark the inode dirty under page lock. First, it unnecessarily
+	 * makes the holding time of page lock longer. Second, it forces lock
+	 * ordering of page lock and transaction start for journaling
+	 * filesystems.
+	 */
+	if (i_size_changed)
+		mark_inode_dirty(inode);
+
+	return copied;
+}
+
+/**
+ * gfs2_write_end
+ * @file: The file to write to
+ * @mapping: The address space to write to
+ * @pos: The file position
+ * @len: The length of the data
+ * @copied:
+ * @page: The page that has been written
+ * @fsdata: The fsdata (unused in GFS2)
+ *
+ * The main write_end function for GFS2. We have a separate one for
+ * stuffed files as they are slightly different, otherwise we just
+ * put our locking around the VFS provided functions.
+ *
+ * Returns: errno
+ */
+
+int gfs2_write_end(struct file *file, struct address_space *mapping,
+		   loff_t pos, unsigned len, unsigned copied,
+		   struct page *page, void *fsdata)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct buffer_head *dibh;
+	struct gfs2_alloc *al = ip->i_alloc;
+	struct gfs2_dinode *di;
+	unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned int to = from + len;
+	int ret;
+
+	BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL);
+
+	ret = gfs2_meta_inode_buffer(ip, &dibh);
+	if (unlikely(ret)) {
+		unlock_page(page);
+		page_cache_release(page);
+		goto failed;
+	}
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
+	if (gfs2_is_stuffed(ip))
+		return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page);
+
+	if (!gfs2_is_writeback(ip))
+		gfs2_page_add_databufs(ip, page, from, to);
+
+	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+	if (likely(ret >= 0) && (inode->i_size > ip->i_di.di_size)) {
+		di = (struct gfs2_dinode *)dibh->b_data;
+		ip->i_di.di_size = inode->i_size;
+		di->di_size = cpu_to_be64(inode->i_size);
+		mark_inode_dirty(inode);
+	}
+
+	if (inode == sdp->sd_rindex)
+		adjust_fs_space(inode);
+
+	brelse(dibh);
+	gfs2_trans_end(sdp);
+failed:
+	if (al) {
+		gfs2_inplace_release(ip);
+		gfs2_quota_unlock(ip);
+		gfs2_alloc_put(ip);
+	}
+	gfs2_glock_dq(&ip->i_gh);
+	gfs2_holder_uninit(&ip->i_gh);
+	return ret;
+}
+
 /**
  * gfs2_prepare_write - Prepare to write a page to a file
  * @file: The file to write to
@@ -719,31 +1180,6 @@ out:
 }
 
 /**
- * adjust_fs_space - Adjusts the free space available due to gfs2_grow
- * @inode: the rindex inode
- */
-static void adjust_fs_space(struct inode *inode)
-{
-	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
-	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
-	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
-	u64 fs_total, new_free;
-
-	/* Total up the file system space, according to the latest rindex. */
-	fs_total = gfs2_ri_total(sdp);
-
-	spin_lock(&sdp->sd_statfs_spin);
-	if (fs_total > (m_sc->sc_total + l_sc->sc_total))
-		new_free = fs_total - (m_sc->sc_total + l_sc->sc_total);
-	else
-		new_free = 0;
-	spin_unlock(&sdp->sd_statfs_spin);
-	fs_warn(sdp, "File system extended by %llu blocks.\n",
-		(unsigned long long)new_free);
-	gfs2_statfs_change(sdp, new_free, new_free, 0);
-}
-
-/**
  * gfs2_commit_write - Commit write to a file
  * @file: The file to write to
  * @page: The page containing the data
@@ -943,7 +1379,7 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
 	if (gfs2_is_stuffed(ip))
 		return 0;
 
-	if (offset > i_size_read(&ip->i_inode))
+	if (offset >= i_size_read(&ip->i_inode))
 		return 0;
 	return 1;
 }
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
index 2452e15..2b96b0c 100644
--- a/fs/gfs2/ops_address.h
+++ b/fs/gfs2/ops_address.h
@@ -16,5 +16,11 @@
 
 extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
 extern void gfs2_set_aops(struct inode *inode);
+extern int gfs2_write_begin(struct file *file, struct address_space *mapping,
+			    loff_t pos, unsigned len, unsigned flags,
+			    struct page **pagep, void **fsdata);
+extern int gfs2_write_end(struct file *file, struct address_space *mapping,
+			  loff_t pos, unsigned len, unsigned copied,
+			  struct page *page, void *fsdata);
 
 #endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 98d96a5..3d7b539 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -23,7 +23,9 @@
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
 #include <linux/writeback.h>
+#include <linux/uaccess.h>
 #include <asm/uaccess.h>
+#include <linux/mpage.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -44,6 +46,24 @@
 #include "eaops.h"
 #include "ops_address.h"
 
+static void iov_iter_advance(struct iov_iter *i, size_t bytes);
+static inline void iov_iter_init(struct iov_iter *i,
+                        const struct iovec *iov, unsigned long nr_segs,
+                        size_t count, size_t written)
+{
+        i->iov = iov;
+        i->nr_segs = nr_segs;
+        i->iov_offset = 0;
+        i->count = count + written;
+
+        iov_iter_advance(i, written);
+}
+
+static inline size_t iov_iter_count(struct iov_iter *i)
+{
+        return i->count;
+}
+
 /*
  * Most fields left uninitialised to catch anybody who tries to
  * use them. f_flags set to prevent file_accessed() from touching
@@ -651,14 +671,466 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 		return do_flock(file, cmd, fl);
 }
 
+/**
+ * generic_file functions backported from upstream filemap.c:
+ * In many cases I needed to change generic_* to gfs2_*
+ */
+
+static unsigned long iov_iter_single_seg_count(struct iov_iter *i)
+{
+	const struct iovec *iov = i->iov;
+	if (i->nr_segs == 1)
+		return i->count;
+	else
+		return min(i->count, iov->iov_len - i->iov_offset);
+}
+
+static void iov_iter_advance(struct iov_iter *i, size_t bytes)
+{
+	BUG_ON(i->count < bytes);
+
+	if (likely(i->nr_segs == 1)) {
+		i->iov_offset += bytes;
+		i->count -= bytes;
+	} else {
+		const struct iovec *iov = i->iov;
+		size_t base = i->iov_offset;
+
+		/*
+		 * The !iov->iov_len check ensures we skip over unlikely
+		 * zero-length segments (without overruning the iovec).
+		 */
+		while (bytes || unlikely(i->count && !iov->iov_len)) {
+			int copy;
+
+			copy = min(bytes, iov->iov_len - base);
+			BUG_ON(!i->count || i->count < copy);
+			i->count -= copy;
+			bytes -= copy;
+			base += copy;
+			if (iov->iov_len == base) {
+				iov++;
+				base = 0;
+			}
+		}
+		i->iov = iov;
+		i->iov_offset = base;
+	}
+}
+
+static inline int gfs2_fault_in_pages_readable(const char __user *uaddr,
+					       int size)
+{
+        volatile char c;
+        int ret;
+
+        if (unlikely(size == 0))
+                return 0;
+
+        ret = __get_user(c, uaddr);
+        if (ret == 0) {
+                const char __user *end = uaddr + size - 1;
+
+                if (((unsigned long)uaddr & PAGE_MASK) !=
+                                ((unsigned long)end & PAGE_MASK))
+                        ret = __get_user(c, end);
+        }
+        return ret;
+}
+static int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+	char __user *buf = i->iov->iov_base + i->iov_offset;
+	bytes = min(bytes, i->iov->iov_len - i->iov_offset);
+	return gfs2_fault_in_pages_readable(buf, bytes);
+}
+
+static ssize_t gfs2_perform_write(struct file *file,
+				  struct iov_iter *i, loff_t pos)
+{
+	struct address_space *mapping = file->f_mapping;
+	long status = 0;
+	ssize_t written = 0;
+	unsigned int flags = 0;
+
+	/*
+	 * Copies from kernel address space cannot fail (NFSD is a big user).
+	 */
+	/*if (segment_eq(get_fs(), KERNEL_DS))
+	  flags |= AOP_FLAG_UNINTERRUPTIBLE;*/
+
+	do {
+		struct page *page;
+		pgoff_t index;		/* Pagecache index for current page */
+		unsigned long offset;	/* Offset into pagecache page */
+		unsigned long bytes;	/* Bytes to write to page */
+		size_t copied;		/* Bytes copied from user */
+		void *fsdata;
+
+		offset = (pos & (PAGE_CACHE_SIZE - 1));
+		index = pos >> PAGE_CACHE_SHIFT;
+		bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+						iov_iter_count(i));
+
+again:
+
+		/*
+		 * Bring in the user page that we will copy from _first_.
+		 * Otherwise there's a nasty deadlock on copying from the
+		 * same page as we're writing to, without it being marked
+		 * up-to-date.
+		 *
+		 * Not only is this an optimisation, but it is also required
+		 * to check that the address is actually valid, when atomic
+		 * usercopies are used, below.
+		 */
+		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+			status = -EFAULT;
+			break;
+		}
+
+		status = gfs2_write_begin(file, mapping, pos, bytes, flags,
+						&page, &fsdata);
+		if (unlikely(status))
+			break;
+
+		/* pagefault disable */
+		inc_preempt_count();
+		barrier();
+		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+		/* pagefault enable */
+		barrier();
+		dec_preempt_count();
+		barrier();
+		preempt_check_resched();
+
+		flush_dcache_page(page);
+
+		status = gfs2_write_end(file, mapping, pos, bytes, copied,
+					page, fsdata);
+		if (unlikely(status < 0))
+			break;
+		copied = status;
+
+		cond_resched();
+
+		iov_iter_advance(i, copied);
+		if (unlikely(copied == 0)) {
+			/*
+			 * If we were unable to copy any data at all, we must
+			 * fall back to a single segment length write.
+			 *
+			 * If we didn't fallback here, we could livelock
+			 * because not all segments in the iov can be copied at
+			 * once without a pagefault.
+			 */
+			bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+						iov_iter_single_seg_count(i));
+			goto again;
+		}
+		pos += copied;
+		written += copied;
+
+		balance_dirty_pages_ratelimited(mapping);
+
+	} while (iov_iter_count(i));
+
+	return written ? written : status;
+}
+
+static ssize_t
+gfs2_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+			 unsigned long nr_segs, loff_t pos, loff_t *ppos,
+			 size_t count, ssize_t written)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	const struct address_space_operations *a_ops = mapping->a_ops;
+	struct inode *inode = mapping->host;
+	ssize_t status;
+	struct iov_iter i;
+
+	iov_iter_init(&i, iov, nr_segs, count, written);
+	status = gfs2_perform_write(file, &i, pos);
+
+	if (likely(status >= 0)) {
+		written += status;
+		*ppos = pos + status;
+
+		/*
+		 * For now, when the user asks for O_SYNC, we'll actually give
+		 * O_DSYNC
+		 */
+		if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+			if (!a_ops->writepage || !is_sync_kiocb(iocb))
+				status = generic_osync_inode(inode, mapping,
+						OSYNC_METADATA|OSYNC_DATA);
+		}
+  	}
+	
+	/*
+	 * If we get here for O_DIRECT writes then we must have fallen through
+	 * to buffered writes (block instantiation inside i_size).  So we sync
+	 * the file data here, to try to honour O_DIRECT expectations.
+	 */
+	if (unlikely(file->f_flags & O_DIRECT) && written)
+		status = filemap_write_and_wait(mapping);
+
+	return written ? written : status;
+}
+
+/*
+ * Performs necessary checks before doing a write
+ * @iov:	io vector request
+ * @nr_segs:	number of segments in the iovec
+ * @count:	number of bytes to write
+ * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE
+ *
+ * Adjust number of segments and amount of bytes to write (nr_segs should be
+ * properly initialized first). Returns appropriate error code that caller
+ * should return or zero in case that write should be allowed.
+ */
+static int generic_segment_checks(const struct iovec *iov,
+			unsigned long *nr_segs, size_t *count, int access_flags)
+{
+	unsigned long   seg;
+	size_t cnt = 0;
+	for (seg = 0; seg < *nr_segs; seg++) {
+		const struct iovec *iv = &iov[seg];
+
+		/*
+		 * If any segment has a negative length, or the cumulative
+		 * length ever wraps negative then return -EINVAL.
+		 */
+		cnt += iv->iov_len;
+		if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
+			return -EINVAL;
+		if (access_ok(access_flags, iv->iov_base, iv->iov_len))
+			continue;
+		if (seg == 0)
+			return -EFAULT;
+		*nr_segs = seg;
+		cnt -= iv->iov_len;	/* This segment is no good */
+		break;
+	}
+	*count = cnt;
+	return 0;
+}
+
+static ssize_t
+__gfs2_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
+			     unsigned long nr_segs, loff_t *ppos)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space * mapping = file->f_mapping;
+	size_t ocount;		/* original count */
+	size_t count;		/* after file limit checks */
+	struct inode 	*inode = mapping->host;
+	loff_t		pos;
+	ssize_t		written;
+	ssize_t		err;
+
+	ocount = 0;
+	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
+	if (err)
+		return err;
+
+	count = ocount;
+	pos = *ppos;
+
+	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = mapping->backing_dev_info;
+	written = 0;
+
+	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+	if (err)
+		goto out;
+
+	if (count == 0)
+		goto out;
+
+	err = remove_suid(file->f_dentry);
+	if (err)
+		goto out;
+
+	file_update_time(file);
+
+	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
+	if (unlikely(file->f_flags & O_DIRECT)) {
+		loff_t endbyte;
+		ssize_t written_buffered;
+
+		written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
+							ppos, count, ocount);
+		if (written < 0 || written == count)
+			goto out;
+		/*
+		 * direct-io write to a hole: fall through to buffered I/O
+		 * for completing the rest of the request.
+		 */
+		pos += written;
+		count -= written;
+		written_buffered = gfs2_file_buffered_write(iocb, iov,
+							    nr_segs, pos,
+							    ppos, count,
+							    written);
+		/*
+		 * If gfs2_file_buffered_write() returned a synchronous error
+		 * then we want to return the number of bytes which were
+		 * direct-written, or the error code if that was zero.  Note
+		 * that this differs from normal direct-io semantics, which
+		 * will return -EFOO even if some bytes were written.
+		 */
+		if (written_buffered < 0) {
+			err = written_buffered;
+			goto out;
+		}
+
+		/*
+		 * We need to ensure that the page cache pages are written to
+		 * disk and invalidated to preserve the expected O_DIRECT
+		 * semantics.
+		 */
+		endbyte = pos + written_buffered - written - 1;
+		err = do_sync_file_range(file, pos, endbyte,
+					 SYNC_FILE_RANGE_WAIT_BEFORE|
+					 SYNC_FILE_RANGE_WRITE|
+					 SYNC_FILE_RANGE_WAIT_AFTER);
+		if (err == 0) {
+			written = written_buffered;
+			invalidate_mapping_pages(mapping,
+						 pos >> PAGE_CACHE_SHIFT,
+						 endbyte >> PAGE_CACHE_SHIFT);
+		} else {
+			/*
+			 * We don't know how much we wrote, so just return
+			 * the number of bytes which were direct-written
+			 */
+		}
+	} else {
+		written = gfs2_file_buffered_write(iocb, iov, nr_segs,
+						   pos, ppos, count, written);
+	}
+out:
+	current->backing_dev_info = NULL;
+	return written ? written : err;
+}
+
+static ssize_t
+gfs2_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
+			   unsigned long nr_segs, loff_t *ppos)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	ssize_t ret;
+	loff_t pos = *ppos;
+
+	ret = __gfs2_file_aio_write_nolock(iocb, iov, nr_segs, ppos);
+
+	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+		int err;
+
+		err = sync_page_range_nolock(inode, mapping, pos, ret);
+		if (err < 0)
+			ret = err;
+	}
+	return ret;
+}
+
+static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const char __user *buf,
+				   size_t count, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	ssize_t ret;
+	struct iovec local_iov = { .iov_base = (void __user *)buf,
+					.iov_len = count };
+
+	BUG_ON(iocb->ki_pos != pos);
+
+	mutex_lock(&inode->i_mutex);
+	ret = __gfs2_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
+	mutex_unlock(&inode->i_mutex);
+
+	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+		ssize_t err;
+
+		err = sync_page_range(inode, mapping, pos, ret);
+		if (err < 0)
+			ret = err;
+	}
+	return ret;
+}
+
+static ssize_t gfs2_file_write_nolock(struct file *file,
+				      const struct iovec *iov,
+				      unsigned long nr_segs, loff_t *ppos)
+{
+	struct kiocb kiocb;
+	ssize_t ret;
+
+	init_sync_kiocb(&kiocb, file);
+	ret = gfs2_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
+	if (-EIOCBQUEUED == ret)
+		ret = wait_on_sync_kiocb(&kiocb);
+	return ret;
+}
+
+static ssize_t gfs2_file_write(struct file *file, const char __user *buf,
+			       size_t count, loff_t *ppos)
+{
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	ssize_t	ret;
+	struct iovec local_iov = { .iov_base = (void __user *)buf,
+					.iov_len = count };
+
+	mutex_lock(&inode->i_mutex);
+	ret = gfs2_file_write_nolock(file, &local_iov, 1, ppos);
+	mutex_unlock(&inode->i_mutex);
+
+	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+		ssize_t err;
+
+		err = sync_page_range(inode, mapping, *ppos - ret, ret);
+		if (err < 0)
+			ret = err;
+	}
+	return ret;
+}
+
+static ssize_t gfs2_file_writev(struct file *file, const struct iovec *iov,
+				unsigned long nr_segs, loff_t *ppos)
+{
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	ssize_t ret;
+
+	mutex_lock(&inode->i_mutex);
+	ret = gfs2_file_write_nolock(file, iov, nr_segs, ppos);
+	mutex_unlock(&inode->i_mutex);
+
+	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+		int err;
+
+		err = sync_page_range(inode, mapping, *ppos - ret, ret);
+		if (err < 0)
+			ret = err;
+	}
+	return ret;
+}
+
 const struct file_operations_ext gfs2_file_fops = {
 	.f_op_orig.llseek		= gfs2_llseek,
 	.f_op_orig.read		= generic_file_read,
 	.f_op_orig.readv		= generic_file_readv,
 	.f_op_orig.aio_read	= generic_file_aio_read,
-	.f_op_orig.write		= generic_file_write,
-	.f_op_orig.writev		= generic_file_writev,
-	.f_op_orig.aio_write	= generic_file_aio_write,
+	.f_op_orig.write		= gfs2_file_write,
+	.f_op_orig.writev		= gfs2_file_writev,
+	.f_op_orig.aio_write	= gfs2_file_aio_write,
 	.f_op_orig.unlocked_ioctl	= gfs2_ioctl,
 	.f_op_orig.mmap		= gfs2_mmap,
 	.f_op_orig.open		= gfs2_open,
@@ -687,9 +1159,9 @@ const struct file_operations_ext gfs2_file_fops_nolock = {
 	.f_op_orig.read		= generic_file_read,
 	.f_op_orig.readv		= generic_file_readv,
 	.f_op_orig.aio_read	= generic_file_aio_read,
-	.f_op_orig.write		= generic_file_write,
-	.f_op_orig.writev		= generic_file_writev,
-	.f_op_orig.aio_write	= generic_file_aio_write,
+	.f_op_orig.write		= gfs2_file_write,
+	.f_op_orig.writev		= gfs2_file_writev,
+	.f_op_orig.aio_write	= gfs2_file_aio_write,
 	.f_op_orig.unlocked_ioctl	= gfs2_ioctl,
 	.f_op_orig.mmap		= gfs2_mmap,
 	.f_op_orig.open		= gfs2_open,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b778a80..f08cec8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -404,6 +404,15 @@ struct page;
 struct address_space;
 struct writeback_control;
 
+struct iov_iter {
+        const struct iovec *iov;
+        unsigned long nr_segs;
+        size_t iov_offset;
+        size_t count;
+};
+
+size_t iov_iter_copy_from_user_atomic(struct page *page,
+                struct iov_iter *i, unsigned long offset, size_t bytes);
 struct address_space_operations {
 	int (*writepage)(struct page *page, struct writeback_control *wbc);
 	int (*readpage)(struct file *, struct page *);
diff --git a/mm/filemap.c b/mm/filemap.c
index 7489d31..3eac292 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1965,6 +1965,34 @@ __filemap_copy_from_user_iovec_inatomic(char *vaddr,
 }
 
 /*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were sucessfully copied.  If a fault is encountered then return the number of
+ * bytes which were copied.
+ */
+size_t iov_iter_copy_from_user_atomic(struct page *page, struct iov_iter *i,
+				      unsigned long offset, size_t bytes)
+{
+	char *kaddr;
+	size_t copied;
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	if (likely(i->nr_segs == 1)) {
+		int left;
+		char __user *buf = i->iov->iov_base + i->iov_offset;
+		left = __copy_from_user_inatomic_nocache(kaddr + offset,
+							buf, bytes);
+		copied = bytes - left;
+	} else {
+		copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset,
+						i->iov, i->iov_offset, bytes);
+	}
+	kunmap_atomic(kaddr, KM_USER0);
+
+	return copied;
+}
+EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
+
+/*
  * Performs necessary checks before doing a write
  *
  * Can adjust writing position or amount of bytes to write.