Sophie: kernel-2.6.18-238.el5 src

kernel-2.6.18-238.el5.src.rpm

From: Josef Bacik <jbacik@redhat.com>
Date: Thu, 28 Aug 2008 14:22:12 -0400
Subject: [mm] keep pagefault from happening under page lock
Message-id: 20080828182212.GB21358@unused.rdu.redhat.com
O-Subject: [RHEL5.3 PATCH] keep pagefault from happening under page lock
Bugzilla: 445433
RH-Acked-by: Rik van Riel <riel@redhat.com>

Hello,

This is in reference to bz 445433, and is a backport of
08291429cfa6258c4cd95d8833beb40f828b194e, and has been tested by Fujitsu on
RHEL4, and me on both RHEL4 and RHEL5 with Fujitsu's reproducer.  Basically it
is possible to pagefault while holding the page lock when doing a write, which
can result in a deadlock with the page lock and either the mmap_sem or some
other lock, in Fujitsu's case they were deadlocking with the journal lock I
beleive.  This patch keeps us from pagefaulting while holding the page lock.
Thank you,

Josef

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e77298b..e4f9e15 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -196,6 +196,9 @@ static inline int fault_in_pages_writeable(char __user *uaddr, int size)
 {
 	int ret;
 
+	if (unlikely(size == 0))
+		return 0;
+
 	/*
 	 * Writing zeroes into userspace here is OK, because we know that if
 	 * the zero gets there, we'll be overwriting it.
@@ -215,19 +218,23 @@ static inline int fault_in_pages_writeable(char __user *uaddr, int size)
 	return ret;
 }
 
-static inline void fault_in_pages_readable(const char __user *uaddr, int size)
+static inline int fault_in_pages_readable(const char __user *uaddr, int size)
 {
 	volatile char c;
 	int ret;
 
+	if (unlikely(size == 0))
+		return 0;
+
 	ret = __get_user(c, uaddr);
 	if (ret == 0) {
 		const char __user *end = uaddr + size - 1;
 
 		if (((unsigned long)uaddr & PAGE_MASK) !=
 				((unsigned long)end & PAGE_MASK))
-		 	__get_user(c, end);
+		 	ret = __get_user(c, end);
 	}
+	return ret;
 }
 
 #endif /* _LINUX_PAGEMAP_H */
diff --git a/mm/filemap.c b/mm/filemap.c
index f5622e5..299dfce 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2094,7 +2094,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 	long		status = 0;
 	struct page	*page;
 	struct page	*cached_page = NULL;
-	size_t		bytes;
 	struct pagevec	lru_pvec;
 	const struct iovec *cur_iov = iov; /* current iovec */
 	size_t		iov_base = 0;	   /* offset in the current iovec */
@@ -2113,8 +2112,11 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 	}
 
 	do {
+		struct page *src_page;
 		unsigned long index;
 		unsigned long offset;
+		unsigned long seglen;	/* Bytes remaining in current iovec */
+		unsigned long bytes;	/* bytes to write to page */
 		size_t copied;
 
 		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
@@ -2124,6 +2126,11 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 		/* Limit the size of the copy to the caller's write size */
 		bytes = min(bytes, count);
 
+		src_page = NULL;
+
+		seglen = cur_iov->iov_len - iov_base;
+		seglen = min(seglen, bytes);
+
 		/* We only need to worry about prefaulting when writes are from
 		 * user-space.  NFSd uses vfs_writev with several non-aligned
 		 * segments in the vector, and limiting to one segment a time is
@@ -2131,19 +2138,15 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 		 */
 		if (!segment_eq(get_fs(), KERNEL_DS)) {
 			/*
-			 * Limit the size of the copy to that of the current
-			 * segment, because fault_in_pages_readable() doesn't
-			 * know how to walk segments.
-			 */
-			bytes = min(bytes, cur_iov->iov_len - iov_base);
-
-			/*
 			 * Bring in the user page that we will copy from
 			 * _first_.  Otherwise there's a nasty deadlock on
 			 * copying from the same page as we're writing to,
 			 * without it being marked up-to-date.
 			 */
-			fault_in_pages_readable(buf, bytes);
+			if (unlikely(fault_in_pages_readable(buf, seglen))) {
+				status = -EFAULT;
+				break;
+			}
 		}
 		page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
 		if (!page) {
@@ -2157,6 +2160,53 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 			goto zero_length_segment;
 		}
 
+		/*
+		 * non-uptodate pages cannot cope with short copies, and we
+		 * cannot take a pagefault with the destination page locked.
+		 * So pin the source page to copy it.
+		 */
+		if (!PageUptodate(page)) {
+			unlock_page(page);
+
+			src_page = alloc_page(GFP_KERNEL);
+			if (!src_page) {
+				page_cache_release(page);
+				status = -ENOMEM;
+				break;
+			}
+
+			/*
+			 * Cannot get_user_pages with a page locked for the
+			 * same reason as we can't take a page fault with a
+			 * page locked (as explained below).
+			 */
+			copied = filemap_copy_from_user_iovec(src_page, offset,
+					cur_iov, iov_base, bytes);
+			if (unlikely(copied == 0)) {
+				status = -EFAULT;
+				page_cache_release(page);
+				page_cache_release(src_page);
+				break;
+			}
+			bytes = copied;
+
+			lock_page(page);
+			/*
+			 * Can't handle the page going uptodate here, because
+			 * that means we would use non-atomic usercopies, which
+			 * zero out the tail of the page, which can cause
+			 * zeroes to become transiently visible. We could just
+			 * use a non-zeroing copy, but the APIs aren't too
+			 * consistent.
+			 */
+			if (unlikely(!page->mapping || PageUptodate(page))) {
+				unlock_page(page);
+				page_cache_release(page);
+				page_cache_release(src_page);
+				continue;
+			}
+		}
+
 		status = a_ops->prepare_write(file, page, offset, offset+bytes);
 		if (unlikely(status)) {
 			loff_t isize = i_size_read(inode);
@@ -2164,6 +2214,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 			if (status != AOP_TRUNCATED_PAGE)
 				unlock_page(page);
 			page_cache_release(page);
+			if (src_page)
+				page_cache_release(src_page);
 			if (status == AOP_TRUNCATED_PAGE)
 				continue;
 			/*
@@ -2174,16 +2226,62 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 				vmtruncate(inode, isize);
 			break;
 		}
-		if (likely(nr_segs == 1))
-			copied = filemap_copy_from_user(page, offset,
-							buf, bytes);
-		else
-			copied = filemap_copy_from_user_iovec(page, offset,
-						cur_iov, iov_base, bytes);
+
+		if (!src_page) {
+			void *kaddr;
+			/*
+			 * Must not enter the pagefault handler here, because
+			 * we hold the page lock, so we might recursively
+			 * deadlock on the same lock, or get an ABBA deadlock
+			 * against a different lock, or against the mmap_sem
+			 * (which nests outside the page lock).  So increment
+			 * preempt count, and use _atomic usercopies.
+			 *
+			 * The page is uptodate so we are OK to encounter a
+			 * short copy: if unmodified parts of the page are
+			 * marked dirty and written out to disk, it doesn't
+			 * really matter.
+			 */
+
+			/* pagefault disable */
+			inc_preempt_count();
+			barrier();
+
+			kaddr = kmap_atomic(page, KM_USER0);
+			if (likely(nr_segs == 1)) {
+				int left;
+				char __user *buf = cur_iov->iov_base+iov_base;
+				left = __copy_from_user_inatomic(kaddr+offset,
+								 buf, bytes);
+				copied = bytes - left;
+			} else {
+				copied =
+				  __filemap_copy_from_user_iovec_inatomic(kaddr+
+						offset, cur_iov, iov_base,
+						bytes);
+			}
+			kunmap_atomic(kaddr, KM_USER0);
+
+			/* pagefault enable */
+			barrier();
+			dec_preempt_count();
+			barrier();
+			preempt_check_resched();
+		} else {
+			void *src, *dst;
+			src = kmap_atomic(src_page, KM_USER0);
+			dst = kmap_atomic(page, KM_USER1);
+			memcpy(dst + offset, src + offset, bytes);
+			kunmap_atomic(dst, KM_USER1);
+			kunmap_atomic(src, KM_USER0);
+			copied = bytes;
+		}
 		flush_dcache_page(page);
 		status = a_ops->commit_write(file, page, offset, offset+bytes);
 		if (status == AOP_TRUNCATED_PAGE) {
 			page_cache_release(page);
+			if (src_page)
+				page_cache_release(src_page);
 			continue;
 		}
 zero_length_segment:
@@ -2207,12 +2305,12 @@ zero_length_segment:
 				}
 			}
 		}
-		if (unlikely(copied != bytes))
-			if (status >= 0)
-				status = -EFAULT;
+
 		unlock_page(page);
 		mark_page_accessed(page);
 		page_cache_release(page);
+		if (src_page)
+			page_cache_release(src_page);
 		if (status < 0)
 			break;
 		balance_dirty_pages_ratelimited(mapping);