Sophie: kernel-2.6.18-238.el5 src

kernel-2.6.18-238.el5.src.rpm

From: Don Zickus <dzickus@redhat.com>
Date: Fri, 5 Dec 2008 15:17:49 -0500
Subject: Revert [mm] keep pagefault from happening under pagelock
Message-id: 20081205201749.GR12451@drseuss.usersys.redhat.com
O-Subject: [PATCH RHEL5.3] Revert mm: keep pagefault from happening under page lock patch
Bugzilla: 473150
RH-Acked-by: Josef Bacik <jbacik@redhat.com>

https://bugzilla.redhat.com/show_bug.cgi?id=473150

A couple of patches went into RHEL-5.3 to fix a particular deadlock when
writing to the fs.  Unfortunately further testing has shown a 50%
degredation in write performance on some stress tests.  Therefore I am
backing out these two patches to bring us back to the original behaviour.

linux-2.6-mm-filemap-fix-iov_base-data-corruption.patch
linux-2.6-mm-keep-pagefault-from-happening-under-page-lock.patch

or git commits:
c3079412dec7edbe0f3369d0f6cea930f6677f58
84dc8173501fc07f215774d6e99f4b78ec146637

The original deadlock problem only happened under certain test conditions
and has never been seen in production.

I created the below patch by using 'git revert' on the two above commits.
The revert went smoothly, so I don't expect any issues.

For those that are familiar with the problem, please review the patch to
make sure the revert didn't screw anything up.

Cheers,
Don

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e4f9e15..e77298b 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -196,9 +196,6 @@ static inline int fault_in_pages_writeable(char __user *uaddr, int size)
 {
 	int ret;
 
-	if (unlikely(size == 0))
-		return 0;
-
 	/*
 	 * Writing zeroes into userspace here is OK, because we know that if
 	 * the zero gets there, we'll be overwriting it.
@@ -218,23 +215,19 @@ static inline int fault_in_pages_writeable(char __user *uaddr, int size)
 	return ret;
 }
 
-static inline int fault_in_pages_readable(const char __user *uaddr, int size)
+static inline void fault_in_pages_readable(const char __user *uaddr, int size)
 {
 	volatile char c;
 	int ret;
 
-	if (unlikely(size == 0))
-		return 0;
-
 	ret = __get_user(c, uaddr);
 	if (ret == 0) {
 		const char __user *end = uaddr + size - 1;
 
 		if (((unsigned long)uaddr & PAGE_MASK) !=
 				((unsigned long)end & PAGE_MASK))
-		 	ret = __get_user(c, end);
+		 	__get_user(c, end);
 	}
-	return ret;
 }
 
 #endif /* _LINUX_PAGEMAP_H */
diff --git a/mm/filemap.c b/mm/filemap.c
index aa2bbcf..251e6f1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2137,9 +2137,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 	long		status = 0;
 	struct page	*page;
 	struct page	*cached_page = NULL;
+	size_t		bytes;
 	struct pagevec	lru_pvec;
 	const struct iovec *cur_iov = iov; /* current iovec */
-	size_t		iov_offset = 0;	   /* offset in the current iovec */
+	size_t		iov_base = 0;	   /* offset in the current iovec */
 	char __user	*buf;
 
 	pagevec_init(&lru_pvec, 0);
@@ -2147,20 +2148,16 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 	/*
 	 * handle partial DIO write.  Adjust cur_iov if needed.
 	 */
-	if (likely(nr_segs == 1)) {
-		iov_offset += written;
-		buf = iov->iov_base + iov_offset;
-	} else {
-		filemap_set_next_iovec(&cur_iov, &iov_offset, written);
-		buf = cur_iov->iov_base + iov_offset;
+	if (likely(nr_segs == 1))
+		buf = iov->iov_base + written;
+	else {
+		filemap_set_next_iovec(&cur_iov, &iov_base, written);
+		buf = cur_iov->iov_base + iov_base;
 	}
 
 	do {
-		struct page *src_page;
 		unsigned long index;
 		unsigned long offset;
-		unsigned long seglen;	/* Bytes remaining in current iovec */
-		unsigned long bytes;	/* bytes to write to page */
 		size_t copied;
 
 		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
@@ -2170,15 +2167,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 		/* Limit the size of the copy to the caller's write size */
 		bytes = min(bytes, count);
 
-		/*
-		 * a non-NULL src_page indicates that we're doing the
-		 * copy via get_user_pages and kmap.
-		 */
-		src_page = NULL;
-
-		seglen = cur_iov->iov_len - iov_offset;
-		seglen = min(seglen, bytes);
-
 		/* We only need to worry about prefaulting when writes are from
 		 * user-space.  NFSd uses vfs_writev with several non-aligned
 		 * segments in the vector, and limiting to one segment a time is
@@ -2186,15 +2174,19 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 		 */
 		if (!segment_eq(get_fs(), KERNEL_DS)) {
 			/*
+			 * Limit the size of the copy to that of the current
+			 * segment, because fault_in_pages_readable() doesn't
+			 * know how to walk segments.
+			 */
+			bytes = min(bytes, cur_iov->iov_len - iov_base);
+
+			/*
 			 * Bring in the user page that we will copy from
 			 * _first_.  Otherwise there's a nasty deadlock on
 			 * copying from the same page as we're writing to,
 			 * without it being marked up-to-date.
 			 */
-			if (unlikely(fault_in_pages_readable(buf, seglen))) {
-				status = -EFAULT;
-				break;
-			}
+			fault_in_pages_readable(buf, bytes);
 		}
 		page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
 		if (!page) {
@@ -2202,51 +2194,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 			break;
 		}
 
-		/*
-		 * non-uptodate pages cannot cope with short copies, and we
-		 * cannot take a pagefault with the destination page locked.
-		 * So pin the source page to copy it.
-		 */
-		if (!PageUptodate(page)) {
-			unlock_page(page);
-
-			src_page = alloc_page(GFP_KERNEL);
-			if (!src_page) {
-				page_cache_release(page);
-				status = -ENOMEM;
-				break;
-			}
-
-			/*
-			 * Cannot get_user_pages with a page locked for the
-			 * same reason as we can't take a page fault with a
-			 * page locked (as explained below).
-			 */
-			copied = filemap_copy_from_user_iovec(src_page, offset,
-					cur_iov, iov_offset, bytes);
-			if (unlikely(copied == 0)) {
-				status = -EFAULT;
-				page_cache_release(page);
-				page_cache_release(src_page);
-				break;
-			}
-			bytes = copied;
-
-			lock_page(page);
-			/*
-			 * Can't handle the page going uptodate here, because
-			 * that means we would use non-atomic usercopies, which
-			 * zero out the tail of the page, which can cause
-			 * zeroes to become transiently visible. We could just
-			 * use a non-zeroing copy, but the APIs aren't too
-			 * consistent.
-			 */
-			if (unlikely(!page->mapping || PageUptodate(page))) {
-				unlock_page(page);
-				page_cache_release(page);
-				page_cache_release(src_page);
-				continue;
-			}
+		if (unlikely(bytes == 0)) {
+			status = 0;
+			copied = 0;
+			goto zero_length_segment;
 		}
 
 		status = a_ops->prepare_write(file, page, offset, offset+bytes);
@@ -2256,8 +2207,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 			if (status != AOP_TRUNCATED_PAGE)
 				unlock_page(page);
 			page_cache_release(page);
-			if (src_page)
-				page_cache_release(src_page);
 			if (status == AOP_TRUNCATED_PAGE)
 				continue;
 			/*
@@ -2268,47 +2217,20 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 				vmtruncate(inode, isize);
 			break;
 		}
-
-		if (!src_page) {
-			/*
-			 * Must not enter the pagefault handler here, because
-			 * we hold the page lock, so we might recursively
-			 * deadlock on the same lock, or get an ABBA deadlock
-			 * against a different lock, or against the mmap_sem
-			 * (which nests outside the page lock).  So increment
-			 * preempt count, and use _atomic usercopies.
-			 *
-			 * The page is uptodate so we are OK to encounter a
-			 * short copy: if unmodified parts of the page are
-			 * marked dirty and written out to disk, it doesn't
-			 * really matter.
-			 */
-
-			pagefault_disable();
-
-			copied = filemap_copy_from_user_atomic(page, offset,
-					cur_iov, nr_segs, iov_offset, bytes);
-
-			pagefault_enable();
-		} else {
-			void *src, *dst;
-			src = kmap_atomic(src_page, KM_USER0);
-			dst = kmap_atomic(page, KM_USER1);
-			memcpy(dst + offset, src + offset, bytes);
-			kunmap_atomic(dst, KM_USER1);
-			kunmap_atomic(src, KM_USER0);
-			copied = bytes;
-		}
+		if (likely(nr_segs == 1))
+			copied = filemap_copy_from_user(page, offset,
+							buf, bytes);
+		else
+			copied = filemap_copy_from_user_iovec(page, offset,
+						cur_iov, iov_base, bytes);
 		flush_dcache_page(page);
 		status = a_ops->commit_write(file, page, offset, offset+bytes);
 		if (status == AOP_TRUNCATED_PAGE) {
 			page_cache_release(page);
-			if (src_page)
-				page_cache_release(src_page);
 			continue;
 		}
-
-		if (likely(copied > 0)) {
+zero_length_segment:
+		if (likely(copied >= 0)) {
 			if (!status)
 				status = copied;
 
@@ -2316,22 +2238,24 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 				written += status;
 				count -= status;
 				pos += status;
-				if (unlikely(nr_segs > 1))
+				buf += status;
+				if (unlikely(nr_segs > 1)) {
 					filemap_set_next_iovec(&cur_iov,
-							&iov_offset, status);
-				else
-					iov_offset += status;
-
-				if (count)
-					buf = cur_iov->iov_base + iov_offset;
+							&iov_base, status);
+					if (count)
+						buf = cur_iov->iov_base +
+							iov_base;
+				} else {
+					iov_base += status;
+				}
 			}
 		}
-
+		if (unlikely(copied != bytes))
+			if (status >= 0)
+				status = -EFAULT;
 		unlock_page(page);
 		mark_page_accessed(page);
 		page_cache_release(page);
-		if (src_page)
-			page_cache_release(src_page);
 		if (status < 0)
 			break;
 		balance_dirty_pages_ratelimited(mapping);
diff --git a/mm/filemap.h b/mm/filemap.h
index efdc5c3..c2bff04 100644
--- a/mm/filemap.h
+++ b/mm/filemap.h
@@ -53,30 +53,6 @@ filemap_copy_from_user(struct page *page, unsigned long offset,
 	return bytes - left;
 }
 
-static inline size_t
-filemap_copy_from_user_atomic(struct page *page, unsigned long offset,
-				const struct iovec *iov, unsigned long nr_segs,
-				size_t iov_offset, size_t bytes)
-{
-	char *kaddr;
-	size_t copied;
-	const char __user *buf = iov->iov_base + iov_offset;
-
-	kaddr = kmap_atomic(page, KM_USER0);
-	if (likely(nr_segs == 1)) {
-		int left;
-		left = __copy_from_user_inatomic_nocache(kaddr+offset, buf,
-				bytes);
-		copied = bytes - left;
-	} else {
-		copied = __filemap_copy_from_user_iovec_inatomic(kaddr+offset,
-				iov, iov_offset, bytes);
-	}
-	kunmap_atomic(kaddr, KM_USER0);
-
-	return copied;
-}
-
 /*
  * This has the same sideeffects and return value as filemap_copy_from_user().
  * The difference is that on a fault we need to memset the remainder of the
@@ -111,7 +87,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
 	const struct iovec *iov = *iovp;
 	size_t base = *basep;
 
-	while (bytes) {
+	do {
 		int copy = min(bytes, iov->iov_len - base);
 
 		bytes -= copy;
@@ -120,7 +96,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
 			iov++;
 			base = 0;
 		}
-	}
+	} while (bytes);
 	*iovp = iov;
 	*basep = base;
 }