Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 27922b4260f65d317aabda37e42bbbff > files > 2190

kernel-2.6.18-238.el5.src.rpm

From: Josef Bacik <jbacik@redhat.com>
Date: Mon, 29 Sep 2008 15:10:14 -0400
Subject: [mm] filemap: fix iov_base data corruption
Message-id: 20080929191013.GC26676@unused.rdu.redhat.com
O-Subject: Re: [RHEL5.3 PATCH] fix data corruption UPDATED
Bugzilla: 463134
RH-Acked-by: Eric Sandeen <sandeen@redhat.com>
RH-Acked-by: Jeff Moyer <jmoyer@redhat.com>
RH-Acked-by: Peter Staubach <staubach@redhat.com>

> >> Hello,
> >>
> >> This is in reference to bz 463134.  The corruption was caused by the fact that
> >> we did not update iov_base in the nr_segs == 1 && written != 0 case.  iov_base
> >> wouldn't be right, so when we went to copy from user we'd reset buf to the
> >> beginning of cur_iov->iov_base, because I had also declared a local buf to use.
> >> Fixed this by dropping the local char * buf since its not needed, and update
> >> iov_base with written, and in the case that we still have count bytes left to
> >> write.  I also changed how we track bytes back to the way it was before since
> >> the upstream change kind of screwed up how the bytes was tracked.  I've tested
> >> this thoroughly with the testcase that was causing problem and a bunch of other
> >> tests.  Not entirely sure why this made it through the first round of testing as
> >> I know the testsuite I run has DIO tests, but in any case it really works this
> >> time.  Thank you,
> >>
> >> Josef
> >>
> >>
> >
Ok this patch fixes all the different concerns.  I've

1) renamed iov_base to iov_offset to make it less confusing, this happened
upstream as well.

2) added the comment above "src_page = NULL"

3) backported 4b49643fbb3fa8bf4910f82be02d45e94e8972a4, which was a revert of a
fix for another patch that was also pulled out because of this patch.  That
addresses the zero length segment concern.

4) used the pagefault_enable/disable helpers since it seems somebody backported
those.

5) created filemap_copy_from_user_atomic helper instead of open coding all of
that stuff.

I also tested with time iozone -a -g 8192 over nfs with and without this patch and
the original deadlock fix patch.  Both ran in ~27seconds, so there was no
performance regression.  Also double-checked with the original reproducer to
make sure we aren't corrupting anything.  Thank you,

Josef

diff --git a/mm/filemap.c b/mm/filemap.c
index 3eac292..5f9971c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2125,7 +2125,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 	struct page	*cached_page = NULL;
 	struct pagevec	lru_pvec;
 	const struct iovec *cur_iov = iov; /* current iovec */
-	size_t		iov_base = 0;	   /* offset in the current iovec */
+	size_t		iov_offset = 0;	   /* offset in the current iovec */
 	char __user	*buf;
 
 	pagevec_init(&lru_pvec, 0);
@@ -2133,11 +2133,12 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 	/*
 	 * handle partial DIO write.  Adjust cur_iov if needed.
 	 */
-	if (likely(nr_segs == 1))
-		buf = iov->iov_base + written;
-	else {
-		filemap_set_next_iovec(&cur_iov, &iov_base, written);
-		buf = cur_iov->iov_base + iov_base;
+	if (likely(nr_segs == 1)) {
+		iov_offset += written;
+		buf = iov->iov_base + iov_offset;
+	} else {
+		filemap_set_next_iovec(&cur_iov, &iov_offset, written);
+		buf = cur_iov->iov_base + iov_offset;
 	}
 
 	do {
@@ -2155,9 +2156,13 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 		/* Limit the size of the copy to the caller's write size */
 		bytes = min(bytes, count);
 
+		/*
+		 * a non-NULL src_page indicates that we're doing the
+		 * copy via get_user_pages and kmap.
+		 */
 		src_page = NULL;
 
-		seglen = cur_iov->iov_len - iov_base;
+		seglen = cur_iov->iov_len - iov_offset;
 		seglen = min(seglen, bytes);
 
 		/* We only need to worry about prefaulting when writes are from
@@ -2183,12 +2188,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 			break;
 		}
 
-		if (unlikely(bytes == 0)) {
-			status = 0;
-			copied = 0;
-			goto zero_length_segment;
-		}
-
 		/*
 		 * non-uptodate pages cannot cope with short copies, and we
 		 * cannot take a pagefault with the destination page locked.
@@ -2210,7 +2209,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 			 * page locked (as explained below).
 			 */
 			copied = filemap_copy_from_user_iovec(src_page, offset,
-					cur_iov, iov_base, bytes);
+					cur_iov, iov_offset, bytes);
 			if (unlikely(copied == 0)) {
 				status = -EFAULT;
 				page_cache_release(page);
@@ -2257,7 +2256,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 		}
 
 		if (!src_page) {
-			void *kaddr;
 			/*
 			 * Must not enter the pagefault handler here, because
 			 * we hold the page lock, so we might recursively
@@ -2272,30 +2270,12 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 			 * really matter.
 			 */
 
-			/* pagefault disable */
-			inc_preempt_count();
-			barrier();
-
-			kaddr = kmap_atomic(page, KM_USER0);
-			if (likely(nr_segs == 1)) {
-				int left;
-				char __user *buf = cur_iov->iov_base+iov_base;
-				left = __copy_from_user_inatomic(kaddr+offset,
-								 buf, bytes);
-				copied = bytes - left;
-			} else {
-				copied =
-				  __filemap_copy_from_user_iovec_inatomic(kaddr+
-						offset, cur_iov, iov_base,
-						bytes);
-			}
-			kunmap_atomic(kaddr, KM_USER0);
+			pagefault_disable();
+
+			copied = filemap_copy_from_user_atomic(page, offset,
+					cur_iov, nr_segs, iov_offset, bytes);
 
-			/* pagefault enable */
-			barrier();
-			dec_preempt_count();
-			barrier();
-			preempt_check_resched();
+			pagefault_enable();
 		} else {
 			void *src, *dst;
 			src = kmap_atomic(src_page, KM_USER0);
@@ -2313,8 +2293,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 				page_cache_release(src_page);
 			continue;
 		}
-zero_length_segment:
-		if (likely(copied >= 0)) {
+
+		if (likely(copied > 0)) {
 			if (!status)
 				status = copied;
 
@@ -2322,16 +2302,14 @@ zero_length_segment:
 				written += status;
 				count -= status;
 				pos += status;
-				buf += status;
-				if (unlikely(nr_segs > 1)) {
+				if (unlikely(nr_segs > 1))
 					filemap_set_next_iovec(&cur_iov,
-							&iov_base, status);
-					if (count)
-						buf = cur_iov->iov_base +
-							iov_base;
-				} else {
-					iov_base += status;
-				}
+							&iov_offset, status);
+				else
+					iov_offset += status;
+
+				if (count)
+					buf = cur_iov->iov_base + iov_offset;
 			}
 		}
 
diff --git a/mm/filemap.h b/mm/filemap.h
index c2bff04..efdc5c3 100644
--- a/mm/filemap.h
+++ b/mm/filemap.h
@@ -53,6 +53,30 @@ filemap_copy_from_user(struct page *page, unsigned long offset,
 	return bytes - left;
 }
 
+static inline size_t
+filemap_copy_from_user_atomic(struct page *page, unsigned long offset,
+				const struct iovec *iov, unsigned long nr_segs,
+				size_t iov_offset, size_t bytes)
+{
+	char *kaddr;
+	size_t copied;
+	const char __user *buf = iov->iov_base + iov_offset;
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	if (likely(nr_segs == 1)) {
+		int left;
+		left = __copy_from_user_inatomic_nocache(kaddr+offset, buf,
+				bytes);
+		copied = bytes - left;
+	} else {
+		copied = __filemap_copy_from_user_iovec_inatomic(kaddr+offset,
+				iov, iov_offset, bytes);
+	}
+	kunmap_atomic(kaddr, KM_USER0);
+
+	return copied;
+}
+
 /*
  * This has the same sideeffects and return value as filemap_copy_from_user().
  * The difference is that on a fault we need to memset the remainder of the
@@ -87,7 +111,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
 	const struct iovec *iov = *iovp;
 	size_t base = *basep;
 
-	do {
+	while (bytes) {
 		int copy = min(bytes, iov->iov_len - base);
 
 		bytes -= copy;
@@ -96,7 +120,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
 			iov++;
 			base = 0;
 		}
-	} while (bytes);
+	}
 	*iovp = iov;
 	*basep = base;
 }