From: Josef Bacik <jbacik@redhat.com> Date: Mon, 29 Sep 2008 15:10:14 -0400 Subject: [mm] filemap: fix iov_base data corruption Message-id: 20080929191013.GC26676@unused.rdu.redhat.com O-Subject: Re: [RHEL5.3 PATCH] fix data corruption UPDATED Bugzilla: 463134 RH-Acked-by: Eric Sandeen <sandeen@redhat.com> RH-Acked-by: Jeff Moyer <jmoyer@redhat.com> RH-Acked-by: Peter Staubach <staubach@redhat.com> > >> Hello, > >> > >> This is in reference to bz 463134. The corruption was caused by the fact that > >> we did not update iov_base in the nr_segs == 1 && written != 0 case. iov_base > >> wouldn't be right, so when we went to copy from user we'd reset buf to the > >> beginning of cur_iov->iov_base, because I had also declared a local buf to use. > >> Fixed this by dropping the local char * buf since its not needed, and update > >> iov_base with written, and in the case that we still have count bytes left to > >> write. I also changed how we track bytes back to the way it was before since > >> the upstream change kind of screwed up how the bytes was tracked. I've tested > >> this thoroughly with the testcase that was causing problem and a bunch of other > >> tests. Not entirely sure why this made it through the first round of testing as > >> I know the testsuite I run has DIO tests, but in any case it really works this > >> time. Thank you, > >> > >> Josef > >> > >> > > Ok this patch fixes all the different concerns. I've 1) renamed iov_base to iov_offset to make it less confusing, this happened upstream as well. 2) added the comment above "src_page = NULL" 3) backported 4b49643fbb3fa8bf4910f82be02d45e94e8972a4, which was a revert of a fix for another patch that was also pulled out because of this patch. That addresses the zero length segment concern. 4) used the pagefault_enable/disable helpers since it seems somebody backported those. 5) created filemap_copy_from_user_atomic helper instead of open coding all of that stuff. I also tested with time iozone -a -g 8192 over nfs with and without this patch and the original deadlock fix patch. Both ran in ~27seconds, so there was no performance regression. Also double-checked with the original reproducer to make sure we aren't corrupting anything. Thank you, Josef diff --git a/mm/filemap.c b/mm/filemap.c index 3eac292..5f9971c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2125,7 +2125,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, struct page *cached_page = NULL; struct pagevec lru_pvec; const struct iovec *cur_iov = iov; /* current iovec */ - size_t iov_base = 0; /* offset in the current iovec */ + size_t iov_offset = 0; /* offset in the current iovec */ char __user *buf; pagevec_init(&lru_pvec, 0); @@ -2133,11 +2133,12 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, /* * handle partial DIO write. Adjust cur_iov if needed. */ - if (likely(nr_segs == 1)) - buf = iov->iov_base + written; - else { - filemap_set_next_iovec(&cur_iov, &iov_base, written); - buf = cur_iov->iov_base + iov_base; + if (likely(nr_segs == 1)) { + iov_offset += written; + buf = iov->iov_base + iov_offset; + } else { + filemap_set_next_iovec(&cur_iov, &iov_offset, written); + buf = cur_iov->iov_base + iov_offset; } do { @@ -2155,9 +2156,13 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, /* Limit the size of the copy to the caller's write size */ bytes = min(bytes, count); + /* + * a non-NULL src_page indicates that we're doing the + * copy via get_user_pages and kmap. + */ src_page = NULL; - seglen = cur_iov->iov_len - iov_base; + seglen = cur_iov->iov_len - iov_offset; seglen = min(seglen, bytes); /* We only need to worry about prefaulting when writes are from @@ -2183,12 +2188,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, break; } - if (unlikely(bytes == 0)) { - status = 0; - copied = 0; - goto zero_length_segment; - } - /* * non-uptodate pages cannot cope with short copies, and we * cannot take a pagefault with the destination page locked. @@ -2210,7 +2209,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, * page locked (as explained below). */ copied = filemap_copy_from_user_iovec(src_page, offset, - cur_iov, iov_base, bytes); + cur_iov, iov_offset, bytes); if (unlikely(copied == 0)) { status = -EFAULT; page_cache_release(page); @@ -2257,7 +2256,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, } if (!src_page) { - void *kaddr; /* * Must not enter the pagefault handler here, because * we hold the page lock, so we might recursively @@ -2272,30 +2270,12 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, * really matter. */ - /* pagefault disable */ - inc_preempt_count(); - barrier(); - - kaddr = kmap_atomic(page, KM_USER0); - if (likely(nr_segs == 1)) { - int left; - char __user *buf = cur_iov->iov_base+iov_base; - left = __copy_from_user_inatomic(kaddr+offset, - buf, bytes); - copied = bytes - left; - } else { - copied = - __filemap_copy_from_user_iovec_inatomic(kaddr+ - offset, cur_iov, iov_base, - bytes); - } - kunmap_atomic(kaddr, KM_USER0); + pagefault_disable(); + + copied = filemap_copy_from_user_atomic(page, offset, + cur_iov, nr_segs, iov_offset, bytes); - /* pagefault enable */ - barrier(); - dec_preempt_count(); - barrier(); - preempt_check_resched(); + pagefault_enable(); } else { void *src, *dst; src = kmap_atomic(src_page, KM_USER0); @@ -2313,8 +2293,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, page_cache_release(src_page); continue; } -zero_length_segment: - if (likely(copied >= 0)) { + + if (likely(copied > 0)) { if (!status) status = copied; @@ -2322,16 +2302,14 @@ zero_length_segment: written += status; count -= status; pos += status; - buf += status; - if (unlikely(nr_segs > 1)) { + if (unlikely(nr_segs > 1)) filemap_set_next_iovec(&cur_iov, - &iov_base, status); - if (count) - buf = cur_iov->iov_base + - iov_base; - } else { - iov_base += status; - } + &iov_offset, status); + else + iov_offset += status; + + if (count) + buf = cur_iov->iov_base + iov_offset; } } diff --git a/mm/filemap.h b/mm/filemap.h index c2bff04..efdc5c3 100644 --- a/mm/filemap.h +++ b/mm/filemap.h @@ -53,6 +53,30 @@ filemap_copy_from_user(struct page *page, unsigned long offset, return bytes - left; } +static inline size_t +filemap_copy_from_user_atomic(struct page *page, unsigned long offset, + const struct iovec *iov, unsigned long nr_segs, + size_t iov_offset, size_t bytes) +{ + char *kaddr; + size_t copied; + const char __user *buf = iov->iov_base + iov_offset; + + kaddr = kmap_atomic(page, KM_USER0); + if (likely(nr_segs == 1)) { + int left; + left = __copy_from_user_inatomic_nocache(kaddr+offset, buf, + bytes); + copied = bytes - left; + } else { + copied = __filemap_copy_from_user_iovec_inatomic(kaddr+offset, + iov, iov_offset, bytes); + } + kunmap_atomic(kaddr, KM_USER0); + + return copied; +} + /* * This has the same sideeffects and return value as filemap_copy_from_user(). * The difference is that on a fault we need to memset the remainder of the @@ -87,7 +111,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) const struct iovec *iov = *iovp; size_t base = *basep; - do { + while (bytes) { int copy = min(bytes, iov->iov_len - base); bytes -= copy; @@ -96,7 +120,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) iov++; base = 0; } - } while (bytes); + } *iovp = iov; *basep = base; }