From: Don Zickus <dzickus@redhat.com> Date: Fri, 5 Dec 2008 15:17:49 -0500 Subject: Revert [mm] keep pagefault from happening under pagelock Message-id: 20081205201749.GR12451@drseuss.usersys.redhat.com O-Subject: [PATCH RHEL5.3] Revert mm: keep pagefault from happening under page lock patch Bugzilla: 473150 RH-Acked-by: Josef Bacik <jbacik@redhat.com> https://bugzilla.redhat.com/show_bug.cgi?id=473150 A couple of patches went into RHEL-5.3 to fix a particular deadlock when writing to the fs. Unfortunately further testing has shown a 50% degredation in write performance on some stress tests. Therefore I am backing out these two patches to bring us back to the original behaviour. linux-2.6-mm-filemap-fix-iov_base-data-corruption.patch linux-2.6-mm-keep-pagefault-from-happening-under-page-lock.patch or git commits: c3079412dec7edbe0f3369d0f6cea930f6677f58 84dc8173501fc07f215774d6e99f4b78ec146637 The original deadlock problem only happened under certain test conditions and has never been seen in production. I created the below patch by using 'git revert' on the two above commits. The revert went smoothly, so I don't expect any issues. For those that are familiar with the problem, please review the patch to make sure the revert didn't screw anything up. Cheers, Don diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e4f9e15..e77298b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -196,9 +196,6 @@ static inline int fault_in_pages_writeable(char __user *uaddr, int size) { int ret; - if (unlikely(size == 0)) - return 0; - /* * Writing zeroes into userspace here is OK, because we know that if * the zero gets there, we'll be overwriting it. @@ -218,23 +215,19 @@ static inline int fault_in_pages_writeable(char __user *uaddr, int size) return ret; } -static inline int fault_in_pages_readable(const char __user *uaddr, int size) +static inline void fault_in_pages_readable(const char __user *uaddr, int size) { volatile char c; int ret; - if (unlikely(size == 0)) - return 0; - ret = __get_user(c, uaddr); if (ret == 0) { const char __user *end = uaddr + size - 1; if (((unsigned long)uaddr & PAGE_MASK) != ((unsigned long)end & PAGE_MASK)) - ret = __get_user(c, end); + __get_user(c, end); } - return ret; } #endif /* _LINUX_PAGEMAP_H */ diff --git a/mm/filemap.c b/mm/filemap.c index aa2bbcf..251e6f1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2137,9 +2137,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, long status = 0; struct page *page; struct page *cached_page = NULL; + size_t bytes; struct pagevec lru_pvec; const struct iovec *cur_iov = iov; /* current iovec */ - size_t iov_offset = 0; /* offset in the current iovec */ + size_t iov_base = 0; /* offset in the current iovec */ char __user *buf; pagevec_init(&lru_pvec, 0); @@ -2147,20 +2148,16 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, /* * handle partial DIO write. Adjust cur_iov if needed. */ - if (likely(nr_segs == 1)) { - iov_offset += written; - buf = iov->iov_base + iov_offset; - } else { - filemap_set_next_iovec(&cur_iov, &iov_offset, written); - buf = cur_iov->iov_base + iov_offset; + if (likely(nr_segs == 1)) + buf = iov->iov_base + written; + else { + filemap_set_next_iovec(&cur_iov, &iov_base, written); + buf = cur_iov->iov_base + iov_base; } do { - struct page *src_page; unsigned long index; unsigned long offset; - unsigned long seglen; /* Bytes remaining in current iovec */ - unsigned long bytes; /* bytes to write to page */ size_t copied; offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ @@ -2170,15 +2167,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, /* Limit the size of the copy to the caller's write size */ bytes = min(bytes, count); - /* - * a non-NULL src_page indicates that we're doing the - * copy via get_user_pages and kmap. - */ - src_page = NULL; - - seglen = cur_iov->iov_len - iov_offset; - seglen = min(seglen, bytes); - /* We only need to worry about prefaulting when writes are from * user-space. NFSd uses vfs_writev with several non-aligned * segments in the vector, and limiting to one segment a time is @@ -2186,15 +2174,19 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, */ if (!segment_eq(get_fs(), KERNEL_DS)) { /* + * Limit the size of the copy to that of the current + * segment, because fault_in_pages_readable() doesn't + * know how to walk segments. + */ + bytes = min(bytes, cur_iov->iov_len - iov_base); + + /* * Bring in the user page that we will copy from * _first_. Otherwise there's a nasty deadlock on * copying from the same page as we're writing to, * without it being marked up-to-date. */ - if (unlikely(fault_in_pages_readable(buf, seglen))) { - status = -EFAULT; - break; - } + fault_in_pages_readable(buf, bytes); } page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); if (!page) { @@ -2202,51 +2194,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, break; } - /* - * non-uptodate pages cannot cope with short copies, and we - * cannot take a pagefault with the destination page locked. - * So pin the source page to copy it. - */ - if (!PageUptodate(page)) { - unlock_page(page); - - src_page = alloc_page(GFP_KERNEL); - if (!src_page) { - page_cache_release(page); - status = -ENOMEM; - break; - } - - /* - * Cannot get_user_pages with a page locked for the - * same reason as we can't take a page fault with a - * page locked (as explained below). - */ - copied = filemap_copy_from_user_iovec(src_page, offset, - cur_iov, iov_offset, bytes); - if (unlikely(copied == 0)) { - status = -EFAULT; - page_cache_release(page); - page_cache_release(src_page); - break; - } - bytes = copied; - - lock_page(page); - /* - * Can't handle the page going uptodate here, because - * that means we would use non-atomic usercopies, which - * zero out the tail of the page, which can cause - * zeroes to become transiently visible. We could just - * use a non-zeroing copy, but the APIs aren't too - * consistent. - */ - if (unlikely(!page->mapping || PageUptodate(page))) { - unlock_page(page); - page_cache_release(page); - page_cache_release(src_page); - continue; - } + if (unlikely(bytes == 0)) { + status = 0; + copied = 0; + goto zero_length_segment; } status = a_ops->prepare_write(file, page, offset, offset+bytes); @@ -2256,8 +2207,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, if (status != AOP_TRUNCATED_PAGE) unlock_page(page); page_cache_release(page); - if (src_page) - page_cache_release(src_page); if (status == AOP_TRUNCATED_PAGE) continue; /* @@ -2268,47 +2217,20 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, vmtruncate(inode, isize); break; } - - if (!src_page) { - /* - * Must not enter the pagefault handler here, because - * we hold the page lock, so we might recursively - * deadlock on the same lock, or get an ABBA deadlock - * against a different lock, or against the mmap_sem - * (which nests outside the page lock). So increment - * preempt count, and use _atomic usercopies. - * - * The page is uptodate so we are OK to encounter a - * short copy: if unmodified parts of the page are - * marked dirty and written out to disk, it doesn't - * really matter. - */ - - pagefault_disable(); - - copied = filemap_copy_from_user_atomic(page, offset, - cur_iov, nr_segs, iov_offset, bytes); - - pagefault_enable(); - } else { - void *src, *dst; - src = kmap_atomic(src_page, KM_USER0); - dst = kmap_atomic(page, KM_USER1); - memcpy(dst + offset, src + offset, bytes); - kunmap_atomic(dst, KM_USER1); - kunmap_atomic(src, KM_USER0); - copied = bytes; - } + if (likely(nr_segs == 1)) + copied = filemap_copy_from_user(page, offset, + buf, bytes); + else + copied = filemap_copy_from_user_iovec(page, offset, + cur_iov, iov_base, bytes); flush_dcache_page(page); status = a_ops->commit_write(file, page, offset, offset+bytes); if (status == AOP_TRUNCATED_PAGE) { page_cache_release(page); - if (src_page) - page_cache_release(src_page); continue; } - - if (likely(copied > 0)) { +zero_length_segment: + if (likely(copied >= 0)) { if (!status) status = copied; @@ -2316,22 +2238,24 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, written += status; count -= status; pos += status; - if (unlikely(nr_segs > 1)) + buf += status; + if (unlikely(nr_segs > 1)) { filemap_set_next_iovec(&cur_iov, - &iov_offset, status); - else - iov_offset += status; - - if (count) - buf = cur_iov->iov_base + iov_offset; + &iov_base, status); + if (count) + buf = cur_iov->iov_base + + iov_base; + } else { + iov_base += status; + } } } - + if (unlikely(copied != bytes)) + if (status >= 0) + status = -EFAULT; unlock_page(page); mark_page_accessed(page); page_cache_release(page); - if (src_page) - page_cache_release(src_page); if (status < 0) break; balance_dirty_pages_ratelimited(mapping); diff --git a/mm/filemap.h b/mm/filemap.h index efdc5c3..c2bff04 100644 --- a/mm/filemap.h +++ b/mm/filemap.h @@ -53,30 +53,6 @@ filemap_copy_from_user(struct page *page, unsigned long offset, return bytes - left; } -static inline size_t -filemap_copy_from_user_atomic(struct page *page, unsigned long offset, - const struct iovec *iov, unsigned long nr_segs, - size_t iov_offset, size_t bytes) -{ - char *kaddr; - size_t copied; - const char __user *buf = iov->iov_base + iov_offset; - - kaddr = kmap_atomic(page, KM_USER0); - if (likely(nr_segs == 1)) { - int left; - left = __copy_from_user_inatomic_nocache(kaddr+offset, buf, - bytes); - copied = bytes - left; - } else { - copied = __filemap_copy_from_user_iovec_inatomic(kaddr+offset, - iov, iov_offset, bytes); - } - kunmap_atomic(kaddr, KM_USER0); - - return copied; -} - /* * This has the same sideeffects and return value as filemap_copy_from_user(). * The difference is that on a fault we need to memset the remainder of the @@ -111,7 +87,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) const struct iovec *iov = *iovp; size_t base = *basep; - while (bytes) { + do { int copy = min(bytes, iov->iov_len - base); bytes -= copy; @@ -120,7 +96,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) iov++; base = 0; } - } + } while (bytes); *iovp = iov; *basep = base; }