From: Bob Peterson <rpeterso@redhat.com> Date: Mon, 15 Sep 2008 19:29:44 -0400 Subject: [gfs2] glock deadlock in page fault path Message-id: 1389207955.365281221521384635.JavaMail.root@zmail02.collab.prod.int.phx2.redhat.com O-Subject: Re: [RHEL5.3 Patch] bz 458684 - GFS2: glock deadlock in page fault path Bugzilla: 458684 RH-Acked-by: Steven Whitehouse <swhiteho@redhat.com> Hi, Here is a respin of my patch for GFS2 bug 458684. The only difference between this patch and my previous is that function upstream function iov_iter_copy_from_user_atomic was moved into mm/filemap.c (where you'll find it in the upstream kernel), and the iov_iter structure declaration was moved to fs.h. I was hoping to contain all the changes to GFS2 to minimize risk, but I can't seem to find any other way to resolve the dependency issue Don pointed out. At least this minimizes the impact since the new function is not called outside of GFS2. I compiled this for x86_64 and i686 and re-ran the cluster "coherency" test that gives it a good workout on x86_64. All tests were successful. Note that this will require a change to the whitelist. Regards, Bob Peterson Red Hat Clustering & GFS -- diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index 5c8bac7..db147e3 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -36,6 +36,27 @@ #include "util.h" #include "glops.h" +static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; + +static void fastcall gfs2_lru_cache_add(struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); + + page_cache_get(page); + if (!pagevec_add(pvec, page)) + __pagevec_lru_add(pvec); + put_cpu_var(lru_add_pvecs); +} + +static int gfs2_add_to_page_cache_lru(struct page *page, + struct address_space *mapping, + pgoff_t offset, gfp_t gfp_mask) +{ + int ret = add_to_page_cache(page, mapping, offset, gfp_mask); + if (ret == 0) + gfs2_lru_cache_add(page); + return ret; +} static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, unsigned int from, unsigned int to) @@ -664,6 +685,446 @@ out_uninit: return ret; } +/* + * Find or create a page at the given pagecache position. Return the locked + * page. This function is specifically for buffered writes. + */ +static struct page *__grab_cache_page(struct address_space *mapping, + unsigned long index) +{ + int status; + struct page *page; +repeat: + page = find_lock_page(mapping, index); + if (likely(page)) + return page; + + page = page_cache_alloc(mapping); + if (!page) + return NULL; + status = gfs2_add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); + if (unlikely(status)) { + page_cache_release(page); + if (status == -EEXIST) + goto repeat; + return NULL; + } + return page; +} + +/** + * gfs2_write_begin - Begin to write to a file + * @file: The file to write to + * @mapping: The mapping in which to write + * @pos: The file offset at which to start writing + * @len: Length of the write + * @flags: Various flags + * @pagep: Pointer to return the page + * @fsdata: Pointer to return fs data (unused by GFS2) + * + * Returns: errno + */ + +int gfs2_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct gfs2_inode *ip = GFS2_I(mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(mapping->host); + unsigned int data_blocks, ind_blocks, rblocks; + int alloc_required; + int error = 0; + struct gfs2_alloc *al; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned to = from + len; + struct page *page; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &ip->i_gh); + error = gfs2_glock_nq_atime(&ip->i_gh); + if (unlikely(error)) + goto out_uninit; + + gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); + error = gfs2_write_alloc_required(ip, pos, len, &alloc_required); + if (error) + goto out_unlock; + + if (alloc_required) { + al = gfs2_alloc_get(ip); + if (!al) { + error = -ENOMEM; + goto out_unlock; + } + + error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out_alloc_put; + + error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid); + if (error) + goto out_qunlock; + + al->al_requested = data_blocks + ind_blocks; + error = gfs2_inplace_reserve(ip); + if (error) + goto out_qunlock; + } + + rblocks = RES_DINODE + ind_blocks; + if (gfs2_is_jdata(ip)) + rblocks += data_blocks ? data_blocks : 1; + if (ind_blocks || data_blocks) + rblocks += RES_STATFS + RES_QUOTA; + + error = gfs2_trans_begin(sdp, rblocks, + PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); + if (error) + goto out_trans_fail; + + error = -ENOMEM; + page = __grab_cache_page(mapping, index); + *pagep = page; + if (unlikely(!page)) + goto out_endtrans; + + if (gfs2_is_stuffed(ip)) { + error = 0; + if (pos + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) { + error = gfs2_unstuff_dinode(ip, page); + if (error == 0) + goto prepare_write; + } else if (!PageUptodate(page)) { + error = stuffed_readpage(ip, page); + } + goto out; + } + +prepare_write: + error = block_prepare_write(page, from, to, gfs2_block_map); +out: + if (error == 0) + return 0; + + page_cache_release(page); + if (pos + len > ip->i_inode.i_size) + vmtruncate(&ip->i_inode, ip->i_inode.i_size); +out_endtrans: + gfs2_trans_end(sdp); +out_trans_fail: + if (alloc_required) { + gfs2_inplace_release(ip); +out_qunlock: + gfs2_quota_unlock(ip); +out_alloc_put: + gfs2_alloc_put(ip); + } +out_unlock: + gfs2_glock_dq(&ip->i_gh); +out_uninit: + gfs2_holder_uninit(&ip->i_gh); + return error; +} + +/** + * adjust_fs_space - Adjusts the free space available due to gfs2_grow + * @inode: the rindex inode + */ +static void adjust_fs_space(struct inode *inode) +{ + struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + u64 fs_total, new_free; + + /* Total up the file system space, according to the latest rindex. */ + fs_total = gfs2_ri_total(sdp); + + spin_lock(&sdp->sd_statfs_spin); + if (fs_total > (m_sc->sc_total + l_sc->sc_total)) + new_free = fs_total - (m_sc->sc_total + l_sc->sc_total); + else + new_free = 0; + spin_unlock(&sdp->sd_statfs_spin); + fs_warn(sdp, "File system extended by %llu blocks.\n", + (unsigned long long)new_free); + gfs2_statfs_change(sdp, new_free, new_free, 0); +} + +/** + * gfs2_stuffed_write_end - Write end for stuffed files + * @inode: The inode + * @dibh: The buffer_head containing the on-disk inode + * @pos: The file position + * @len: The length of the write + * @copied: How much was actually copied by the VFS + * @page: The page + * + * This copies the data from the page into the inode block after + * the inode data structure itself. + * + * Returns: errno + */ +static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, + loff_t pos, unsigned len, unsigned copied, + struct page *page) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + u64 to = pos + copied; + void *kaddr; + unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode); + struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; + + BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode))); + kaddr = kmap_atomic(page, KM_USER0); + memcpy(buf + pos, kaddr + pos, copied); + memset(kaddr + pos + copied, 0, len - copied); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + + if (!PageUptodate(page)) + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); + + if (inode->i_size < to) { + i_size_write(inode, to); + ip->i_di.di_size = inode->i_size; + di->di_size = cpu_to_be64(inode->i_size); + mark_inode_dirty(inode); + } + + if (inode == sdp->sd_rindex) + adjust_fs_space(inode); + + brelse(dibh); + gfs2_trans_end(sdp); + gfs2_glock_dq(&ip->i_gh); + gfs2_holder_uninit(&ip->i_gh); + return copied; +} + +/* + * If a page has any new buffers, zero them out here, and mark them uptodate + * and dirty so they'll be written out (in order to prevent uninitialised + * block data from leaking). And clear the new bit. + */ +static void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) +{ + unsigned int block_start, block_end; + struct buffer_head *head, *bh; + + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + return; + + bh = head = page_buffers(page); + block_start = 0; + do { + block_end = block_start + bh->b_size; + + if (buffer_new(bh)) { + if (block_end > from && block_start < to) { + if (!PageUptodate(page)) { + unsigned start, size; + + start = max(from, block_start); + size = min(to, block_end) - start; + + zero_user(page, start, size); + set_buffer_uptodate(bh); + } + + clear_buffer_new(bh); + mark_buffer_dirty(bh); + } + } + + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); +} + +static int __block_commit_write(struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + unsigned block_start, block_end; + int partial = 0; + unsigned blocksize; + struct buffer_head *bh, *head; + + blocksize = 1 << inode->i_blkbits; + + for(bh = head = page_buffers(page), block_start = 0; + bh != head || !block_start; + block_start=block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (!buffer_uptodate(bh)) + partial = 1; + } else { + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + } + clear_buffer_new(bh); + } + + /* + * If this is a partial write which happened to make all buffers + * uptodate then we can optimize away a bogus readpage() for + * the next read(). Here we 'discover' whether the page went + * uptodate as a result of this (potentially partial) write. + */ + if (!partial) + SetPageUptodate(page); + return 0; +} + +static int block_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + unsigned start; + + start = pos & (PAGE_CACHE_SIZE - 1); + + if (unlikely(copied < len)) { + /* + * The buffers that were written will now be uptodate, so we + * don't have to worry about a readpage reading them and + * overwriting a partial write. However if we have encountered + * a short write and only partially written into a buffer, it + * will not be marked uptodate, so a readpage might come in and + * destroy our partial write. + * + * Do the simplest thing, and just treat any short write to a + * non uptodate page as a zero-length write, and force the + * caller to redo the whole thing. + */ + if (!PageUptodate(page)) + copied = 0; + + page_zero_new_buffers(page, start+copied, start+len); + } + flush_dcache_page(page); + + /* This could be a short (even 0-length) commit */ + __block_commit_write(inode, page, start, start+copied); + + return copied; +} + +static int generic_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + int i_size_changed = 0; + + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold i_mutex. + * + * But it's important to update i_size while still holding page lock: + * page writeout could otherwise come in and zero beyond i_size. + */ + if (pos+copied > inode->i_size) { + i_size_write(inode, pos+copied); + i_size_changed = 1; + } + + unlock_page(page); + page_cache_release(page); + + /* + * Don't mark the inode dirty under page lock. First, it unnecessarily + * makes the holding time of page lock longer. Second, it forces lock + * ordering of page lock and transaction start for journaling + * filesystems. + */ + if (i_size_changed) + mark_inode_dirty(inode); + + return copied; +} + +/** + * gfs2_write_end + * @file: The file to write to + * @mapping: The address space to write to + * @pos: The file position + * @len: The length of the data + * @copied: + * @page: The page that has been written + * @fsdata: The fsdata (unused in GFS2) + * + * The main write_end function for GFS2. We have a separate one for + * stuffed files as they are slightly different, otherwise we just + * put our locking around the VFS provided functions. + * + * Returns: errno + */ + +int gfs2_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct buffer_head *dibh; + struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_dinode *di; + unsigned int from = pos & (PAGE_CACHE_SIZE - 1); + unsigned int to = from + len; + int ret; + + BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL); + + ret = gfs2_meta_inode_buffer(ip, &dibh); + if (unlikely(ret)) { + unlock_page(page); + page_cache_release(page); + goto failed; + } + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + + if (gfs2_is_stuffed(ip)) + return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page); + + if (!gfs2_is_writeback(ip)) + gfs2_page_add_databufs(ip, page, from, to); + + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + + if (likely(ret >= 0) && (inode->i_size > ip->i_di.di_size)) { + di = (struct gfs2_dinode *)dibh->b_data; + ip->i_di.di_size = inode->i_size; + di->di_size = cpu_to_be64(inode->i_size); + mark_inode_dirty(inode); + } + + if (inode == sdp->sd_rindex) + adjust_fs_space(inode); + + brelse(dibh); + gfs2_trans_end(sdp); +failed: + if (al) { + gfs2_inplace_release(ip); + gfs2_quota_unlock(ip); + gfs2_alloc_put(ip); + } + gfs2_glock_dq(&ip->i_gh); + gfs2_holder_uninit(&ip->i_gh); + return ret; +} + /** * gfs2_prepare_write - Prepare to write a page to a file * @file: The file to write to @@ -719,31 +1180,6 @@ out: } /** - * adjust_fs_space - Adjusts the free space available due to gfs2_grow - * @inode: the rindex inode - */ -static void adjust_fs_space(struct inode *inode) -{ - struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; - struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; - struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; - u64 fs_total, new_free; - - /* Total up the file system space, according to the latest rindex. */ - fs_total = gfs2_ri_total(sdp); - - spin_lock(&sdp->sd_statfs_spin); - if (fs_total > (m_sc->sc_total + l_sc->sc_total)) - new_free = fs_total - (m_sc->sc_total + l_sc->sc_total); - else - new_free = 0; - spin_unlock(&sdp->sd_statfs_spin); - fs_warn(sdp, "File system extended by %llu blocks.\n", - (unsigned long long)new_free); - gfs2_statfs_change(sdp, new_free, new_free, 0); -} - -/** * gfs2_commit_write - Commit write to a file * @file: The file to write to * @page: The page containing the data @@ -943,7 +1379,7 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset) if (gfs2_is_stuffed(ip)) return 0; - if (offset > i_size_read(&ip->i_inode)) + if (offset >= i_size_read(&ip->i_inode)) return 0; return 1; } diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h index 2452e15..2b96b0c 100644 --- a/fs/gfs2/ops_address.h +++ b/fs/gfs2/ops_address.h @@ -16,5 +16,11 @@ extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask); extern void gfs2_set_aops(struct inode *inode); +extern int gfs2_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); +extern int gfs2_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); #endif /* __OPS_ADDRESS_DOT_H__ */ diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index 98d96a5..3d7b539 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -23,7 +23,9 @@ #include <linux/crc32.h> #include <linux/lm_interface.h> #include <linux/writeback.h> +#include <linux/uaccess.h> #include <asm/uaccess.h> +#include <linux/mpage.h> #include "gfs2.h" #include "incore.h" @@ -44,6 +46,24 @@ #include "eaops.h" #include "ops_address.h" +static void iov_iter_advance(struct iov_iter *i, size_t bytes); +static inline void iov_iter_init(struct iov_iter *i, + const struct iovec *iov, unsigned long nr_segs, + size_t count, size_t written) +{ + i->iov = iov; + i->nr_segs = nr_segs; + i->iov_offset = 0; + i->count = count + written; + + iov_iter_advance(i, written); +} + +static inline size_t iov_iter_count(struct iov_iter *i) +{ + return i->count; +} + /* * Most fields left uninitialised to catch anybody who tries to * use them. f_flags set to prevent file_accessed() from touching @@ -651,14 +671,466 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) return do_flock(file, cmd, fl); } +/** + * generic_file functions backported from upstream filemap.c: + * In many cases I needed to change generic_* to gfs2_* + */ + +static unsigned long iov_iter_single_seg_count(struct iov_iter *i) +{ + const struct iovec *iov = i->iov; + if (i->nr_segs == 1) + return i->count; + else + return min(i->count, iov->iov_len - i->iov_offset); +} + +static void iov_iter_advance(struct iov_iter *i, size_t bytes) +{ + BUG_ON(i->count < bytes); + + if (likely(i->nr_segs == 1)) { + i->iov_offset += bytes; + i->count -= bytes; + } else { + const struct iovec *iov = i->iov; + size_t base = i->iov_offset; + + /* + * The !iov->iov_len check ensures we skip over unlikely + * zero-length segments (without overruning the iovec). + */ + while (bytes || unlikely(i->count && !iov->iov_len)) { + int copy; + + copy = min(bytes, iov->iov_len - base); + BUG_ON(!i->count || i->count < copy); + i->count -= copy; + bytes -= copy; + base += copy; + if (iov->iov_len == base) { + iov++; + base = 0; + } + } + i->iov = iov; + i->iov_offset = base; + } +} + +static inline int gfs2_fault_in_pages_readable(const char __user *uaddr, + int size) +{ + volatile char c; + int ret; + + if (unlikely(size == 0)) + return 0; + + ret = __get_user(c, uaddr); + if (ret == 0) { + const char __user *end = uaddr + size - 1; + + if (((unsigned long)uaddr & PAGE_MASK) != + ((unsigned long)end & PAGE_MASK)) + ret = __get_user(c, end); + } + return ret; +} +static int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) +{ + char __user *buf = i->iov->iov_base + i->iov_offset; + bytes = min(bytes, i->iov->iov_len - i->iov_offset); + return gfs2_fault_in_pages_readable(buf, bytes); +} + +static ssize_t gfs2_perform_write(struct file *file, + struct iov_iter *i, loff_t pos) +{ + struct address_space *mapping = file->f_mapping; + long status = 0; + ssize_t written = 0; + unsigned int flags = 0; + + /* + * Copies from kernel address space cannot fail (NFSD is a big user). + */ + /*if (segment_eq(get_fs(), KERNEL_DS)) + flags |= AOP_FLAG_UNINTERRUPTIBLE;*/ + + do { + struct page *page; + pgoff_t index; /* Pagecache index for current page */ + unsigned long offset; /* Offset into pagecache page */ + unsigned long bytes; /* Bytes to write to page */ + size_t copied; /* Bytes copied from user */ + void *fsdata; + + offset = (pos & (PAGE_CACHE_SIZE - 1)); + index = pos >> PAGE_CACHE_SHIFT; + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + iov_iter_count(i)); + +again: + + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + * + * Not only is this an optimisation, but it is also required + * to check that the address is actually valid, when atomic + * usercopies are used, below. + */ + if (unlikely(iov_iter_fault_in_readable(i, bytes))) { + status = -EFAULT; + break; + } + + status = gfs2_write_begin(file, mapping, pos, bytes, flags, + &page, &fsdata); + if (unlikely(status)) + break; + + /* pagefault disable */ + inc_preempt_count(); + barrier(); + copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); + /* pagefault enable */ + barrier(); + dec_preempt_count(); + barrier(); + preempt_check_resched(); + + flush_dcache_page(page); + + status = gfs2_write_end(file, mapping, pos, bytes, copied, + page, fsdata); + if (unlikely(status < 0)) + break; + copied = status; + + cond_resched(); + + iov_iter_advance(i, copied); + if (unlikely(copied == 0)) { + /* + * If we were unable to copy any data at all, we must + * fall back to a single segment length write. + * + * If we didn't fallback here, we could livelock + * because not all segments in the iov can be copied at + * once without a pagefault. + */ + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + iov_iter_single_seg_count(i)); + goto again; + } + pos += copied; + written += copied; + + balance_dirty_pages_ratelimited(mapping); + + } while (iov_iter_count(i)); + + return written ? written : status; +} + +static ssize_t +gfs2_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos, loff_t *ppos, + size_t count, ssize_t written) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + struct inode *inode = mapping->host; + ssize_t status; + struct iov_iter i; + + iov_iter_init(&i, iov, nr_segs, count, written); + status = gfs2_perform_write(file, &i, pos); + + if (likely(status >= 0)) { + written += status; + *ppos = pos + status; + + /* + * For now, when the user asks for O_SYNC, we'll actually give + * O_DSYNC + */ + if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + if (!a_ops->writepage || !is_sync_kiocb(iocb)) + status = generic_osync_inode(inode, mapping, + OSYNC_METADATA|OSYNC_DATA); + } + } + + /* + * If we get here for O_DIRECT writes then we must have fallen through + * to buffered writes (block instantiation inside i_size). So we sync + * the file data here, to try to honour O_DIRECT expectations. + */ + if (unlikely(file->f_flags & O_DIRECT) && written) + status = filemap_write_and_wait(mapping); + + return written ? written : status; +} + +/* + * Performs necessary checks before doing a write + * @iov: io vector request + * @nr_segs: number of segments in the iovec + * @count: number of bytes to write + * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE + * + * Adjust number of segments and amount of bytes to write (nr_segs should be + * properly initialized first). Returns appropriate error code that caller + * should return or zero in case that write should be allowed. + */ +static int generic_segment_checks(const struct iovec *iov, + unsigned long *nr_segs, size_t *count, int access_flags) +{ + unsigned long seg; + size_t cnt = 0; + for (seg = 0; seg < *nr_segs; seg++) { + const struct iovec *iv = &iov[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + cnt += iv->iov_len; + if (unlikely((ssize_t)(cnt|iv->iov_len) < 0)) + return -EINVAL; + if (access_ok(access_flags, iv->iov_base, iv->iov_len)) + continue; + if (seg == 0) + return -EFAULT; + *nr_segs = seg; + cnt -= iv->iov_len; /* This segment is no good */ + break; + } + *count = cnt; + return 0; +} + +static ssize_t +__gfs2_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct file *file = iocb->ki_filp; + struct address_space * mapping = file->f_mapping; + size_t ocount; /* original count */ + size_t count; /* after file limit checks */ + struct inode *inode = mapping->host; + loff_t pos; + ssize_t written; + ssize_t err; + + ocount = 0; + err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); + if (err) + return err; + + count = ocount; + pos = *ppos; + + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = mapping->backing_dev_info; + written = 0; + + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) + goto out; + + if (count == 0) + goto out; + + err = remove_suid(file->f_dentry); + if (err) + goto out; + + file_update_time(file); + + /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ + if (unlikely(file->f_flags & O_DIRECT)) { + loff_t endbyte; + ssize_t written_buffered; + + written = generic_file_direct_write(iocb, iov, &nr_segs, pos, + ppos, count, ocount); + if (written < 0 || written == count) + goto out; + /* + * direct-io write to a hole: fall through to buffered I/O + * for completing the rest of the request. + */ + pos += written; + count -= written; + written_buffered = gfs2_file_buffered_write(iocb, iov, + nr_segs, pos, + ppos, count, + written); + /* + * If gfs2_file_buffered_write() returned a synchronous error + * then we want to return the number of bytes which were + * direct-written, or the error code if that was zero. Note + * that this differs from normal direct-io semantics, which + * will return -EFOO even if some bytes were written. + */ + if (written_buffered < 0) { + err = written_buffered; + goto out; + } + + /* + * We need to ensure that the page cache pages are written to + * disk and invalidated to preserve the expected O_DIRECT + * semantics. + */ + endbyte = pos + written_buffered - written - 1; + err = do_sync_file_range(file, pos, endbyte, + SYNC_FILE_RANGE_WAIT_BEFORE| + SYNC_FILE_RANGE_WRITE| + SYNC_FILE_RANGE_WAIT_AFTER); + if (err == 0) { + written = written_buffered; + invalidate_mapping_pages(mapping, + pos >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); + } else { + /* + * We don't know how much we wrote, so just return + * the number of bytes which were direct-written + */ + } + } else { + written = gfs2_file_buffered_write(iocb, iov, nr_segs, + pos, ppos, count, written); + } +out: + current->backing_dev_info = NULL; + return written ? written : err; +} + +static ssize_t +gfs2_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + loff_t pos = *ppos; + + ret = __gfs2_file_aio_write_nolock(iocb, iov, nr_segs, ppos); + + if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + int err; + + err = sync_page_range_nolock(inode, mapping, pos, ret); + if (err < 0) + ret = err; + } + return ret; +} + +static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const char __user *buf, + size_t count, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + struct iovec local_iov = { .iov_base = (void __user *)buf, + .iov_len = count }; + + BUG_ON(iocb->ki_pos != pos); + + mutex_lock(&inode->i_mutex); + ret = __gfs2_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); + mutex_unlock(&inode->i_mutex); + + if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + ssize_t err; + + err = sync_page_range(inode, mapping, pos, ret); + if (err < 0) + ret = err; + } + return ret; +} + +static ssize_t gfs2_file_write_nolock(struct file *file, + const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct kiocb kiocb; + ssize_t ret; + + init_sync_kiocb(&kiocb, file); + ret = gfs2_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&kiocb); + return ret; +} + +static ssize_t gfs2_file_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + struct iovec local_iov = { .iov_base = (void __user *)buf, + .iov_len = count }; + + mutex_lock(&inode->i_mutex); + ret = gfs2_file_write_nolock(file, &local_iov, 1, ppos); + mutex_unlock(&inode->i_mutex); + + if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + ssize_t err; + + err = sync_page_range(inode, mapping, *ppos - ret, ret); + if (err < 0) + ret = err; + } + return ret; +} + +static ssize_t gfs2_file_writev(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + + mutex_lock(&inode->i_mutex); + ret = gfs2_file_write_nolock(file, iov, nr_segs, ppos); + mutex_unlock(&inode->i_mutex); + + if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + int err; + + err = sync_page_range(inode, mapping, *ppos - ret, ret); + if (err < 0) + ret = err; + } + return ret; +} + const struct file_operations_ext gfs2_file_fops = { .f_op_orig.llseek = gfs2_llseek, .f_op_orig.read = generic_file_read, .f_op_orig.readv = generic_file_readv, .f_op_orig.aio_read = generic_file_aio_read, - .f_op_orig.write = generic_file_write, - .f_op_orig.writev = generic_file_writev, - .f_op_orig.aio_write = generic_file_aio_write, + .f_op_orig.write = gfs2_file_write, + .f_op_orig.writev = gfs2_file_writev, + .f_op_orig.aio_write = gfs2_file_aio_write, .f_op_orig.unlocked_ioctl = gfs2_ioctl, .f_op_orig.mmap = gfs2_mmap, .f_op_orig.open = gfs2_open, @@ -687,9 +1159,9 @@ const struct file_operations_ext gfs2_file_fops_nolock = { .f_op_orig.read = generic_file_read, .f_op_orig.readv = generic_file_readv, .f_op_orig.aio_read = generic_file_aio_read, - .f_op_orig.write = generic_file_write, - .f_op_orig.writev = generic_file_writev, - .f_op_orig.aio_write = generic_file_aio_write, + .f_op_orig.write = gfs2_file_write, + .f_op_orig.writev = gfs2_file_writev, + .f_op_orig.aio_write = gfs2_file_aio_write, .f_op_orig.unlocked_ioctl = gfs2_ioctl, .f_op_orig.mmap = gfs2_mmap, .f_op_orig.open = gfs2_open, diff --git a/include/linux/fs.h b/include/linux/fs.h index b778a80..f08cec8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -404,6 +404,15 @@ struct page; struct address_space; struct writeback_control; +struct iov_iter { + const struct iovec *iov; + unsigned long nr_segs; + size_t iov_offset; + size_t count; +}; + +size_t iov_iter_copy_from_user_atomic(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes); struct address_space_operations { int (*writepage)(struct page *page, struct writeback_control *wbc); int (*readpage)(struct file *, struct page *); diff --git a/mm/filemap.c b/mm/filemap.c index 7489d31..3eac292 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1965,6 +1965,34 @@ __filemap_copy_from_user_iovec_inatomic(char *vaddr, } /* + * Copy as much as we can into the page and return the number of bytes which + * were sucessfully copied. If a fault is encountered then return the number of + * bytes which were copied. + */ +size_t iov_iter_copy_from_user_atomic(struct page *page, struct iov_iter *i, + unsigned long offset, size_t bytes) +{ + char *kaddr; + size_t copied; + + kaddr = kmap_atomic(page, KM_USER0); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = i->iov->iov_base + i->iov_offset; + left = __copy_from_user_inatomic_nocache(kaddr + offset, + buf, bytes); + copied = bytes - left; + } else { + copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, + i->iov, i->iov_offset, bytes); + } + kunmap_atomic(kaddr, KM_USER0); + + return copied; +} +EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); + +/* * Performs necessary checks before doing a write * * Can adjust writing position or amount of bytes to write.