From: Eric Sandeen <sandeen@redhat.com> Date: Tue, 26 Aug 2008 14:22:15 -0500 Subject: [fs] ext4/vfs/mm: core delalloc support Message-id: 48B457E7.9070906@redhat.com O-Subject: [PATCH RHEL5.3] ext4/vfs/mm: core delalloc support. Bugzilla: 455452 RH-Acked-by: Peter Staubach <staubach@redhat.com> [Bug 455452] RFE: delalloc helpers for ext4 However, I'd like to be able to provide test ext4 modules with delalloc, for customers to run on a stock RHEL5.3 kernel, and for that I'll need some core infrastructure changes. So it's hard to say that this is well-tested, since delalloc on rhel5 is not working at this point. On the other hand, it is minimally invasive, really. * It adds buffer_delay() tests, which no other fs will set in this path (xfs does not go via __block_write_full_page). * It exports a few new symbols * It adds a couple new functions (and exports them) Anyway, patch follows: Various upstream changes for ext4 delalloc support. 11 Jul 2008 29a814d2ee0e43c2980f33f91c1311ec06c0aa35 vfs: add hooks for ext4's delayed allocation support f4c0a0fdfae708f7aa438c27a380ed4071294e11 vfs: export filemap_fdatawrite_range() __mpage_writepage_mpd is a helper/wrapper around __mpage_writepage, it takes an mpage_data struct. write_cache_pages is a backport of: 0ea971801625184a91a6d80ea85e53875caa0bf5 consolidate generic_writepages and mpage_writepages it's basically a copy of mpage_writepages with changes to ____ ; I did it this way to avoid perturbing the existing core paths, but it does result in some duplication. fs/buffer.c | 7 +- fs/mpage.c | 135 +++++++++++++++++++++++++++++++++++++++++++++++++- include/linux/fs.h | 2 include/linux/mpage.h | 19 +++++++ mm/filemap.c | 3 - diff --git a/fs/buffer.c b/fs/buffer.c index 091319e..d61807e 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1814,11 +1814,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page, */ clear_buffer_dirty(bh); set_buffer_uptodate(bh); - } else if (!buffer_mapped(bh) && buffer_dirty(bh)) { + } else if ((!buffer_mapped(bh) || buffer_delay(bh)) && + buffer_dirty(bh)) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, block, bh, 1); if (err) goto recover; + clear_buffer_delay(bh); if (buffer_new(bh)) { /* blockdev mappings never come here */ clear_buffer_new(bh); @@ -1907,7 +1909,8 @@ recover: bh = head; /* Recovery: lock and submit the mapped buffers */ do { - if (buffer_mapped(bh) && buffer_dirty(bh)) { + if (buffer_mapped(bh) && buffer_dirty(bh) && + !buffer_delay(bh)) { lock_buffer(bh); mark_buffer_async_write(bh); } else { diff --git a/fs/mpage.c b/fs/mpage.c index 1e45982..18df41a 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -90,7 +90,7 @@ static int mpage_end_io_write(struct bio *bio, unsigned int bytes_done, int err) return 0; } -static struct bio *mpage_bio_submit(int rw, struct bio *bio) +struct bio *mpage_bio_submit(int rw, struct bio *bio) { bio->bi_end_io = mpage_end_io_read; if (rw == WRITE) @@ -98,6 +98,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio) submit_bio(rw, bio); return NULL; } +EXPORT_SYMBOL(mpage_bio_submit); static struct bio * mpage_alloc(struct block_device *bdev, @@ -456,7 +457,7 @@ EXPORT_SYMBOL(mpage_readpage); * written, so it can intelligently allocate a suitably-sized BIO. For now, * just allocate full-size (16-page) BIOs. */ -static struct bio * +struct bio * __mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block, sector_t *last_block_in_bio, int *ret, struct writeback_control *wbc, writepage_t writepage_fn) @@ -672,6 +673,152 @@ confused: out: return bio; } +EXPORT_SYMBOL(__mpage_writepage); + +int __mpage_writepage_mpd(struct page *page, struct writeback_control *wbc, + struct mpage_data *mpd) +{ + int ret; + struct bio *bio; + int (*writepage)(struct page *page, struct writeback_control *wbc); + + if (mpd->use_writepage) + writepage = page->mapping->a_ops->writepage; + else + writepage = NULL; + + bio = __mpage_writepage(mpd->bio, page, mpd->get_block, + &mpd->last_block_in_bio, &ret, wbc, writepage); + + if (bio) + mpd->bio = bio; + + return ret; +} +EXPORT_SYMBOL(__mpage_writepage_mpd); + +/** + * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. + * @mapping: address space structure to write + * @range_cont: same as @wbc->range_cont upstream ... + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @writepage: function called for each page + * @data: data passed to writepage function + * + * If a page is already under I/O, write_cache_pages() skips it, even + * if it's dirty. This is desirable behaviour for memory-cleaning writeback, + * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() + * and msync() need to guarantee that all the data which was dirty at the time + * the call was made get new I/O started against them. If wbc->sync_mode is + * WB_SYNC_ALL then we were called for data integrity and we must wait for + * existing IO to complete. + */ + +/* range_cont is part of wbc upstream, but we cannot easily add to that - KABI */ +int +write_cache_pages(struct address_space *mapping, int range_cont, + struct writeback_control *wbc, writepage_data_t writepage, + void *data) +{ + struct backing_dev_info *bdi = mapping->backing_dev_info; + int ret = 0; + int done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; /* Inclusive */ + int scanned = 0; + int range_whole = 0; + + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + return 0; + } + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + scanned = 1; + } +retry: + while (!done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + unsigned i; + + scanned = 1; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point we hold neither mapping->tree_lock nor + * lock on the page itself: the page may be truncated or + * invalidated (changing page->mapping to NULL), or even + * swizzled back from swapper_space to tmpfs file + * mapping + */ + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + continue; + } + + if (!wbc->range_cyclic && page->index > end) { + done = 1; + unlock_page(page); + continue; + } + + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + + if (PageWriteback(page) || + !clear_page_dirty_for_io(page)) { + unlock_page(page); + continue; + } + + ret = (*writepage)(page, wbc, data); + + if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { + unlock_page(page); + ret = 0; + } + if (ret || (--(wbc->nr_to_write) <= 0)) + done = 1; + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + done = 1; + } + } + pagevec_release(&pvec); + cond_resched(); + } + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = index; + + if (range_cont) + wbc->range_start = index << PAGE_CACHE_SHIFT; + return ret; +} +EXPORT_SYMBOL(write_cache_pages); /** * mpage_writepages - walk the list of dirty pages of the given diff --git a/include/linux/fs.h b/include/linux/fs.h index dafa46f..b778a80 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1668,6 +1668,8 @@ extern int wait_on_page_writeback_range(struct address_space *mapping, pgoff_t start, pgoff_t end); extern int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end, int sync_mode); +extern int filemap_fdatawrite_range(struct address_space *mapping, + loff_t start, loff_t end); extern long do_fsync(struct file *file, int datasync); extern void sync_supers(void); diff --git a/include/linux/mpage.h b/include/linux/mpage.h index 3ca8804..0e57838 100644 --- a/include/linux/mpage.h +++ b/include/linux/mpage.h @@ -10,12 +10,26 @@ * nested includes. Get it right in the .c file). */ +struct mpage_data { + struct bio *bio; + sector_t last_block_in_bio; + get_block_t *get_block; + unsigned use_writepage; +}; + struct writeback_control; typedef int (writepage_t)(struct page *page, struct writeback_control *wbc); +typedef int (writepage_data_t)(struct page *page, struct writeback_control *wbc, void *data); +struct bio *mpage_bio_submit(int rw, struct bio *bio); int mpage_readpages(struct address_space *mapping, struct list_head *pages, unsigned nr_pages, get_block_t get_block); int mpage_readpage(struct page *page, get_block_t get_block); +struct bio *__mpage_writepage(struct bio *bio, struct page *page, + get_block_t get_block, sector_t *last_block_in_bio, int *ret, + struct writeback_control *wbc, writepage_t writepage_fn); +int __mpage_writepage_mpd(struct page *page, struct writeback_control *wbc, + struct mpage_data *mpd); int mpage_writepages(struct address_space *mapping, struct writeback_control *wbc, get_block_t get_block); int mpage_writepage(struct page *page, get_block_t *get_block, @@ -26,3 +40,8 @@ generic_writepages(struct address_space *mapping, struct writeback_control *wbc) { return mpage_writepages(mapping, wbc, NULL); } + +int +write_cache_pages(struct address_space *mapping, int range_cont, + struct writeback_control *wbc, writepage_data_t writepage, + void *data); diff --git a/mm/filemap.c b/mm/filemap.c index 299dfce..7489d31 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -214,11 +214,12 @@ int filemap_fdatawrite(struct address_space *mapping) } EXPORT_SYMBOL(filemap_fdatawrite); -static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, +int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end) { return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); } +EXPORT_SYMBOL(filemap_fdatawrite_range); /** * filemap_flush - mostly a non-blocking flush