Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > fc11cd6e1c513a17304da94a5390f3cd > files > 826

kernel-2.6.18-194.11.1.el5.src.rpm

From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 26 Aug 2008 14:22:15 -0500
Subject: [fs] ext4/vfs/mm: core delalloc support
Message-id: 48B457E7.9070906@redhat.com
O-Subject: [PATCH RHEL5.3] ext4/vfs/mm: core delalloc support.
Bugzilla: 455452
RH-Acked-by: Peter Staubach <staubach@redhat.com>

[Bug 455452] RFE: delalloc helpers for ext4

However, I'd like to be able to provide test ext4 modules with delalloc, for
customers to run on a stock RHEL5.3 kernel, and for that I'll need some core
infrastructure changes.

So it's hard to say that this is well-tested, since delalloc on rhel5 is not working at
this point.

On the other hand, it is minimally invasive, really.

* It adds buffer_delay() tests, which no other fs will set in this path
(xfs does not go via __block_write_full_page).

* It exports a few new symbols

* It adds a couple new functions (and exports them)

Anyway, patch follows:

Various upstream changes for ext4 delalloc support.

11 Jul 2008
29a814d2ee0e43c2980f33f91c1311ec06c0aa35 vfs: add hooks for ext4's delayed allocation support
f4c0a0fdfae708f7aa438c27a380ed4071294e11 vfs: export filemap_fdatawrite_range()

__mpage_writepage_mpd is a helper/wrapper around __mpage_writepage, it takes an
mpage_data struct.

write_cache_pages is a backport of:
0ea971801625184a91a6d80ea85e53875caa0bf5 consolidate generic_writepages and mpage_writepages

it's basically a copy of mpage_writepages with changes to ____ ; I did it this way to
avoid perturbing the existing core paths, but it does result in some duplication.

 fs/buffer.c           |    7 +-
 fs/mpage.c            |  135 +++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/fs.h    |    2
 include/linux/mpage.h |   19 +++++++
 mm/filemap.c          |    3 -

diff --git a/fs/buffer.c b/fs/buffer.c
index 091319e..d61807e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1814,11 +1814,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 			 */
 			clear_buffer_dirty(bh);
 			set_buffer_uptodate(bh);
-		} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
+		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
+			   buffer_dirty(bh)) {
 			WARN_ON(bh->b_size != blocksize);
 			err = get_block(inode, block, bh, 1);
 			if (err)
 				goto recover;
+			clear_buffer_delay(bh);
 			if (buffer_new(bh)) {
 				/* blockdev mappings never come here */
 				clear_buffer_new(bh);
@@ -1907,7 +1909,8 @@ recover:
 	bh = head;
 	/* Recovery: lock and submit the mapped buffers */
 	do {
-		if (buffer_mapped(bh) && buffer_dirty(bh)) {
+		if (buffer_mapped(bh) && buffer_dirty(bh) &&
+		    !buffer_delay(bh)) {
 			lock_buffer(bh);
 			mark_buffer_async_write(bh);
 		} else {
diff --git a/fs/mpage.c b/fs/mpage.c
index 1e45982..18df41a 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -90,7 +90,7 @@ static int mpage_end_io_write(struct bio *bio, unsigned int bytes_done, int err)
 	return 0;
 }
 
-static struct bio *mpage_bio_submit(int rw, struct bio *bio)
+struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
 	bio->bi_end_io = mpage_end_io_read;
 	if (rw == WRITE)
@@ -98,6 +98,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 	submit_bio(rw, bio);
 	return NULL;
 }
+EXPORT_SYMBOL(mpage_bio_submit);
 
 static struct bio *
 mpage_alloc(struct block_device *bdev,
@@ -456,7 +457,7 @@ EXPORT_SYMBOL(mpage_readpage);
  * written, so it can intelligently allocate a suitably-sized BIO.  For now,
  * just allocate full-size (16-page) BIOs.
  */
-static struct bio *
+struct bio *
 __mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
 	sector_t *last_block_in_bio, int *ret, struct writeback_control *wbc,
 	writepage_t writepage_fn)
@@ -672,6 +673,152 @@ confused:
 out:
 	return bio;
 }
+EXPORT_SYMBOL(__mpage_writepage);
+
+int __mpage_writepage_mpd(struct page *page, struct writeback_control *wbc,
+		      struct mpage_data *mpd)
+{
+	int ret;
+	struct bio *bio;
+	int (*writepage)(struct page *page, struct writeback_control *wbc);
+
+	if (mpd->use_writepage)
+		writepage = page->mapping->a_ops->writepage;
+	else
+		writepage = NULL;
+
+	bio = __mpage_writepage(mpd->bio, page, mpd->get_block,
+				&mpd->last_block_in_bio, &ret, wbc, writepage);
+
+	if (bio)
+		mpd->bio = bio;
+	
+	return ret;
+}
+EXPORT_SYMBOL(__mpage_writepage_mpd);
+
+/**
+ * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
+ * @mapping: address space structure to write
+ * @range_cont: same as @wbc->range_cont upstream ...
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @writepage: function called for each page
+ * @data: data passed to writepage function
+ *
+ * If a page is already under I/O, write_cache_pages() skips it, even
+ * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
+ * and msync() need to guarantee that all the data which was dirty at the time
+ * the call was made get new I/O started against them.  If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
+ */
+
+/* range_cont is part of wbc upstream, but we cannot easily add to that - KABI */
+int
+write_cache_pages(struct address_space *mapping, int range_cont,
+		struct writeback_control *wbc, writepage_data_t writepage,
+		void *data)
+{
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	int ret = 0;
+	int done = 0;
+	struct pagevec pvec;
+	int nr_pages;
+	pgoff_t index;
+	pgoff_t end;		/* Inclusive */
+	int scanned = 0;
+	int range_whole = 0;
+
+	if (wbc->nonblocking && bdi_write_congested(bdi)) {
+		wbc->encountered_congestion = 1;
+		return 0;
+	}
+
+	pagevec_init(&pvec, 0);
+	if (wbc->range_cyclic) {
+		index = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
+		scanned = 1;
+	}
+retry:
+	while (!done && (index <= end) &&
+			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+			PAGECACHE_TAG_DIRTY,
+			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+		unsigned i;
+
+		scanned = 1;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/*
+			 * At this point we hold neither mapping->tree_lock nor
+			 * lock on the page itself: the page may be truncated or
+			 * invalidated (changing page->mapping to NULL), or even
+			 * swizzled back from swapper_space to tmpfs file
+			 * mapping
+			 */
+			lock_page(page);
+
+			if (unlikely(page->mapping != mapping)) {
+				unlock_page(page);
+				continue;
+			}
+
+			if (!wbc->range_cyclic && page->index > end) {
+				done = 1;
+				unlock_page(page);
+				continue;
+			}
+
+			if (wbc->sync_mode != WB_SYNC_NONE)
+				wait_on_page_writeback(page);
+
+			if (PageWriteback(page) ||
+					!clear_page_dirty_for_io(page)) {
+				unlock_page(page);
+				continue;
+			}
+
+			ret = (*writepage)(page, wbc, data);
+
+			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
+				unlock_page(page);
+				ret = 0;
+			}
+			if (ret || (--(wbc->nr_to_write) <= 0))
+				done = 1;
+			if (wbc->nonblocking && bdi_write_congested(bdi)) {
+				wbc->encountered_congestion = 1;
+				done = 1;
+			}
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+	if (!scanned && !done) {
+		/*
+		 * We hit the last page and there is more work to be done: wrap
+		 * back to the start of the file
+		 */
+		scanned = 1;
+		index = 0;
+		goto retry;
+	}
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		mapping->writeback_index = index;
+
+	if (range_cont)
+		wbc->range_start = index << PAGE_CACHE_SHIFT;
+	return ret;
+}
+EXPORT_SYMBOL(write_cache_pages);
 
 /**
  * mpage_writepages - walk the list of dirty pages of the given
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dafa46f..b778a80 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1668,6 +1668,8 @@ extern int wait_on_page_writeback_range(struct address_space *mapping,
 				pgoff_t start, pgoff_t end);
 extern int __filemap_fdatawrite_range(struct address_space *mapping,
 				loff_t start, loff_t end, int sync_mode);
+extern int filemap_fdatawrite_range(struct address_space *mapping,
+				loff_t start, loff_t end);
 
 extern long do_fsync(struct file *file, int datasync);
 extern void sync_supers(void);
diff --git a/include/linux/mpage.h b/include/linux/mpage.h
index 3ca8804..0e57838 100644
--- a/include/linux/mpage.h
+++ b/include/linux/mpage.h
@@ -10,12 +10,26 @@
  * nested includes.  Get it right in the .c file).
  */
 
+struct mpage_data {
+	struct bio *bio;
+	sector_t last_block_in_bio;
+	get_block_t *get_block;
+	unsigned use_writepage;
+};
+
 struct writeback_control;
 typedef int (writepage_t)(struct page *page, struct writeback_control *wbc);
+typedef int (writepage_data_t)(struct page *page, struct writeback_control *wbc, void *data);
 
+struct bio *mpage_bio_submit(int rw, struct bio *bio);
 int mpage_readpages(struct address_space *mapping, struct list_head *pages,
 				unsigned nr_pages, get_block_t get_block);
 int mpage_readpage(struct page *page, get_block_t get_block);
+struct bio *__mpage_writepage(struct bio *bio, struct page *page,
+	get_block_t get_block, sector_t *last_block_in_bio, int *ret,
+	struct writeback_control *wbc, writepage_t writepage_fn);
+int __mpage_writepage_mpd(struct page *page, struct writeback_control *wbc,
+                      struct mpage_data *mpd);
 int mpage_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, get_block_t get_block);
 int mpage_writepage(struct page *page, get_block_t *get_block,
@@ -26,3 +40,8 @@ generic_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	return mpage_writepages(mapping, wbc, NULL);
 }
+
+int
+write_cache_pages(struct address_space *mapping, int range_cont,
+                struct writeback_control *wbc, writepage_data_t writepage,
+                void *data);
diff --git a/mm/filemap.c b/mm/filemap.c
index 299dfce..7489d31 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -214,11 +214,12 @@ int filemap_fdatawrite(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_fdatawrite);
 
-static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 				loff_t end)
 {
 	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
 }
+EXPORT_SYMBOL(filemap_fdatawrite_range);
 
 /**
  * filemap_flush - mostly a non-blocking flush