From: Eric Sandeen <sandeen@redhat.com> Date: Thu, 21 Aug 2008 15:42:33 -0500 Subject: [fs] ext4: revert delalloc upstream mods Message-id: 48ADD339.2080909@redhat.com O-Subject: [RHEL5.3 Patch 3/6] ext4: revert delalloc upstream mods Bugzilla: 458718 [Bug 458718] FEAT: RHEL 5.3 ext4 tech preview Revert several upstream ext4/jbd2 patches from 2.6.27-rc3. Some are simply for backporting, but more interestingly & importantly, remove (for now) patches related to delayed allocation and associated rewrites. Delayed allocation is still somewhat in flux upstream, and it also depends on a series of upstream changes that ultimately depend on the rewritten aops (write_begin/write_end) and a backport of all this is not yet ready. For now, we will run without delalloc. Reverts: 3295f0ef9ff048a4619ede597ad9ec9cab725654 lockdep: rename map_[acquire|release]() => lock_map_[acquire|release]() 4f3e7524b2e703d9f8b02ac338153a53dd7ede66 lockdep: map_acquire 77e69dac3cefacee939cb107ae9cd520a62338e0 [PATCH] fix races and leaks in vfs_quota_on() users 8ab22b9abb5c55413802e4adc9aa6223324547c3 vfs: pagecache usage optimization for pagesize!=blocksize e6305c43eda10ebfd2ad9e35d6e172ccc7bb3695 [PATCH] sanitize ->permission() prototype 51cc50685a4275c6a02653670af9f108a64e01cf SL*B: drop kmem cache argument from constructor 12219aea6b944e36795267be31d43f9c484841be ext4: Cleanup the block reservation code path d5a0d4f732af3438e592efab4cb80076d1dd81b5 ext4: fix ext4_da_write_begin error path dd919b9822c5fd9fd72f95a602440130297c3857 ext4: Enable delalloc by default. 3e3398a08d6e516675d5af853d625dc7dd90eab1 ext4: delayed allocation i_blocks fix for stat 632eaeab1feb5d78c1e2bfb1d2dfc0ebb8ac187f ext4: fix delalloc i_disksize early update issue f0e6c98593eb8a77edb7dd0edb22bb9f9368c567 ext4: Handle page without buffers in ext4_*_writepage() cd1aac32923a9c8adcc0ae85e33c1ca0c5855838 ext4: Add ordered mode support for delalloc e9e34f4e8f42177c66754fec1edfd35e70c18f99 jbd2: don't abort if flushing file data failed 87c89c232c8f7b3820c33c3b9bc803e9358027da jbd2: Remove data=ordered mode support using jbd buffer heads c851ed540173736e60d48b53b91a16ea5c903896 jbd2: Implement data=ordered mode handling via inodes 61628a3f3a37af2bf25daf8e26fd6b76a78c4f76 ext4: Invert lock ordering of page_lock and transaction start in delalloc d2a1763791a634e315ec926b62829c1e88842c86 ext4: delayed allocation ENOSPC handling e8ced39d5e8911c662d4d69a342b9d053eaaac4e percpu_counter: new function percpu_counter_sum_and_set 64769240bd07f446f83660bb143bb609d8ab4910 ext4: Add delayed allocation support in data=writeback mode 678aaf481496b01473b778685eca231d6784098b ext4: Use new framework for data=ordered mode in JBD2 cf108bca465dde0c015f32dd453b99457d31c7c7 ext4: Invert the locking order of page_lock and transaction start Adds: cbe5f466f6995e10a10c7ae66d6dc8608f08a6b8 jbd: don't abort if flushing file data failed diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 694ed6f..a234b54 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -299,7 +299,7 @@ ext4_check_acl(struct inode *inode, int mask) } int -ext4_permission(struct inode *inode, int mask) +ext4_permission(struct inode *inode, int mask, struct nameidata *nd) { return generic_permission(inode, mask, ext4_check_acl); } diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index cd2b855..26a5c1a 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -58,7 +58,7 @@ static inline int ext4_acl_count(size_t size) #define EXT4_ACL_NOT_CACHED ((void *)-1) /* acl.c */ -extern int ext4_permission (struct inode *, int); +extern int ext4_permission (struct inode *, int, struct nameidata *); extern int ext4_acl_chmod (struct inode *); extern int ext4_init_acl (handle_t *, struct inode *, struct inode *); diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 1ae5004..7185c68 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -1624,7 +1624,7 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, #ifdef CONFIG_SMP if (free_blocks - root_blocks < FBC_BATCH) free_blocks = - percpu_counter_sum_and_set(&sbi->s_freeblocks_counter); + percpu_counter_sum_positive(&sbi->s_freeblocks_counter); #endif if (free_blocks - root_blocks < nblocks) return free_blocks - root_blocks; @@ -1704,12 +1704,7 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode, } sbi = EXT4_SB(sb); - if (!EXT4_I(inode)->i_delalloc_reserved_flag) { - /* - * With delalloc we already reserved the blocks - */ - *count = ext4_has_free_blocks(sbi, *count); - } + *count = ext4_has_free_blocks(sbi, *count); if (*count == 0) { *errp = -ENOSPC; return 0; /*return with ENOSPC error */ @@ -1910,8 +1905,7 @@ allocated: le16_add_cpu(&gdp->bg_free_blocks_count, -num); gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp); spin_unlock(sb_bgl_lock(sbi, group_no)); - if (!EXT4_I(inode)->i_delalloc_reserved_flag) - percpu_counter_sub(&sbi->s_freeblocks_counter, num); + percpu_counter_sub(&sbi->s_freeblocks_counter, num); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, group_no); @@ -1985,49 +1979,40 @@ static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode, } /* - * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks + * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks * * @handle: handle to this transaction * @inode: file inode * @goal: given target block(filesystem wide) - * @count: total number of blocks need * @errp: error code * - * Return 1st allocated block numberon success, *count stores total account - * error stores in errp pointer + * Return allocated block number on success */ -ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, unsigned long *count, int *errp) +ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, int *errp) { - ext4_fsblk_t ret; - ret = do_blk_alloc(handle, inode, 0, goal, - count, errp, EXT4_META_BLOCK); - /* - * Account for the allocated meta blocks - */ - if (!(*errp)) { - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - EXT4_I(inode)->i_allocated_meta_blocks += *count; - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - } - return ret; + unsigned long count = 1; + return do_blk_alloc(handle, inode, 0, goal, + &count, errp, EXT4_META_BLOCK); } /* - * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks + * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks * * @handle: handle to this transaction * @inode: file inode * @goal: given target block(filesystem wide) + * @count: total number of blocks need * @errp: error code * - * Return allocated block number on success + * Return 1st allocated block numberon success, *count stores total account + * error stores in errp pointer */ -ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, int *errp) +ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, unsigned long *count, int *errp) { - unsigned long count = 1; - return ext4_new_meta_blocks(handle, inode, goal, &count, errp); + return do_blk_alloc(handle, inode, 0, goal, + count, errp, EXT4_META_BLOCK); } /* diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index d3d23d7..5ed5108 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -129,8 +129,7 @@ static int ext4_readdir(struct file * filp, struct buffer_head *bh = NULL; map_bh.b_state = 0; - err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, - 0, 0, 0); + err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0); if (err > 0) { pgoff_t index = map_bh.b_blocknr >> (PAGE_CACHE_SHIFT - inode->i_blkbits); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6c7924d..98760d1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -74,9 +74,6 @@ #define EXT4_MB_HINT_GOAL_ONLY 256 /* goal is meaningful */ #define EXT4_MB_HINT_TRY_GOAL 512 -/* blocks already pre-reserved by delayed allocation */ -#define EXT4_MB_DELALLOC_RESERVED 1024 - struct ext4_allocation_request { /* target inode for block we're allocating */ @@ -539,7 +536,6 @@ do { \ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ #define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */ -#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt @@ -1058,8 +1054,6 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, extern struct inode *ext4_iget(struct super_block *, unsigned long); extern int ext4_write_inode (struct inode *, int); extern int ext4_setattr (struct dentry *, struct iattr *); -extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat); extern void ext4_delete_inode (struct inode *); extern int ext4_sync_inode (handle_t *, struct inode *); extern void ext4_discard_reservation (struct inode *); @@ -1072,7 +1066,7 @@ extern void ext4_set_inode_flags(struct inode *); extern void ext4_get_inode_flags(struct ext4_inode_info *); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); -extern int ext4_block_truncate_page(handle_t *handle, +extern int ext4_block_truncate_page(handle_t *handle, struct page *page, struct address_space *mapping, loff_t from); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page); @@ -1231,7 +1225,7 @@ extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned long max_blocks, struct buffer_head *bh_result, int create, int extend_disksize); -extern void ext4_ext_truncate(struct inode *); +extern void ext4_ext_truncate(struct inode *, struct page *); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, @@ -1239,7 +1233,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, unsigned long max_blocks, struct buffer_head *bh, int create, - int extend_disksize, int flag); + int extend_disksize); #endif /* __KERNEL__ */ #endif /* _EXT4_H */ diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 6c166c0..75333b5 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -212,7 +212,6 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); } -extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); extern int ext4_extent_tree_init(handle_t *, struct inode *); diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h index ef7409f..abf2744 100644 --- a/fs/ext4/ext4_i.h +++ b/fs/ext4/ext4_i.h @@ -150,7 +150,6 @@ struct ext4_inode_info { */ struct rw_semaphore i_data_sem; struct inode vfs_inode; - struct jbd2_inode jinode; unsigned long i_ext_generation; struct ext4_ext_cache i_cached_extent; @@ -163,13 +162,6 @@ struct ext4_inode_info { /* mballoc */ struct list_head i_prealloc_list; spinlock_t i_prealloc_lock; - - /* allocation reservation info for delalloc */ - unsigned long i_reserved_data_blocks; - unsigned long i_reserved_meta_blocks; - unsigned long i_allocated_meta_blocks; - unsigned short i_delalloc_reserved_flag; - spinlock_t i_block_reservation_lock; }; #endif /* _EXT4_I */ diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index eb8bc3a..d0aa9ee 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -154,6 +154,8 @@ int __ext4_journal_dirty_metadata(const char *where, #define ext4_journal_forget(handle, bh) \ __ext4_journal_forget(__func__, (handle), (bh)) +int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh); + handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); int __ext4_journal_stop(const char *where, handle_t *handle); @@ -190,11 +192,6 @@ static inline int ext4_journal_force_commit(journal_t *journal) return jbd2_journal_force_commit(journal); } -static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) -{ - return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); -} - /* super.c */ int ext4_force_commit(struct super_block *sb); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 612c3d2..352bfd7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -248,36 +248,6 @@ static int ext4_ext_space_root_idx(struct inode *inode) return size; } -/* - * Calculate the number of metadata blocks needed - * to allocate @blocks - * Worse case is one block per extent - */ -int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks) -{ - int lcap, icap, rcap, leafs, idxs, num; - int newextents = blocks; - - rcap = ext4_ext_space_root_idx(inode); - lcap = ext4_ext_space_block(inode); - icap = ext4_ext_space_block_idx(inode); - - /* number of new leaf blocks needed */ - num = leafs = (newextents + lcap - 1) / lcap; - - /* - * Worse case, we need separate index block(s) - * to link all new leaf blocks - */ - idxs = (leafs + icap - 1) / icap; - do { - num += idxs; - idxs = (idxs + icap - 1) / icap; - } while (idxs > rcap); - - return num; -} - static int ext4_ext_max_entries(struct inode *inode, int depth) { @@ -2574,7 +2544,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, int err = 0, depth, ret; unsigned long allocated = 0; struct ext4_allocation_request ar; - loff_t disksize; __clear_bit(BH_New, &bh_result->b_state); ext_debug("blocks %u/%lu requested for inode %u\n", @@ -2765,13 +2734,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, newblock = ext_pblock(&newex); allocated = ext4_ext_get_actual_len(&newex); outnew: - if (extend_disksize) { - disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits; - if (disksize > i_size_read(inode)) - disksize = i_size_read(inode); - if (disksize > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = disksize; - } + if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = inode->i_size; set_buffer_new(bh_result); @@ -2794,7 +2758,7 @@ out2: return err ? err : allocated; } -void ext4_ext_truncate(struct inode *inode) +void ext4_ext_truncate(struct inode * inode, struct page *page) { struct address_space *mapping = inode->i_mapping; struct super_block *sb = inode->i_sb; @@ -2807,11 +2771,18 @@ void ext4_ext_truncate(struct inode *inode) */ err = ext4_writepage_trans_blocks(inode) + 3; handle = ext4_journal_start(inode, err); - if (IS_ERR(handle)) + if (IS_ERR(handle)) { + if (page) { + clear_highpage(page); + flush_dcache_page(page); + unlock_page(page); + page_cache_release(page); + } return; + } - if (inode->i_size & (sb->s_blocksize - 1)) - ext4_block_truncate_page(handle, mapping, inode->i_size); + if (page) + ext4_block_truncate_page(handle, page, mapping, inode->i_size); if (ext4_orphan_add(handle, inode)) goto out_stop; @@ -2955,7 +2926,7 @@ retry: } ret = ext4_get_blocks_wrap(handle, inode, block, max_blocks, &map_bh, - EXT4_CREATE_UNINITIALIZED_EXT, 0, 0); + EXT4_CREATE_UNINITIALIZED_EXT, 0); if (ret <= 0) { #ifdef EXT4FS_DEBUG WARN_ON(ret <= 0); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 430eb79..b9510ba 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -161,7 +161,6 @@ const struct file_operations ext4_file_operations = { const struct inode_operations ext4_file_inode_operations = { .truncate = ext4_truncate, .setattr = ext4_setattr, - .getattr = ext4_getattr, #ifdef CONFIG_EXT4DEV_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 59fbbe8..edd21bd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -32,23 +32,12 @@ #include <linux/string.h> #include <linux/buffer_head.h> #include <linux/writeback.h> -#include <linux/pagevec.h> #include <linux/mpage.h> #include <linux/uio.h> #include <linux/bio.h> #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" -#include "ext4_extents.h" - -static inline int ext4_begin_ordered_truncate(struct inode *inode, - loff_t new_size) -{ - return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode, - new_size); -} - -static void ext4_invalidatepage(struct page *page, unsigned long offset); /* * Test whether an inode is a fast symlink. @@ -193,8 +182,6 @@ void ext4_delete_inode (struct inode * inode) handle_t *handle; int err; - if (ext4_should_order_data(inode)) - ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) @@ -875,7 +862,6 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, struct ext4_inode_info *ei = EXT4_I(inode); int count = 0; ext4_fsblk_t first_block = 0; - loff_t disksize; J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); @@ -951,13 +937,8 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, * protect it if you're about to implement concurrent * ext4_get_block() -bzzz */ - if (!err && extend_disksize) { - disksize = ((loff_t) iblock + count) << inode->i_blkbits; - if (disksize > i_size_read(inode)) - disksize = i_size_read(inode); - if (disksize > ei->i_disksize) - ei->i_disksize = disksize; - } + if (!err && extend_disksize && inode->i_size > ei->i_disksize) + ei->i_disksize = inode->i_size; if (err) goto cleanup; @@ -980,67 +961,6 @@ out: return err; } -/* - * Calculate the number of metadata blocks need to reserve - * to allocate @blocks for non extent file based file - */ -static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) -{ - int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); - int ind_blks, dind_blks, tind_blks; - - /* number of new indirect blocks needed */ - ind_blks = (blocks + icap - 1) / icap; - - dind_blks = (ind_blks + icap - 1) / icap; - - tind_blks = 1; - - return ind_blks + dind_blks + tind_blks; -} - -/* - * Calculate the number of metadata blocks need to reserve - * to allocate given number of blocks - */ -static int ext4_calc_metadata_amount(struct inode *inode, int blocks) -{ - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) - return ext4_ext_calc_metadata_amount(inode, blocks); - - return ext4_indirect_calc_metadata_amount(inode, blocks); -} - -static void ext4_da_update_reserve_space(struct inode *inode, int used) -{ - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - int total, mdb, mdb_free; - - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - /* recalculate the number of metablocks still need to be reserved */ - total = EXT4_I(inode)->i_reserved_data_blocks - used; - mdb = ext4_calc_metadata_amount(inode, total); - - /* figure out how many metablocks to release */ - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); - mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; - - /* Account for allocated meta_blocks */ - mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; - - /* update fs free blocks counter for truncate case */ - percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free); - - /* update per-inode reservations */ - BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); - EXT4_I(inode)->i_reserved_data_blocks -= used; - - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); - EXT4_I(inode)->i_reserved_meta_blocks = mdb; - EXT4_I(inode)->i_allocated_meta_blocks = 0; - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); -} - /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 /* @@ -1077,7 +997,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) */ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, unsigned long max_blocks, struct buffer_head *bh, - int create, int extend_disksize, int flag) + int create, int extend_disksize) { int retval; @@ -1118,15 +1038,6 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, * with create == 1 flag. */ down_write((&EXT4_I(inode)->i_data_sem)); - - /* - * if the caller is from delayed allocation writeout path - * we have already reserved fs blocks for allocation - * let the underlying get_block() function know to - * avoid double accounting - */ - if (flag) - EXT4_I(inode)->i_delalloc_reserved_flag = 1; /* * We need to check for EXT4 here because migrate * could have changed the inode type in between @@ -1148,18 +1059,6 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, ~EXT4_EXT_MIGRATE; } } - - if (flag) { - EXT4_I(inode)->i_delalloc_reserved_flag = 0; - /* - * Update reserved blocks/metadata blocks - * after successful block allocation - * which were deferred till now - */ - if ((retval > 0) && buffer_delay(bh)) - ext4_da_update_reserve_space(inode, retval); - } - up_write((&EXT4_I(inode)->i_data_sem)); return retval; } @@ -1185,7 +1084,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock, } ret = ext4_get_blocks_wrap(handle, inode, iblock, - max_blocks, bh_result, create, 0, 0); + max_blocks, bh_result, create, 0); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; @@ -1211,7 +1110,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, dummy.b_blocknr = -1000; buffer_trace_init(&dummy.b_history); err = ext4_get_blocks_wrap(handle, inode, block, 1, - &dummy, create, 1, 0); + &dummy, create, 1); /* * ext4_get_blocks_handle() returns number of blocks * mapped. 0 in case of a HOLE. @@ -1367,20 +1266,19 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, to = from + len; retry: + page = __grab_cache_page(mapping, index); + if (!page) + return -ENOMEM; + *pagep = page; + handle = ext4_journal_start(inode, needed_blocks); if (IS_ERR(handle)) { + unlock_page(page); + page_cache_release(page); ret = PTR_ERR(handle); goto out; } - page = __grab_cache_page(mapping, index); - if (!page) { - ext4_journal_stop(handle); - ret = -ENOMEM; - goto out; - } - *pagep = page; - ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, ext4_get_block); @@ -1390,8 +1288,8 @@ retry: } if (ret) { - unlock_page(page); ext4_journal_stop(handle); + unlock_page(page); page_cache_release(page); } @@ -1401,6 +1299,15 @@ out: return ret; } +int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh) +{ + int err = jbd2_journal_dirty_data(handle, bh); + if (err) + ext4_journal_abort_handle(__func__, __func__, + bh, handle, err); + return err; +} + /* For write_end() in data=journal mode */ static int write_end_fn(handle_t *handle, struct buffer_head *bh) { @@ -1411,6 +1318,29 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh) } /* + * Generic write_end handler for ordered and writeback ext4 journal modes. + * We can't use generic_write_end, because that unlocks the page and we need to + * unlock the page after ext4_journal_stop, but ext4_journal_stop must run + * after block_write_end. + */ +static int ext4_generic_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = file->f_mapping->host; + + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + + if (pos+copied > inode->i_size) { + i_size_write(inode, pos+copied); + mark_inode_dirty(inode); + } + + return copied; +} + +/* * We need to pick up the new inode size which generic_commit_write gave us * `file' can be NULL - eg, when called from page_symlink(). * @@ -1423,10 +1353,11 @@ static int ext4_ordered_write_end(struct file *file, struct page *page, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); - struct inode *inode = mapping->host; + struct inode *inode = file->f_mapping->host; int ret = 0, ret2; - ret = ext4_jbd2_file_inode(handle, inode); + ret = walk_page_buffers(handle, page_buffers(page), + from, to, NULL, ext4_journal_dirty_data); if (ret == 0) { /* @@ -1439,7 +1370,7 @@ static int ext4_ordered_write_end(struct file *file, new_i_size = pos + copied; if (new_i_size > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = new_i_size; - ret2 = generic_write_end(file, mapping, pos, len, copied, + ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, page, fsdata); copied = ret2; if (ret2 < 0) @@ -1448,6 +1379,8 @@ static int ext4_ordered_write_end(struct file *file, ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; + unlock_page(page); + page_cache_release(page); return ret ? ret : copied; } @@ -1458,7 +1391,7 @@ static int ext4_writeback_write_end(struct file *file, struct page *page, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); - struct inode *inode = mapping->host; + struct inode *inode = file->f_mapping->host; int ret = 0, ret2; loff_t new_i_size; @@ -1466,7 +1399,7 @@ static int ext4_writeback_write_end(struct file *file, if (new_i_size > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = new_i_size; - ret2 = generic_write_end(file, mapping, pos, len, copied, + ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, page, fsdata); copied = ret2; if (ret2 < 0) @@ -1475,6 +1408,8 @@ static int ext4_writeback_write_end(struct file *file, ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; + unlock_page(page); + page_cache_release(page); return ret ? ret : copied; } @@ -1513,934 +1448,15 @@ static int ext4_journalled_write_end(struct file *file, ret = ret2; } - unlock_page(page); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; + unlock_page(page); page_cache_release(page); return ret ? ret : copied; } -static int ext4_da_reserve_space(struct inode *inode, int nrblocks) -{ - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - unsigned long md_needed, mdblocks, total = 0; - - /* - * recalculate the amount of metadata blocks to reserve - * in order to allocate nrblocks - * worse case is one extent per block - */ - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; - mdblocks = ext4_calc_metadata_amount(inode, total); - BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); - - md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; - total = md_needed + nrblocks; - - if (ext4_has_free_blocks(sbi, total) < total) { - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - return -ENOSPC; - } - /* reduce fs free blocks counter */ - percpu_counter_sub(&sbi->s_freeblocks_counter, total); - - EXT4_I(inode)->i_reserved_data_blocks += nrblocks; - EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; - - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - return 0; /* success */ -} - -static void ext4_da_release_space(struct inode *inode, int to_free) -{ - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - int total, mdb, mdb_free, release; - - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - /* recalculate the number of metablocks still need to be reserved */ - total = EXT4_I(inode)->i_reserved_data_blocks - to_free; - mdb = ext4_calc_metadata_amount(inode, total); - - /* figure out how many metablocks to release */ - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); - mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; - - release = to_free + mdb_free; - - /* update fs free blocks counter for truncate case */ - percpu_counter_add(&sbi->s_freeblocks_counter, release); - - /* update per-inode reservations */ - BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); - EXT4_I(inode)->i_reserved_data_blocks -= to_free; - - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); - EXT4_I(inode)->i_reserved_meta_blocks = mdb; - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); -} - -static void ext4_da_page_release_reservation(struct page *page, - unsigned long offset) -{ - int to_release = 0; - struct buffer_head *head, *bh; - unsigned int curr_off = 0; - - head = page_buffers(page); - bh = head; - do { - unsigned int next_off = curr_off + bh->b_size; - - if ((offset <= curr_off) && (buffer_delay(bh))) { - to_release++; - clear_buffer_delay(bh); - } - curr_off = next_off; - } while ((bh = bh->b_this_page) != head); - ext4_da_release_space(page->mapping->host, to_release); -} - -/* - * Delayed allocation stuff - */ - -struct mpage_da_data { - struct inode *inode; - struct buffer_head lbh; /* extent of blocks */ - unsigned long first_page, next_page; /* extent of pages */ - get_block_t *get_block; - struct writeback_control *wbc; -}; - -/* - * mpage_da_submit_io - walks through extent of pages and try to write - * them with __mpage_writepage() - * - * @mpd->inode: inode - * @mpd->first_page: first page of the extent - * @mpd->next_page: page after the last page of the extent - * @mpd->get_block: the filesystem's block mapper function - * - * By the time mpage_da_submit_io() is called we expect all blocks - * to be allocated. this may be wrong if allocation failed. - * - * As pages are already locked by write_cache_pages(), we can't use it - */ -static int mpage_da_submit_io(struct mpage_da_data *mpd) -{ - struct address_space *mapping = mpd->inode->i_mapping; - struct mpage_data mpd_pp = { - .bio = NULL, - .last_block_in_bio = 0, - .get_block = mpd->get_block, - .use_writepage = 1, - }; - int ret = 0, err, nr_pages, i; - unsigned long index, end; - struct pagevec pvec; - - BUG_ON(mpd->next_page <= mpd->first_page); - - pagevec_init(&pvec, 0); - index = mpd->first_page; - end = mpd->next_page - 1; - - while (index <= end) { - /* XXX: optimize tail */ - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); - if (nr_pages == 0) - break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - index = page->index; - if (index > end) - break; - index++; - - err = __mpage_writepage(page, mpd->wbc, &mpd_pp); - - /* - * In error case, we have to continue because - * remaining pages are still locked - * XXX: unlock and re-dirty them? - */ - if (ret == 0) - ret = err; - } - pagevec_release(&pvec); - } - if (mpd_pp.bio) - mpage_bio_submit(WRITE, mpd_pp.bio); - - return ret; -} - -/* - * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers - * - * @mpd->inode - inode to walk through - * @exbh->b_blocknr - first block on a disk - * @exbh->b_size - amount of space in bytes - * @logical - first logical block to start assignment with - * - * the function goes through all passed space and put actual disk - * block numbers into buffer heads, dropping BH_Delay - */ -static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, - struct buffer_head *exbh) -{ - struct inode *inode = mpd->inode; - struct address_space *mapping = inode->i_mapping; - int blocks = exbh->b_size >> inode->i_blkbits; - sector_t pblock = exbh->b_blocknr, cur_logical; - struct buffer_head *head, *bh; - unsigned long index, end; - struct pagevec pvec; - int nr_pages, i; - - index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - - pagevec_init(&pvec, 0); - - while (index <= end) { - /* XXX: optimize tail */ - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); - if (nr_pages == 0) - break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - index = page->index; - if (index > end) - break; - index++; - - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); - BUG_ON(!page_has_buffers(page)); - - bh = page_buffers(page); - head = bh; - - /* skip blocks out of the range */ - do { - if (cur_logical >= logical) - break; - cur_logical++; - } while ((bh = bh->b_this_page) != head); - - do { - if (cur_logical >= logical + blocks) - break; - if (buffer_delay(bh)) { - bh->b_blocknr = pblock; - clear_buffer_delay(bh); - } else if (buffer_mapped(bh)) - BUG_ON(bh->b_blocknr != pblock); - - cur_logical++; - pblock++; - } while ((bh = bh->b_this_page) != head); - } - pagevec_release(&pvec); - } -} - - -/* - * __unmap_underlying_blocks - just a helper function to unmap - * set of blocks described by @bh - */ -static inline void __unmap_underlying_blocks(struct inode *inode, - struct buffer_head *bh) -{ - struct block_device *bdev = inode->i_sb->s_bdev; - int blocks, i; - - blocks = bh->b_size >> inode->i_blkbits; - for (i = 0; i < blocks; i++) - unmap_underlying_metadata(bdev, bh->b_blocknr + i); -} - -/* - * mpage_da_map_blocks - go through given space - * - * @mpd->lbh - bh describing space - * @mpd->get_block - the filesystem's block mapper function - * - * The function skips space we know is already mapped to disk blocks. - * - * The function ignores errors ->get_block() returns, thus real - * error handling is postponed to __mpage_writepage() - */ -static void mpage_da_map_blocks(struct mpage_da_data *mpd) -{ - struct buffer_head *lbh = &mpd->lbh; - int err = 0, remain = lbh->b_size; - sector_t next = lbh->b_blocknr; - struct buffer_head new; - - /* - * We consider only non-mapped and non-allocated blocks - */ - if (buffer_mapped(lbh) && !buffer_delay(lbh)) - return; - - while (remain) { - new.b_state = lbh->b_state; - new.b_blocknr = 0; - new.b_size = remain; - err = mpd->get_block(mpd->inode, next, &new, 1); - if (err) { - /* - * Rather than implement own error handling - * here, we just leave remaining blocks - * unallocated and try again with ->writepage() - */ - break; - } - BUG_ON(new.b_size == 0); - - if (buffer_new(&new)) - __unmap_underlying_blocks(mpd->inode, &new); - - /* - * If blocks are delayed marked, we need to - * put actual blocknr and drop delayed bit - */ - if (buffer_delay(lbh)) - mpage_put_bnr_to_bhs(mpd, next, &new); - - /* go for the remaining blocks */ - next += new.b_size >> mpd->inode->i_blkbits; - remain -= new.b_size; - } -} - -#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay)) - -/* - * mpage_add_bh_to_extent - try to add one more block to extent of blocks - * - * @mpd->lbh - extent of blocks - * @logical - logical number of the block in the file - * @bh - bh of the block (used to access block's state) - * - * the function is used to collect contig. blocks in same state - */ -static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, - sector_t logical, struct buffer_head *bh) -{ - struct buffer_head *lbh = &mpd->lbh; - sector_t next; - - next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits); - - /* - * First block in the extent - */ - if (lbh->b_size == 0) { - lbh->b_blocknr = logical; - lbh->b_size = bh->b_size; - lbh->b_state = bh->b_state & BH_FLAGS; - return; - } - - /* - * Can we merge the block to our big extent? - */ - if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { - lbh->b_size += bh->b_size; - return; - } - - /* - * We couldn't merge the block to our extent, so we - * need to flush current extent and start new one - */ - mpage_da_map_blocks(mpd); - - /* - * Now start a new extent - */ - lbh->b_size = bh->b_size; - lbh->b_state = bh->b_state & BH_FLAGS; - lbh->b_blocknr = logical; -} - -/* - * __mpage_da_writepage - finds extent of pages and blocks - * - * @page: page to consider - * @wbc: not used, we just follow rules - * @data: context - * - * The function finds extents of pages and scan them for all blocks. - */ -static int __mpage_da_writepage(struct page *page, - struct writeback_control *wbc, void *data) -{ - struct mpage_da_data *mpd = data; - struct inode *inode = mpd->inode; - struct buffer_head *bh, *head, fake; - sector_t logical; - - /* - * Can we merge this page to current extent? - */ - if (mpd->next_page != page->index) { - /* - * Nope, we can't. So, we map non-allocated blocks - * and start IO on them using __mpage_writepage() - */ - if (mpd->next_page != mpd->first_page) { - mpage_da_map_blocks(mpd); - mpage_da_submit_io(mpd); - } - - /* - * Start next extent of pages ... - */ - mpd->first_page = page->index; - - /* - * ... and blocks - */ - mpd->lbh.b_size = 0; - mpd->lbh.b_state = 0; - mpd->lbh.b_blocknr = 0; - } - - mpd->next_page = page->index + 1; - logical = (sector_t) page->index << - (PAGE_CACHE_SHIFT - inode->i_blkbits); - - if (!page_has_buffers(page)) { - /* - * There is no attached buffer heads yet (mmap?) - * we treat the page asfull of dirty blocks - */ - bh = &fake; - bh->b_size = PAGE_CACHE_SIZE; - bh->b_state = 0; - set_buffer_dirty(bh); - set_buffer_uptodate(bh); - mpage_add_bh_to_extent(mpd, logical, bh); - } else { - /* - * Page with regular buffer heads, just add all dirty ones - */ - head = page_buffers(page); - bh = head; - do { - BUG_ON(buffer_locked(bh)); - if (buffer_dirty(bh)) - mpage_add_bh_to_extent(mpd, logical, bh); - logical++; - } while ((bh = bh->b_this_page) != head); - } - - return 0; -} - -/* - * mpage_da_writepages - walk the list of dirty pages of the given - * address space, allocates non-allocated blocks, maps newly-allocated - * blocks to existing bhs and issue IO them - * - * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @get_block: the filesystem's block mapper function. - * - * This is a library function, which implements the writepages() - * address_space_operation. - * - * In order to avoid duplication of logic that deals with partial pages, - * multiple bio per page, etc, we find non-allocated blocks, allocate - * them with minimal calls to ->get_block() and re-use __mpage_writepage() - * - * It's important that we call __mpage_writepage() only once for each - * involved page, otherwise we'd have to implement more complicated logic - * to deal with pages w/o PG_lock or w/ PG_writeback and so on. - * - * See comments to mpage_writepages() - */ -static int mpage_da_writepages(struct address_space *mapping, - struct writeback_control *wbc, - get_block_t get_block) -{ - struct mpage_da_data mpd; - int ret; - - if (!get_block) - return generic_writepages(mapping, wbc); - - mpd.wbc = wbc; - mpd.inode = mapping->host; - mpd.lbh.b_size = 0; - mpd.lbh.b_state = 0; - mpd.lbh.b_blocknr = 0; - mpd.first_page = 0; - mpd.next_page = 0; - mpd.get_block = get_block; - - ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd); - - /* - * Handle last extent of pages - */ - if (mpd.next_page != mpd.first_page) { - mpage_da_map_blocks(&mpd); - mpage_da_submit_io(&mpd); - } - - return ret; -} - -/* - * this is a special callback for ->write_begin() only - * it's intention is to return mapped block or reserve space - */ -static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - int ret = 0; - - BUG_ON(create == 0); - BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); - - /* - * first, we need to know whether the block is allocated already - * preallocated blocks are unmapped but should treated - * the same as allocated blocks. - */ - ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0); - if ((ret == 0) && !buffer_delay(bh_result)) { - /* the block isn't (pre)allocated yet, let's reserve space */ - /* - * XXX: __block_prepare_write() unmaps passed block, - * is it OK? - */ - ret = ext4_da_reserve_space(inode, 1); - if (ret) - /* not enough space to reserve */ - return ret; - - map_bh(bh_result, inode->i_sb, 0); - set_buffer_new(bh_result); - set_buffer_delay(bh_result); - } else if (ret > 0) { - bh_result->b_size = (ret << inode->i_blkbits); - ret = 0; - } - - return ret; -} -#define EXT4_DELALLOC_RSVED 1 -static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - int ret; - unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; - loff_t disksize = EXT4_I(inode)->i_disksize; - handle_t *handle = NULL; - - handle = ext4_journal_current_handle(); - if (!handle) { - ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, - bh_result, 0, 0, 0); - BUG_ON(!ret); - } else { - ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, - bh_result, create, 0, EXT4_DELALLOC_RSVED); - } - - if (ret > 0) { - bh_result->b_size = (ret << inode->i_blkbits); - - /* - * Update on-disk size along with block allocation - * we don't use 'extend_disksize' as size may change - * within already allocated block -bzzz - */ - disksize = ((loff_t) iblock + ret) << inode->i_blkbits; - if (disksize > i_size_read(inode)) - disksize = i_size_read(inode); - if (disksize > EXT4_I(inode)->i_disksize) { - /* - * XXX: replace with spinlock if seen contended -bzzz - */ - down_write(&EXT4_I(inode)->i_data_sem); - if (disksize > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = disksize; - up_write(&EXT4_I(inode)->i_data_sem); - - if (EXT4_I(inode)->i_disksize == disksize) { - ret = ext4_mark_inode_dirty(handle, inode); - return ret; - } - } - ret = 0; - } - return ret; -} - -static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) -{ - /* - * unmapped buffer is possible for holes. - * delay buffer is possible with delayed allocation - */ - return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh)); -} - -static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - int ret = 0; - unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; - - /* - * we don't want to do block allocation in writepage - * so call get_block_wrap with create = 0 - */ - ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks, - bh_result, 0, 0, 0); - if (ret > 0) { - bh_result->b_size = (ret << inode->i_blkbits); - ret = 0; - } - return ret; -} - -/* - * get called vi ext4_da_writepages after taking page lock (have journal handle) - * get called via journal_submit_inode_data_buffers (no journal handle) - * get called via shrink_page_list via pdflush (no journal handle) - * or grab_page_cache when doing write_begin (have journal handle) - */ -static int ext4_da_writepage(struct page *page, - struct writeback_control *wbc) -{ - int ret = 0; - loff_t size; - unsigned long len; - struct buffer_head *page_bufs; - struct inode *inode = page->mapping->host; - - size = i_size_read(inode); - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - - if (page_has_buffers(page)) { - page_bufs = page_buffers(page); - if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, - ext4_bh_unmapped_or_delay)) { - /* - * We don't want to do block allocation - * So redirty the page and return - * We may reach here when we do a journal commit - * via journal_submit_inode_data_buffers. - * If we don't have mapping block we just ignore - * them. We can also reach here via shrink_page_list - */ - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } - } else { - /* - * The test for page_has_buffers() is subtle: - * We know the page is dirty but it lost buffers. That means - * that at some moment in time after write_begin()/write_end() - * has been called all buffers have been clean and thus they - * must have been written at least once. So they are all - * mapped and we can happily proceed with mapping them - * and writing the page. - * - * Try to initialize the buffer_heads and check whether - * all are mapped and non delay. We don't want to - * do block allocation here. - */ - ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, - ext4_normal_get_block_write); - if (!ret) { - page_bufs = page_buffers(page); - /* check whether all are mapped and non delay */ - if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, - ext4_bh_unmapped_or_delay)) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } - } else { - /* - * We can't do block allocation here - * so just redity the page and unlock - * and return - */ - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } - } - - if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) - ret = nobh_writepage(page, ext4_normal_get_block_write, wbc); - else - ret = block_write_full_page(page, - ext4_normal_get_block_write, - wbc); - - return ret; -} - -/* - * For now just follow the DIO way to estimate the max credits - * needed to write out EXT4_MAX_WRITEBACK_PAGES. - * todo: need to calculate the max credits need for - * extent based files, currently the DIO credits is based on - * indirect-blocks mapping way. - * - * Probably should have a generic way to calculate credits - * for DIO, writepages, and truncate - */ -#define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS -#define EXT4_MAX_WRITEBACK_CREDITS DIO_CREDITS - -static int ext4_da_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - struct inode *inode = mapping->host; - handle_t *handle = NULL; - int needed_blocks; - int ret = 0; - long to_write; - loff_t range_start = 0; - - /* - * No pages to write? This is mainly a kludge to avoid starting - * a transaction for special inodes like journal inode on last iput() - * because that could violate lock ordering on umount - */ - if (!mapping->nrpages) - return 0; - - /* - * Estimate the worse case needed credits to write out - * EXT4_MAX_BUF_BLOCKS pages - */ - needed_blocks = EXT4_MAX_WRITEBACK_CREDITS; - - to_write = wbc->nr_to_write; - if (!wbc->range_cyclic) { - /* - * If range_cyclic is not set force range_cont - * and save the old writeback_index - */ - wbc->range_cont = 1; - range_start = wbc->range_start; - } - - while (!ret && to_write) { - /* start a new transaction*/ - handle = ext4_journal_start(inode, needed_blocks); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out_writepages; - } - if (ext4_should_order_data(inode)) { - /* - * With ordered mode we need to add - * the inode to the journal handle - * when we do block allocation. - */ - ret = ext4_jbd2_file_inode(handle, inode); - if (ret) { - ext4_journal_stop(handle); - goto out_writepages; - } - - } - /* - * set the max dirty pages could be write at a time - * to fit into the reserved transaction credits - */ - if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES) - wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES; - - to_write -= wbc->nr_to_write; - ret = mpage_da_writepages(mapping, wbc, - ext4_da_get_block_write); - ext4_journal_stop(handle); - if (wbc->nr_to_write) { - /* - * There is no more writeout needed - * or we requested for a noblocking writeout - * and we found the device congested - */ - to_write += wbc->nr_to_write; - break; - } - wbc->nr_to_write = to_write; - } - -out_writepages: - wbc->nr_to_write = to_write; - if (range_start) - wbc->range_start = range_start; - return ret; -} - -static int ext4_da_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - int ret, retries = 0; - struct page *page; - pgoff_t index; - unsigned from, to; - struct inode *inode = mapping->host; - handle_t *handle; - - index = pos >> PAGE_CACHE_SHIFT; - from = pos & (PAGE_CACHE_SIZE - 1); - to = from + len; - -retry: - /* - * With delayed allocation, we don't log the i_disksize update - * if there is delayed block allocation. But we still need - * to journalling the i_disksize update if writes to the end - * of file which has an already mapped buffer. - */ - handle = ext4_journal_start(inode, 1); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - - page = __grab_cache_page(mapping, index); - if (!page) { - ext4_journal_stop(handle); - ret = -ENOMEM; - goto out; - } - *pagep = page; - - ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, - ext4_da_get_block_prep); - if (ret < 0) { - unlock_page(page); - ext4_journal_stop(handle); - page_cache_release(page); - } - - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; -out: - return ret; -} - -/* - * Check if we should update i_disksize - * when write to the end of file but not require block allocation - */ -static int ext4_da_should_update_i_disksize(struct page *page, - unsigned long offset) -{ - struct buffer_head *bh; - struct inode *inode = page->mapping->host; - unsigned int idx; - int i; - - bh = page_buffers(page); - idx = offset >> inode->i_blkbits; - - for (i=0; i < idx; i++) - bh = bh->b_this_page; - - if (!buffer_mapped(bh) || (buffer_delay(bh))) - return 0; - return 1; -} - -static int ext4_da_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = mapping->host; - int ret = 0, ret2; - handle_t *handle = ext4_journal_current_handle(); - loff_t new_i_size; - unsigned long start, end; - - start = pos & (PAGE_CACHE_SIZE - 1); - end = start + copied -1; - - /* - * generic_write_end() will run mark_inode_dirty() if i_size - * changes. So let's piggyback the i_disksize mark_inode_dirty - * into that. - */ - - new_i_size = pos + copied; - if (new_i_size > EXT4_I(inode)->i_disksize) { - if (ext4_da_should_update_i_disksize(page, end)) { - down_write(&EXT4_I(inode)->i_data_sem); - if (new_i_size > EXT4_I(inode)->i_disksize) { - /* - * Updating i_disksize when extending file - * without needing block allocation - */ - if (ext4_should_order_data(inode)) - ret = ext4_jbd2_file_inode(handle, - inode); - - EXT4_I(inode)->i_disksize = new_i_size; - } - up_write(&EXT4_I(inode)->i_data_sem); - } - } - ret2 = generic_write_end(file, mapping, pos, len, copied, - page, fsdata); - copied = ret2; - if (ret2 < 0) - ret = ret2; - ret2 = ext4_journal_stop(handle); - if (!ret) - ret = ret2; - - return ret ? ret : copied; -} - -static void ext4_da_invalidatepage(struct page *page, unsigned long offset) -{ - /* - * Drop reserved blocks - */ - BUG_ON(!PageLocked(page)); - if (!page_has_buffers(page)) - goto out; - - ext4_da_page_release_reservation(page, offset); - -out: - ext4_invalidatepage(page, offset); - - return; -} - - /* * bmap() is special. It gets used by applications such as lilo and by * the swapper to find the on-disk block of a specific piece of data. @@ -2461,16 +1477,6 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) journal_t *journal; int err; - if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && - test_opt(inode->i_sb, DELALLOC)) { - /* - * With delalloc we want to sync the file - * so that we can make sure we allocate - * blocks for file - */ - filemap_write_and_wait(mapping); - } - if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { /* * This is a REALLY heavyweight approach, but the use of @@ -2515,17 +1521,21 @@ static int bput_one(handle_t *handle, struct buffer_head *bh) return 0; } +static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) +{ + if (buffer_mapped(bh)) + return ext4_journal_dirty_data(handle, bh); + return 0; +} + /* - * Note that we don't need to start a transaction unless we're journaling data - * because we should have holes filled from ext4_page_mkwrite(). We even don't - * need to file the inode to the transaction's list in ordered mode because if - * we are writing back data added by write(), the inode is already there and if - * we are writing back data modified via mmap(), noone guarantees in which - * transaction the data will hit the disk. In case we are journaling data, we - * cannot start transaction directly because transaction start ranks above page - * lock so we have to do some magic. + * Note that we always start a transaction even if we're not journalling + * data. This is to preserve ordering: any hole instantiation within + * __block_write_full_page -> ext4_get_block() should be journalled + * along with the data so we don't crash and then get metadata which + * refers to old data. * - * In all journaling modes block_write_full_page() will start the I/O. + * In all journalling modes block_write_full_page() will start the I/O. * * Problem: * @@ -2567,103 +1577,105 @@ static int bput_one(handle_t *handle, struct buffer_head *bh) * disastrous. Any write() or metadata operation will sync the fs for * us. * + * AKPM2: if all the page's buffers are mapped to disk and !data=journal, + * we don't need to open a transaction here. */ -static int __ext4_normal_writepage(struct page *page, +static int ext4_ordered_writepage(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; + struct buffer_head *page_bufs; + handle_t *handle = NULL; + int ret = 0; + int err; - if (test_opt(inode->i_sb, NOBH)) - return nobh_writepage(page, - ext4_normal_get_block_write, wbc); - else - return block_write_full_page(page, - ext4_normal_get_block_write, - wbc); -} + J_ASSERT(PageLocked(page)); -static int ext4_normal_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - loff_t size = i_size_read(inode); - loff_t len; + /* + * We give up here if we're reentered, because it might be for a + * different filesystem. + */ + if (ext4_journal_current_handle()) + goto out_fail; - J_ASSERT(PageLocked(page)); - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; + handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); - if (page_has_buffers(page)) { - /* if page has buffers it should all be mapped - * and allocated. If there are not buffers attached - * to the page we know the page is dirty but it lost - * buffers. That means that at some moment in time - * after write_begin() / write_end() has been called - * all buffers have been clean and thus they must have been - * written at least once. So they are all mapped and we can - * happily proceed with mapping them and writing the page. - */ - BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, - ext4_bh_unmapped_or_delay)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_fail; } - if (!ext4_journal_current_handle()) - return __ext4_normal_writepage(page, wbc); + if (!page_has_buffers(page)) { + create_empty_buffers(page, inode->i_sb->s_blocksize, + (1 << BH_Dirty)|(1 << BH_Uptodate)); + } + page_bufs = page_buffers(page); + walk_page_buffers(handle, page_bufs, 0, + PAGE_CACHE_SIZE, NULL, bget_one); + + ret = block_write_full_page(page, ext4_get_block, wbc); + /* + * The page can become unlocked at any point now, and + * truncate can then come in and change things. So we + * can't touch *page from now on. But *page_bufs is + * safe due to elevated refcount. + */ + + /* + * And attach them to the current transaction. But only if + * block_write_full_page() succeeded. Otherwise they are unmapped, + * and generally junk. + */ + if (ret == 0) { + err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, + NULL, jbd2_journal_dirty_data_fn); + if (!ret) + ret = err; + } + walk_page_buffers(handle, page_bufs, 0, + PAGE_CACHE_SIZE, NULL, bput_one); + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + return ret; + +out_fail: redirty_page_for_writepage(wbc, page); unlock_page(page); - return 0; + return ret; } -static int __ext4_journalled_writepage(struct page *page, +static int ext4_writeback_writepage(struct page *page, struct writeback_control *wbc) { - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; - struct buffer_head *page_bufs; + struct inode *inode = page->mapping->host; handle_t *handle = NULL; int ret = 0; int err; - ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, - ext4_normal_get_block_write); - if (ret != 0) - goto out_unlock; - - page_bufs = page_buffers(page); - walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, - bget_one); - /* As soon as we unlock the page, it can go away, but we have - * references to buffers so we are safe */ - unlock_page(page); + if (ext4_journal_current_handle()) + goto out_fail; handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out; + goto out_fail; } - ret = walk_page_buffers(handle, page_bufs, 0, - PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); + if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) + ret = nobh_writepage(page, ext4_get_block, wbc); + else + ret = block_write_full_page(page, ext4_get_block, wbc); - err = walk_page_buffers(handle, page_bufs, 0, - PAGE_CACHE_SIZE, NULL, write_end_fn); - if (ret == 0) - ret = err; err = ext4_journal_stop(handle); if (!ret) ret = err; + return ret; - walk_page_buffers(handle, page_bufs, 0, - PAGE_CACHE_SIZE, NULL, bput_one); - EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; - goto out; - -out_unlock: +out_fail: + redirty_page_for_writepage(wbc, page); unlock_page(page); -out: return ret; } @@ -2671,53 +1683,59 @@ static int ext4_journalled_writepage(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; - loff_t size = i_size_read(inode); - loff_t len; - - J_ASSERT(PageLocked(page)); - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - - if (page_has_buffers(page)) { - /* if page has buffers it should all be mapped - * and allocated. If there are not buffers attached - * to the page we know the page is dirty but it lost - * buffers. That means that at some moment in time - * after write_begin() / write_end() has been called - * all buffers have been clean and thus they must have been - * written at least once. So they are all mapped and we can - * happily proceed with mapping them and writing the page. - */ - BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, - ext4_bh_unmapped_or_delay)); - } + handle_t *handle = NULL; + int ret = 0; + int err; if (ext4_journal_current_handle()) goto no_write; - if (PageChecked(page)) { + handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto no_write; + } + + if (!page_has_buffers(page) || PageChecked(page)) { /* * It's mmapped pagecache. Add buffers and journal it. There * doesn't seem much point in redirtying the page here. */ ClearPageChecked(page); - return __ext4_journalled_writepage(page, wbc); + ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, + ext4_get_block); + if (ret != 0) { + ext4_journal_stop(handle); + goto out_unlock; + } + ret = walk_page_buffers(handle, page_buffers(page), 0, + PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); + + err = walk_page_buffers(handle, page_buffers(page), 0, + PAGE_CACHE_SIZE, NULL, write_end_fn); + if (ret == 0) + ret = err; + EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; + unlock_page(page); } else { /* * It may be a page full of checkpoint-mode buffers. We don't * really know unless we go poke around in the buffer_heads. * But block_write_full_page will do the right thing. */ - return block_write_full_page(page, - ext4_normal_get_block_write, - wbc); + ret = block_write_full_page(page, ext4_get_block, wbc); } + err = ext4_journal_stop(handle); + if (!ret) + ret = err; +out: + return ret; + no_write: redirty_page_for_writepage(wbc, page); +out_unlock: unlock_page(page); - return 0; + goto out; } static int ext4_readpage(struct file *file, struct page *page) @@ -2858,75 +1876,50 @@ static int ext4_journalled_set_page_dirty(struct page *page) } static const struct address_space_operations ext4_ordered_aops = { - .readpage = ext4_readpage, - .readpages = ext4_readpages, - .writepage = ext4_normal_writepage, - .sync_page = block_sync_page, - .write_begin = ext4_write_begin, - .write_end = ext4_ordered_write_end, - .bmap = ext4_bmap, - .invalidatepage = ext4_invalidatepage, - .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, - .migratepage = buffer_migrate_page, - .is_partially_uptodate = block_is_partially_uptodate, + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_ordered_writepage, + .sync_page = block_sync_page, + .write_begin = ext4_write_begin, + .write_end = ext4_ordered_write_end, + .bmap = ext4_bmap, + .invalidatepage = ext4_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .migratepage = buffer_migrate_page, }; static const struct address_space_operations ext4_writeback_aops = { - .readpage = ext4_readpage, - .readpages = ext4_readpages, - .writepage = ext4_normal_writepage, - .sync_page = block_sync_page, - .write_begin = ext4_write_begin, - .write_end = ext4_writeback_write_end, - .bmap = ext4_bmap, - .invalidatepage = ext4_invalidatepage, - .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, - .migratepage = buffer_migrate_page, - .is_partially_uptodate = block_is_partially_uptodate, + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_writeback_writepage, + .sync_page = block_sync_page, + .write_begin = ext4_write_begin, + .write_end = ext4_writeback_write_end, + .bmap = ext4_bmap, + .invalidatepage = ext4_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .migratepage = buffer_migrate_page, }; static const struct address_space_operations ext4_journalled_aops = { - .readpage = ext4_readpage, - .readpages = ext4_readpages, - .writepage = ext4_journalled_writepage, - .sync_page = block_sync_page, - .write_begin = ext4_write_begin, - .write_end = ext4_journalled_write_end, - .set_page_dirty = ext4_journalled_set_page_dirty, - .bmap = ext4_bmap, - .invalidatepage = ext4_invalidatepage, - .releasepage = ext4_releasepage, - .is_partially_uptodate = block_is_partially_uptodate, -}; - -static const struct address_space_operations ext4_da_aops = { - .readpage = ext4_readpage, - .readpages = ext4_readpages, - .writepage = ext4_da_writepage, - .writepages = ext4_da_writepages, - .sync_page = block_sync_page, - .write_begin = ext4_da_write_begin, - .write_end = ext4_da_write_end, - .bmap = ext4_bmap, - .invalidatepage = ext4_da_invalidatepage, - .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, - .migratepage = buffer_migrate_page, - .is_partially_uptodate = block_is_partially_uptodate, + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_journalled_writepage, + .sync_page = block_sync_page, + .write_begin = ext4_write_begin, + .write_end = ext4_journalled_write_end, + .set_page_dirty = ext4_journalled_set_page_dirty, + .bmap = ext4_bmap, + .invalidatepage = ext4_invalidatepage, + .releasepage = ext4_releasepage, }; void ext4_set_aops(struct inode *inode) { - if (ext4_should_order_data(inode) && - test_opt(inode->i_sb, DELALLOC)) - inode->i_mapping->a_ops = &ext4_da_aops; - else if (ext4_should_order_data(inode)) + if (ext4_should_order_data(inode)) inode->i_mapping->a_ops = &ext4_ordered_aops; - else if (ext4_should_writeback_data(inode) && - test_opt(inode->i_sb, DELALLOC)) - inode->i_mapping->a_ops = &ext4_da_aops; else if (ext4_should_writeback_data(inode)) inode->i_mapping->a_ops = &ext4_writeback_aops; else @@ -2939,7 +1932,7 @@ void ext4_set_aops(struct inode *inode) * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ -int ext4_block_truncate_page(handle_t *handle, +int ext4_block_truncate_page(handle_t *handle, struct page *page, struct address_space *mapping, loff_t from) { ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; @@ -2948,13 +1941,8 @@ int ext4_block_truncate_page(handle_t *handle, ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; - struct page *page; int err = 0; - page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT); - if (!page) - return -EINVAL; - blocksize = inode->i_sb->s_blocksize; length = blocksize - (offset & (blocksize - 1)); iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); @@ -3027,7 +2015,7 @@ int ext4_block_truncate_page(handle_t *handle, err = ext4_journal_dirty_metadata(handle, bh); } else { if (ext4_should_order_data(inode)) - err = ext4_jbd2_file_inode(handle, inode); + err = ext4_journal_dirty_data(handle, bh); mark_buffer_dirty(bh); } @@ -3445,25 +2433,46 @@ void ext4_truncate(struct inode *inode) int n; ext4_lblk_t last_block; unsigned blocksize = inode->i_sb->s_blocksize; + struct page *page; if (!ext4_can_truncate(inode)) return; + /* + * We have to lock the EOF page here, because lock_page() nests + * outside jbd2_journal_start(). + */ + if ((inode->i_size & (blocksize - 1)) == 0) { + /* Block boundary? Nothing to do */ + page = NULL; + } else { + page = grab_cache_page(mapping, + inode->i_size >> PAGE_CACHE_SHIFT); + if (!page) + return; + } + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { - ext4_ext_truncate(inode); + ext4_ext_truncate(inode, page); return; } handle = start_transaction(inode); - if (IS_ERR(handle)) + if (IS_ERR(handle)) { + if (page) { + clear_highpage(page); + flush_dcache_page(page); + unlock_page(page); + page_cache_release(page); + } return; /* AKPM: return what? */ + } last_block = (inode->i_size + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); - if (inode->i_size & (blocksize - 1)) - if (ext4_block_truncate_page(handle, mapping, inode->i_size)) - goto out_stop; + if (page) + ext4_block_truncate_page(handle, page, mapping, inode->i_size); n = ext4_block_to_path(inode, last_block, offsets, NULL); if (n == 0) @@ -3482,11 +2491,6 @@ void ext4_truncate(struct inode *inode) goto out_stop; /* - * From here we block out all ext4_get_block() callers who want to - * modify the block allocation tree. - */ - down_write(&ei->i_data_sem); - /* * The orphan list entry will now protect us from any crash which * occurs before the truncate completes, so it is now safe to propagate * the new, shorter inode size (held for now in i_size) into the @@ -3495,6 +2499,12 @@ void ext4_truncate(struct inode *inode) */ ei->i_disksize = inode->i_size; + /* + * From here we block out all ext4_get_block() callers who want to + * modify the block allocation tree. + */ + down_write(&ei->i_data_sem); + if (n == 1) { /* direct blocks */ ext4_free_data(handle, inode, NULL, i_data+offsets[0], i_data + EXT4_NDIR_BLOCKS); @@ -4188,14 +3198,7 @@ int ext4_write_inode(struct inode *inode, int wait) * be freed, so we have a strong guarantee that no future commit will * leave these blocks visible to the user.) * - * Another thing we have to assure is that if we are in ordered mode - * and inode is still attached to the committing transaction, we must - * we start writeout of all the dirty pages which are being truncated. - * This way we are sure that all the data written in the previous - * transaction are already on disk (truncate waits for pages under - * writeback). - * - * Called with inode->i_mutex down. + * Called with inode->sem down. */ int ext4_setattr(struct dentry *dentry, struct iattr *attr) { @@ -4261,22 +3264,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (!error) error = rc; ext4_journal_stop(handle); - - if (ext4_should_order_data(inode)) { - error = ext4_begin_ordered_truncate(inode, - attr->ia_size); - if (error) { - /* Do as much error cleanup as possible */ - handle = ext4_journal_start(inode, 3); - if (IS_ERR(handle)) { - ext4_orphan_del(NULL, inode); - goto err_out; - } - ext4_orphan_del(handle, inode); - ext4_journal_stop(handle); - goto err_out; - } - } } rc = inode_setattr(inode, attr); @@ -4297,32 +3284,6 @@ err_out: return error; } -int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) -{ - struct inode *inode; - unsigned long delalloc_blocks; - - inode = dentry->d_inode; - generic_fillattr(inode, stat); - - /* - * We can't update i_blocks if the block allocation is delayed - * otherwise in the case of system crash before the real block - * allocation is done, we will have i_blocks inconsistent with - * on-disk file blocks. - * We always keep i_blocks updated together with real - * allocation. But to not confuse with user, stat - * will return the blocks that include the delayed allocation - * blocks for this file. - */ - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - - stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; - return 0; -} /* * How many blocks doth make a writepage()? diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 865e9dd..8d36d49 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2968,15 +2968,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); - - /* - * free blocks account has already be reduced/reserved - * at write_begin() time for delayed allocation - * do not double accounting - */ - if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) - percpu_counter_sub(&sbi->s_freeblocks_counter, - ac->ac_b_ex.fe_len); + percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, @@ -4345,12 +4337,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, &(ar->len), errp); return block; } - if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) { - /* - * With delalloc we already reserved the blocks - */ - ar->len = ext4_has_free_blocks(sbi, ar->len); - } + ar->len = ext4_has_free_blocks(sbi, ar->len); if (ar->len == 0) { *errp = -ENOSPC; @@ -4367,9 +4354,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, } inquota = ar->len; - if (EXT4_I(ar->inode)->i_delalloc_reserved_flag) - ar->flags |= EXT4_MB_DELALLOC_RESERVED; - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); if (!ac) { ar->len = 0; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d5d7795..b574c46 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -571,12 +571,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); - jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode); - ei->i_reserved_data_blocks = 0; - ei->i_reserved_meta_blocks = 0; - ei->i_allocated_meta_blocks = 0; - ei->i_delalloc_reserved_flag = 0; - spin_lock_init(&(ei->i_block_reservation_lock)); return &ei->vfs_inode; } @@ -593,7 +587,7 @@ static void ext4_destroy_inode(struct inode *inode) kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); } -static void init_once(void *foo) +static void init_once(struct kmem_cache *cachep, void *foo) { struct ext4_inode_info *ei = (struct ext4_inode_info *) foo; @@ -641,8 +635,6 @@ static void ext4_clear_inode(struct inode *inode) EXT4_I(inode)->i_block_alloc_info = NULL; if (unlikely(rsv)) kfree(rsv); - jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, - &EXT4_I(inode)->jinode); } static inline void ext4_show_quota_options(struct seq_file *seq, @@ -755,9 +747,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",nomballoc"); if (test_opt(sb, I_VERSION)) seq_puts(seq, ",i_version"); - if (!test_opt(sb, DELALLOC)) - seq_puts(seq, ",nodelalloc"); - if (sbi->s_stripe) seq_printf(seq, ",stripe=%lu", sbi->s_stripe); @@ -905,7 +894,7 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, - Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc, + Opt_mballoc, Opt_nomballoc, Opt_stripe, }; static match_table_t tokens = { @@ -964,8 +953,6 @@ static match_table_t tokens = { {Opt_nomballoc, "nomballoc"}, {Opt_stripe, "stripe=%u"}, {Opt_resize, "resize"}, - {Opt_delalloc, "delalloc"}, - {Opt_nodelalloc, "nodelalloc"}, {Opt_err, NULL}, }; @@ -1353,9 +1340,6 @@ set_qf_format: set_opt(sbi->s_mount_opt, I_VERSION); sb->s_flags |= MS_I_VERSION; break; - case Opt_nodelalloc: - clear_opt(sbi->s_mount_opt, DELALLOC); - break; case Opt_mballoc: set_opt(sbi->s_mount_opt, MBALLOC); break; @@ -1369,9 +1353,6 @@ set_qf_format: return 0; sbi->s_stripe = option; break; - case Opt_delalloc: - set_opt(sbi->s_mount_opt, DELALLOC); - break; default: printk(KERN_ERR "EXT4-fs: Unrecognized mount option \"%s\" " @@ -2017,13 +1998,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) */ set_opt(sbi->s_mount_opt, MBALLOC); - /* - * enable delayed allocation by default - * Use -o nodelalloc to turn it off - */ - set_opt(sbi->s_mount_opt, DELALLOC); - - if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum, NULL, 0)) goto failed_mount; @@ -2462,13 +2436,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": "writeback"); - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { - printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " - "requested data journaling mode\n"); - clear_opt(sbi->s_mount_opt, DELALLOC); - } else if (test_opt(sb, DELALLOC)) - printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); - ext4_ext_init(sb); ext4_mb_init(sb, needs_recovery); @@ -3370,9 +3337,8 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); } - err = vfs_quota_on_path(sb, type, format_id, &nd.path); path_put(&nd.path); - return err; + return vfs_quota_on(sb, type, format_id, path, remount); } /* Read data from quotafile - avoid pagecache and such because we cannot afford @@ -3458,7 +3424,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, err = ext4_journal_dirty_metadata(handle, bh); else { /* Always do at least ordered writes for quotas */ - err = ext4_jbd2_file_inode(handle, inode); + err = ext4_journal_dirty_data(handle, bh); mark_buffer_dirty(bh); } brelse(bh); diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 91389c8..6914598 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -688,6 +688,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact J_ASSERT(transaction->t_state == T_FINISHED); J_ASSERT(transaction->t_buffers == NULL); + J_ASSERT(transaction->t_sync_datalist == NULL); J_ASSERT(transaction->t_forget == NULL); J_ASSERT(transaction->t_iobuf_list == NULL); J_ASSERT(transaction->t_shadow_list == NULL); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index f2ad061..b706b7d 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -22,8 +22,6 @@ #include <linux/pagemap.h> #include <linux/jiffies.h> #include <linux/crc32.h> -#include <linux/writeback.h> -#include <linux/backing-dev.h> /* * Default IO end handler for temporary BJ_IO buffer_heads. @@ -39,8 +37,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) } /* - * When an ext4 file is truncated, it is possible that some pages are not - * successfully freed, because they are attached to a committing transaction. + * When an ext3-ordered file is truncated, it is possible that many pages are + * not sucessfully freed, because they are attached to a committing transaction. * After the transaction commits, these pages are left on the LRU, with no * ->mapping, and with attached buffers. These pages are trivially reclaimable * by the VM, but their apparent absence upsets the VM accounting, and it makes @@ -82,6 +80,21 @@ nope: } /* + * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is + * held. For ranking reasons we must trylock. If we lose, schedule away and + * return 0. j_list_lock is dropped in this case. + */ +static int inverted_lock(journal_t *journal, struct buffer_head *bh) +{ + if (!jbd_trylock_bh_state(bh)) { + spin_unlock(&journal->j_list_lock); + schedule(); + return 0; + } + return 1; +} + +/* * Done it all: now submit the commit record. We should have * cleaned up our previous buffers by now, so if we are in abort * mode we can now just skip the rest of the journal write @@ -187,114 +200,175 @@ static int journal_wait_on_commit_record(struct buffer_head *bh) } /* - * write the filemap data using writepage() address_space_operations. - * We don't do block allocation here even for delalloc. We don't - * use writepages() because with dealyed allocation we may be doing - * block allocation in writepages(). + * Wait for all submitted IO to complete. */ -static int journal_submit_inode_data_buffers(struct address_space *mapping) +static int journal_wait_on_locked_list(journal_t *journal, + transaction_t *commit_transaction) { - int ret; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = mapping->nrpages * 2, - .range_start = 0, - .range_end = i_size_read(mapping->host), - .for_writepages = 1, - }; - - ret = generic_writepages(mapping, &wbc); + int ret = 0; + struct journal_head *jh; + + while (commit_transaction->t_locked_list) { + struct buffer_head *bh; + + jh = commit_transaction->t_locked_list->b_tprev; + bh = jh2bh(jh); + get_bh(bh); + if (buffer_locked(bh)) { + spin_unlock(&journal->j_list_lock); + wait_on_buffer(bh); + spin_lock(&journal->j_list_lock); + } + if (unlikely(!buffer_uptodate(bh))) { + if (TestSetPageLocked(bh->b_page)) { + spin_unlock(&journal->j_list_lock); + lock_page(bh->b_page); + spin_lock(&journal->j_list_lock); + } + if (bh->b_page->mapping) + set_bit(AS_EIO, &bh->b_page->mapping->flags); + + unlock_page(bh->b_page); + SetPageError(bh->b_page); + ret = -EIO; + } + if (!inverted_lock(journal, bh)) { + put_bh(bh); + spin_lock(&journal->j_list_lock); + continue; + } + if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { + __jbd2_journal_unfile_buffer(jh); + jbd_unlock_bh_state(bh); + jbd2_journal_remove_journal_head(bh); + put_bh(bh); + } else { + jbd_unlock_bh_state(bh); + } + put_bh(bh); + cond_resched_lock(&journal->j_list_lock); + } return ret; -} + } -/* - * Submit all the data buffers of inode associated with the transaction to - * disk. - * - * We are in a committing transaction. Therefore no new inode can be added to - * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently - * operate on from being released while we write out pages. - */ -static int journal_submit_data_buffers(journal_t *journal, - transaction_t *commit_transaction) +static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) { - struct jbd2_inode *jinode; - int err, ret = 0; - struct address_space *mapping; + int i; - spin_lock(&journal->j_list_lock); - list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { - mapping = jinode->i_vfs_inode->i_mapping; - jinode->i_flags |= JI_COMMIT_RUNNING; - spin_unlock(&journal->j_list_lock); - /* - * submit the inode data buffers. We use writepage - * instead of writepages. Because writepages can do - * block allocation with delalloc. We need to write - * only allocated blocks here. - */ - err = journal_submit_inode_data_buffers(mapping); - if (!ret) - ret = err; - spin_lock(&journal->j_list_lock); - J_ASSERT(jinode->i_transaction == commit_transaction); - jinode->i_flags &= ~JI_COMMIT_RUNNING; - wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); + for (i = 0; i < bufs; i++) { + wbuf[i]->b_end_io = end_buffer_write_sync; + /* We use-up our safety reference in submit_bh() */ + submit_bh(WRITE, wbuf[i]); } - spin_unlock(&journal->j_list_lock); - return ret; } /* - * Wait for data submitted for writeout, refile inodes to proper - * transaction if needed. - * + * Submit all the data buffers to disk */ -static int journal_finish_inode_data_buffers(journal_t *journal, - transaction_t *commit_transaction) +static int journal_submit_data_buffers(journal_t *journal, + transaction_t *commit_transaction) { - struct jbd2_inode *jinode, *next_i; - int err, ret = 0; + struct journal_head *jh; + struct buffer_head *bh; + int locked; + int bufs = 0; + struct buffer_head **wbuf = journal->j_wbuf; + int err = 0; - /* For locking, see the comment in journal_submit_data_buffers() */ + /* + * Whenever we unlock the journal and sleep, things can get added + * onto ->t_sync_datalist, so we have to keep looping back to + * write_out_data until we *know* that the list is empty. + * + * Cleanup any flushed data buffers from the data list. Even in + * abort mode, we want to flush this out as soon as possible. + */ +write_out_data: + cond_resched(); spin_lock(&journal->j_list_lock); - list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { - jinode->i_flags |= JI_COMMIT_RUNNING; - spin_unlock(&journal->j_list_lock); - err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); - if (err) { - /* - * Because AS_EIO is cleared by - * wait_on_page_writeback_range(), set it again so - * that user process can get -EIO from fsync(). - */ - set_bit(AS_EIO, - &jinode->i_vfs_inode->i_mapping->flags); - - if (!ret) - ret = err; - } - spin_lock(&journal->j_list_lock); - jinode->i_flags &= ~JI_COMMIT_RUNNING; - wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); - } - /* Now refile inode to proper lists */ - list_for_each_entry_safe(jinode, next_i, - &commit_transaction->t_inode_list, i_list) { - list_del(&jinode->i_list); - if (jinode->i_next_transaction) { - jinode->i_transaction = jinode->i_next_transaction; - jinode->i_next_transaction = NULL; - list_add(&jinode->i_list, - &jinode->i_transaction->t_inode_list); + while (commit_transaction->t_sync_datalist) { + jh = commit_transaction->t_sync_datalist; + bh = jh2bh(jh); + locked = 0; + + /* Get reference just to make sure buffer does not disappear + * when we are forced to drop various locks */ + get_bh(bh); + /* If the buffer is dirty, we need to submit IO and hence + * we need the buffer lock. We try to lock the buffer without + * blocking. If we fail, we need to drop j_list_lock and do + * blocking lock_buffer(). + */ + if (buffer_dirty(bh)) { + if (test_set_buffer_locked(bh)) { + BUFFER_TRACE(bh, "needs blocking lock"); + spin_unlock(&journal->j_list_lock); + /* Write out all data to prevent deadlocks */ + journal_do_submit_data(wbuf, bufs); + bufs = 0; + lock_buffer(bh); + spin_lock(&journal->j_list_lock); + } + locked = 1; + } + /* We have to get bh_state lock. Again out of order, sigh. */ + if (!inverted_lock(journal, bh)) { + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + } + /* Someone already cleaned up the buffer? */ + if (!buffer_jbd(bh) + || jh->b_transaction != commit_transaction + || jh->b_jlist != BJ_SyncData) { + jbd_unlock_bh_state(bh); + if (locked) + unlock_buffer(bh); + BUFFER_TRACE(bh, "already cleaned up"); + put_bh(bh); + continue; + } + if (locked && test_clear_buffer_dirty(bh)) { + BUFFER_TRACE(bh, "needs writeout, adding to array"); + wbuf[bufs++] = bh; + __jbd2_journal_file_buffer(jh, commit_transaction, + BJ_Locked); + jbd_unlock_bh_state(bh); + if (bufs == journal->j_wbufsize) { + spin_unlock(&journal->j_list_lock); + journal_do_submit_data(wbuf, bufs); + bufs = 0; + goto write_out_data; + } + } else if (!locked && buffer_locked(bh)) { + __jbd2_journal_file_buffer(jh, commit_transaction, + BJ_Locked); + jbd_unlock_bh_state(bh); + put_bh(bh); } else { - jinode->i_transaction = NULL; + BUFFER_TRACE(bh, "writeout complete: unfile"); + if (unlikely(!buffer_uptodate(bh))) + err = -EIO; + __jbd2_journal_unfile_buffer(jh); + jbd_unlock_bh_state(bh); + if (locked) + unlock_buffer(bh); + jbd2_journal_remove_journal_head(bh); + /* Once for our safety reference, once for + * jbd2_journal_remove_journal_head() */ + put_bh(bh); + put_bh(bh); + } + + if (need_resched() || spin_needbreak(&journal->j_list_lock)) { + spin_unlock(&journal->j_list_lock); + goto write_out_data; } } spin_unlock(&journal->j_list_lock); + journal_do_submit_data(wbuf, bufs); - return ret; + return err; } static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) @@ -470,14 +544,43 @@ void jbd2_journal_commit_transaction(journal_t *journal) * on the transaction lists. Data blocks go first. */ err = journal_submit_data_buffers(journal, commit_transaction); - if (err) - jbd2_journal_abort(journal, err); + + /* + * Wait for all previously submitted IO to complete if commit + * record is to be written synchronously. + */ + spin_lock(&journal->j_list_lock); + if (!JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) + err = journal_wait_on_locked_list(journal, + commit_transaction); + + spin_unlock(&journal->j_list_lock); + + if (err) { + char b[BDEVNAME_SIZE]; + + printk(KERN_WARNING + "JBD2: Detected IO errors while flushing file data " + "on %s\n", bdevname(journal->j_fs_dev, b)); + err = 0; + } jbd2_journal_write_revoke_records(journal, commit_transaction); jbd_debug(3, "JBD: commit phase 2\n"); /* + * If we found any dirty or locked buffers, then we should have + * looped back up to the write_out_data label. If there weren't + * any then journal_clean_data_list should have wiped the list + * clean by now, so check that it is in fact empty. + */ + J_ASSERT (commit_transaction->t_sync_datalist == NULL); + + jbd_debug (3, "JBD: commit phase 3\n"); + + /* * Way to go: we have now written out all of the data for a * transaction! Now comes the tricky part: we need to write out * metadata. Loop over the transaction's entire buffer list: @@ -495,7 +598,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) J_ASSERT(commit_transaction->t_nr_buffers <= commit_transaction->t_outstanding_credits); - err = 0; descriptor = NULL; bufs = 0; while (commit_transaction->t_buffers) { @@ -670,23 +772,13 @@ start_journal_io: &cbh, crc32_sum); if (err) __jbd2_journal_abort_hard(journal); - } - /* - * This is the right place to wait for data buffers both for ASYNC - * and !ASYNC commit. If commit is ASYNC, we need to wait only after - * the commit block went to disk (which happens above). If commit is - * SYNC, we need to wait for data buffers before we start writing - * commit block, which happens below in such setting. - */ - err = journal_finish_inode_data_buffers(journal, commit_transaction); - if (err) { - char b[BDEVNAME_SIZE]; - - printk(KERN_WARNING - "JBD2: Detected IO errors while flushing file data " - "on %s\n", bdevname(journal->j_fs_dev, b)); - err = 0; + spin_lock(&journal->j_list_lock); + err = journal_wait_on_locked_list(journal, + commit_transaction); + spin_unlock(&journal->j_list_lock); + if (err) + __jbd2_journal_abort_hard(journal); } /* Lo and behold: we have just managed to send a transaction to @@ -700,7 +792,7 @@ start_journal_io: so we incur less scheduling load. */ - jbd_debug(3, "JBD: commit phase 3\n"); + jbd_debug(3, "JBD: commit phase 4\n"); /* * akpm: these are BJ_IO, and j_list_lock is not needed. @@ -759,7 +851,7 @@ wait_for_iobuf: J_ASSERT (commit_transaction->t_shadow_list == NULL); - jbd_debug(3, "JBD: commit phase 4\n"); + jbd_debug(3, "JBD: commit phase 5\n"); /* Here we wait for the revoke record and descriptor record buffers */ wait_for_ctlbuf: @@ -786,7 +878,7 @@ wait_for_iobuf: /* AKPM: bforget here */ } - jbd_debug(3, "JBD: commit phase 5\n"); + jbd_debug(3, "JBD: commit phase 6\n"); if (!JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { @@ -806,9 +898,9 @@ wait_for_iobuf: transaction can be removed from any checkpoint list it was on before. */ - jbd_debug(3, "JBD: commit phase 6\n"); + jbd_debug(3, "JBD: commit phase 7\n"); - J_ASSERT(list_empty(&commit_transaction->t_inode_list)); + J_ASSERT(commit_transaction->t_sync_datalist == NULL); J_ASSERT(commit_transaction->t_buffers == NULL); J_ASSERT(commit_transaction->t_checkpoint_list == NULL); J_ASSERT(commit_transaction->t_iobuf_list == NULL); @@ -929,7 +1021,7 @@ restart_loop: /* Done with this transaction! */ - jbd_debug(3, "JBD: commit phase 7\n"); + jbd_debug(3, "JBD: commit phase 8\n"); J_ASSERT(commit_transaction->t_state == T_COMMIT); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 8207a01..e52a41d 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -50,6 +50,7 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates); EXPORT_SYMBOL(jbd2_journal_get_write_access); EXPORT_SYMBOL(jbd2_journal_get_create_access); EXPORT_SYMBOL(jbd2_journal_get_undo_access); +EXPORT_SYMBOL(jbd2_journal_dirty_data); EXPORT_SYMBOL(jbd2_journal_dirty_metadata); EXPORT_SYMBOL(jbd2_journal_release_buffer); EXPORT_SYMBOL(jbd2_journal_forget); @@ -80,10 +81,6 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page); EXPORT_SYMBOL(jbd2_journal_invalidatepage); EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); EXPORT_SYMBOL(jbd2_journal_force_commit); -EXPORT_SYMBOL(jbd2_journal_file_inode); -EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); -EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); -EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); static void __journal_abort_soft (journal_t *journal, int errno); @@ -2197,54 +2194,6 @@ void jbd2_journal_put_journal_head(struct journal_head *jh) } /* - * Initialize jbd inode head - */ -void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode) -{ - jinode->i_transaction = NULL; - jinode->i_next_transaction = NULL; - jinode->i_vfs_inode = inode; - jinode->i_flags = 0; - INIT_LIST_HEAD(&jinode->i_list); -} - -/* - * Function to be called before we start removing inode from memory (i.e., - * clear_inode() is a fine place to be called from). It removes inode from - * transaction's lists. - */ -void jbd2_journal_release_jbd_inode(journal_t *journal, - struct jbd2_inode *jinode) -{ - int writeout = 0; - - if (!journal) - return; -restart: - spin_lock(&journal->j_list_lock); - /* Is commit writing out inode - we have to wait */ - if (jinode->i_flags & JI_COMMIT_RUNNING) { - wait_queue_head_t *wq; - DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); - wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); - prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&journal->j_list_lock); - schedule(); - finish_wait(wq, &wait.wait); - goto restart; - } - - /* Do we need to wait for data writeback? */ - if (journal->j_committing_transaction == jinode->i_transaction) - writeout = 1; - if (jinode->i_transaction) { - list_del(&jinode->i_list); - jinode->i_transaction = NULL; - } - spin_unlock(&journal->j_list_lock); -} - -/* * debugfs tunables */ #ifdef CONFIG_JBD2_DEBUG diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e5d5405..ba620c4 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -51,7 +51,6 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) transaction->t_tid = journal->j_transaction_sequence++; transaction->t_expires = jiffies + journal->j_commit_interval; spin_lock_init(&transaction->t_handle_lock); - INIT_LIST_HEAD(&transaction->t_inode_list); /* Set up the commit timer for the new transaction. */ journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); @@ -301,7 +300,7 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) goto out; } - lock_map_acquire(&handle->h_lockdep_map); + lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_); out: return handle; } @@ -943,6 +942,183 @@ out: } /** + * int jbd2_journal_dirty_data() - mark a buffer as containing dirty data which + * needs to be flushed before we can commit the + * current transaction. + * @handle: transaction + * @bh: bufferhead to mark + * + * The buffer is placed on the transaction's data list and is marked as + * belonging to the transaction. + * + * Returns error number or 0 on success. + * + * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage + * by kswapd. + */ +int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh) +{ + journal_t *journal = handle->h_transaction->t_journal; + int need_brelse = 0; + struct journal_head *jh; + + if (is_handle_aborted(handle)) + return 0; + + jh = jbd2_journal_add_journal_head(bh); + JBUFFER_TRACE(jh, "entry"); + + /* + * The buffer could *already* be dirty. Writeout can start + * at any time. + */ + jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid); + + /* + * What if the buffer is already part of a running transaction? + * + * There are two cases: + * 1) It is part of the current running transaction. Refile it, + * just in case we have allocated it as metadata, deallocated + * it, then reallocated it as data. + * 2) It is part of the previous, still-committing transaction. + * If all we want to do is to guarantee that the buffer will be + * written to disk before this new transaction commits, then + * being sure that the *previous* transaction has this same + * property is sufficient for us! Just leave it on its old + * transaction. + * + * In case (2), the buffer must not already exist as metadata + * --- that would violate write ordering (a transaction is free + * to write its data at any point, even before the previous + * committing transaction has committed). The caller must + * never, ever allow this to happen: there's nothing we can do + * about it in this layer. + */ + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + + /* Now that we have bh_state locked, are we really still mapped? */ + if (!buffer_mapped(bh)) { + JBUFFER_TRACE(jh, "unmapped buffer, bailing out"); + goto no_journal; + } + + if (jh->b_transaction) { + JBUFFER_TRACE(jh, "has transaction"); + if (jh->b_transaction != handle->h_transaction) { + JBUFFER_TRACE(jh, "belongs to older transaction"); + J_ASSERT_JH(jh, jh->b_transaction == + journal->j_committing_transaction); + + /* @@@ IS THIS TRUE ? */ + /* + * Not any more. Scenario: someone does a write() + * in data=journal mode. The buffer's transaction has + * moved into commit. Then someone does another + * write() to the file. We do the frozen data copyout + * and set b_next_transaction to point to j_running_t. + * And while we're in that state, someone does a + * writepage() in an attempt to pageout the same area + * of the file via a shared mapping. At present that + * calls jbd2_journal_dirty_data(), and we get right here. + * It may be too late to journal the data. Simply + * falling through to the next test will suffice: the + * data will be dirty and wil be checkpointed. The + * ordering comments in the next comment block still + * apply. + */ + //J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + + /* + * If we're journalling data, and this buffer was + * subject to a write(), it could be metadata, forget + * or shadow against the committing transaction. Now, + * someone has dirtied the same darn page via a mapping + * and it is being writepage()'d. + * We *could* just steal the page from commit, with some + * fancy locking there. Instead, we just skip it - + * don't tie the page's buffers to the new transaction + * at all. + * Implication: if we crash before the writepage() data + * is written into the filesystem, recovery will replay + * the write() data. + */ + if (jh->b_jlist != BJ_None && + jh->b_jlist != BJ_SyncData && + jh->b_jlist != BJ_Locked) { + JBUFFER_TRACE(jh, "Not stealing"); + goto no_journal; + } + + /* + * This buffer may be undergoing writeout in commit. We + * can't return from here and let the caller dirty it + * again because that can cause the write-out loop in + * commit to never terminate. + */ + if (buffer_dirty(bh)) { + get_bh(bh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + need_brelse = 1; + sync_dirty_buffer(bh); + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + /* Since we dropped the lock... */ + if (!buffer_mapped(bh)) { + JBUFFER_TRACE(jh, "buffer got unmapped"); + goto no_journal; + } + /* The buffer may become locked again at any + time if it is redirtied */ + } + + /* journal_clean_data_list() may have got there first */ + if (jh->b_transaction != NULL) { + JBUFFER_TRACE(jh, "unfile from commit"); + __jbd2_journal_temp_unlink_buffer(jh); + /* It still points to the committing + * transaction; move it to this one so + * that the refile assert checks are + * happy. */ + jh->b_transaction = handle->h_transaction; + } + /* The buffer will be refiled below */ + + } + /* + * Special case --- the buffer might actually have been + * allocated and then immediately deallocated in the previous, + * committing transaction, so might still be left on that + * transaction's metadata lists. + */ + if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) { + JBUFFER_TRACE(jh, "not on correct data list: unfile"); + J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); + __jbd2_journal_temp_unlink_buffer(jh); + jh->b_transaction = handle->h_transaction; + JBUFFER_TRACE(jh, "file as data"); + __jbd2_journal_file_buffer(jh, handle->h_transaction, + BJ_SyncData); + } + } else { + JBUFFER_TRACE(jh, "not on a transaction"); + __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData); + } +no_journal: + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + if (need_brelse) { + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + } + JBUFFER_TRACE(jh, "exit"); + jbd2_journal_put_journal_head(jh); + return 0; +} + +/** * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata * @handle: transaction to add buffer to. * @bh: buffer to mark @@ -1279,7 +1455,7 @@ int jbd2_journal_stop(handle_t *handle) spin_unlock(&journal->j_state_lock); } - lock_map_release(&handle->h_lockdep_map); + lock_release(&handle->h_lockdep_map, 1, _THIS_IP_); jbd2_free_handle(handle); return err; @@ -1364,10 +1540,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh) * Remove a buffer from the appropriate transaction list. * * Note that this function can *change* the value of - * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list, - * t_log_list or t_reserved_list. If the caller is holding onto a copy of one - * of these pointers, it could go bad. Generally the caller needs to re-read - * the pointer from the transaction_t. + * bh->b_transaction->t_sync_datalist, t_buffers, t_forget, + * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller + * is holding onto a copy of one of thee pointers, it could go bad. + * Generally the caller needs to re-read the pointer from the transaction_t. * * Called under j_list_lock. The journal may not be locked. */ @@ -1389,6 +1565,9 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) switch (jh->b_jlist) { case BJ_None: return; + case BJ_SyncData: + list = &transaction->t_sync_datalist; + break; case BJ_Metadata: transaction->t_nr_buffers--; J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); @@ -1409,6 +1588,9 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) case BJ_Reserved: list = &transaction->t_reserved_list; break; + case BJ_Locked: + list = &transaction->t_locked_list; + break; } __blist_del_buffer(list, jh); @@ -1451,7 +1633,15 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) goto out; spin_lock(&journal->j_list_lock); - if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { + if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) { + if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) { + /* A written-back ordered data buffer */ + JBUFFER_TRACE(jh, "release data"); + __jbd2_journal_unfile_buffer(jh); + jbd2_journal_remove_journal_head(bh); + __brelse(bh); + } + } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { /* written-back checkpointed metadata buffer */ if (jh->b_jlist == BJ_None) { JBUFFER_TRACE(jh, "remove from checkpoint list"); @@ -1687,7 +1877,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) if (!buffer_jbd(bh)) goto zap_buffer_unlocked; - /* OK, we have data buffer in journaled mode */ spin_lock(&journal->j_state_lock); jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); @@ -1751,6 +1940,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) } } else if (transaction == journal->j_committing_transaction) { JBUFFER_TRACE(jh, "on committing transaction"); + if (jh->b_jlist == BJ_Locked) { + /* + * The buffer is on the committing transaction's locked + * list. We have the buffer locked, so I/O has + * completed. So we can nail the buffer now. + */ + may_free = __dispose_buffer(jh, transaction); + goto zap_buffer; + } /* * If it is committing, we simply cannot touch it. We * can remove it's next_transaction pointer from the @@ -1883,6 +2081,9 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, J_ASSERT_JH(jh, !jh->b_committed_data); J_ASSERT_JH(jh, !jh->b_frozen_data); return; + case BJ_SyncData: + list = &transaction->t_sync_datalist; + break; case BJ_Metadata: transaction->t_nr_buffers++; list = &transaction->t_buffers; @@ -1902,6 +2103,9 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, case BJ_Reserved: list = &transaction->t_reserved_list; break; + case BJ_Locked: + list = &transaction->t_locked_list; + break; } __blist_add_buffer(list, jh); @@ -1991,88 +2195,3 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) spin_unlock(&journal->j_list_lock); __brelse(bh); } - -/* - * File inode in the inode list of the handle's transaction - */ -int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) -{ - transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; - - if (is_handle_aborted(handle)) - return -EIO; - - jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, - transaction->t_tid); - - /* - * First check whether inode isn't already on the transaction's - * lists without taking the lock. Note that this check is safe - * without the lock as we cannot race with somebody removing inode - * from the transaction. The reason is that we remove inode from the - * transaction only in journal_release_jbd_inode() and when we commit - * the transaction. We are guarded from the first case by holding - * a reference to the inode. We are safe against the second case - * because if jinode->i_transaction == transaction, commit code - * cannot touch the transaction because we hold reference to it, - * and if jinode->i_next_transaction == transaction, commit code - * will only file the inode where we want it. - */ - if (jinode->i_transaction == transaction || - jinode->i_next_transaction == transaction) - return 0; - - spin_lock(&journal->j_list_lock); - - if (jinode->i_transaction == transaction || - jinode->i_next_transaction == transaction) - goto done; - - /* On some different transaction's list - should be - * the committing one */ - if (jinode->i_transaction) { - J_ASSERT(jinode->i_next_transaction == NULL); - J_ASSERT(jinode->i_transaction == - journal->j_committing_transaction); - jinode->i_next_transaction = transaction; - goto done; - } - /* Not on any transaction list... */ - J_ASSERT(!jinode->i_next_transaction); - jinode->i_transaction = transaction; - list_add(&jinode->i_list, &transaction->t_inode_list); -done: - spin_unlock(&journal->j_list_lock); - - return 0; -} - -/* - * This function must be called when inode is journaled in ordered mode - * before truncation happens. It starts writeout of truncated part in - * case it is in the committing transaction so that we stand to ordered - * mode consistency guarantees. - */ -int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, - loff_t new_size) -{ - journal_t *journal; - transaction_t *commit_trans; - int ret = 0; - - if (!inode->i_transaction && !inode->i_next_transaction) - goto out; - journal = inode->i_transaction->t_journal; - spin_lock(&journal->j_state_lock); - commit_trans = journal->j_committing_transaction; - spin_unlock(&journal->j_state_lock); - if (inode->i_transaction == commit_trans) { - ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping, - new_size, LLONG_MAX); - if (ret) - jbd2_journal_abort(journal, ret); - } -out: - return ret; -} diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 3dd2090..ec9cadf 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -381,38 +381,6 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) bit_spin_unlock(BH_JournalHead, &bh->b_state); } -/* Flags in jbd_inode->i_flags */ -#define __JI_COMMIT_RUNNING 0 -/* Commit of the inode data in progress. We use this flag to protect us from - * concurrent deletion of inode. We cannot use reference to inode for this - * since we cannot afford doing last iput() on behalf of kjournald - */ -#define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING) - -/** - * struct jbd_inode is the structure linking inodes in ordered mode - * present in a transaction so that we can sync them during commit. - */ -struct jbd2_inode { - /* Which transaction does this inode belong to? Either the running - * transaction or the committing one. [j_list_lock] */ - transaction_t *i_transaction; - - /* Pointer to the running transaction modifying inode's data in case - * there is already a committing transaction touching it. [j_list_lock] */ - transaction_t *i_next_transaction; - - /* List of inodes in the i_transaction [j_list_lock] */ - struct list_head i_list; - - /* VFS inode this inode belongs to [constant during the lifetime - * of the structure] */ - struct inode *i_vfs_inode; - - /* Flags of inode [j_list_lock] */ - unsigned int i_flags; -}; - struct jbd2_revoke_table_s; /** @@ -543,12 +511,24 @@ struct transaction_s struct journal_head *t_reserved_list; /* + * Doubly-linked circular list of all buffers under writeout during + * commit [j_list_lock] + */ + struct journal_head *t_locked_list; + + /* * Doubly-linked circular list of all metadata buffers owned by this * transaction [j_list_lock] */ struct journal_head *t_buffers; /* + * Doubly-linked circular list of all data buffers still to be + * flushed before this transaction can be committed [j_list_lock] + */ + struct journal_head *t_sync_datalist; + + /* * Doubly-linked circular list of all forget buffers (superseded * buffers which we can un-checkpoint once this transaction commits) * [j_list_lock] @@ -587,12 +567,6 @@ struct transaction_s struct journal_head *t_log_list; /* - * List of inodes whose data we've modified in data=ordered mode. - * [j_list_lock] - */ - struct list_head t_inode_list; - - /* * Protects info related to handles */ spinlock_t t_handle_lock; @@ -1032,6 +1006,7 @@ extern int jbd2_journal_extend (handle_t *, int nblocks); extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *); extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *); extern int jbd2_journal_get_undo_access(handle_t *, struct buffer_head *); +extern int jbd2_journal_dirty_data (handle_t *, struct buffer_head *); extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *); extern void jbd2_journal_release_buffer (handle_t *, struct buffer_head *); extern int jbd2_journal_forget (handle_t *, struct buffer_head *); @@ -1071,10 +1046,6 @@ extern void jbd2_journal_ack_err (journal_t *); extern int jbd2_journal_clear_err (journal_t *); extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); extern int jbd2_journal_force_commit(journal_t *); -extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode); -extern int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size); -extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode); -extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode); /* * journal_head management @@ -1210,13 +1181,15 @@ static inline int jbd_space_needed(journal_t *journal) /* journaling buffer types */ #define BJ_None 0 /* Not journaled */ -#define BJ_Metadata 1 /* Normal journaled metadata */ -#define BJ_Forget 2 /* Buffer superseded by this transaction */ -#define BJ_IO 3 /* Buffer is for temporary IO use */ -#define BJ_Shadow 4 /* Buffer contents being shadowed to the log */ -#define BJ_LogCtl 5 /* Buffer contains log descriptors */ -#define BJ_Reserved 6 /* Buffer is reserved for access by journal */ -#define BJ_Types 7 +#define BJ_SyncData 1 /* Normal data: flush before commit */ +#define BJ_Metadata 2 /* Normal journaled metadata */ +#define BJ_Forget 3 /* Buffer superseded by this transaction */ +#define BJ_IO 4 /* Buffer is for temporary IO use */ +#define BJ_Shadow 5 /* Buffer contents being shadowed to the log */ +#define BJ_LogCtl 6 /* Buffer contains log descriptors */ +#define BJ_Reserved 7 /* Buffer is reserved for access by journal */ +#define BJ_Locked 8 /* Locked for I/O during commit */ +#define BJ_Types 9 extern int jbd_blocks_per_page(struct inode *inode);