Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 27922b4260f65d317aabda37e42bbbff > files > 1065

kernel-2.6.18-238.el5.src.rpm

From: Robert S Peterson <rpeterso@redhat.com>
Date: Tue, 6 Jul 2010 22:51:51 -0400
Subject: [fs] gfs2: fix stuck in inode wait, no glocks stuck
Message-id: <713432751.26941278456711997.JavaMail.root@zmail06.collab.prod.int.phx2.redhat.com>
Patchwork-id: 26722
O-Subject: [PATCH RHEL5.6] bz595397  - GFS2: stuck in inode wait, no glocks stuck
Bugzilla: 595397
RH-Acked-by: Steven Whitehouse <swhiteho@redhat.com>

Hi,

This patch fixes three bugs having to do with the reclaiming
of unlinked dinodes.  The three were discovered during RHEL6
testing and reported as these bugzilla numbers:

bug #570182 - GFS2: glock held by ended process
bug #583737 - GFS2: stuck in inode wait, no glocks stuck
bug #604244 - GFS2: kernel NULL pointer dereference from dlm_astd

This patch is a combined version of three patches that already
appear upstream in the -nmw git repository and in the RHEL6 kernel.

Due to the serious nature of the problems, the patches were
back-ported to RHEL5 for inclusion in the RHEL5.6 kernel under
the bugzilla bug #595397.

The first bug #570182 was caused by GFS2 not following its documented
lock ordering rules while reclaiming unlinked dinodes.  The fix
was to split the gfs2_inode_lookup function into a separate function
just for reclaim: gfs2_process_unlinked_inode.  The new function
follows the lock ordering rules properly by doing two things:
First, by using a "try" lock during the lookup.  Second, by
returning the dinode block number to a higher-level function that
can deal with the inode outside the locks that were causing deadlock,
(rather than to try to do the locking while the other locks were held).

The second bug #583737 was caused by a problem with the patch
for the first bug #570182.  This is the combined patch.  Basically,
the timing window was nearly closed in the first patch, but there
was still a case where it could hang.

The third bug #604244 was due to a problem with the second patch.
Basically, if the "try" lock failed due to lock contention, the
glock was improperly released twice, which caused dlm_astd to
reference a pointer to freed storage.  Now we set the freed pointer
to NULL so that it's not unlocked a second time.

This patch was tested and found to be correct on the roth-0{1,2,3}
cluster using the brawl test.

Regards,

Bob Peterson
Red Hat GFS

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
--
 fs/gfs2/dir.c        |    2 +-
 fs/gfs2/glock.c      |    2 +-
 fs/gfs2/inode.c      |  118 +++++++++++++++++++++++++++++++++++++++++++------
 fs/gfs2/inode.h      |    4 +-
 fs/gfs2/ops_export.c |    2 +-
 fs/gfs2/ops_file.c   |    2 -
 fs/gfs2/ops_fstype.c |    2 +-
 fs/gfs2/rgrp.c       |   81 +++++++++++++++++++---------------
 8 files changed, 155 insertions(+), 58 deletions(-)

Signed-off-by: Jarod Wilson <jarod@redhat.com>

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 8f664d5..f9cb6a9 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1500,7 +1500,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
 		inode = gfs2_inode_lookup(dir->i_sb,
 				be16_to_cpu(dent->de_type),
 				be64_to_cpu(dent->de_inum.no_addr),
-				be64_to_cpu(dent->de_inum.no_formal_ino), 0);
+				be64_to_cpu(dent->de_inum.no_formal_ino));
 		brelse(bh);
 		return inode;
 	}
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 5370112..888d648 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -290,7 +290,6 @@ int gfs2_glock_put(struct gfs2_glock *gl)
 		}
 		spin_unlock(&lru_lock);
 		write_unlock(gl_lock_addr(gl->gl_hash));
-		GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED);
 		GLOCK_BUG_ON(gl, !list_empty(&gl->gl_lru));
 		GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
 		glock_free(gl);
@@ -913,6 +912,7 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *
 	gh->gh_flags = flags;
 	gh->gh_iflags = 0;
 	gh->gh_ip = (unsigned long)__builtin_return_address(0);
+	gh->gh_owner_pid = current->pid;
 }
 
 /**
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2de4eea..212fae8 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -162,7 +162,6 @@ void gfs2_set_iop(struct inode *inode)
  * @sb: The super block
  * @no_addr: The inode number
  * @type: The type of the inode
- * @skip_freeing: set this not return an inode if it is currently being freed.
  *
  * Returns: A VFS inode, or an error
  */
@@ -170,17 +169,14 @@ void gfs2_set_iop(struct inode *inode)
 struct inode *gfs2_inode_lookup(struct super_block *sb,
 				unsigned int type,
 				u64 no_addr,
-				u64 no_formal_ino, int skip_freeing)
+				u64 no_formal_ino)
 {
 	struct inode *inode;
 	struct gfs2_inode *ip;
-	struct gfs2_glock *io_gl;
+	struct gfs2_glock *io_gl = NULL;
 	int error;
 
-	if (skip_freeing)
-		inode = gfs2_iget_skip(sb, no_addr);
-	else
-		inode = gfs2_iget(sb, no_addr);
+	inode = gfs2_iget(sb, no_addr);
 	ip = GFS2_I(inode);
 
 	if (!inode)
@@ -207,6 +203,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
 		ip->i_iopen_gh.gh_gl->gl_object = ip;
 
 		gfs2_glock_put(io_gl);
+		io_gl = NULL;
 
 		if ((type == DT_UNKNOWN) && (no_formal_ino == 0))
 			goto gfs2_nfsbypass;
@@ -237,13 +234,107 @@ gfs2_nfsbypass:
 fail_glock:
 	gfs2_glock_dq(&ip->i_iopen_gh);
 fail_iopen:
+	if (io_gl)
+		gfs2_glock_put(io_gl);
+fail_put:
+	if (inode->i_state & I_NEW)
+		ip->i_gl->gl_object = NULL;
+	gfs2_glock_put(ip->i_gl);
+fail:
+	if (inode->i_state & I_NEW)
+		iget_failed(inode);
+	else
+		iput(inode);
+	return ERR_PTR(error);
+}
+
+/**
+ * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation
+ *                               and try to reclaim it by doing iput.
+ *
+ * This function assumes no rgrp locks are currently held.
+ *
+ * @sb: The super block
+ * no_addr: The inode number
+ *
+ */
+
+void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
+{
+	struct gfs2_sbd *sdp;
+	struct gfs2_inode *ip;
+	struct gfs2_glock *io_gl = NULL;
+	int error;
+	struct gfs2_holder gh;
+	struct inode *inode;
+
+	inode = gfs2_iget_skip(sb, no_addr);
+
+	if (!inode)
+		return;
+
+	/* If it's not a new inode, someone's using it, so leave it alone. */
+	if (!(inode->i_state & I_NEW)) {
+		iput(inode);
+		return;
+	}
+
+	ip = GFS2_I(inode);
+	sdp = GFS2_SB(inode);
+	ip->i_no_formal_ino = -1;
+
+	error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
+	if (unlikely(error))
+		goto fail;
+	ip->i_gl->gl_object = ip;
+
+	error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
+	if (unlikely(error))
+		goto fail_put;
+
+	set_bit(GIF_INVALID, &ip->i_flags);
+	error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
+				   &ip->i_iopen_gh);
+	if (unlikely(error))
+		goto fail_iopen;
+
+	ip->i_iopen_gh.gh_gl->gl_object = ip;
 	gfs2_glock_put(io_gl);
+	io_gl = NULL;
+
+	inode->i_mode = DT2IF(DT_UNKNOWN);
+
+	/*
+	 * We must read the inode in order to work out its type in
+	 * this case. Note that this doesn't happen often as we normally
+	 * know the type beforehand. This code path only occurs during
+	 * unlinked inode recovery (where it is safe to do this glock,
+	 * which is not true in the general case).
+	 */
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
+				   &gh);
+	if (unlikely(error))
+		goto fail_glock;
+
+	/* Inode is now uptodate */
+	gfs2_glock_dq_uninit(&gh);
+	gfs2_set_iop(inode);
+
+	/* The iput will cause it to be deleted. */
+	iput(inode);
+	return;
+
+fail_glock:
+	gfs2_glock_dq(&ip->i_iopen_gh);
+fail_iopen:
+	if (io_gl)
+		gfs2_glock_put(io_gl);
 fail_put:
 	ip->i_gl->gl_object = NULL;
 	gfs2_glock_put(ip->i_gl);
 fail:
-	iput(inode);
-	return ERR_PTR(error);
+	iget_failed(inode);
+	return;
 }
 
 static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
@@ -1010,9 +1101,8 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 	if (error)
 		goto fail_gunlock2;
 
-	inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode),
-					inum.no_addr,
-					inum.no_formal_ino, 0);
+	inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
+				  inum.no_formal_ino);
 	if (IS_ERR(inode))
 		goto fail_gunlock2;
 
@@ -1034,13 +1124,11 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 
 	if (bh)
 		brelse(bh);
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
 	return inode;
 
 fail_gunlock2:
 	gfs2_glock_dq_uninit(ghs + 1);
-	if (inode)
+	if (inode && !IS_ERR(inode))
 		iput(inode);
 fail_gunlock:
 	gfs2_glock_dq(ghs);
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 59960b7..4dda02c 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -76,8 +76,8 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
 void gfs2_inode_attr_in(struct gfs2_inode *ip);
 void gfs2_set_iop(struct inode *inode);
 struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
-				u64 no_addr, u64 no_formal_ino,
-				int skip_freeing);
+				u64 no_addr, u64 no_formal_ino);
+void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr);
 struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
 
 int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index 42b98b1..b6a139d 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -212,7 +212,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj)
 	if (error)
 		goto fail;
 
-	inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0);
+	inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0);
 	if (!inode)
 		goto fail;
 	if (IS_ERR(inode)) {
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 46a484f..56dce7e 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -796,8 +796,6 @@ static void do_unflock(struct file *file, struct file_lock *fl)
 
 static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 {
-	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-
 	if (!(fl->fl_flags & FL_FLOCK))
 		return -ENOLCK;
 	if (fl->fl_type & LOCK_MAND)
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 75dbe03..d4b7de4 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -499,7 +499,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
 	struct dentry *dentry;
 	struct inode *inode;
 
-	inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
+	inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0);
 	if (IS_ERR(inode)) {
 		fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
 		return PTR_ERR(inode);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 9940ca3..ca2800e 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -888,13 +888,13 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
  * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
  * @rgd: The rgrp
  *
- * Returns: The inode, if one has been found
+ * Returns: 0 if no error
+ *          The inode, if one has been found, in inode.
  */
 
-static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
-				     u64 skip)
+static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
+			   u64 skip)
 {
-	struct inode *inode;
 	u32 goal = 0, block;
 	u64 no_addr;
 	struct gfs2_sbd *sdp = rgd->rd_sbd;
@@ -917,14 +917,11 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
 		if (no_addr == skip)
 			continue;
 		*last_unlinked = no_addr;
-		inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN,
-					  no_addr, -1, 1);
-		if (!IS_ERR(inode))
-			return inode;
+		return no_addr;
 	}
 
 	rgd->rd_flags &= ~GFS2_RDF_CHECK;
-	return NULL;
+	return 0;
 }
 
 /**
@@ -1082,11 +1079,12 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
  * Try to acquire rgrp in way which avoids contending with others.
  *
  * Returns: errno
+ *          unlinked: the block address of an unlinked block to be reclaimed
  */
 
-static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
+static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
+			  u64 *last_unlinked)
 {
-	struct inode *inode = NULL;
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_rgrpd *rgd, *begin = NULL;
 	struct gfs2_alloc *al = ip->i_alloc;
@@ -1096,7 +1094,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 	int error, rg_locked;
 
 	/* Try recently successful rgrps */
-
+	*unlinked = 0;
 	rgd = recent_rgrp_first(sdp, ip->i_goal);
 
 	while (rgd) {
@@ -1113,21 +1111,24 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 		case 0:
 			if (try_rgrp_fit(rgd, al))
 				goto out;
-			if (rgd->rd_flags & GFS2_RDF_CHECK)
-				inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
+			/* If the rg came in already locked, there's no
+			   way we can recover from a failed try_rgrp_unlink
+			   because that would require an iput which can only
+			   happen after the rgrp is unlocked. */
+			if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
+				*unlinked = try_rgrp_unlink(rgd, last_unlinked,
+							   ip->i_no_addr);
 			if (!rg_locked)
 				gfs2_glock_dq_uninit(&al->al_rgd_gh);
-			if (inode)
-				return inode;
-			rgd = recent_rgrp_next(rgd, 1);
-			break;
-
+			if (*unlinked)
+				return -EAGAIN;
+			/* fall through */
 		case GLR_TRYFAILED:
 			rgd = recent_rgrp_next(rgd, 0);
 			break;
 
 		default:
-			return ERR_PTR(error);
+			return error;
 		}
 	}
 
@@ -1149,12 +1150,13 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 		case 0:
 			if (try_rgrp_fit(rgd, al))
 				goto out;
-			if (rgd->rd_flags & GFS2_RDF_CHECK)
-				inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
+			if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
+				*unlinked = try_rgrp_unlink(rgd, last_unlinked,
+							    ip->i_no_addr);
 			if (!rg_locked)
 				gfs2_glock_dq_uninit(&al->al_rgd_gh);
-			if (inode)
-				return inode;
+			if (*unlinked)
+				return -EAGAIN;
 			break;
 
 		case GLR_TRYFAILED:
@@ -1162,7 +1164,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 			break;
 
 		default:
-			return ERR_PTR(error);
+			return error;
 		}
 
 		rgd = gfs2_rgrpd_get_next(rgd);
@@ -1171,7 +1173,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 
 		if (rgd == begin) {
 			if (++loops >= 3)
-				return ERR_PTR(-ENOSPC);
+				return -ENOSPC;
 			if (!skipped)
 				loops++;
 			flags = 0;
@@ -1189,7 +1191,7 @@ out:
 		forward_rgrp_set(sdp, rgd);
 	}
 
-	return NULL;
+	return 0;
 }
 
 /**
@@ -1203,9 +1205,8 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_alloc *al = ip->i_alloc;
-	struct inode *inode;
 	int error = 0;
-	u64 last_unlinked = NO_BLOCK;
+	u64 last_unlinked = NO_BLOCK, unlinked;
 
 	if (gfs2_assert_warn(sdp, al->al_requested))
 		return -EINVAL;
@@ -1221,17 +1222,27 @@ try_again:
 	if (error)
 		return error;
 
-	inode = get_local_rgrp(ip, &last_unlinked);
-	if (inode) {
+	/* Find an rgrp suitable for allocation.  If it encounters any unlinked
+	   dinodes along the way, error will equal -EAGAIN and unlinked will
+	   contains it block address. We then need to look up that inode and
+	   try to free it, and try the allocation again. */
+	error = get_local_rgrp(ip, &unlinked, &last_unlinked);
+	if (error) {
 		if (ip != GFS2_I(sdp->sd_rindex))
 			gfs2_glock_dq_uninit(&al->al_ri_gh);
-		if (IS_ERR(inode))
-			return PTR_ERR(inode);
-		iput(inode);
+		if (error != -EAGAIN)
+			return error;
+
+		gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked);
+		/* regardless of whether or not gfs2_process_unlinked_inode
+		   was successful, we don't want to repeat it again. */
+		last_unlinked = unlinked;
 		gfs2_log_flush(sdp, NULL);
+		error = 0;
+
 		goto try_again;
 	}
-
+	/* no error, so we have the rgrp set in the inode's allocation. */
 	al->al_file = file;
 	al->al_line = line;