Sophie: cman-2.0.115-68.el5

cman-2.0.115-68.el5_6.1.src.rpm

commit d360c0537aa734205e49939de92c763696ef477b
Author: David Teigland <teigland@redhat.com>
Date:   Wed Dec 9 16:53:33 2009 -0600

    groupd: clean up leaving failed node
    
    rhbz#521817
    
    Due to shutdown+failure scenarios that aren't fully understood,
    a node that fails while shutting down can cause the other nodes
    to get stuck trying to restart the clvmd group (whether other
    groups could be affected is unknown.)
    
    The other nodes will all show something like this from group_tool -v:
    
    dlm              1     clvmd    00010002 LEAVE_STOP_WAIT 1 100020002 1
    
    and group_tool dump will show things like:
    
    1260396236 1:clvmd waiting for 1 more stopped messages before LEAVE_ALL_STOPPED 1
    1260396236 1:clvmd waiting for 1 more stopped messages before LEAVE_ALL_STOPPED 1
    
    This fix is to more or less watch out for this very specific
    situation where things get messed up and forcibly clean things
    up so the other nodes aren't stuck.
    
    Signed-off-by: David Teigland <teigland@redhat.com>

diff --git a/group/daemon/app.c b/group/daemon/app.c
index df17896..03952df 100644
--- a/group/daemon/app.c
+++ b/group/daemon/app.c
@@ -228,6 +228,8 @@ struct recovery_set *get_recovery_set(int nodeid)
    and goes away, and then we get the add_recovery_set_cpg() matching
    the _cman() variant that we ignored? */
 
+static void clean_up_dead_node(int nodeid);
+
 void add_recovery_set_cman(int nodeid)
 {
 	struct recovery_set *rs;
@@ -245,6 +247,9 @@ void add_recovery_set_cman(int nodeid)
 		log_debug("free recovery set %d not running groupd", nodeid);
 		list_del(&rs->list);
 		free(rs);
+
+		clean_up_dead_node(nodeid);
+
 		return;
 	}
 
@@ -1845,3 +1850,39 @@ void groupd_down(int nodeid)
 	}
 }
 
+/* More hacks to try to work around similar kinds of problems that don't
+   make much sense, bz 521817.  I believe the following produces effectively
+   the same problem as in the bz, on one node:
+   service cman start (with groupd -s0, not sure if this could happen otherwise)
+   service clvmd start
+   killall -9 dlm_controld
+   killall -9 groupd
+   killall -9 aisexec
+
+   At this point, the clvmd group in groupd on the other nodes is stuck in
+   LEAVE_ALL_STOPPED waiting for a stopped message from the killed node.
+   The groupd cpg confchg would ordinarily clean things up, but that probably
+   doesn't do anything because the event type is LEAVE instead of a failure.
+   Another way to deal with this would possibly be to do it when we see
+   the nodeid leave the groupd cpg. */
+
+static void clean_up_dead_node(int nodeid)
+{
+	group_t *g;
+	event_t *ev;
+
+	list_for_each_entry(g, &gd_groups, list) {
+		if (g->app && g->app->current_event &&
+		    g->app->current_event->nodeid == nodeid) {
+			ev = g->app->current_event;
+
+			log_group(g, "clean_up_dead_node %d ev %d", nodeid,
+				  ev->state);
+
+			if (ev->state == EST_LEAVE_STOP_WAIT) {
+				mark_node_stopped(g->app, nodeid);
+			}
+		}
+	}
+}
+