commit d360c0537aa734205e49939de92c763696ef477b Author: David Teigland <teigland@redhat.com> Date: Wed Dec 9 16:53:33 2009 -0600 groupd: clean up leaving failed node rhbz#521817 Due to shutdown+failure scenarios that aren't fully understood, a node that fails while shutting down can cause the other nodes to get stuck trying to restart the clvmd group (whether other groups could be affected is unknown.) The other nodes will all show something like this from group_tool -v: dlm 1 clvmd 00010002 LEAVE_STOP_WAIT 1 100020002 1 and group_tool dump will show things like: 1260396236 1:clvmd waiting for 1 more stopped messages before LEAVE_ALL_STOPPED 1 1260396236 1:clvmd waiting for 1 more stopped messages before LEAVE_ALL_STOPPED 1 This fix is to more or less watch out for this very specific situation where things get messed up and forcibly clean things up so the other nodes aren't stuck. Signed-off-by: David Teigland <teigland@redhat.com> diff --git a/group/daemon/app.c b/group/daemon/app.c index df17896..03952df 100644 --- a/group/daemon/app.c +++ b/group/daemon/app.c @@ -228,6 +228,8 @@ struct recovery_set *get_recovery_set(int nodeid) and goes away, and then we get the add_recovery_set_cpg() matching the _cman() variant that we ignored? */ +static void clean_up_dead_node(int nodeid); + void add_recovery_set_cman(int nodeid) { struct recovery_set *rs; @@ -245,6 +247,9 @@ void add_recovery_set_cman(int nodeid) log_debug("free recovery set %d not running groupd", nodeid); list_del(&rs->list); free(rs); + + clean_up_dead_node(nodeid); + return; } @@ -1845,3 +1850,39 @@ void groupd_down(int nodeid) } } +/* More hacks to try to work around similar kinds of problems that don't + make much sense, bz 521817. I believe the following produces effectively + the same problem as in the bz, on one node: + service cman start (with groupd -s0, not sure if this could happen otherwise) + service clvmd start + killall -9 dlm_controld + killall -9 groupd + killall -9 aisexec + + At this point, the clvmd group in groupd on the other nodes is stuck in + LEAVE_ALL_STOPPED waiting for a stopped message from the killed node. + The groupd cpg confchg would ordinarily clean things up, but that probably + doesn't do anything because the event type is LEAVE instead of a failure. + Another way to deal with this would possibly be to do it when we see + the nodeid leave the groupd cpg. */ + +static void clean_up_dead_node(int nodeid) +{ + group_t *g; + event_t *ev; + + list_for_each_entry(g, &gd_groups, list) { + if (g->app && g->app->current_event && + g->app->current_event->nodeid == nodeid) { + ev = g->app->current_event; + + log_group(g, "clean_up_dead_node %d ev %d", nodeid, + ev->state); + + if (ev->state == EST_LEAVE_STOP_WAIT) { + mark_node_stopped(g->app, nodeid); + } + } + } +} +