From 0fc5da5fdc190ed50e94640136fea177a900ea57 Mon Sep 17 00:00:00 2001 From: David Teigland <teigland@redhat.com> Date: Thu, 5 Aug 2010 17:05:26 -0500 Subject: [PATCH] fenced: use post_join_delay after cluster join When the cluster has lost quorum due to a node failure, the next event is generally a cluster node join which gives the cluster quorum again. With quorum, fenced begins fencing any failed nodes, applying post_fail_delay since the last cpg event was a node failure. In this case, however, post_join_delay is more appropriate since the chances are good that nodes being fenced will be joining. Detect this case where a node joins the cluster giving it quorum, and use post_join_delay. bz 575952 Signed-off-by: David Teigland <teigland@redhat.com> --- fence/fenced/member_cman.c | 70 ++++++++++++++++++++++++++++++++++++++++++-- fence/fenced/recover.c | 6 +++- 2 files changed, 72 insertions(+), 4 deletions(-) diff --git a/fence/fenced/member_cman.c b/fence/fenced/member_cman.c index 9e22ece..6ef74c3 100644 --- a/fence/fenced/member_cman.c +++ b/fence/fenced/member_cman.c @@ -17,9 +17,14 @@ #define BUFLEN 128 static cman_handle_t ch; -static int cman_quorate; +static cman_node_t old_nodes[MAX_NODES]; +static int old_node_count; +static int old_quorate; static cman_node_t cman_nodes[MAX_NODES]; static int cman_node_count; +static int cman_quorate; +int cman_quorate_from_last_change; + static char name_buf[CMAN_MAX_NODENAME_LEN+1]; extern struct list_head domains; @@ -27,6 +32,26 @@ extern struct list_head domains; char *our_name; int our_nodeid; +static int _is_member(cman_node_t *node_list, int count, int nodeid) +{ + int i; + + for (i = 0; i < count; i++) { + if (node_list[i].cn_nodeid == nodeid) + return node_list[i].cn_member; + } + return 0; +} + +static int is_old_member(int nodeid) +{ + return _is_member(old_nodes, old_node_count, nodeid); +} + +static int is_cman_member(int nodeid) +{ + return _is_member(cman_nodes, cman_node_count, nodeid); +} static int name_equal(char *name1, char *name2) { @@ -89,15 +114,54 @@ static cman_node_t *find_cluster_node_name(char *name) static void statechange(void) { - int rv; + int i, rv; + int removed = 0, added = 0; + + old_quorate = cman_quorate; + old_node_count = cman_node_count; + memcpy(&old_nodes, &cman_nodes, sizeof(old_nodes)); cman_quorate = cman_is_quorate(ch); cman_node_count = 0; memset(&cman_nodes, 0, sizeof(cman_nodes)); - rv = cman_get_nodes(ch, MAX_NODES, &cman_node_count, cman_nodes); if (rv < 0) log_error("cman_get_nodes error %d %d", rv, errno); + + /* Never allow node ID 0 to be considered a member #315711 */ + for (i = 0; i < cman_node_count; i++) { + if (cman_nodes[i].cn_nodeid == 0) { + cman_nodes[i].cn_member = 0; + break; + } + } + + for (i = 0; i < old_node_count; i++) { + if (old_nodes[i].cn_member && + !is_cman_member(old_nodes[i].cn_nodeid)) { + removed++; + log_debug("cman: node %d removed", + old_nodes[i].cn_nodeid); + } + } + + for (i = 0; i < cman_node_count; i++) { + if (cman_nodes[i].cn_member && + !is_old_member(cman_nodes[i].cn_nodeid)) { + added++; + log_debug("cman: node %d added", + cman_nodes[i].cn_nodeid); + } + } + + if (removed) { + cman_quorate_from_last_change = 0; + } else if (added) { + if (!old_quorate && cman_quorate) + cman_quorate_from_last_change = 1; + else + cman_quorate_from_last_change = 0; + } } static void cman_callback(cman_handle_t h, void *private, int reason, int arg) diff --git a/fence/fenced/recover.c b/fence/fenced/recover.c index 7f8aace..732ec81 100644 --- a/fence/fenced/recover.c +++ b/fence/fenced/recover.c @@ -18,6 +18,7 @@ extern int our_nodeid; extern commandline_t comline; +extern int cman_quorate_from_last_change; /* Fencing recovery algorithm @@ -302,7 +303,7 @@ static void delay_fencing(fd_t *fd, int start_type) fd_node_t *node; char *delay_type; - if (start_type == GROUP_NODE_JOIN) { + if ((start_type == GROUP_NODE_JOIN) || cman_quorate_from_last_change) { delay = comline.post_join_delay; delay_type = "post_join_delay"; } else { @@ -310,6 +311,9 @@ static void delay_fencing(fd_t *fd, int start_type) delay_type = "post_fail_delay"; } + log_debug("delay_fencing %s %d quorate_from_last_change %d", + delay_type, delay, cman_quorate_from_last_change); + if (delay == 0) goto out; -- 1.7.1.1