From fec58c0da35b360c9454bacd28371a22fd7fb146 Mon Sep 17 00:00:00 2001 From: Lon Hohberger <lhh@redhat.com> Date: Wed, 16 Dec 2009 12:56:43 -0500 Subject: [PATCH] rgmanager: Make VF timeout scale with token timeout Rgmanager was not waiting long enough to account for failures mid-state transition, allowing the possibility for services to enter the 'failed' state erroneously. Resolves: rhbz#548133 Signed-off-by: Lon Hohberger <lhh@redhat.com> --- rgmanager/include/vf.h | 2 +- rgmanager/src/clulib/vft.c | 10 +++++++--- rgmanager/src/daemons/main.c | 22 +++++++++++++++++----- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/rgmanager/include/vf.h b/rgmanager/include/vf.h index abcca1b..3be7e15 100644 --- a/rgmanager/include/vf.h +++ b/rgmanager/include/vf.h @@ -170,7 +170,7 @@ typedef struct _key_node { /* * VF Stuff. VF only talks to peers. */ -int vf_init(int, uint16_t, vf_vote_cb_t, vf_commit_cb_t); +int vf_init(int, uint16_t, vf_vote_cb_t, vf_commit_cb_t, int); int vf_invalidate(void); int vf_shutdown(void); diff --git a/rgmanager/src/clulib/vft.c b/rgmanager/src/clulib/vft.c index aed1d30..26191b5 100644 --- a/rgmanager/src/clulib/vft.c +++ b/rgmanager/src/clulib/vft.c @@ -45,6 +45,7 @@ static key_node_t *key_list = NULL; /** List of key nodes. */ static int _node_id = (int)-1;/** Our node ID, set with vf_init. */ static uint16_t _port = 0; /** Our daemon ID, set with vf_init. */ +static int _vf_timeout = 10; /* * TODO: We could make it thread safe, but this might be unnecessary work @@ -104,7 +105,8 @@ static int tv_cmp(struct timeval *left, struct timeval *right); static uint32_t vf_try_commit(key_node_t *key_node); int vf_init(int my_node_id, uint16_t my_port, - vf_vote_cb_t vote_cb, vf_commit_cb_t commit_cb); + vf_vote_cb_t vote_cb, vf_commit_cb_t commit_cb, + int cluster_timeout); int vf_key_init(char *keyid, int timeout, vf_vote_cb_t vote_cb, vf_commit_cb_t commit_cb); static int vf_key_init_nt(char *keyid, int timeout, vf_vote_cb_t vote_cb, @@ -910,7 +912,7 @@ vf_server(void *arg) */ int vf_init(int my_node_id, uint16_t my_port, vf_vote_cb_t vcb, - vf_commit_cb_t ccb) + vf_commit_cb_t ccb, int cluster_timeout) { struct vf_args *args; msgctx_t *ctx; @@ -937,6 +939,8 @@ vf_init(int my_node_id, uint16_t my_port, vf_vote_cb_t vcb, pthread_mutex_lock(&vf_mutex); _port = my_port; _node_id = my_node_id; + if (cluster_timeout) + _vf_timeout = cluster_timeout; default_vote_cb = vcb; default_commit_cb = ccb; pthread_mutex_unlock(&vf_mutex); @@ -1248,7 +1252,7 @@ vf_write(cluster_member_list_t *membership, uint32_t flags, char *keyid, * See if we have a consensus =) */ if ((rv = (vf_unanimous(&everyone, trans, remain, - 5))) == VFR_OK) { + _vf_timeout))) == VFR_OK) { vf_send_commit(&everyone, trans); #ifdef DEBUG printf("VF: Consensus reached!\n"); diff --git a/rgmanager/src/daemons/main.c b/rgmanager/src/daemons/main.c index 7f12f08..601e7d0 100644 --- a/rgmanager/src/daemons/main.c +++ b/rgmanager/src/daemons/main.c @@ -45,7 +45,7 @@ #ifdef WRAP_THREADS void dump_thread_states(FILE *); #endif -int configure_rgmanager(int ccsfd, int debug); +int configure_rgmanager(int ccsfd, int debug, int *cluster_timeout); void node_event(int, int, int, int); void node_event_q(int, int, int, int); @@ -792,7 +792,7 @@ event_loop(msgctx_t *localctx, msgctx_t *clusterctx) if (need_reconfigure || check_config_update(&oldver, &newver)) { need_reconfigure = 0; - configure_rgmanager(-1, 0); + configure_rgmanager(-1, 0, NULL); config_event_q(oldver, newver); return 0; } @@ -848,11 +848,12 @@ statedump(int __attribute__ ((unused)) sig) * Configure logging based on data in cluster.conf */ int -configure_rgmanager(int ccsfd, int dbg) +configure_rgmanager(int ccsfd, int dbg, int *token_secs) { char *v; char internal = 0; int status_child_max = 0; + int tmp; if (ccsfd == -1) { internal = 1; @@ -861,6 +862,16 @@ configure_rgmanager(int ccsfd, int dbg) return -1; } + if (token_secs && ccs_get(ccsfd, "/cluster/totem/@token", &v) == 0) { + tmp = atoi(v); + if (tmp >= 1000) { + *token_secs = tmp / 1000; + if (tmp % 1000) + ++(*token_secs); + } + free(v); + } + if (ccs_get(ccsfd, "/cluster/rm/@log_facility", &v) == 0) { clu_set_facility(v); free(v); @@ -1011,6 +1022,7 @@ main(int argc, char **argv) msgctx_t *local_ctx; pthread_t th; cman_handle_t clu = NULL; + int cluster_timeout = 10; while ((rv = getopt(argc, argv, "wfdN")) != EOF) { switch (rv) { @@ -1089,7 +1101,7 @@ main(int argc, char **argv) We know we're quorate. At this point, we need to read the resource group trees from ccsd. */ - configure_rgmanager(-1, debug); + configure_rgmanager(-1, debug, &cluster_timeout); clulog(LOG_NOTICE, "Resource Group Manager Starting\n"); if (init_resource_groups(0, do_init, 0) != 0) { @@ -1132,7 +1144,7 @@ main(int argc, char **argv) ds_key_init("rg_lockdown", 32, 10); #else - if (vf_init(me.cn_nodeid, port, NULL, NULL) != 0) { + if (vf_init(me.cn_nodeid, port, NULL, NULL, cluster_timeout) != 0) { clulog(LOG_CRIT, "#11: Couldn't set up VF listen socket\n"); return -1; } -- 1.6.2.5