From c1de3c2fc07a756a6c96dbe6ebe17257b1e08d52 Mon Sep 17 00:00:00 2001 From: Lon Hohberger <lhh@redhat.com> Date: Wed, 1 Sep 2010 15:24:22 -0400 Subject: [PATCH] rgmanager: Add convalesce operation This option enables the administrator to repair a partially-failed service without affecting the rest of the service. Resolves: rhbz#605733 Signed-off-by: Lon Hohberger <lhh@redhat.com> --- rgmanager/include/resgroup.h | 1 + rgmanager/include/reslist.h | 1 + rgmanager/src/daemons/groups.c | 3 + rgmanager/src/daemons/restree.c | 14 ++++ rgmanager/src/daemons/rg_state.c | 119 ++++++++++++++++++++++++++++++++++++- rgmanager/src/daemons/rg_thread.c | 20 ++++++ rgmanager/src/utils/clusvcadm.c | 7 ++- 7 files changed, 161 insertions(+), 4 deletions(-) diff --git a/rgmanager/include/resgroup.h b/rgmanager/include/resgroup.h index 701d884..ded11f4 100644 --- a/rgmanager/include/resgroup.h +++ b/rgmanager/include/resgroup.h @@ -145,6 +145,7 @@ int svc_disable(char *svcName); int svc_fail(char *svcName); int svc_freeze(char *svcName); int svc_unfreeze(char *svcName); +int svc_convalesce(char *svcName); int svc_migrate(char *svcName, int target); int check_restart(char *svcName); int clear_restart(const char *rg_name); diff --git a/rgmanager/include/reslist.h b/rgmanager/include/reslist.h index 1958c76..3f5e772 100644 --- a/rgmanager/include/reslist.h +++ b/rgmanager/include/reslist.h @@ -163,6 +163,7 @@ int res_start(resource_node_t **tree, resource_t *res, void *ret); int res_stop(resource_node_t **tree, resource_t *res, void *ret); int res_status(resource_node_t **tree, resource_t *res, void *ret); int res_status_inquiry(resource_node_t **tree, resource_t *res, void *ret); +int res_convalesce(resource_node_t **tree, resource_t *res, void *ret); int res_condstart(resource_node_t **tree, resource_t *res, void *ret); int res_condstop(resource_node_t **tree, resource_t *res, void *ret); int res_exec(resource_node_t *node, int op, const char *arg, int depth); diff --git a/rgmanager/src/daemons/groups.c b/rgmanager/src/daemons/groups.c index 9818aa3..ecb7b85 100644 --- a/rgmanager/src/daemons/groups.c +++ b/rgmanager/src/daemons/groups.c @@ -993,6 +993,9 @@ group_op(char *groupname, int op) case RG_CONDSTART: ret = res_condstart(&_tree, res, NULL); break; + case RG_CONVALESCE: + ret = res_convalesce(&_tree, res, NULL); + break; } pthread_rwlock_unlock(&resource_lock); diff --git a/rgmanager/src/daemons/restree.c b/rgmanager/src/daemons/restree.c index 3c0ce5f..3a03f91 100644 --- a/rgmanager/src/daemons/restree.c +++ b/rgmanager/src/daemons/restree.c @@ -1641,6 +1641,20 @@ res_start(resource_node_t **tree, resource_t *res, void *ret) /** + Repair/fix/convalesce all occurrences of a resource in a tree + + @param tree Tree to search for our resource. + @param res Resource to start/stop + @param ret Unused + */ +int +res_convalesce(resource_node_t **tree, resource_t *res, void *ret) +{ + return _res_op(tree, res, NULL, ret, RS_CONVALESCE); +} + + +/** Start all occurrences of a resource in a tree @param tree Tree to search for our resource. diff --git a/rgmanager/src/daemons/rg_state.c b/rgmanager/src/daemons/rg_state.c index c1bcdc8..020b47c 100644 --- a/rgmanager/src/daemons/rg_state.c +++ b/rgmanager/src/daemons/rg_state.c @@ -865,6 +865,92 @@ svc_start(char *svcName, int req) /** + * Fix stuff + */ +int +svc_convalesce(char *svcName) +{ + struct dlm_lksb lockp; + rg_state_t svcStatus; + int ret; + + if (rg_lock(svcName, &lockp) < 0) { + clulog(LOG_ERR, "#451: Unable to obtain cluster lock: %s\n", + strerror(errno)); + return RG_EFAIL; + } + + if (get_rg_state(svcName, &svcStatus) != 0) { + rg_unlock(&lockp); + clulog(LOG_ERR, "#461: Failed getting status for RG %s\n", + svcName); + return RG_EFAIL; + } + + switch(svcStatus.rs_state) { + case RG_STATE_STARTED: + break; + case RG_STATE_STARTING: + case RG_STATE_STOPPING: + case RG_STATE_RECOVER: + case RG_STATE_MIGRATE: + case RG_STATE_ERROR: + rg_unlock(&lockp); + return RG_EAGAIN; + default: + rg_unlock(&lockp); + return RG_EINVAL; + } + + if (svcStatus.rs_flags & RG_FLAG_FROZEN) { + rg_unlock(&lockp); + return RG_EFROZEN; + } + + if (svcStatus.rs_owner != (uint32_t)my_id()) { + rg_unlock(&lockp); + return RG_EFORWARD; + } + + if (!(svcStatus.rs_flags & RG_FLAG_PARTIAL)) { + rg_unlock(&lockp); + return RG_ERUN; + } + + rg_unlock(&lockp); + + clulog(LOG_INFO, "Repairing %s\n", svcName); + ret = group_op(svcName, RG_CONVALESCE); + + switch(ret) { + default: + clulog(LOG_WARNING, "Failed to repair %s\n", svcName); + /* Fail to restart a non-critical resource + * does not fail the service. */ + return RG_EFAIL; + case 0: + clulog(LOG_INFO, "Repair of %s was successful\n", svcName); + break; + } + + /* Success - flip owner in state info */ + if (rg_lock(svcName, &lockp) < 0) { + clulog(LOG_ERR, "#455: Unable to obtain cluster lock: %s\n", + strerror(errno)); + return RG_EFAIL; + } + + /* No need for a 'get' here since the service is still STARTED */ + svcStatus.rs_flags &= ~RG_FLAG_PARTIAL; + + set_rg_state(svcName, &svcStatus); + rg_unlock(&lockp); + + return 0; +} + + +/** * Migrate a service to another node. Relies on agent * operating synchronously */ @@ -1171,8 +1257,31 @@ svc_status(char *svcName) } /* For running services, if the return code is 0, we're done*/ - if (svcStatus.rs_state == RG_STATE_STARTED) - return handle_started_status(svcName, ret, &svcStatus); + if (svcStatus.rs_state == RG_STATE_STARTED) { + ret = handle_started_status(svcName, ret, &svcStatus); + + if (ret & SFL_PARTIAL) { + ret &= ~SFL_PARTIAL; + svcStatus.rs_flags |= RG_FLAG_PARTIAL; + if (rg_lock(svcName, &lockp) < 0) { + clulog(LOG_ERR, + "#481: Unable to obtain cluster lock: %s\n", + strerror(errno)); + return RG_EFAIL; + } + + if (set_rg_state(svcName, &svcStatus) != 0) { + rg_unlock(&lockp); + clulog(LOG_ERR, + "#482: Failed setting status for RG %s\n", + svcName); + return RG_EFAIL; + } + rg_unlock(&lockp); + } + + return ret; + } return handle_migrate_status(svcName, ret, &svcStatus); } @@ -1213,9 +1322,10 @@ handle_started_status(char *svcName, int ret, svcName); if (ret & SFL_PARTIAL) { clulog(LOG_NOTICE, "Note: Some non-critical " - "resources are still stopped.\n"); + "resources were stopped during recovery.\n"); clulog(LOG_NOTICE, "Run 'clusvcadm -c %s' to " "restore them to operation.\n", svcName); + return SFL_PARTIAL; } return 0; @@ -1322,6 +1432,7 @@ _svc_stop(char *svcName, int req, int recover, uint32_t newstate) svcStatus.rs_last_owner = svcStatus.rs_owner; svcStatus.rs_owner = 0; svcStatus.rs_state = RG_STATE_STOPPED; + svcStatus.rs_flags = 0; if (set_rg_state(svcName, &svcStatus) != 0) { rg_unlock(&lockp); return RG_EFAIL; @@ -1352,6 +1463,7 @@ _svc_stop(char *svcName, int req, int recover, uint32_t newstate) clulog(LOG_DEBUG, "%s is clean; skipping double-stop\n", svcName); svcStatus.rs_state = newstate; + svcStatus.rs_flags = 0; if (set_rg_state(svcName, &svcStatus) != 0) { clulog(LOG_ERR, "#52: Failed changing RG status\n"); @@ -1433,6 +1545,7 @@ _svc_stop_finish(char *svcName, int failed, uint32_t newstate) } svcStatus.rs_state = newstate; + svcStatus.rs_flags = 0; clulog(LOG_NOTICE, "Service %s is %s\n", svcName, rg_state_str(svcStatus.rs_state)); diff --git a/rgmanager/src/daemons/rg_thread.c b/rgmanager/src/daemons/rg_thread.c index d0c0b50..b89af97 100644 --- a/rgmanager/src/daemons/rg_thread.c +++ b/rgmanager/src/daemons/rg_thread.c @@ -293,6 +293,26 @@ resgroup_thread_main(void *arg) ret = RG_NONE; break; + case RG_CONVALESCE: + error = svc_convalesce(myname); + + if (error == 0) { + ret = RG_SUCCESS; + + pthread_mutex_lock(&my_queue_mutex); + purge_status_checks(&my_queue); + pthread_mutex_unlock(&my_queue_mutex); + } else if (error == RG_EFORWARD) { + ret = RG_NONE; + break; + } else { + /* + * Bad news. + */ + ret = RG_EFAIL; + } + break; + case RG_MIGRATE: error = svc_migrate(myname, req->rr_target); diff --git a/rgmanager/src/utils/clusvcadm.c b/rgmanager/src/utils/clusvcadm.c index 73f41cc..1bc5d9e 100644 --- a/rgmanager/src/utils/clusvcadm.c +++ b/rgmanager/src/utils/clusvcadm.c @@ -251,7 +251,7 @@ main(int argc, char **argv) return 1; } - while ((opt = getopt(argc, argv, "lSue:M:d:r:n:m:Z:U:FvR:s:qh?")) != EOF) { + while ((opt = getopt(argc, argv, "lSue:M:d:r:n:c:m:Z:U:FvR:s:qh?")) != EOF) { switch (opt) { case 'l': return do_lock(); @@ -328,6 +328,11 @@ main(int argc, char **argv) action = RG_UNFREEZE; svcname = optarg; break; + case 'c': + actionstr = "convalescing"; + action = RG_CONVALESCE; + svcname = optarg; + break; case 'q': close(STDOUT_FILENO); break; -- 1.7.2.2