Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > media > main-src > by-pkgid > e536fc0c6270ec1d92a0fd41bb1f8360 > files > 119

rgmanager-2.0.52-28.el5_8.2.src.rpm

From c1de3c2fc07a756a6c96dbe6ebe17257b1e08d52 Mon Sep 17 00:00:00 2001
From: Lon Hohberger <lhh@redhat.com>
Date: Wed, 1 Sep 2010 15:24:22 -0400
Subject: [PATCH] rgmanager: Add convalesce operation

This option enables the administrator to repair
a partially-failed service without affecting
the rest of the service.

Resolves: rhbz#605733

Signed-off-by: Lon Hohberger <lhh@redhat.com>
---
 rgmanager/include/resgroup.h      |    1 +
 rgmanager/include/reslist.h       |    1 +
 rgmanager/src/daemons/groups.c    |    3 +
 rgmanager/src/daemons/restree.c   |   14 ++++
 rgmanager/src/daemons/rg_state.c  |  119 ++++++++++++++++++++++++++++++++++++-
 rgmanager/src/daemons/rg_thread.c |   20 ++++++
 rgmanager/src/utils/clusvcadm.c   |    7 ++-
 7 files changed, 161 insertions(+), 4 deletions(-)

diff --git a/rgmanager/include/resgroup.h b/rgmanager/include/resgroup.h
index 701d884..ded11f4 100644
--- a/rgmanager/include/resgroup.h
+++ b/rgmanager/include/resgroup.h
@@ -145,6 +145,7 @@ int svc_disable(char *svcName);
 int svc_fail(char *svcName);
 int svc_freeze(char *svcName);
 int svc_unfreeze(char *svcName);
+int svc_convalesce(char *svcName);
 int svc_migrate(char *svcName, int target);
 int check_restart(char *svcName);
 int clear_restart(const char *rg_name);
diff --git a/rgmanager/include/reslist.h b/rgmanager/include/reslist.h
index 1958c76..3f5e772 100644
--- a/rgmanager/include/reslist.h
+++ b/rgmanager/include/reslist.h
@@ -163,6 +163,7 @@ int res_start(resource_node_t **tree, resource_t *res, void *ret);
 int res_stop(resource_node_t **tree, resource_t *res, void *ret);
 int res_status(resource_node_t **tree, resource_t *res, void *ret);
 int res_status_inquiry(resource_node_t **tree, resource_t *res, void *ret);
+int res_convalesce(resource_node_t **tree, resource_t *res, void *ret);
 int res_condstart(resource_node_t **tree, resource_t *res, void *ret);
 int res_condstop(resource_node_t **tree, resource_t *res, void *ret);
 int res_exec(resource_node_t *node, int op, const char *arg, int depth);
diff --git a/rgmanager/src/daemons/groups.c b/rgmanager/src/daemons/groups.c
index 9818aa3..ecb7b85 100644
--- a/rgmanager/src/daemons/groups.c
+++ b/rgmanager/src/daemons/groups.c
@@ -993,6 +993,9 @@ group_op(char *groupname, int op)
 	case RG_CONDSTART:
 		ret = res_condstart(&_tree, res, NULL);
 		break;
+	case RG_CONVALESCE:
+		ret = res_convalesce(&_tree, res, NULL);
+		break;
 	}
 	pthread_rwlock_unlock(&resource_lock);
 
diff --git a/rgmanager/src/daemons/restree.c b/rgmanager/src/daemons/restree.c
index 3c0ce5f..3a03f91 100644
--- a/rgmanager/src/daemons/restree.c
+++ b/rgmanager/src/daemons/restree.c
@@ -1641,6 +1641,20 @@ res_start(resource_node_t **tree, resource_t *res, void *ret)
 
 
 /**
+   Repair/fix/convalesce all occurrences of a resource in a tree
+
+   @param tree		Tree to search for our resource.
+   @param res		Resource to start/stop
+   @param ret		Unused
+ */
+int
+res_convalesce(resource_node_t **tree, resource_t *res, void *ret)
+{
+	return _res_op(tree, res, NULL, ret, RS_CONVALESCE);
+}
+
+
+/**
    Start all occurrences of a resource in a tree
 
    @param tree		Tree to search for our resource.
diff --git a/rgmanager/src/daemons/rg_state.c b/rgmanager/src/daemons/rg_state.c
index c1bcdc8..020b47c 100644
--- a/rgmanager/src/daemons/rg_state.c
+++ b/rgmanager/src/daemons/rg_state.c
@@ -865,6 +865,92 @@ svc_start(char *svcName, int req)
 
 
 /**
+ * Fix stuff
+ */
+int
+svc_convalesce(char *svcName)
+{
+	struct dlm_lksb lockp;
+	rg_state_t svcStatus;
+	int ret;
+
+	if (rg_lock(svcName, &lockp) < 0) {
+		clulog(LOG_ERR, "#451: Unable to obtain cluster lock: %s\n",
+		       strerror(errno));
+		return RG_EFAIL;
+	}
+
+	if (get_rg_state(svcName, &svcStatus) != 0) {
+		rg_unlock(&lockp);
+		clulog(LOG_ERR, "#461: Failed getting status for RG %s\n",
+		       svcName);
+		return RG_EFAIL;
+	}
+
+	switch(svcStatus.rs_state) {
+	case RG_STATE_STARTED:
+		break;
+	case RG_STATE_STARTING:
+	case RG_STATE_STOPPING:
+	case RG_STATE_RECOVER:
+	case RG_STATE_MIGRATE:
+	case RG_STATE_ERROR:
+		rg_unlock(&lockp);
+		return RG_EAGAIN;
+	default:
+		rg_unlock(&lockp);
+		return RG_EINVAL;
+	}
+
+	if (svcStatus.rs_flags & RG_FLAG_FROZEN) {
+		rg_unlock(&lockp);
+		return RG_EFROZEN;
+	}
+
+	if (svcStatus.rs_owner != (uint32_t)my_id()) {
+		rg_unlock(&lockp);
+		return RG_EFORWARD;
+	}
+
+	if (!(svcStatus.rs_flags & RG_FLAG_PARTIAL)) {
+		rg_unlock(&lockp);
+		return RG_ERUN;
+	}
+
+	rg_unlock(&lockp);
+
+	clulog(LOG_INFO, "Repairing %s\n", svcName);
+	ret = group_op(svcName, RG_CONVALESCE);
+
+	switch(ret) {
+	default:
+		clulog(LOG_WARNING, "Failed to repair %s\n", svcName);
+		/* Fail to restart a non-critical resource
+		 * does not fail the service. */
+		return RG_EFAIL;
+	case 0:
+		clulog(LOG_INFO, "Repair of %s was successful\n", svcName);
+		break;
+	}
+
+	/* Success - flip owner in state info */
+	if (rg_lock(svcName, &lockp) < 0) {
+		clulog(LOG_ERR, "#455: Unable to obtain cluster lock: %s\n",
+			   strerror(errno));
+		return RG_EFAIL;
+	}
+
+	/* No need for a 'get' here since the service is still STARTED */
+	svcStatus.rs_flags &= ~RG_FLAG_PARTIAL;
+
+	set_rg_state(svcName, &svcStatus);
+	rg_unlock(&lockp);
+
+	return 0;
+}
+
+
+/**
  * Migrate a service to another node.  Relies on agent
  * operating synchronously
  */
@@ -1171,8 +1257,31 @@ svc_status(char *svcName)
 	}
 
 	/* For running services, if the return code is 0, we're done*/
-	if (svcStatus.rs_state == RG_STATE_STARTED)
-		return handle_started_status(svcName, ret, &svcStatus);
+	if (svcStatus.rs_state == RG_STATE_STARTED) {
+		ret = handle_started_status(svcName, ret, &svcStatus);
+
+		if (ret & SFL_PARTIAL) {
+			ret &= ~SFL_PARTIAL;
+			svcStatus.rs_flags |= RG_FLAG_PARTIAL;
+			if (rg_lock(svcName, &lockp) < 0) {
+				clulog(LOG_ERR,
+				       "#481: Unable to obtain cluster lock: %s\n",
+				       strerror(errno));
+				return RG_EFAIL;
+			}
+
+			if (set_rg_state(svcName, &svcStatus) != 0) {
+				rg_unlock(&lockp);
+				clulog(LOG_ERR,
+				       "#482: Failed setting status for RG %s\n",
+				       svcName);
+				return RG_EFAIL;
+			}
+			rg_unlock(&lockp);
+		}
+
+		return ret;
+	}
 	
 	return handle_migrate_status(svcName, ret, &svcStatus);
 }
@@ -1213,9 +1322,10 @@ handle_started_status(char *svcName, int ret,
 		       svcName);
 		if (ret & SFL_PARTIAL) {
 			clulog(LOG_NOTICE, "Note: Some non-critical "
-			       "resources are still stopped.\n");
+			       "resources were stopped during recovery.\n");
 			clulog(LOG_NOTICE, "Run 'clusvcadm -c %s' to "
 			       "restore them to operation.\n", svcName);
+			return SFL_PARTIAL;
 		}
 
 		return 0;
@@ -1322,6 +1432,7 @@ _svc_stop(char *svcName, int req, int recover, uint32_t newstate)
 		svcStatus.rs_last_owner = svcStatus.rs_owner;
 		svcStatus.rs_owner = 0;
 		svcStatus.rs_state = RG_STATE_STOPPED;
+		svcStatus.rs_flags = 0;
 		if (set_rg_state(svcName, &svcStatus) != 0) {
 			rg_unlock(&lockp);
 			return RG_EFAIL;
@@ -1352,6 +1463,7 @@ _svc_stop(char *svcName, int req, int recover, uint32_t newstate)
 		clulog(LOG_DEBUG, "%s is clean; skipping double-stop\n",
 		       svcName);
 		svcStatus.rs_state = newstate;
+		svcStatus.rs_flags = 0;
 
 		if (set_rg_state(svcName, &svcStatus) != 0) {
 			clulog(LOG_ERR, "#52: Failed changing RG status\n");
@@ -1433,6 +1545,7 @@ _svc_stop_finish(char *svcName, int failed, uint32_t newstate)
 	}
 
 	svcStatus.rs_state = newstate;
+	svcStatus.rs_flags = 0;
 
 	clulog(LOG_NOTICE, "Service %s is %s\n", svcName,
 	       rg_state_str(svcStatus.rs_state));
diff --git a/rgmanager/src/daemons/rg_thread.c b/rgmanager/src/daemons/rg_thread.c
index d0c0b50..b89af97 100644
--- a/rgmanager/src/daemons/rg_thread.c
+++ b/rgmanager/src/daemons/rg_thread.c
@@ -293,6 +293,26 @@ resgroup_thread_main(void *arg)
 				ret = RG_NONE;
 			break;
 
+		case RG_CONVALESCE:
+			error = svc_convalesce(myname);
+
+			if (error == 0) {
+				ret = RG_SUCCESS;
+
+				pthread_mutex_lock(&my_queue_mutex);
+				purge_status_checks(&my_queue);
+				pthread_mutex_unlock(&my_queue_mutex);
+			} else if (error == RG_EFORWARD) {
+				ret = RG_NONE;
+				break;
+			} else {
+				/*
+				 * Bad news.
+				 */
+				ret = RG_EFAIL;
+			}
+			break;
+
 		case RG_MIGRATE:
 			error = svc_migrate(myname, req->rr_target);
 
diff --git a/rgmanager/src/utils/clusvcadm.c b/rgmanager/src/utils/clusvcadm.c
index 73f41cc..1bc5d9e 100644
--- a/rgmanager/src/utils/clusvcadm.c
+++ b/rgmanager/src/utils/clusvcadm.c
@@ -251,7 +251,7 @@ main(int argc, char **argv)
 		return 1;
 	}
 
-	while ((opt = getopt(argc, argv, "lSue:M:d:r:n:m:Z:U:FvR:s:qh?")) != EOF) {
+	while ((opt = getopt(argc, argv, "lSue:M:d:r:n:c:m:Z:U:FvR:s:qh?")) != EOF) {
 		switch (opt) {
 		case 'l':
 			return do_lock();
@@ -328,6 +328,11 @@ main(int argc, char **argv)
 			action = RG_UNFREEZE;
 			svcname = optarg;
 			break;
+		case 'c':
+			actionstr = "convalescing";
+			action = RG_CONVALESCE;
+			svcname = optarg;
+			break;
 		case 'q':
 			close(STDOUT_FILENO);
 			break;
-- 
1.7.2.2