Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > media > main-src > by-pkgid > e536fc0c6270ec1d92a0fd41bb1f8360 > files > 120

rgmanager-2.0.52-28.el5_8.2.src.rpm

From b05534b22a95d15be95035448a20f6fa321a9a10 Mon Sep 17 00:00:00 2001
From: Lon Hohberger <lhh@redhat.com>
Date: Fri, 10 Sep 2010 14:45:15 -0400
Subject: [PATCH] rgmanager: Add independent subtree restart thresholds

__max_restarts        => number of restarts to tolerate
__restart_expire_time => in the amount of time given here

__independent_subtree="1":
 - after threshold is exceeded for the subtree, restart the
   service
 - if no threshold, never restart the service

__independent_subtree="2"
 - after threshold is exceeded for the subtree, stop the
   subtree
 - if no threshold, stop the subtree on first failure

Resolves: rhbz#605733

Signed-off-by: Lon Hohberger <lhh@redhat.com>
---
 rgmanager/src/daemons/restree.c |   97 ++++++++++++++++++++++++++++++++-------
 1 files changed, 80 insertions(+), 17 deletions(-)

diff --git a/rgmanager/src/daemons/restree.c b/rgmanager/src/daemons/restree.c
index 0494cdc..60743a9 100644
--- a/rgmanager/src/daemons/restree.c
+++ b/rgmanager/src/daemons/restree.c
@@ -522,20 +522,54 @@ res_exec(resource_node_t *node, int op, const char *arg, int depth)
 
 static inline void
 assign_restart_policy(resource_t *curres, resource_node_t *parent,
-		      resource_node_t *node)
+		      resource_node_t *node, int ccsfd, char *base)
 {
 	char *val;
 	int max_restarts = 0;
 	time_t restart_expire_time = 0;
+	char tok[1024];
 
 	node->rn_restart_counter = NULL;
 
 	if (!curres || !node)
 		return;
-	if (parent) /* Non-parents don't get one for now */
+	if (parent &&
+	    !(node->rn_flags & RF_INDEPENDENT))
 		return;
 
-	val = res_attr_value(curres, "max_restarts");
+	if (node->rn_flags & RF_INDEPENDENT) {
+		/* per-resource-node failures / expire times */
+		snprintf(tok, sizeof(tok), "%s/@__max_restarts", base);
+#ifndef NO_CCS
+		if (ccs_get(ccsfd, tok, &val) == 0) {
+#else
+		if (conf_get(tok, &val) == 0) {
+#endif
+			max_restarts = atoi(val);
+			if (max_restarts <= 0)
+				max_restarts = 0;
+			free(val);
+			
+			if (!max_restarts)
+				return;
+		}
+	
+		snprintf(tok, sizeof(tok), "%s/@__restart_expire_time", base);
+#ifndef NO_CCS
+		if (ccs_get(ccsfd, tok, &val) == 0) {
+#else
+		if (conf_get(tok, &val) == 0) {
+#endif
+			restart_expire_time = (time_t)expand_time(val);
+			if ((int64_t)restart_expire_time < 0)
+				restart_expire_time = 0;
+			free(val);
+		}
+
+		goto out_assign;
+	}
+
+	val = (char *)res_attr_value(curres, "max_restarts");
 	if (!val)
 		return;
 	max_restarts = atoi(val);
@@ -548,6 +582,7 @@ assign_restart_policy(resource_t *curres, resource_node_t *parent,
 			return;
 	}
 
+out_assign:
 	node->rn_restart_counter = restart_init(restart_expire_time,
 						max_restarts);
 }
@@ -637,20 +672,25 @@ do_load_resource(int ccsfd, char *base,
 	node->rn_state = RES_STOPPED;
 	node->rn_flags = 0;
 	node->rn_actions = (resource_act_t *)act_dup(curres->r_actions);
-	assign_restart_policy(curres, parent, node);
 
-	snprintf(tok, sizeof(tok), "%s/@__independent_subtree", base);
+
+	if (parent) {
+		/* Independent subtree / non-critical for top-level is
+		 * not useful and can interfere with restart thresholds for
+		 * non critical resources */
+		snprintf(tok, sizeof(tok), "%s/@__independent_subtree", base);
 #ifndef NO_CCS
-	if (ccs_get(ccsfd, tok, &ref) == 0) {
+		if (ccs_get(ccsfd, tok, &ref) == 0) {
 #else
-	if (conf_get(tok, &ref) == 0) {
+		if (conf_get(tok, &ref) == 0) {
 #endif
-		if (atoi(ref) == 1 || strcasecmp(ref, "yes") == 0)
-			node->rn_flags |= RF_INDEPENDENT;
-		if (atoi(ref) == 2 || strcasecmp(ref, "non-critical") == 0) {
-			curres->r_flags |= RF_NON_CRITICAL;
+			if (atoi(ref) == 1 || strcasecmp(ref, "yes") == 0)
+				node->rn_flags |= RF_INDEPENDENT;
+			if (atoi(ref) == 2 || strcasecmp(ref, "non-critical") == 0) {
+				curres->r_flags |= RF_NON_CRITICAL;
+			}
+			free(ref);
 		}
-		free(ref);
 	}
 
 	snprintf(tok, sizeof(tok), "%s/@__enforce_timeouts", base);
@@ -708,8 +748,11 @@ do_load_resource(int ccsfd, char *base,
 		 * resource is non-critical
 		 */
 		node->rn_flags |= RF_NON_CRITICAL | RF_INDEPENDENT;
+
 	}
 
+	assign_restart_policy(curres, parent, node, ccsfd, base);
+
 	*newnode = node;
 
 	list_insert(tree, node);
@@ -1377,7 +1420,7 @@ _res_op_internal(resource_node_t __attribute__ ((unused)) **tree,
 		 char *type, void *__attribute__((unused))ret, int realop,
 		 resource_node_t *node)
 {
-	int rv = 0, me, op;
+	int rv = 0, me, op, rte = 0;
 
 	/* Restore default operation. */
 	op = realop;
@@ -1443,6 +1486,10 @@ _res_op_internal(resource_node_t __attribute__ ((unused)) **tree,
 			/* Nothing to do - children are also disabled */
 			return 0;
 
+		if ((realop == RS_START || realop == RS_CONVALESCE) &&
+		     node->rn_flags & RF_INDEPENDENT)
+			restart_clear(node->rn_restart_counter);
+
 		pthread_mutex_lock(&node->rn_resource->r_mutex);
 
 		if (node->rn_flags & RF_RECONFIG &&
@@ -1496,14 +1543,22 @@ _res_op_internal(resource_node_t __attribute__ ((unused)) **tree,
 			   resources of this node must be restarted,
 			   but siblings of this node are not affected. */
 			if (node->rn_flags & RF_INDEPENDENT) {
-				if (node->rn_flags & RF_NON_CRITICAL) {
+				
+				rte = restart_threshold_exceeded(node->rn_restart_counter);
+				if ((node->rn_flags & RF_NON_CRITICAL) && (rte ||
+				    !node->rn_restart_counter)) {
 					mark_nodes(node, RES_FAILED,
 						   RF_NEEDSTOP | RF_QUIESCE, 0);
+					restart_clear(node->rn_restart_counter);
 					return SFL_RECOVERABLE|SFL_PARTIAL;
-				} else {
+				} else if (!rte) {
+					restart_add(node->rn_restart_counter);
 					mark_nodes(node, RES_FAILED,
 						   RF_NEEDSTART | RF_NEEDSTOP, 0);
 					return SFL_RECOVERABLE;
+				} else {
+					restart_clear(node->rn_restart_counter);
+					return SFL_FAILURE;
 				}
 			}
 
@@ -1527,15 +1582,23 @@ _res_op_internal(resource_node_t __attribute__ ((unused)) **tree,
 		  the resource tree. */
 		if (op == RS_STATUS && (rv & (SFL_FAILURE|SFL_RECOVERABLE)) &&
 		    (node->rn_flags & RF_INDEPENDENT)) {
+
+			rte = restart_threshold_exceeded(node->rn_restart_counter);
+
 			rv = SFL_RECOVERABLE;
-			if (node->rn_flags & RF_NON_CRITICAL) {
+			if ((node->rn_flags & RF_NON_CRITICAL) && (rte || 
+			    !node->rn_restart_counter)) {
 				/* if non-critical, just stop */
 				mark_nodes(node, RES_FAILED, RF_NEEDSTOP | RF_QUIESCE, 0);
 
 				rv |= SFL_PARTIAL;
-			} else {
+			} else if (!rte) {
+				restart_add(node->rn_restart_counter);
 				mark_nodes(node, RES_FAILED,
 					   RF_NEEDSTOP | RF_NEEDSTART, 0);
+			} else {
+				restart_clear(node->rn_restart_counter);
+				rv = SFL_FAILURE;
 			}
 		}
 	}
-- 
1.7.2.2