Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 893b4547bb5f5eb61d2f3af4281e5a38 > files > 165

cman-2.0.115-96.el5_8.1.src.rpm

commit fe9a89972834d0459c312bede9e4a32df52e445a
Author: Eduardo Damato <edamato@redhat.com>
Date:   Tue Sep 29 09:58:18 2009 -0400

    qdisk: Implement I/O timeouts in qdiskd
    
    This allows administrators to make qdiskd reboot the
    system if it can not write its status out for interval*tko
    seconds.
    
    Resolves: rhbz#511113
    
    Part 1/4
    
    Signed-off-by: Eduardo Damato <edamato@redhat.com>
    Signed-off-by: Lon Hohberger <lhh@redhat.com>

diff --git a/cman/man/qdisk.5 b/cman/man/qdisk.5
index 65b9956..513d56b 100644
--- a/cman/man/qdisk.5
+++ b/cman/man/qdisk.5
@@ -291,6 +291,13 @@ if it takes more than (interval * tko) seconds to complete a quorum disk
 pass.  The default for this value is 0 (off).
 
 .in 9
+\fIio_timeout\fP\fB="\fP0\fB"\fP
+.in 12
+If set to 1 (on), qdiskd will watch internal timers and reboot the node
+if qdisk is not able to write to disk after (interval * tko) seconds.
+The default for this value is 0 (off).
+
+.in 9
 \fIscheduler\fP\fB="\fPrr\fB"\fP
 .in 12
 Valid values are 'rr', 'fifo', and 'other'.  Selects the scheduling queue
diff --git a/cman/qdisk/disk.h b/cman/qdisk/disk.h
index 3509339..0b652b2 100644
--- a/cman/qdisk/disk.h
+++ b/cman/qdisk/disk.h
@@ -73,7 +73,8 @@ typedef enum {
 	RF_PARANOID = 0x8,
 	RF_ALLOW_KILL = 0x10,
 	RF_UPTIME = 0x20,
-	RF_CMAN_LABEL = 0x40
+	RF_CMAN_LABEL = 0x40,
+	RF_IOTIMEOUT = 0x80
 } run_flag_t;
 
 
diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c
index 81999a0..c86759e 100644
--- a/cman/qdisk/main.c
+++ b/cman/qdisk/main.c
@@ -867,7 +867,7 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
 	int low_id, bid_pending = 0, score, score_max, score_req,
 	    upgrade = 0, count, errors, error_cycles = 0;
 	memb_mask_t mask, master_mask;
-	struct timeval maxtime, oldtime, newtime, diff, sleeptime, interval;
+	struct timeval maxtime, oldtime, newtime, diff, sleeptime, interval, lastok;
 
 	ctx->qc_status = S_NONE;
 	
@@ -877,6 +877,9 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
 	interval.tv_usec = 0;
 	interval.tv_sec = ctx->qc_interval;
 	
+	lastok.tv_usec = 0;
+	lastok.tv_sec = 0;
+	
 	get_my_score(&score, &score_max);
 	if (score_max < ctx->qc_scoremin) {
 		clulog(LOG_WARNING, "Minimum score (%d) is impossible to "
@@ -1065,6 +1068,8 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
 			clulog(LOG_ERR, "Error writing to quorum disk\n");
 			errors++; /* this value isn't really used 
 				     at this point */
+ 		} else {
+ 			get_time(&lastok, ctx->qc_flags&RF_UPTIME);
 		}
 
 		/* write out our local status */
@@ -1073,11 +1078,27 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
 		/* Cycle. We could time the loop and sleep
 		   usleep(interval-looptime), but this is fine for now.*/
 		get_time(&newtime, ctx->qc_flags&RF_UPTIME);
-		_diff_tv(&diff, &oldtime, &newtime);
 		
+ 		/*
+		 * Reboot if the last successful hearbeat was longer ago than interval*TKO_COUNT
+		 */
+		_diff_tv(&diff, &lastok, &newtime);
+		if (_cmp_tv(&maxtime, &diff) == 1 &&
+		    ctx->qc_flags & RF_IOTIMEOUT) {
+			clulog(LOG_EMERG, "Failed to send a heartbeat within "
+			       "%d second%s (%d.%06d) - REBOOTING\n",
+			       (int)maxtime.tv_sec,
+			       maxtime.tv_sec==1?"":"s",
+			       (int)diff.tv_sec,
+			       (int)diff.tv_usec);
+			if (!(ctx->qc_flags & RF_DEBUG)) 
+				reboot(RB_AUTOBOOT);
+		}
+	
 		/*
 		 * Reboot if we didn't send a heartbeat in interval*TKO_COUNT
 		 */
+		_diff_tv(&diff, &oldtime, &newtime);
 		if (_cmp_tv(&maxtime, &diff) == 1 &&
 		    ctx->qc_flags & RF_PARANOID) {
 			clulog(LOG_EMERG, "Failed to complete a cycle within "
@@ -1347,6 +1368,15 @@ get_config_data(char *cluster_name, qd_ctx *ctx, struct h_data *h, int maxh,
 		free(val);
 	}
 	
+	/* default = off, so, 1 to turn on */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@io_timeout");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		if (!atoi(val))
+			ctx->qc_flags &= ~RF_IOTIMEOUT;
+		else
+			ctx->qc_flags |= RF_IOTIMEOUT;
+		free(val);
+	}
 	
 	/*
 	 * Get flag to see if we're supposed to reboot if we can't complete