Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > d236c5da97a239a1b6991cfba2865b66 > files > 114

cman-2.0.115-68.el5_6.1.src.rpm

commit dcfffdf4e2ebebae62958673e53138982fb3af97
Author: David Teigland <teigland@redhat.com>
Date:   Mon Feb 8 15:26:03 2010 -0600

    cman/groupd/dlm_controld/gfs_controld: work around ipc deadlock
    
    bz 561892
    
    When there are many gfs fs's (approx above 120) in a two
    node cluster, and one of the nodes fails, groupd on the
    remaining node can deadlock with dlm_controld and gfs_controld.
    
    The problem is caused by so much communication (lots of fs's)
    being sent between groupd and the other daemons so quickly
    (no other nodes to synchronize with), that the unix socket
    buffers fill up, causing both daemons to be blocked writing
    stop/start/stop_done/start_done messages to the other.
    Since the daemons are single threaded, being blocked on write
    means that neither will read to unblock the other.
    
    To determine if you're having this problem, you can strace
    groupd, dlm_controld and gfs_controld, and notice that they
    are blocked writing strings starting with "stop" or "start".
    group_tool will hang since groupd is blocked.
    
    The solution has three main parts:
    1. dlm_controld queues its stop_done and start_done messages
       and waits to send them to groupd until groupd is finished
       sending all the stop/start messages.
    2. gfs_controld does the same only for stop_done messages
       (start_done messages are already naturally delayed here)
    3. groupd skips sending finish messages to dlm_controld,
       since dlm_controld does not use them for anything
    
    Each of these changes in behavior are disabled by default and
    need to be configured explicitly:
    
    1. <dlm delay_done="2"/> in cluster.conf
    
    2. <gfs_controld delay_done="2"/> in cluster.conf
    
    3. SKIP_DLM_FINISH=1 in /etc/sysconfig/cman
       (adds -f0 option to groupd which doesn't read
        options from cluster.conf)
    
    The delay_done values are in seconds.  If the same problem
    persists with these settings, values of 4 or 8 might help.
    
    Signed-off-by: David Teigland <teigland@redhat.com>

diff --git a/cman/init.d/cman b/cman/init.d/cman
index f717e8b..78cc6ce 100755
--- a/cman/init.d/cman
+++ b/cman/init.d/cman
@@ -31,6 +31,11 @@
 #     The default is 60 seconds
 [ -z "$CMAN_SHUTDOWN_TIMEOUT" ] && CMAN_SHUTDOWN_TIMEOUT=60
 
+# SKIP_DLM_FINISH -- setting to 1 will cause groupd to be started with
+# -f0, which causes groupd to not send finish callbacks to dlm_controld
+# as part of a workaround for bz 561892.
+[ -z "$SKIP_DLM_FINISH" ] && SKIP_DLM_FINISH=0
+
 # FENCED_START_TIMEOUT -- amount of time to wait for starting fenced
 #     before giving up.  If FENCED_START_TIMEOUT is positive, then we will
 #     wait FENCED_START_TIMEOUT seconds before giving up and failing when
@@ -61,6 +66,7 @@
 
 [ -n "$NODENAME" ] && cman_join_opts+=" -n $NODENAME"
 
+
 load_modules()
 {
     errmsg=$( /sbin/modprobe configfs 2>&1 ) || return 1
@@ -145,21 +151,31 @@ start_qdiskd()
 start_daemons()
 {
     status groupd &> /dev/null
-    if [ $? -ne 0 ]; then
-	errmsg=$( /sbin/groupd 2>&1 ) || return 1
+    if [ $? -ne 0 ]
+    then
+        if [ $SKIP_DLM_FINISH -eq 1 ]
+        then
+	    errmsg=$( /sbin/groupd -f0 2>&1 ) || return 1
+        else
+	    errmsg=$( /sbin/groupd 2>&1 ) || return 1
+        fi
     fi
+
     status fenced &> /dev/null
     if [ $? -ne 0 ]; then
 	errmsg=$( /sbin/fenced 2>&1 ) || return 1
     fi
+
     status dlm_controld &> /dev/null
     if [ $? -ne 0 ]; then
 	errmsg=$( /sbin/dlm_controld 2>&1 ) || return 1
     fi
+
     status gfs_controld &> /dev/null
     if [ $? -ne 0 ]; then
 	errmsg=$( /sbin/gfs_controld 2>&1 ) || return 1
     fi
+
     return 0
 }
 
diff --git a/group/daemon/main.c b/group/daemon/main.c
index f5dcc88..7a4fca4 100644
--- a/group/daemon/main.c
+++ b/group/daemon/main.c
@@ -15,7 +15,7 @@
 
 #include "gd_internal.h"
 
-#define OPTION_STRING			"Dhs:Vv"
+#define OPTION_STRING			"Dhs:f:Vv"
 #define LOCKFILE_NAME			"/var/run/groupd.pid"
 #define LOG_FILE				"/var/log/groupd.log"
 
@@ -27,6 +27,7 @@ uint32_t		gd_event_nr;
 char			*our_name;
 int			our_nodeid;
 int			cman_quorate;
+int			dlm_finish = 1;
 
 static int client_maxi;
 static int client_size = 0;
@@ -314,6 +315,12 @@ void app_start(app_t *a)
 void app_finish(app_t *a)
 {
 	char buf[GROUPD_MSGLEN];
+
+	if (!strncmp(client[a->client].type, "dlm", 3) && !dlm_finish) {
+		log_group(a->g, "skip finish");
+		return;
+	}
+
 	snprintf(buf, sizeof(buf), "finish %s %d",
 		 a->g->name, a->current_event->event_nr);
 	app_action(a, buf);
@@ -919,6 +926,7 @@ static void print_usage(void)
 	printf("  -D	       Enable debugging code and don't fork\n");
 	printf("  -h	       Print this help, then exit\n");
 	printf("  -s [0|1]     Enable (or disable) shutdown mode\n");
+	printf("  -f [0|1]     Send finish callbacks to dlm_controld, default 1\n");
 	printf("  -V	       Print program version information, then exit\n");
 }
 
@@ -945,6 +953,10 @@ static void decode_arguments(int argc, char **argv)
 			groupd_shutdown_opt = atoi(optarg);
 			break;
 
+		case 'f':
+			dlm_finish = atoi(optarg);
+			break;
+
 		case 'v':
 			groupd_debug_verbose++;
 			break;
diff --git a/group/dlm_controld/action.c b/group/dlm_controld/action.c
index 34e84fe..a7ea3a7 100644
--- a/group/dlm_controld/action.c
+++ b/group/dlm_controld/action.c
@@ -968,6 +968,43 @@ static void set_debug(int cd)
 	set_configfs_debug(rv);
 }
 
+#define DELAY_DONE_PATH "/cluster/dlm/@delay_done"
+
+static int get_ccs_delay_done(int cd)
+{
+	char path[PATH_MAX], *str;
+	int error, rv;
+
+	memset(path, 0, PATH_MAX);
+	sprintf(path, DELAY_DONE_PATH);
+
+	error = ccs_get(cd, path, &str);
+	if (error || !str)
+		return -1;
+
+	rv = atoi(str);
+
+	if (rv < 0) {
+		log_error("invalid delay_done from ccs");
+		rv = -1;
+	}
+
+	free(str);
+	log_error("ccs dlm/delay_done %d", rv);
+	return rv;
+}
+
+static void set_delay_done(int cd)
+{
+	int rv;
+
+	rv = get_ccs_delay_done(cd);
+	if (rv < 0)
+		return;
+
+	delay_done_cb = rv;
+}
+
 void set_ccs_options(void)
 {
 	int cd;
@@ -979,6 +1016,7 @@ void set_ccs_options(void)
 	set_protocol(cd);
 	set_timewarn(cd);
 	set_debug(cd);
+	set_delay_done(cd);
 
 	ccs_disconnect(cd);
 }
diff --git a/group/dlm_controld/deadlock.c b/group/dlm_controld/deadlock.c
index f21beda..0b1538a 100644
--- a/group/dlm_controld/deadlock.c
+++ b/group/dlm_controld/deadlock.c
@@ -15,7 +15,6 @@
 
 int deadlock_enabled = 0;
 
-extern struct list_head lockspaces;
 extern int our_nodeid;
 
 static SaCkptHandleT global_ckpt_h;
diff --git a/group/dlm_controld/dlm_daemon.h b/group/dlm_controld/dlm_daemon.h
index c164a81..0037d32 100644
--- a/group/dlm_controld/dlm_daemon.h
+++ b/group/dlm_controld/dlm_daemon.h
@@ -63,6 +63,16 @@ extern int daemon_debug_opt;
 extern int kernel_debug_opt;
 extern char daemon_debug_buf[256];
 
+extern int delay_done_cb;
+extern struct list_head stop_done_list;
+extern int stop_done_entries;
+extern struct list_head start_done_list;
+extern int start_done_entries;
+extern struct list_head lockspaces;
+extern int lockspace_count;
+extern uint64_t last_stop_time;
+extern uint64_t last_start_time;
+
 #define log_debug(fmt, args...) \
 do { \
 	snprintf(daemon_debug_buf, 255, "%ld " fmt "\n", time(NULL), ##args); \
@@ -123,6 +133,8 @@ char *nodeid2name(int nodeid);
 /* group.c */
 int setup_groupd(void);
 void process_groupd(int ci);
+void push_stop_done(void);
+void push_start_done(void);
 
 /* main.c */
 int client_add(int fd, void (*workfn)(int ci), void (*deadfn)(int ci));
diff --git a/group/dlm_controld/group.c b/group/dlm_controld/group.c
index 700edac..3024bef 100644
--- a/group/dlm_controld/group.c
+++ b/group/dlm_controld/group.c
@@ -24,6 +24,7 @@
    do the processing within the callback function itself */
 
 group_handle_t gh;
+
 static int cb_action;
 static char cb_name[MAX_GROUP_NAME_LEN+1];
 static int cb_event_nr;
@@ -31,6 +32,7 @@ static unsigned int cb_id;
 static int cb_type;
 static int cb_member_count;
 static int cb_members[MAX_GROUP_MEMBERS];
+static int last_action;
 
 static void stop_cbfn(group_handle_t h, void *private, char *name)
 {
@@ -116,10 +118,108 @@ static unsigned int replace_zero_global_id(char *name)
 	return new_id;
 }
 
+struct save_done {
+	struct list_head list;
+	int event_nr;
+	char name[MAX_GROUP_NAME_LEN+1];
+};
+
+void push_stop_done(void)
+{
+	struct save_done *sd, *safe;
+	int count;
+
+	if (stop_done_entries > 1)
+		log_error("push_stop_done begin %d", stop_done_entries);
+
+	count = 0;
+
+	list_for_each_entry_safe(sd, safe, &stop_done_list, list) {
+		group_stop_done(gh, sd->name);
+		list_del(&sd->list);
+		free(sd);
+		stop_done_entries--;
+		count++;
+	}
+
+	if (count > 1)
+		log_error("push_stop_done end %d", count);
+}
+
+/* only queue if the last action is also stop? */
+
+static int queue_stop_done(char *name)
+{
+	struct save_done *sd;
+
+	if (!delay_done_cb)
+		return -1;
+
+	if (last_action != DO_STOP)
+		return -1;
+
+	sd = malloc(sizeof(struct save_done));
+	if (!sd)
+		return -1;
+
+	memset(sd, 0, sizeof(struct save_done));
+	strcpy(sd->name, name);
+	list_add_tail(&sd->list, &stop_done_list);
+	stop_done_entries++;
+
+	return 0;
+}
+
+void push_start_done(void)
+{
+	struct save_done *sd, *safe;
+	int count;
+
+	if (start_done_entries > 1)
+		log_error("push_start_done begin %d", start_done_entries);
+
+	count = 0;
+
+	list_for_each_entry_safe(sd, safe, &start_done_list, list) {
+		group_start_done(gh, sd->name, sd->event_nr);
+		list_del(&sd->list);
+		free(sd);
+		start_done_entries--;
+		count++;
+	}
+
+	if (count > 1)
+		log_error("push_start_done end %d", count);
+}
+
+static int queue_start_done(char *name, int event_nr)
+{
+	struct save_done *sd;
+
+	if (!delay_done_cb)
+		return -1;
+
+	if (last_action != DO_START)
+		return -1;
+
+	sd = malloc(sizeof(struct save_done));
+	if (!sd)
+		return -1;
+
+	memset(sd, 0, sizeof(struct save_done));
+	strcpy(sd->name, name);
+	sd->event_nr = event_nr;
+	list_add_tail(&sd->list, &start_done_list);
+	start_done_entries++;
+
+	return 0;
+}
+
 void process_groupd(int ci)
 {
 	struct lockspace *ls;
 	int error = 0, val;
+	int rv;
 
 	group_dispatch(gh);
 
@@ -137,7 +237,10 @@ void process_groupd(int ci)
 	case DO_STOP:
 		log_debug("groupd callback: stop %s", cb_name);
 		set_control(cb_name, 0);
-		group_stop_done(gh, cb_name);
+		rv = queue_stop_done(cb_name);
+		if (rv < 0)
+			group_stop_done(gh, cb_name);
+		last_stop_time = time(NULL);
 		break;
 
 	case DO_START:
@@ -154,20 +257,27 @@ void process_groupd(int ci)
 		/* the dlm doesn't need/use a "finish" stage following
 		   start, so we can just do start_done immediately */
 
-		group_start_done(gh, cb_name, cb_event_nr);
 
-		if (!ls->joining)
+		if (!ls->joining) {
+			rv = queue_start_done(cb_name, cb_event_nr);
+			if (rv < 0)
+				group_start_done(gh, cb_name, cb_event_nr);
+			last_start_time = time(NULL);
 			break;
+		} else {
+			group_start_done(gh, cb_name, cb_event_nr);
 
-		ls->joining = 0;
-		log_debug("join event done %s", cb_name);
+			ls->joining = 0;
+			log_debug("join event done %s", cb_name);
 
-		/* this causes the dlm_new_lockspace() call (typically from
-		   mount) to complete */
-		set_event_done(cb_name, 0);
+			/* this causes the dlm_new_lockspace() call (typically from
+		   	   mount) to complete */
+			set_event_done(cb_name, 0);
 
-		join_deadlock_cpg(ls);
-		break;
+			join_deadlock_cpg(ls);
+			last_start_time = time(NULL);
+			break;
+		}
 
 	case DO_SETID:
 		log_debug("groupd callback: set_id %s %x", cb_name, cb_id);
@@ -195,6 +305,7 @@ void process_groupd(int ci)
 		set_event_done(cb_name, val);
 		leave_deadlock_cpg(ls);
 		list_del(&ls->list);
+		lockspace_count--;
 		free(ls);
 		break;
 
@@ -206,6 +317,8 @@ void process_groupd(int ci)
 		error = -EINVAL;
 	}
 
+	last_action = cb_action;
+
 	cb_action = 0;
  out:
 	return;
@@ -215,6 +328,11 @@ int setup_groupd(void)
 {
 	int rv;
 
+	INIT_LIST_HEAD(&stop_done_list);
+	INIT_LIST_HEAD(&start_done_list);
+	stop_done_entries = 0;
+	start_done_entries = 0;
+
 	gh = group_init(NULL, "dlm", 1, &callbacks, GROUPD_TIMEOUT);
 	if (!gh) {
 		log_error("group_init error %p %d", gh, errno);
diff --git a/group/dlm_controld/main.c b/group/dlm_controld/main.c
index 1588605..d47ea63 100644
--- a/group/dlm_controld/main.c
+++ b/group/dlm_controld/main.c
@@ -17,15 +17,13 @@
 #include <linux/dlm.h>
 #include <linux/dlm_netlink.h>
 
-#define OPTION_STRING			"KDhVd:"
+#define OPTION_STRING			"KDhVd:y:"
 #define LOCKFILE_NAME			"/var/run/dlm_controld.pid"
 
 #define DEADLOCK_CHECK_SECS		10
 
 #define NALLOC 16
 
-struct list_head lockspaces;
-
 extern group_handle_t gh;
 extern int deadlock_enabled;
 
@@ -265,6 +263,7 @@ static void process_uevent(int ci)
 
 		ls->joining = 1;
 		list_add(&ls->list, &lockspaces);
+		lockspace_count++;
 
 		rv = group_join(gh, argv[3]);
 
@@ -611,8 +610,11 @@ void cluster_dead(int ci)
 static int loop(void)
 {
 	int rv, i;
+	int poll_timeout = -1;
 	void (*workfn) (int ci);
 	void (*deadfn) (int ci);
+	uint64_t push_begin = 0;
+	uint64_t now;
 
 	rv = setup_listener();
 	if (rv < 0)
@@ -646,7 +648,7 @@ static int loop(void)
  for_loop:
 
 	for (;;) {
-		rv = poll(pollfd, client_maxi + 1, -1);
+		rv = poll(pollfd, client_maxi + 1, poll_timeout);
 		if (rv == -1 && errno == EINTR) {
 			if (daemon_quit && list_empty(&lockspaces)) {
 				clear_configfs();
@@ -672,6 +674,56 @@ static int loop(void)
 				deadfn(i);
 			}
 		}
+
+		if (delay_done_cb && !list_empty(&stop_done_list)) {
+			if (!push_begin) {
+				push_begin = time(NULL);
+				poll_timeout = 1000;
+			}
+			now = time(NULL);
+
+			if ((stop_done_entries == lockspace_count) ||
+			    (now - push_begin >= delay_done_cb)) {
+				if (stop_done_entries > 1) {
+					log_error("stop_done entries %d "
+						  "count %d begin %llu "
+						  "now %llu last stop %llu",
+						  stop_done_entries,
+						  lockspace_count,
+						  (unsigned long long)push_begin,
+						  (unsigned long long)now,
+						  (unsigned long long)last_stop_time);
+				}
+				push_stop_done();
+				push_begin = 0;
+				poll_timeout = -1;
+			}
+		}
+
+		if (delay_done_cb && !list_empty(&start_done_list)) {
+			if (!push_begin) {
+				push_begin = time(NULL);
+				poll_timeout = 1000;
+			}
+			now = time(NULL);
+
+			if ((start_done_entries == lockspace_count) ||
+			    (now - push_begin >= delay_done_cb)) {
+				if (start_done_entries > 1) {
+					log_error("start_done entries %d "
+						  "count %d begin %llu "
+						  "now %llu last start %llu",
+						  start_done_entries,
+						  lockspace_count,
+						  (unsigned long long)push_begin,
+						  (unsigned long long)now,
+						  (unsigned long long)last_start_time);
+				}
+				push_start_done();
+				push_begin = 0;
+				poll_timeout = -1;
+			}
+		}
 	}
 	rv = 0;
  out:
@@ -754,6 +806,7 @@ static void print_usage(void)
 #endif
 	printf("  -D	       Enable debugging code and don't fork\n");
 	printf("  -K	       Enable kernel dlm debugging messages\n");
+	printf("  -y <sec>     Delay done callbacks to groupd by this many seconds, default 0\n");
 	printf("  -h	       Print this help, then exit\n");
 	printf("  -V	       Print program version information, then exit\n");
 }
@@ -780,6 +833,10 @@ static void decode_arguments(int argc, char **argv)
 			print_usage();
 			exit(EXIT_SUCCESS);
 			break;
+
+		case 'y':
+			delay_done_cb = atoi(optarg);
+			break;
 #if DEADLOCK
 		case 'd':
 			deadlock_enabled = atoi(optarg);
@@ -844,6 +901,7 @@ int main(int argc, char **argv)
 	prog_name = argv[0];
 
 	INIT_LIST_HEAD(&lockspaces);
+	lockspace_count = 0;
 
 	decode_arguments(argc, argv);
 
@@ -871,4 +929,13 @@ char *prog_name;
 int daemon_debug_opt;
 char daemon_debug_buf[256];
 int kernel_debug_opt;
+int delay_done_cb = 0;
+struct list_head stop_done_list;
+int stop_done_entries;
+struct list_head start_done_list;
+int start_done_entries;
+struct list_head lockspaces;
+int lockspace_count;
+uint64_t last_stop_time;
+uint64_t last_start_time;
 
diff --git a/group/dlm_controld/member_cman.c b/group/dlm_controld/member_cman.c
index 1ce180c..b37e4ec 100644
--- a/group/dlm_controld/member_cman.c
+++ b/group/dlm_controld/member_cman.c
@@ -19,7 +19,6 @@ static cman_node_t      old_nodes[MAX_NODES];
 static int              old_node_count;
 static cman_node_t      cman_nodes[MAX_NODES];
 static int              cman_node_count;
-extern struct list_head lockspaces;
 
 static int is_member(cman_node_t *node_list, int count, int nodeid)
 {
diff --git a/group/gfs_controld/group.c b/group/gfs_controld/group.c
index 3717579..d2fff7c 100644
--- a/group/gfs_controld/group.c
+++ b/group/gfs_controld/group.c
@@ -23,6 +23,7 @@ static unsigned int cb_id;
 static int cb_type;
 static int cb_member_count;
 static int cb_members[MAX_GROUP_MEMBERS];
+static int last_action;
 
 int do_stop(struct mountgroup *mg);
 int do_finish(struct mountgroup *mg);
@@ -109,6 +110,55 @@ static unsigned int replace_zero_global_id(char *name)
 	return new_id;
 }
 
+struct save_done {
+	struct list_head list;
+	char name[MAX_GROUP_NAME_LEN+1];
+};
+
+void push_stop_done(void)
+{
+	struct save_done *sd, *safe;
+	int count;
+
+	if (stop_done_entries > 1)
+		log_error("push_stop_done begin %d", stop_done_entries);
+
+	count = 0;
+
+	list_for_each_entry_safe(sd, safe, &stop_done_list, list) {
+		group_stop_done(gh, sd->name);
+		list_del(&sd->list);
+		free(sd);
+		stop_done_entries--;
+		count++;
+	}
+
+	if (count > 1)
+		log_error("push_stop_done end %d", count);
+}
+
+int queue_stop_done(char *name)
+{
+	struct save_done *sd;
+
+	if (!delay_done_cb)
+		return -1;
+
+	if (last_action != DO_STOP)
+		return -1;
+
+	sd = malloc(sizeof(struct save_done));
+	if (!sd)
+		return -1;
+
+	memset(sd, 0, sizeof(struct save_done));
+	strcpy(sd->name, name);
+	list_add_tail(&sd->list, &stop_done_list);
+	stop_done_entries++;
+
+	return 0;
+}
+
 int process_groupd(void)
 {
 	struct mountgroup *mg;
@@ -132,6 +182,7 @@ int process_groupd(void)
 
 	switch (cb_action) {
 	case DO_STOP:
+		last_stop_time = time(NULL);
 		log_debug("groupd cb: stop %s", cb_name);
 		mg->last_callback = DO_STOP;
 		mg->last_stop = mg->last_start;
@@ -170,6 +221,7 @@ int process_groupd(void)
 		error = -EINVAL;
 	}
 
+	last_action = cb_action;
  out:
 	cb_action = 0;
 	return error;
@@ -179,6 +231,9 @@ int setup_groupd(void)
 {
 	int rv;
 
+	INIT_LIST_HEAD(&stop_done_list);
+	stop_done_entries = 0;
+
 	gh = group_init(NULL, LOCK_DLM_GROUP_NAME, LOCK_DLM_GROUP_LEVEL,
 			&callbacks, 10);
 	if (!gh) {
diff --git a/group/gfs_controld/lock_dlm.h b/group/gfs_controld/lock_dlm.h
index 746d0c7..cc12e1c 100644
--- a/group/gfs_controld/lock_dlm.h
+++ b/group/gfs_controld/lock_dlm.h
@@ -74,6 +74,13 @@ extern char dump_buf[DUMP_SIZE];
 extern int dump_point;
 extern int dump_wrap;
 
+extern int delay_done_cb;
+extern struct list_head stop_done_list;
+extern int stop_done_entries;
+extern struct list_head mounts;
+extern int mountgroup_count;
+extern uint64_t last_stop_time;
+
 extern void daemon_dump_save(void);
 
 #define log_debug(fmt, args...) \
@@ -283,6 +290,9 @@ int setup_plocks(void);
 int process_plocks(void);
 void exit_cman(void);
 
+void push_stop_done(void);
+int queue_stop_done(char *name);
+
 int do_mount(int ci, char *dir, char *type, char *proto, char *table,
 	     char *options, char *dev, struct mountgroup **mg_ret);
 int do_unmount(int ci, char *dir, int mnterr);
diff --git a/group/gfs_controld/main.c b/group/gfs_controld/main.c
index 7293938..5a70f60 100644
--- a/group/gfs_controld/main.c
+++ b/group/gfs_controld/main.c
@@ -41,7 +41,6 @@ struct client {
 	int another_mount;
 };
 
-extern struct list_head mounts;
 extern struct list_head withdrawn_mounts;
 extern group_handle_t gh;
 
@@ -545,6 +544,8 @@ int setup_uevent(void)
 int loop(void)
 {
 	int rv, i, f, error, poll_timeout = -1, ignore_plocks_fd = 0;
+	uint64_t push_begin = 0;
+	uint64_t now;
 
 	rv = listen_fd = setup_listen();
 	if (rv < 0)
@@ -655,6 +656,31 @@ int loop(void)
 				}
 			}
 		}
+
+                if (delay_done_cb && !list_empty(&stop_done_list)) {
+                        if (!push_begin) {
+                                push_begin = time(NULL);
+                                poll_timeout = 1000;
+                        }
+                        now = time(NULL);
+
+                        if ((stop_done_entries == mountgroup_count) ||
+                            (now - push_begin >= delay_done_cb)) {
+                                if (stop_done_entries > 1) {
+                                        log_error("stop_done entries %d "
+                                                  "count %d begin %llu "
+                                                  "now %llu last stop %llu",
+                                                  stop_done_entries,
+                                                  mountgroup_count,
+                                                  (unsigned long long)push_begin,
+                                                  (unsigned long long)now,
+                                                  (unsigned long long)last_stop_time);
+                                }
+                                push_stop_done();
+                                push_begin = 0;
+                                poll_timeout = -1;
+                        }
+		}
 	}
 	rv = 0;
  out:
@@ -666,11 +692,12 @@ int loop(void)
 #define DROP_RESOURCES_TIME_PATH "/cluster/gfs_controld/@drop_resources_time"
 #define DROP_RESOURCES_COUNT_PATH "/cluster/gfs_controld/@drop_resources_count"
 #define DROP_RESOURCES_AGE_PATH "/cluster/gfs_controld/@drop_resources_age"
+#define DELAY_DONE_PATH "/cluster/gfs_controld/@delay_done"
 
 static void set_ccs_config(void)
 {
 	char path[PATH_MAX], *str;
-	int i = 0, cd, error;
+	int i = 0, cd, error, rv;
 
 	while ((cd = ccs_connect()) < 0) {
 		sleep(1);
@@ -738,6 +765,23 @@ static void set_ccs_config(void)
 	}
 	if (str)
 		free(str);
+
+	memset(path, 0, PATH_MAX);
+	snprintf(path, PATH_MAX, "%s", DELAY_DONE_PATH);
+	str = NULL;
+
+	error = ccs_get(cd, path, &str);
+	if (!error) {
+		rv = atoi(str);
+		if (rv < 0) {
+			log_error("invalid delay_done from ccs");
+		} else {
+			delay_done_cb = rv;
+			log_error("ccs gfs_controld/delay_done %d", rv);
+		}
+	}
+	if (str)
+		free(str);
 }
 
 static void lockfile(void)
@@ -824,6 +868,7 @@ static void print_usage(void)
 	printf("               Default is %u\n", DEFAULT_DROP_RESOURCES_COUNT);
 	printf("  -a <ms>      drop resources age (milliseconds)\n");
 	printf("               Default is %u\n", DEFAULT_DROP_RESOURCES_AGE);
+	printf("  -y <sec>     Delay done callbacks to groupd by this many seconds, default 0\n");
 	printf("  -h	       Print this help, then exit\n");
 	printf("  -V	       Print program version information, then exit\n");
 }
@@ -881,6 +926,10 @@ static void decode_arguments(int argc, char **argv)
 			opt_drop_resources_age = 1;
 			break;
 
+		case 'y':
+			delay_done_cb = atoi(optarg);
+			break;
+
 		case 'h':
 			print_usage();
 			exit(EXIT_SUCCESS);
@@ -946,6 +995,7 @@ int main(int argc, char **argv)
 
 	INIT_LIST_HEAD(&mounts);
 	INIT_LIST_HEAD(&withdrawn_mounts);
+	mountgroup_count = 0;
 
 	config_no_withdraw = DEFAULT_NO_WITHDRAW;
 	config_no_plock = DEFAULT_NO_PLOCK;
@@ -1007,4 +1057,10 @@ char daemon_debug_buf[256];
 char dump_buf[DUMP_SIZE];
 int dump_point;
 int dump_wrap;
+int delay_done_cb = 0;
+struct list_head stop_done_list;
+int stop_done_entries;
+struct list_head mounts;
+int mountgroup_count;
+uint64_t last_stop_time;
 
diff --git a/group/gfs_controld/recover.c b/group/gfs_controld/recover.c
index 52d96ff..3eec64f 100644
--- a/group/gfs_controld/recover.c
+++ b/group/gfs_controld/recover.c
@@ -1676,6 +1676,7 @@ int do_mount(int ci, char *dir, char *type, char *proto, char *table,
 	}
 
 	list_add(&mg->list, &mounts);
+	mountgroup_count++;
 	group_join(gh, name);
 	rv = 0;
  out:
@@ -1996,6 +1997,7 @@ int do_unmount(int ci, char *dir, int mnterr)
 		free(mp);
 		if (list_empty(&mg->mountpoints)) {
 			list_del(&mg->list);
+			mountgroup_count--;
 			free(mg);
 		}
 		return 0;
@@ -2342,7 +2344,9 @@ int do_stop(struct mountgroup *mg)
 		}
 	}
  out:
-	group_stop_done(gh, mg->name);
+	rv = queue_stop_done(mg->name);
+	if (rv < 0)
+		group_stop_done(gh, mg->name);
 	return 0;
 }