Sophie: kernel-2.6.18-194.11.1.el5 src

kernel-2.6.18-194.11.1.el5.src.rpm

From: Amerigo Wang <amwang@redhat.com>
Date: Mon, 31 Aug 2009 09:00:38 -0400
Subject: [sched] enable CONFIG_DETECT_HUNG_TASK support
Message-id: <20090831090130.4492.37170.sendpatchset@localhost.localdomain>
Patchwork-id: 20816
O-Subject: [PATCH RHEL5.x] backport CONFIG_DETECT_HUNG_TASK to RHEL5
Bugzilla: 506059
RH-Acked-by: Prarit Bhargava <prarit@redhat.com>
RH-Acked-by: Dave Anderson <anderson@redhat.com>

RHBZ:
https://bugzilla.redhat.com/show_bug.cgi?id=506059

Description:
The problem is related to 'D' State Processes creating a situation where the
kernel can't kill them. Thus leading to a machine which cannot shutdown. See
Bug 419581 for details.

The recent 2.6.30 kernel has the new feature, CONFIG_DETECT_HUNG_TASK and
CONFIG_BOOTPARAM_HUNG_TASK_PANIC

The doc claims that there is very negligible performance hit, it would be good
to have this feature backported to RHEL5.
This will save us from the catastrophic situation of D-State processes not
getting killed by the kernel, which further leads to shutdown hang.

With this feature in place, we can document this setting for situations where
people hit the D-State Bug.

KABI:
Thanks to Peter, we can avoid KABI breakage by moving members of
task_struct into task_struct_aux.

Brew:
https://brewweb.devel.redhat.com/taskinfo?taskID=1949037
https://brewweb.devel.redhat.com/taskinfo?taskID=1945636

Upstream status:
Commit e162b39a merged this feature.

Test status:
I wrote a kernel module to produce a hung task, and with this patch
the kernel can detect it well. I will upload the code to the bugzilla
web page.

Signed-off-by: WANG Cong <amwang@redhat.com>


diff --git a/include/linux/sched.h b/include/linux/sched.h
index eaabf86..ee0e62e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -216,6 +216,7 @@ extern void cpu_init (void);
 extern void trap_init(void);
 extern void update_process_times(int user, struct pt_regs *regs);
 extern void scheduler_tick(void);
+extern void sched_show_task(struct task_struct *p);
 
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 extern unsigned long softlockup_get_next_event(void);
@@ -243,6 +244,15 @@ static inline void touch_all_softlockup_watchdogs(void)
 {
 }
 #endif
+#ifdef CONFIG_DETECT_HUNG_TASK
+extern unsigned int  sysctl_hung_task_panic;
+extern unsigned long sysctl_hung_task_check_count;
+extern unsigned long sysctl_hung_task_timeout_secs;
+extern unsigned long sysctl_hung_task_warnings;
+extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+					 struct file *filp, void __user *buffer,
+					 size_t *lenp, loff_t *ppos);
+#endif
 
 
 /* Attach to any functions which should be ignored in wchan output. */
@@ -855,6 +865,9 @@ struct task_struct_aux {
 	struct completion *vfork_done;  /* for vfork() [displaced from task_struct] */
 	struct list_head  *scm_work_list; /*displaced from task_struct for abi compat*/
 	struct task_io_accounting ioac;
+#ifdef CONFIG_DETECT_HUNG_TASK
+	unsigned long last_switch_count; /* hung task detection */
+#endif
 };
 
 #define task_aux(tsk) ((tsk)->auxilliary)
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index e594513..c9ce04c 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -161,6 +161,10 @@ enum
 	KERN_PROVE_LOCKING=79,	/* int: enable lock dependancy validation */
 	KERN_SOFTLOCKUP_THRESH=80, /* int: min time to report softlockups */
 	KERN_SOFTLOCKUP_PANIC=81, /* int: panic on softlockup */
+	KERN_HUNG_TASK_PANIC=82,
+	KERN_HUNG_TASK_CHECK_COUNT=83,
+	KERN_HUNG_TASK_TIMEOUT_SECS=84,
+	KERN_HUNG_TASK_WARNINGS=85,
 };
 
 
diff --git a/kernel/Makefile b/kernel/Makefile
index f1c92c2..0172581 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -44,6 +44,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o audit_tree.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_SYSFS) += ksysfs.o
 obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
+obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
diff --git a/kernel/fork.c b/kernel/fork.c
index f389c4f..95bb523 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -647,6 +647,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 
 	tsk->min_flt = tsk->maj_flt = 0;
 	tsk->nvcsw = tsk->nivcsw = 0;
+#ifdef CONFIG_DETECT_HUNG_TASK
+	task_aux(tsk)->last_switch_count = tsk->nvcsw + tsk->nivcsw;
+#endif
 
 	tsk->mm = NULL;
 	tsk->active_mm = NULL;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
new file mode 100644
index 0000000..0d5a150
--- /dev/null
+++ b/kernel/hung_task.c
@@ -0,0 +1,213 @@
+/*
+ * Detect Hung Task
+ *
+ * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/lockdep.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+
+/*
+ * The number of tasks checked:
+ */
+unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
+
+/*
+ * Limit number of tasks checked in a batch.
+ *
+ * This value controls the preemptibility of khungtaskd since preemption
+ * is disabled during the critical section. It also controls the size of
+ * the RCU grace period. So it needs to be upper-bound.
+ */
+#define HUNG_TASK_BATCHING 1024
+
+/*
+ * Zero means infinite timeout - no checking done:
+ */
+unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
+
+unsigned long __read_mostly sysctl_hung_task_warnings = 10;
+
+static int __read_mostly did_panic;
+
+static struct task_struct *watchdog_task;
+
+/*
+ * Should we panic (and reboot, if panic_timeout= is set) when a
+ * hung task is detected:
+ */
+unsigned int __read_mostly sysctl_hung_task_panic =
+				CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
+
+static int __init hung_task_panic_setup(char *str)
+{
+	sysctl_hung_task_panic = simple_strtoul(str, NULL, 0);
+
+	return 1;
+}
+__setup("hung_task_panic=", hung_task_panic_setup);
+
+static int
+hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	did_panic = 1;
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block panic_block = {
+	.notifier_call = hung_task_panic,
+};
+
+static void check_hung_task(struct task_struct *t, unsigned long timeout)
+{
+	unsigned long switch_count = t->nvcsw + t->nivcsw;
+
+	/*
+	 * Ensure the task is not frozen.
+	 * Also, when a freshly created task is scheduled once, changes
+	 * its state to TASK_UNINTERRUPTIBLE without having ever been
+	 * switched out once, it musn't be checked.
+	 */
+	if (unlikely(t->flags & PF_FROZEN || !switch_count))
+		return;
+
+	if (switch_count != task_aux(t)->last_switch_count) {
+		task_aux(t)->last_switch_count = switch_count;
+		return;
+	}
+	if (!sysctl_hung_task_warnings)
+		return;
+	sysctl_hung_task_warnings--;
+
+	/*
+	 * Ok, the task did not get scheduled for more than 2 minutes,
+	 * complain:
+	 */
+	printk(KERN_ERR "INFO: task %s:%d blocked for more than "
+			"%ld seconds.\n", t->comm, t->pid, timeout);
+	printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
+			" disables this message.\n");
+	sched_show_task(t);
+	debug_show_held_locks(t);
+
+	touch_nmi_watchdog();
+
+	if (sysctl_hung_task_panic)
+		panic("hung_task: blocked tasks");
+}
+
+/*
+ * To avoid extending the RCU grace period for an unbounded amount of time,
+ * periodically exit the critical section and enter a new one.
+ *
+ * For preemptible RCU it is sufficient to call rcu_read_unlock in order
+ * exit the grace period. For classic RCU, a reschedule is required.
+ */
+static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
+{
+	get_task_struct(g);
+	get_task_struct(t);
+	rcu_read_unlock();
+	cond_resched();
+	rcu_read_lock();
+	put_task_struct(t);
+	put_task_struct(g);
+}
+
+/*
+ * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
+ * a really long time (120 seconds). If that happens, print out
+ * a warning.
+ */
+static void check_hung_uninterruptible_tasks(unsigned long timeout)
+{
+	int max_count = sysctl_hung_task_check_count;
+	int batch_count = HUNG_TASK_BATCHING;
+	struct task_struct *g, *t;
+
+	/*
+	 * If the system crashed already then all bets are off,
+	 * do not report extra hung tasks:
+	 */
+	if (did_panic)
+		return;
+
+	rcu_read_lock();
+	do_each_thread(g, t) {
+		if (!--max_count)
+			goto unlock;
+		if (!--batch_count) {
+			batch_count = HUNG_TASK_BATCHING;
+			rcu_lock_break(g, t);
+		}
+		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
+		if (t->state == TASK_UNINTERRUPTIBLE)
+			check_hung_task(t, timeout);
+	} while_each_thread(g, t);
+ unlock:
+	rcu_read_unlock();
+}
+
+static unsigned long timeout_jiffies(unsigned long timeout)
+{
+	/* timeout of 0 will disable the watchdog */
+	return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT;
+}
+
+/*
+ * Process updating of timeout sysctl
+ */
+int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+				  struct file *filp, void __user *buffer,
+				  size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+
+	if (ret || !write)
+		goto out;
+
+	wake_up_process(watchdog_task);
+
+ out:
+	return ret;
+}
+
+/*
+ * kthread which checks for tasks stuck in D state
+ */
+static int watchdog(void *dummy)
+{
+	set_user_nice(current, 0);
+
+	for ( ; ; ) {
+		unsigned long timeout = sysctl_hung_task_timeout_secs;
+
+		while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
+			timeout = sysctl_hung_task_timeout_secs;
+
+		check_hung_uninterruptible_tasks(timeout);
+	}
+
+	return 0;
+}
+
+static int __init hung_task_init(void)
+{
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+	watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
+
+	return 0;
+}
+
+module_init(hung_task_init);
diff --git a/kernel/sched.c b/kernel/sched.c
index 267715b..950e1df 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5004,7 +5004,7 @@ static inline struct task_struct *younger_sibling(struct task_struct *p)
 
 static const char stat_nam[] = "RSDTtZX";
 
-static void show_task(struct task_struct *p)
+void sched_show_task(struct task_struct *p)
 {
 	struct task_struct *relative;
 	unsigned long free = 0;
@@ -5073,7 +5073,7 @@ void show_state(void)
 		 * console might take alot of time:
 		 */
 		touch_nmi_watchdog();
-		show_task(p);
+		sched_show_task(p);
 	} while_each_thread(g, p);
 
 	touch_all_softlockup_watchdogs();
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 32f4130..954c7b9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -115,12 +115,12 @@ __setup("exec-shield=", setup_exec_shield);
 
 /* Constants used for minimum and  maximum */
 #ifdef CONFIG_DETECT_SOFTLOCKUP
-static int one = 1;
 static int sixty = 60;
 static int threehundred = 300;
 #endif
 
 static int zero;
+static int one = 1;
 static int one_hundred = 100;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
@@ -821,6 +821,46 @@ static ctl_table kern_table[] = {
 		.extra2		= &one,
 	},
 #endif
+#ifdef CONFIG_DETECT_HUNG_TASK
+	{
+		.ctl_name	= KERN_HUNG_TASK_PANIC,
+		.procname	= "hung_task_panic",
+		.data		= &sysctl_hung_task_panic,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+	{
+		.ctl_name	= KERN_HUNG_TASK_CHECK_COUNT,
+		.procname	= "hung_task_check_count",
+		.data		= &sysctl_hung_task_check_count,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_minmax,
+		.strategy	= &sysctl_intvec,
+	},
+	{
+		.ctl_name	= KERN_HUNG_TASK_TIMEOUT_SECS,
+		.procname	= "hung_task_timeout_secs",
+		.data		= &sysctl_hung_task_timeout_secs,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= &proc_dohung_task_timeout_secs,
+		.strategy	= &sysctl_intvec,
+	},
+	{
+		.ctl_name	= KERN_HUNG_TASK_WARNINGS,
+		.procname	= "hung_task_warnings",
+		.data		= &sysctl_hung_task_warnings,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_minmax,
+		.strategy	= &sysctl_intvec,
+	},
+#endif
 #ifdef CONFIG_COMPAT
 	{
 		.ctl_name	= KERN_COMPAT_LOG,
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 5a5f05a..5dd07e6 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -81,6 +81,45 @@ config DETECT_SOFTLOCKUP
 	   can be detected via the NMI-watchdog, on platforms that
 	   support it.)
 
+config DETECT_HUNG_TASK
+	bool "Detect Hung Tasks"
+	depends on DEBUG_KERNEL
+	default DETECT_SOFTLOCKUP
+	help
+	  Say Y here to enable the kernel to detect "hung tasks",
+	  which are bugs that cause the task to be stuck in
+	  uninterruptible "D" state indefinitiley.
+
+	  When a hung task is detected, the kernel will print the
+	  current stack trace (which you should report), but the
+	  task will stay in uninterruptible state. If lockdep is
+	  enabled then all held locks will also be reported. This
+	  feature has negligible overhead.
+
+config BOOTPARAM_HUNG_TASK_PANIC
+	bool "Panic (Reboot) On Hung Tasks"
+	depends on DETECT_HUNG_TASK
+	help
+	  Say Y here to enable the kernel to panic on "hung tasks",
+	  which are bugs that cause the kernel to leave a task stuck
+	  in uninterruptible "D" state.
+
+	  The panic can be used in combination with panic_timeout,
+	  to cause the system to reboot automatically after a
+	  hung task has been detected. This feature is useful for
+	  high-availability systems that have uptime guarantees and
+	  where a hung tasks must be resolved ASAP.
+
+	  Say N if unsure.
+
+config BOOTPARAM_HUNG_TASK_PANIC_VALUE
+	int
+	depends on DETECT_HUNG_TASK
+	range 0 1
+	default 0 if !BOOTPARAM_HUNG_TASK_PANIC
+	default 1 if BOOTPARAM_HUNG_TASK_PANIC
+
+
 config SCHEDSTATS
 	bool "Collect scheduler statistics"
 	depends on DEBUG_KERNEL && PROC_FS