Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 27922b4260f65d317aabda37e42bbbff > files > 2070

kernel-2.6.18-238.el5.src.rpm

From: Eduardo Habkost <ehabkost@redhat.com>
Date: Fri, 29 Aug 2008 14:55:24 -0700
Subject: [misc] preempt-notifiers implementation
Message-id: 20080828220747.GB17182@blackpad
O-Subject: [RHEL5.3 PATCH] preempt-notifiers implementation
Bugzilla: 459838
RH-Acked-by: Chris Wright <chrisw@redhat.com>
RH-Acked-by: john cooper <john.cooper@redhat.com>
RH-Acked-by: Glauber Costa <glommer@redhat.com>
RH-Acked-by: Jon Masters <jcm@redhat.com>
RH-Acked-by: Peter Zijlstra <pzijlstr@redhat.com>

Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=459838

Rationale:

This patch adds preempt-notifiers into the kernel. It is a feature
used by the KVM module that is implemented upstream (commit
e107be36efb2a233833e8c9899039a370e4b2318). However, the upstream
implementation adds a new field to task_struct, breaking kABI.

KVM has a compat module that emulates preempt-notifiers, but it does
this by hacking into the int1 gate and setting debug registers to insert
a breakpoint on schedule().

The compat module also keeps a single global list of all preempt notifiers
registerd, that is searched on every context switch. This and the debug
registers hack explains the following warning it prints when initializing:

    printk("kvm: emulating preempt notifiers;"
           " do not benchmark on this machine\n");

Hence, using the KVM compat preempt-notifiers implementation wouldn't
be acceptable.

Description:

This patch is similar to the preempt notifier implementation that is
upstream, but instead of a new task_struct field, it uses a hash table to
store the preempt notifier list. The hooks to call the notifiers, however,
are exactly on the same place where the upstream patch includes them.

To avoid querying the hash table unnecessarily on every context
switch, it defines a new task_struct flag: PF_PREEMPT_NOTIFIER. If
no preempt notifier is registered, the flag will be clear and the
fire_sched_*_preempt_notifiers() hooks will return immediately.

When a preempt notifier is registered, the PF_PREEMPT_NOTIFIER flag will
be set on the task_struct, and the notifier info will be added to the hash
table, that will be queried on the fire_sched_*_preempt_notifiers() hooks.

The caller will be responsible for calling preempt_notifier_unregister()
later, when the notifier is not needed anymore.

For simplicity, this code supports only one notifier per task, but
that is what KVM needs (and what is supported by the preempt-notifier
compat module KVM has, also).

Performance impact:

John Cooper microbenchmarked the latency of the call/hash-table-lookup
of this implementation and debug-trap/linked-list-search on the compat
module. The debug-trap and search on the linked-list took around 4x
more time than the hashtable lookup and calling of the preempt notifier
function. This was tested when running only one KVM guest.

He also measured the context switch performance with and without
this patch, using a cooperating process context switch benchmark. On
both cases, without any preempt notifier registered, any difference
was effectively under the measurement noise floor. This is expected,
as without preempt notifiers registered, the only additional operation
that is done on every context switch is checking a bit on task->flags.

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index d0926d6..401de3a 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -60,4 +60,55 @@ do { \
 
 #endif
 
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+
+#include <linux/list.h>
+
+
+struct preempt_notifier;
+struct task_struct;
+
+/**
+ * preempt_ops - notifiers called when a task is preempted and rescheduled
+ * @sched_in: we're about to be rescheduled:
+ *    notifier: struct preempt_notifier for the task being scheduled
+ *    cpu:  cpu we're scheduled on
+ * @sched_out: we've just been preempted
+ *    notifier: struct preempt_notifier for the task being preempted
+ *    next: the task that's kicking us out
+ */
+struct preempt_ops {
+	void (*sched_in)(struct preempt_notifier *notifier, int cpu);
+	void (*sched_out)(struct preempt_notifier *notifier,
+			  struct task_struct *next);
+};
+
+/**
+ * preempt_notifier - key for installing preemption notifiers
+ * @link: internal use
+ * @ops: defines the notifier functions to be called
+ * @task: the task this notifier is associated to
+ *
+ * Usually used in conjunction with container_of().
+ */
+struct preempt_notifier {
+	struct hlist_node link;
+	struct preempt_ops *ops;
+	struct task_struct *task;
+};
+
+void preempt_notifier_register(struct preempt_notifier *notifier);
+void preempt_notifier_unregister(struct preempt_notifier *notifier);
+
+static inline void preempt_notifier_init(struct preempt_notifier *notifier,
+				     struct preempt_ops *ops)
+{
+	INIT_HLIST_NODE(&notifier->link);
+	notifier->task = NULL;
+	notifier->ops = ops;
+}
+
+#endif /* !CONFIG_PREEMPT_NOTIFIERS */
+
 #endif /* __LINUX_PREEMPT_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2efe69c..411d561 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1119,6 +1119,7 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
+#define PF_PREEMPT_NOTIFIER 0x40000000  /* preempt notifier attached to the task */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 17ab322..b1dbdc5 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -64,3 +64,7 @@ config PREEMPT_BKL
 	  Say Y here if you are building a kernel for a desktop system.
 	  Say N if you are unsure.
 
+config PREEMPT_NOTIFIERS
+	bool
+	default y
+
diff --git a/kernel/fork.c b/kernel/fork.c
index 46262f5..9e14ef0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1027,7 +1027,7 @@ static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
 {
 	unsigned long new_flags = p->flags;
 
-	new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE);
+	new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE | PF_PREEMPT_NOTIFIER);
 	new_flags |= PF_FORKNOEXEC;
 	new_flags |= PF_STARTING;
 	p->flags = new_flags;
diff --git a/kernel/sched.c b/kernel/sched.c
index 728f430..4037e50 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -52,6 +52,7 @@
 #include <linux/acct.h>
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
+#include <linux/hash.h>
 #include <asm/tlb.h>
 #include <trace/sched.h>
 
@@ -1784,6 +1785,142 @@ void fastcall sched_exit(struct task_struct *p)
         }
 }
 
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+
+#define NOTIFIER_HASH_BITS	5
+#define NOTIFIER_HASH_SIZE	(1<<NOTIFIER_HASH_BITS)
+
+struct notifier_hbucket
+{
+	spinlock_t lock;
+	struct hlist_head notifiers;
+};
+
+static struct notifier_hbucket notifier_hash[NOTIFIER_HASH_SIZE];
+
+
+static inline
+struct notifier_hbucket *task_hbucket(struct task_struct *task)
+{
+	unsigned long h = hash_ptr(task, NOTIFIER_HASH_BITS);
+	return &notifier_hash[h];
+}
+
+/**
+ * preempt_notifier_register - tell me when current is being being preempted
+ *                         and rescheduled
+ */
+void preempt_notifier_register(struct preempt_notifier *notifier)
+{
+	struct task_struct *task = current;
+	struct notifier_hbucket *b;
+
+	BUG_ON(task->flags & PF_PREEMPT_NOTIFIER);
+	task->flags |= PF_PREEMPT_NOTIFIER;
+	notifier->task = task;
+
+	b = task_hbucket(task);
+	spin_lock(&b->lock);
+	hlist_add_head(&notifier->link, &b->notifiers);
+	spin_unlock(&b->lock);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_register);
+
+/**
+ * preempt_notifier_unregister - no longer interested in preemption notifications
+ *
+ * This is safe to call from within a preemption notifier.
+ */
+void preempt_notifier_unregister(struct preempt_notifier *notifier)
+{
+	struct task_struct *task = notifier->task;
+	struct notifier_hbucket *b = task_hbucket(task);
+
+	spin_lock(&b->lock);
+	hlist_del(&notifier->link);
+	spin_unlock(&b->lock);
+
+	notifier->task = NULL;
+	task->flags &= ~PF_PREEMPT_NOTIFIER;
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
+
+static void
+fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+	struct preempt_notifier *notifier = NULL;
+	int found = 0;
+	struct hlist_node *node;
+	struct notifier_hbucket *b;
+
+	if (!(curr->flags & PF_PREEMPT_NOTIFIER))
+		return;
+
+	b = task_hbucket(curr);
+	spin_lock(&b->lock);
+	hlist_for_each_entry(notifier, node, &b->notifiers, link)
+		if (notifier->task == curr) {
+			found = 1;
+			break;
+		}
+	spin_unlock(&b->lock);
+
+	if (found)
+		notifier->ops->sched_in(notifier, raw_smp_processor_id());
+}
+
+static void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+				 struct task_struct *next)
+{
+	struct preempt_notifier *notifier = NULL;
+	int found = 0;
+	struct hlist_node *node;
+	struct notifier_hbucket *b;
+
+	if (!(curr->flags & PF_PREEMPT_NOTIFIER))
+		return;
+
+	b = task_hbucket(curr);
+	spin_lock(&b->lock);
+	hlist_for_each_entry(notifier, node, &b->notifiers, link)
+		if (notifier->task == curr) {
+			found = 1;
+			break;
+		}
+	spin_unlock(&b->lock);
+
+	if (found)
+		notifier->ops->sched_out(notifier, next);
+}
+
+void init_preempt_notifiers(void)
+{
+	int i;
+	struct notifier_hbucket *b = notifier_hash;
+
+	for (i = 0; i < NOTIFIER_HASH_SIZE; i++,b++) {
+		spin_lock_init(&b->lock);
+		INIT_HLIST_HEAD(&b->notifiers);
+	}
+}
+
+#else /* CONFIG_PREEMPT_NOTIFIERS */
+
+static inline void
+fire_sched_in_preempt_notifiers(struct task_struct *curr) { }
+
+
+static inline void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+				 struct task_struct *next) { }
+
+static void inline
+init_preempt_notifiers(void) { }
+
+#endif /* !CONFIG_PREEMPT_NOTIFIERS */
+
+
 /**
  * prepare_task_switch - prepare to switch tasks
  * @rq: the runqueue preparing to switch
@@ -1796,8 +1933,11 @@ void fastcall sched_exit(struct task_struct *p)
  * prepare_task_switch sets up locking and calls architecture specific
  * hooks.
  */
-static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
+static inline void
+prepare_task_switch(struct rq *rq, struct task_struct *prev,
+		    struct task_struct *next)
 {
+	fire_sched_out_preempt_notifiers(prev, next);
 	prepare_lock_switch(rq, next);
 	prepare_arch_switch(next);
 }
@@ -1839,6 +1979,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	prev_task_flags = prev->flags;
 	finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
+	fire_sched_in_preempt_notifiers(current);
 	if (mm)
 		mmdrop(mm);
 	if (unlikely(prev_task_flags & PF_DEAD)) {
@@ -3504,7 +3645,7 @@ switch_tasks:
 		rq->curr = next;
 		++*switch_count;
 
-		prepare_task_switch(rq, next);
+		prepare_task_switch(rq, prev, next);
 		prev = context_switch(rq, prev, next);
 		barrier();
 		/*
@@ -6960,6 +7101,8 @@ void __init sched_init(void)
 	plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
 #endif
 
+	init_preempt_notifiers();
+
 	arch_update_cpu_topology();
 	/*
 	 * The boot idle thread does lazy MMU switching as well: