Sophie: kernel-2.6.18-194.11.1.el5 src

kernel-2.6.18-194.11.1.el5.src.rpm

From: Peter Zijlstra <pzijlstr@redhat.com>
Date: Thu, 7 May 2009 15:59:42 +0200
Subject: [sched] rq clock
Message-id: 20090507140138.332783000@chello.nl
O-Subject: [PATCH 4/5] RHEL-5: sched: rq clock
Bugzilla: 297731
RH-Acked-by: Rik van Riel <riel@redhat.com>
CVE: CVE-2007-3719

backport of the upstream sched_clock machinery.

Signed-off-by: Peter Zijlstra <pzijlstr@redhat.com>

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index afe6d2b..bf74446 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -52,6 +52,7 @@
 ACPI_MODULE_NAME("acpi_processor")
 #define ACPI_PROCESSOR_FILE_POWER	"power"
 #define US_TO_PM_TIMER_TICKS(t)		((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
+#define PM_TIMER_TICK_NS                (1000000000ULL/PM_TIMER_FREQUENCY)
 #define C2_OVERHEAD			4	/* 1us (3.579 ticks per us) */
 #define C3_OVERHEAD			4	/* 1us (3.579 ticks per us) */
 static void (*pm_idle_save) (void) __read_mostly;
@@ -351,6 +352,8 @@ static void acpi_processor_idle(void)
 	case ACPI_STATE_C2:
 		/* Get start time (ticks) */
 		t1 = inl(acpi_fadt.xpm_tmr_blk.address);
+		/* Tell the scheduler that we are going deep-idle */
+		sched_clock_idle_sleep_event();
 		/* Invoke C2 */
 		inb(cx->address);
 		/* Dummy wait op - must do something useless after P_LVL2 read
@@ -365,12 +368,15 @@ static void acpi_processor_idle(void)
 		if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
 			mark_tsc_unstable();
 #endif
+		/* Compute time (ticks) that we were actually asleep */
+		sleep_ticks = ticks_elapsed(t1, t2);
+		/* Tell the scheduler how much we idled */
+		sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
 		/* Re-enable interrupts */
 		local_irq_enable();
 		current_thread_info()->status |= TS_POLLING;
-		/* Compute time (ticks) that we were actually asleep */
-		sleep_ticks =
-		    ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD;
+		/* Do not account our idle-switching overhead */
+		sleep_ticks -= cx->latency_ticks + C2_OVERHEAD;
 		break;
 
 	case ACPI_STATE_C3:
@@ -402,6 +408,8 @@ static void acpi_processor_idle(void)
 
 		/* Get start time (ticks) */
 		t1 = inl(acpi_fadt.xpm_tmr_blk.address);
+		/* Tell the scheduler that we are going deep-idle */
+		sched_clock_idle_sleep_event();
 		/* Invoke C3 */
 		inb(cx->address);
 		/* Dummy wait op (see above) */
@@ -420,12 +428,15 @@ static void acpi_processor_idle(void)
 		if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
 			mark_tsc_unstable();
 #endif
+		/* Compute time (ticks) that we were actually asleep */
+		sleep_ticks = ticks_elapsed(t1, t2);
+		/* Tell the scheduler how much we idled */
+		sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
 		/* Re-enable interrupts */
 		local_irq_enable();
 		current_thread_info()->status |= TS_POLLING;
-		/* Compute time (ticks) that we were actually asleep */
-		sleep_ticks =
-		    ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD;
+		/* Do not account our idle-switching overhead */
+		sleep_ticks -= cx->latency_ticks + C3_OVERHEAD;
 		break;
 
 	default:
diff --git a/include/linux/sched.h b/include/linux/sched.h
index eaabf86..1faef4a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1213,6 +1213,8 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 extern unsigned long long sched_clock(void);
 extern unsigned long long
 current_sched_time(const struct task_struct *current_task);
+extern void sched_clock_idle_sleep_event(void);
+extern void sched_clock_idle_wakeup_event(u64 delta_ns);
 
 /* sched_exec is called by processes performing an exec */
 #ifdef CONFIG_SMP
diff --git a/init/main.c b/init/main.c
index 06349fe..e9f6fa5 100644
--- a/init/main.c
+++ b/init/main.c
@@ -118,6 +118,8 @@ extern void time_init(void);
 void (*late_time_init)(void);
 extern void softirq_init(void);
 
+extern void sched_clock_init(void);
+
 /* Untouched command line (eg. for /proc) saved by arch-specific code. */
 char saved_command_line[COMMAND_LINE_SIZE];
 
@@ -543,6 +545,7 @@ asmlinkage void __init start_kernel(void)
 	softirq_init();
 	timekeeping_init();
 	time_init();
+	sched_clock_init();
 	profile_init();
 	if (!irqs_disabled())
 		printk("start_kernel(): bug: interrupts were enabled early\n");
diff --git a/kernel/sched.c b/kernel/sched.c
index 7a62c3e..6b923f1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -41,6 +41,7 @@
 #include <linux/smp.h>
 #include <linux/threads.h>
 #include <linux/timer.h>
+#include <linux/time.h>
 #include <linux/rcupdate.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
@@ -228,6 +229,7 @@ struct rq {
 
 	unsigned long expired_timestamp;
 	unsigned long long timestamp_last_tick;
+	unsigned long long clock;
 	struct task_struct *curr, *idle;
 	struct mm_struct *prev_mm;
 	struct prio_array *active, *expired, arrays[2];
@@ -371,6 +373,237 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 }
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 
+static int sched_clock_running __read_mostly;
+
+/*
+ * can be set by an arch if their native sched_clock() is stable and
+ * synchronized between cpus
+ */
+int sched_clock_stable __read_mostly;
+
+struct sched_clock_data {
+	raw_spinlock_t		lock;
+
+	u64			tick_raw;
+	u64			tick_gtod;
+	u64			clock;
+};
+
+static DEFINE_PER_CPU(struct sched_clock_data, sched_clock_data);
+
+static inline struct sched_clock_data *this_scd(void)
+{
+	return &__get_cpu_var(sched_clock_data);
+}
+
+static inline struct sched_clock_data *cpu_sdc(int cpu)
+{
+	return &per_cpu(sched_clock_data, cpu);
+}
+
+static u64 get_monotonic_time(void)
+{
+	u64 time;
+	struct timespec tp;
+
+	ktime_get_ts(&tp);
+	time = tp.tv_sec * NSEC_PER_SEC + tp.tv_nsec;
+
+	return time;
+}
+
+void sched_clock_init(void)
+{
+	u64 ktime_now = get_monotonic_time();
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct sched_clock_data *scd = cpu_sdc(cpu);
+
+		scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+		scd->tick_raw = 0;
+		scd->tick_gtod = ktime_now;
+		scd->clock = ktime_now;
+	}
+
+	sched_clock_running = 1;
+}
+
+/*
+ * min, max except they take wrapping into account
+ */
+
+static inline u64 wrap_min(u64 x, u64 y)
+{
+	return (s64)(x - y) < 0 ? x : y;
+}
+
+static inline u64 wrap_max(u64 x, u64 y)
+{
+	return (s64)(x - y) > 0 ? x : y;
+}
+
+/*
+ * update the percpu scd from the raw @now value
+ *
+ *  - filter out backward motion
+ *  - use the GTOD tick value to create a window to filter crazy TSC values
+ */
+static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
+{
+	s64 delta = now - scd->tick_raw;
+	u64 clock, min_clock, max_clock;
+
+	if (unlikely(delta < 0))
+		delta = 0;
+
+	/*
+	 * scd->clock = clamp(scd->tick_gtod + delta,
+	 *		      max(scd->tick_gtod, scd->clock),
+	 *		      scd->tick_gtod + TICK_NSEC);
+	 */
+
+	clock = scd->tick_gtod + delta;
+	min_clock = wrap_max(scd->tick_gtod, scd->clock);
+	max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC);
+
+	clock = wrap_max(clock, min_clock);
+	clock = wrap_min(clock, max_clock);
+
+	scd->clock = clock;
+
+	return scd->clock;
+}
+
+static void lock_double_clock(struct sched_clock_data *data1,
+			      struct sched_clock_data *data2)
+{
+	if (data1 < data2) {
+		__raw_spin_lock(&data1->lock);
+		__raw_spin_lock(&data2->lock);
+	} else {
+		__raw_spin_lock(&data2->lock);
+		__raw_spin_lock(&data1->lock);
+	}
+}
+
+u64 sched_clock_cpu(int cpu)
+{
+	u64 now, clock, this_clock, remote_clock;
+	struct sched_clock_data *scd;
+
+	if (sched_clock_stable)
+		return sched_clock();
+
+	scd = cpu_sdc(cpu);
+
+	if (unlikely(!sched_clock_running))
+		return 0ull;
+
+	WARN_ON_ONCE(!irqs_disabled());
+	now = sched_clock();
+
+	if (cpu != smp_processor_id()) {
+		struct sched_clock_data *my_scd = this_scd();
+
+		lock_double_clock(scd, my_scd);
+
+		this_clock = __update_sched_clock(my_scd, now);
+		remote_clock = scd->clock;
+
+		/*
+		 * Use the opportunity that we have both locks
+		 * taken to couple the two clocks: we take the
+		 * larger time as the latest time for both
+		 * runqueues. (this creates monotonic movement)
+		 */
+		if (likely((s64)(remote_clock - this_clock) < 0)) {
+			clock = this_clock;
+			scd->clock = clock;
+		} else {
+			/*
+			 * Should be rare, but possible:
+			 */
+			clock = remote_clock;
+			my_scd->clock = remote_clock;
+		}
+
+		__raw_spin_unlock(&my_scd->lock);
+	} else {
+		__raw_spin_lock(&scd->lock);
+		clock = __update_sched_clock(scd, now);
+	}
+
+	__raw_spin_unlock(&scd->lock);
+
+	return clock;
+}
+
+static inline u64 sched_clock_local(void)
+{
+	return sched_clock_cpu(smp_processor_id());
+}
+
+static void sched_clock_tick(void)
+{
+	struct sched_clock_data *scd;
+	u64 now, now_gtod;
+
+	if (sched_clock_stable)
+		return;
+
+	if (unlikely(!sched_clock_running))
+		return;
+
+	WARN_ON_ONCE(!irqs_disabled());
+
+	scd = this_scd();
+	now_gtod = get_monotonic_time();
+	now = sched_clock();
+
+	__raw_spin_lock(&scd->lock);
+	scd->tick_raw = now;
+	scd->tick_gtod = now_gtod;
+	__update_sched_clock(scd, now);
+	__raw_spin_unlock(&scd->lock);
+}
+
+/*
+ * We are going deep-idle (irqs are disabled):
+ */
+void sched_clock_idle_sleep_event(void)
+{
+	sched_clock_local();
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
+
+/*
+ * We just idled delta nanoseconds (called with irqs disabled):
+ */
+void sched_clock_idle_wakeup_event(u64 delta_ns)
+{
+	sched_clock_tick();
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
+
+unsigned long long cpu_clock(int cpu)
+{
+	unsigned long long clock;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	clock = sched_clock_cpu(cpu);
+	local_irq_restore(flags);
+
+	return clock;
+}
+EXPORT_SYMBOL_GPL(cpu_clock);
+
+static void update_rq_clock(struct rq *rq)
+{
+	rq->clock = sched_clock_cpu(cpu_of(rq));
+}
+
 /*
  * __task_rq_lock - lock the runqueue a given task resides on.
  * Must be called interrupts disabled.
@@ -387,6 +620,7 @@ repeat_lock_task:
 		spin_unlock(&rq->lock);
 		goto repeat_lock_task;
 	}
+	update_rq_clock(rq);
 	return rq;
 }
 
@@ -408,6 +642,7 @@ repeat_lock_task:
 		spin_unlock_irqrestore(&rq->lock, *flags);
 		goto repeat_lock_task;
 	}
+	update_rq_clock(rq);
 	return rq;
 }
 
@@ -559,6 +794,7 @@ static inline struct rq *this_rq_lock(void)
 	local_irq_disable();
 	rq = this_rq();
 	spin_lock(&rq->lock);
+	update_rq_clock(rq);
 
 	return rq;
 }
@@ -941,15 +1177,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
 {
 	unsigned long long now;
 
-	now = sched_clock();
-#ifdef CONFIG_SMP
-	if (!local) {
-		/* Compensate for drifting sched_clock */
-		struct rq *this_rq = this_rq();
-		now = (now - this_rq->timestamp_last_tick)
-			+ rq->timestamp_last_tick;
-	}
-#endif
+	now = rq->clock;
 
 	if (!rt_task(p))
 		p->prio = recalc_task_prio(p, now);
@@ -1339,6 +1567,7 @@ static int wake_idle(int cpu, struct task_struct *p)
 	struct sched_domain *sd;
 	int i;
 	unsigned long long now;
+	struct rq *rq = cpu_rq(cpu);
 
 	/*
 	 * If it is idle, then it is the best cpu to run this task.
@@ -1349,10 +1578,10 @@ static int wake_idle(int cpu, struct task_struct *p)
 	 * sibling runqueue info. This will avoid the checks and cache miss
 	 * penalities associated with that.
 	 */
-	if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
+	if (idle_cpu(cpu) || rq->nr_running > 1)
 		return cpu;
 
-	now = sched_clock();
+	now = rq->clock;
 	for_each_domain(cpu, sd) {
 		if ((sd->flags & SD_WAKE_IDLE)
 		    || ((sd->flags & SD_WAKE_IDLE_FAR)
@@ -1654,7 +1883,8 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
 	 */
 	p->first_time_slice = current->pid;
 	current->time_slice >>= 1;
-	p->timestamp = sched_clock();
+
+	p->timestamp = sched_clock_local();
 	if (unlikely(!current->time_slice)) {
 		/*
 		 * This case is rare, it happens when the parent has only
@@ -1729,12 +1959,6 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 	} else {
 		this_rq = cpu_rq(this_cpu);
 
-		/*
-		 * Not the local CPU - must adjust timestamp. This should
-		 * get optimised away in the !CONFIG_SMP case.
-		 */
-		p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
-					+ rq->timestamp_last_tick;
 		__activate_task(p, rq);
 		if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
@@ -2161,6 +2385,8 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
 			spin_lock(&rq1->lock);
 		}
 	}
+	update_rq_clock(rq1);
+	update_rq_clock(rq2);
 }
 
 /*
@@ -2196,6 +2422,8 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
 		} else
 			spin_lock(&busiest->lock);
 	}
+	/* update_rq_clock(this_rq); */
+	update_rq_clock(busiest);
 }
 
 /*
@@ -2258,8 +2486,6 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array,
 	set_task_cpu(p, this_cpu);
 	inc_nr_running(p, this_rq);
 	enqueue_task(p, this_array);
-	p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
-				+ this_rq->timestamp_last_tick;
 	/*
 	 * Note that idle threads have a prio of MAX_PRIO, for this test
 	 * to be always true for them.
@@ -2298,7 +2524,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	if (sd->nr_balance_failed > sd->cache_nice_tries)
 		return 1;
 
-	if (task_hot(p, rq->timestamp_last_tick, sd))
+	if (task_hot(p, rq->clock, sd))
 		return 0;
 	return 1;
 }
@@ -2397,7 +2623,7 @@ skip_queue:
 	}
 
 #ifdef CONFIG_SCHEDSTATS
-	if (task_hot(tmp, busiest->timestamp_last_tick, sd))
+	if (task_hot(tmp, busiest->clock, sd))
 		schedstat_inc(sd, lb_hot_gained[idle]);
 #endif
 
@@ -3131,7 +3357,7 @@ unsigned long long current_sched_time(const struct task_struct *p)
 
 	local_irq_save(flags);
 	ns = max(p->timestamp, task_rq(p)->timestamp_last_tick);
-	ns = p->sched_time + sched_clock() - ns;
+	ns = p->sched_time + sched_clock_local() - ns;
 	local_irq_restore(flags);
 
 	return ns;
@@ -3242,16 +3468,22 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
  */
 void scheduler_tick(void)
 {
-	unsigned long long now = sched_clock();
+	unsigned long long now;
 	struct task_struct *p = current;
 	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
 
+	sched_clock_tick();
+
+	spin_lock(&rq->lock);
+	update_rq_clock(rq);
+	now = rq->clock;
 	update_cpu_clock(p, rq, now);
 
 	rq->timestamp_last_tick = now;
 
 	if (p == rq->idle) {
+		spin_unlock(&rq->lock);
 		if (wake_priority_sleeper(rq))
 			goto out;
 		rebalance_tick(cpu, rq, SCHED_IDLE);
@@ -3261,9 +3493,8 @@ void scheduler_tick(void)
 	/* Task might have expired already, but not scheduled off yet */
 	if (p->array != rq->active) {
 		set_tsk_need_resched(p);
-		goto out;
+		goto out_unlock;
 	}
-	spin_lock(&rq->lock);
 	/*
 	 * The task was running during this tick - update the
 	 * time slice counter. Note: we do not update a thread's
@@ -3551,7 +3782,8 @@ need_resched_nonpreemptible:
 
 	schedstat_inc(rq, sched_cnt);
 	spin_lock_irq(&rq->lock);
-	now = sched_clock();
+	update_rq_clock(rq);
+	now = rq->clock;
 	if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
 		run_time = now - prev->timestamp;
 		if (unlikely((long long)(now - prev->timestamp) < 0))
@@ -5096,7 +5328,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
-	idle->timestamp = sched_clock();
+	local_irq_save(flags);
+
+	idle->timestamp = sched_clock_local();
 	idle->sleep_avg = 0;
 	idle->array = NULL;
 	idle->prio = idle->normal_prio = MAX_PRIO;
@@ -5104,7 +5338,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	idle->cpus_allowed = cpumask_of_cpu(cpu);
 	set_task_cpu(idle, cpu);
 
-	spin_lock_irqsave(&rq->lock, flags);
+	spin_lock(&rq->lock);
+
 	rq->curr = rq->idle = idle;
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	idle->oncpu = 1;
@@ -5219,14 +5454,6 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 
 	set_task_cpu(p, dest_cpu);
 	if (p->array) {
-		/*
-		 * Sync timestamp with rq_dest's before activating.
-		 * The same thing could be achieved by doing this step
-		 * afterwards, and pretending it was a local activate.
-		 * This way is cleaner and logically correct.
-		 */
-		p->timestamp = p->timestamp - rq_src->timestamp_last_tick
-				+ rq_dest->timestamp_last_tick;
 		deactivate_task(p, rq_src);
 		__activate_task(p, rq_dest);
 		if (TASK_PREEMPTS_CURR(p, rq_dest))
@@ -5265,6 +5492,8 @@ static int migration_thread(void *data)
 			goto wait_to_die;
 		}
 
+		update_rq_clock(rq);
+
 		if (rq->active_balance) {
 			active_load_balance(rq, cpu);
 			rq->active_balance = 0;