Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > fc11cd6e1c513a17304da94a5390f3cd > files > 4188

kernel-2.6.18-194.11.1.el5.src.rpm

From: Chris Lalancette <clalance@redhat.com>
Date: Thu, 16 Apr 2009 15:12:40 +0200
Subject: [x86_64] xen: implement a minimal TSC based clocksource
Message-id: 49E72EC8.807@redhat.com
O-Subject: Re: [RHEL5.4 PATCH 10/14 v2]: x86_64: Add a minimal TSC based clocksource implementation for 64bit
Bugzilla: 463573
RH-Acked-by: Prarit Bhargava <prarit@redhat.com>
RH-Acked-by: Rik van Riel <riel@redhat.com>

The 2.6.18 64bit RHEL based linux kernel keeps time by counting timer
interrupts.

This is problematic when running in a virtual machine.  The VM can be
descheduled for some portion of time.  When the VM is rescheduled, the
hypervisor needs to "catch up" delivering timer interrupts so that the
kernel can determine the correct time.

Until the VM is caught up, the kernel's time will be behind, causing
short term divergence of the kernel's time with wallclock time.
Additionally, under certain overcommitment conditions, it may not be
possible for the hypervisor to fully catch up.  In this case, the kernel
time can fall behind over the long term.

The solution is to change the kernel's timekeeping algorithm to keep
time based on how much time has elapsed according to a time counter
rather than by counting interrupts.  This is similar to the timeofday
algorithm used by clocksource enabled mainline kernels or the RHEL5
32bit kernel.

The time counter that is used to keep time is the virtual TSC.  The
virtual TSC is an idealized TSC that does not suffer from the issues
that many physical TSCs suffer from.  However, measuring the frequency
of the TSC inside a VM is difficult.  So, when running on a VMware
hypervisor, query the hypervisor to discover the TSC frequency.

Note that this new TSC-based time keeping algorithm is enabled by
default only after a VMware hypervisor has been detected, eliminating
any effect when running on non-VMware systems (besides executing the
VMware hypervisor detection code).

Fixes BZ 463573

--
Chris Lalancette

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 0fb9e36..e2a15e8 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -363,7 +363,7 @@ running once the system is up.
 			Forces specified clocksource (if avaliable) to be used
 			when calculating gettimeofday(). If specified
 			clocksource is not avalible, it defaults to PIT.
-			Format: { pit | tsc | cyclone | pmtmr }
+			Format: { pit | tsc | cyclone | pmtmr | tsccount | notsccount }
 
 	disable_8254_timer
 	enable_8254_timer
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index b9f5303..4310ca9 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -98,6 +98,10 @@ unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
 struct timespec __xtime __section_xtime;
 struct timezone __sys_tz __section_sys_tz;
 
+/* -1=>disabled, 0=>autoconfigure, 1=>enabled */
+static int timekeeping_use_tsc;
+static cycles_t cycles_per_tick, cycles_accounted_limit;
+
 /*
  * do_gettimeoffset() returns nanoseconds since last timer interrupt was
  * triggered by hardware. A memory read of HPET is slower than a register read
@@ -328,6 +332,27 @@ unsigned long long monotonic_clock(void)
 }
 EXPORT_SYMBOL(monotonic_clock);
 
+static void do_timer_jiffy(struct pt_regs *regs)
+{
+	do_timer(regs);
+#ifndef CONFIG_SMP
+	update_process_times(user_mode(regs), regs);
+#endif
+
+	/*
+	 * In the SMP case we use the local APIC timer interrupt to do the profiling,
+	 * except when we simulate SMP mode on a uniprocessor system, in that case we
+	 * have to call the local interrupt handler.
+	 */
+
+#ifndef CONFIG_X86_LOCAL_APIC
+	profile_tick(CPU_PROFILING, regs);
+#else
+	if (!using_apic_timer)
+		smp_local_timer_interrupt(regs);
+#endif
+}
+
 static noinline void handle_lost_ticks(int lost, struct pt_regs *regs)
 {
 	static long lost_count;
@@ -365,21 +390,11 @@ static noinline void handle_lost_ticks(int lost, struct pt_regs *regs)
 #endif
 }
 
-void main_timer_handler(struct pt_regs *regs)
+static void do_timer_account_lost_ticks(struct pt_regs *regs)
 {
-	static unsigned long rtc_update = 0;
 	unsigned long tsc;
 	int delay = 0, offset = 0, lost = 0, i;
 
-/*
- * Here we are in the timer irq handler. We have irqs locally disabled (so we
- * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running
- * on the other CPU, so we need a lock. We also need to lock the vsyscall
- * variables, because both do_timer() and us change them -arca+vojtech
- */
-
-	write_seqlock(&xtime_lock);
-
 	if (vxtime.hpet_address)
 		offset = hpet_readl(HPET_COUNTER);
 
@@ -450,30 +465,63 @@ void main_timer_handler(struct pt_regs *regs)
 		jiffies += (u64)lost - (tick_divider - 1);
 	}
 
+	/* Do the timer stuff */
+	for (i = 0; i < tick_divider; i++)
+		do_timer_jiffy(regs);
+}
+
 /*
- * Do the timer stuff.
+ * Measure time based on the TSC, rather than counting interrupts.
  */
+static void do_timer_tsc_timekeeping(struct pt_regs *regs)
+{
+	int i;
+	cycles_t tsc, tsc_accounted, tsc_not_accounted;
 
-	for (i = 0; i < tick_divider; i++) {
-		do_timer(regs);
-#ifndef CONFIG_SMP
-		update_process_times(user_mode(regs), regs);
-#endif
+	tsc = get_cycles_sync();
+	tsc_accounted = vxtime.last_tsc;
 
-	/*
-	 * In the SMP case we use the local APIC timer interrupt to do the profiling,
-	 * except when we simulate SMP mode on a uniprocessor system, in that case we
-	 * have to call the local interrupt handler.
-	 */
+	if (unlikely(tsc < tsc_accounted))
+		return;
 
-#ifndef CONFIG_X86_LOCAL_APIC
-		profile_tick(CPU_PROFILING, regs);
-#else
-		if (!using_apic_timer)
-			smp_local_timer_interrupt(regs);
-#endif
+	tsc_not_accounted = tsc - tsc_accounted;
+
+	if (tsc_not_accounted > cycles_accounted_limit) {
+		/* Be extra safe and limit the loop below. */
+		tsc_accounted += tsc_not_accounted - cycles_accounted_limit;
+		tsc_not_accounted = cycles_accounted_limit;
 	}
 
+	while (tsc_not_accounted >= cycles_per_tick) {
+		for (i = 0; i < tick_divider; i++)
+			do_timer_jiffy(regs);
+		tsc_not_accounted -= cycles_per_tick;
+		tsc_accounted += cycles_per_tick;
+	}
+
+	monotonic_base += ((tsc_accounted - vxtime.last_tsc) *
+					1000000 / cpu_khz);
+	vxtime.last_tsc = tsc_accounted;
+}
+
+void main_timer_handler(struct pt_regs *regs)
+{
+	static unsigned long rtc_update = 0;
+
+/*
+ * Here we are in the timer irq handler. We have irqs locally disabled (so we
+ * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running
+ * on the other CPU, so we need a lock. We also need to lock the vsyscall
+ * variables, because both do_timer() and us change them -arca+vojtech
+ */
+
+	write_seqlock(&xtime_lock);
+
+	if (timekeeping_use_tsc > 0)
+		do_timer_tsc_timekeeping(regs);
+	else
+		do_timer_account_lost_ticks(regs);
+
 /*
  * If we have an externally synchronized Linux clock, then update CMOS clock
  * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy
@@ -1038,6 +1086,19 @@ void __init time_init(void)
 
 	lpj_fine = ((unsigned long)tsc_khz * 1000)/HZ;
 
+	/* Keep time based on the TSC rather than by counting interrupts. */
+	if (timekeeping_use_tsc > 0) {
+		cycles_per_tick = (cpu_khz * 1000) / REAL_HZ;
+		/*
+		 * The maximum cycles we will account per
+		 * timer interrupt is 10 minutes.
+		 */
+		cycles_accounted_limit = cycles_per_tick * REAL_HZ * 60 * 10;
+		tick_nsec = NSEC_PER_SEC / HZ;
+		printk(KERN_INFO
+			"time.c: Using tsc for timekeeping HZ %d\n", HZ);
+	}
+
 	vxtime.mode = VXTIME_TSC;
 	vxtime.quot = (NSEC_PER_SEC << NS_SCALE) / vxtime_hz;
 	vxtime.tsc_quot = (NSEC_PER_MSEC << NS_SCALE) / cpu_khz;
@@ -1100,7 +1161,10 @@ void time_init_gtod(void)
 	else
 		vgetcpu_mode = VGETCPU_LSL;
 
-	if (vxtime.hpet_address && notsc) {
+	if (timekeeping_use_tsc > 0) {
+		timetype = "TSC Timekeeping";
+		vxtime.mode = VXTIME_TSC;
+	} else if (vxtime.hpet_address && notsc) {
 		timetype = hpet_use_timer ? "HPET" : "PIT/HPET";
 		if (hpet_use_timer)
 			vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick_real;
@@ -1471,6 +1535,10 @@ static int __init boot_override_clock(char *str)
 	} else if (!strcmp(str, "tsc")) {
 		nohpet = 1;
 		pmtmr_ioport = 0;
+	} else if (!strcmp(str, "tsccount")) {
+		timekeeping_use_tsc = 1;
+	} else if (!strcmp(str, "notsccount")) {
+		timekeeping_use_tsc = -1;
 	} else
 		printk(KERN_WARNING "%s is unknown clock source\n", str);