Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > fc11cd6e1c513a17304da94a5390f3cd > files > 4148

kernel-2.6.18-194.11.1.el5.src.rpm

From: Aristeu Rozanski <aris@redhat.com>
Date: Wed, 20 Aug 2008 13:16:47 -0400
Subject: [x86_64] nmi: add perfctr infrastructure
Message-id: 20080820171646.783256000@redhat.com
O-Subject: [RHEL5.3 PATCH 01/25] nmi: add perfctr infrastructure
Bugzilla: 447618

https://bugzilla.redhat.com/show_bug.cgi?id=447618

add perfctr infrastructure that will make possible to share perf counters

upstream: 09198e68501a7e34737cd9264d266f42429abcdc
	  86d78f640257344cc90a50da8cd52297ba1c6bdf
	  4f14bdef41e599e218d71e3d0abf339d65e9b480
	  (and others)

diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index d5eeace..dcec5ae 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_AUDIT)		+= audit.o
 obj-$(CONFIG_EFI)              += efi.o efi_callwrap.o
 
 obj-$(CONFIG_MODULES)		+= module.o
+obj-$(CONFIG_X86_LOCAL_APIC)	+= perfctr-watchdog.o
 
 obj-y				+= topology.o
 obj-y				+= intel_cacheinfo.o
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 9ee8c4f..006f6fc 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -234,39 +234,6 @@ int __init setup_nmi_watchdog(char *str)
 
 __setup("nmi_watchdog=", setup_nmi_watchdog);
 
-static void disable_intel_arch_watchdog(void);
-
-static void disable_lapic_nmi_watchdog(void)
-{
-	if (nmi_active <= 0)
-		return;
-	switch (boot_cpu_data.x86_vendor) {
-	case X86_VENDOR_AMD:
-		wrmsr(MSR_K7_EVNTSEL0, 0, 0);
-		break;
-	case X86_VENDOR_INTEL:
-		if (boot_cpu_data.x86 == 15) {
-			wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
-			wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
-		} else if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
-			disable_intel_arch_watchdog();
-		}
-		break;
-	}
-	nmi_active = -1;
-	/* tell do_nmi() and others that we're not active any more */
-	nmi_watchdog = 0;
-}
-
-static void enable_lapic_nmi_watchdog(void)
-{
-	if (nmi_active < 0) {
-		nmi_watchdog = NMI_LOCAL_APIC;
-		touch_nmi_watchdog();
-		setup_apic_nmi_watchdog();
-	}
-}
-
 int reserve_lapic_nmi(void)
 {
 	unsigned int old_owner;
@@ -374,126 +341,6 @@ static void clear_msr_range(unsigned int base, unsigned int n)
 		wrmsr(base+i, 0, 0);
 }
 
-static void setup_k7_watchdog(void)
-{
-	int i;
-	unsigned int evntsel;
-
-	nmi_perfctr_msr = MSR_K7_PERFCTR0;
-
-	for(i = 0; i < 4; ++i) {
-		/* Simulator may not support it */
-		if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) {
-			nmi_perfctr_msr = 0;
-			return;
-		}
-		wrmsrl(MSR_K7_PERFCTR0+i, 0UL);
-	}
-
-	evntsel = K7_EVNTSEL_INT
-		| K7_EVNTSEL_OS
-		| K7_EVNTSEL_USR
-		| K7_NMI_EVENT;
-
-	wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
-	wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz * 1000 / nmi_hz));
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	evntsel |= K7_EVNTSEL_ENABLE;
-	wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
-}
-
-static void disable_intel_arch_watchdog(void)
-{
-	unsigned ebx;
-
-	/*
-	 * Check whether the Architectural PerfMon supports
-	 * Unhalted Core Cycles Event or not.
-	 * NOTE: Corresponding bit = 0 in ebp indicates event present.
-	 */
-	ebx = cpuid_ebx(10);
-	if (!(ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
-		wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, 0, 0);
-}
-
-static int setup_intel_arch_watchdog(void)
-{
-	unsigned int evntsel;
-	unsigned ebx;
-
-	/*
-	 * Check whether the Architectural PerfMon supports
-	 * Unhalted Core Cycles Event or not.
-	 * NOTE: Corresponding bit = 0 in ebp indicates event present.
-	 */
-	ebx = cpuid_ebx(10);
-	if ((ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
-		return 0;
-
-	nmi_perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
-
-	clear_msr_range(MSR_ARCH_PERFMON_EVENTSEL0, 2);
-	clear_msr_range(MSR_ARCH_PERFMON_PERFCTR0, 2);
-
-	evntsel = ARCH_PERFMON_EVENTSEL_INT
-		| ARCH_PERFMON_EVENTSEL_OS
-		| ARCH_PERFMON_EVENTSEL_USR
-		| ARCH_PERFMON_NMI_EVENT_SEL
-		| ARCH_PERFMON_NMI_EVENT_UMASK;
-
-	wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0);
-
-	nmi_hz = adjust_for_32bit_ctr(nmi_hz);
-	wrmsr(nmi_perfctr_msr, (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0);
-
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-	wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0);
-	return 1;
-}
-
-
-static int setup_p4_watchdog(void)
-{
-	unsigned int misc_enable, dummy;
-
-	rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy);
-	if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
-		return 0;
-
-	nmi_perfctr_msr = MSR_P4_IQ_COUNTER0;
-	nmi_p4_cccr_val = P4_NMI_IQ_CCCR0;
-#ifdef CONFIG_SMP
-	if (smp_num_siblings == 2)
-		nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1;
-#endif
-
-	if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL))
-		clear_msr_range(0x3F1, 2);
-	/* MSR 0x3F0 seems to have a default value of 0xFC00, but current
-	   docs doesn't fully define it, so leave it alone for now. */
-	if (boot_cpu_data.x86_model >= 0x3) {
-		/* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */
-		clear_msr_range(0x3A0, 26);
-		clear_msr_range(0x3BC, 3);
-	} else {
-		clear_msr_range(0x3A0, 31);
-	}
-	clear_msr_range(0x3C0, 6);
-	clear_msr_range(0x3C8, 6);
-	clear_msr_range(0x3E0, 2);
-	clear_msr_range(MSR_P4_CCCR0, 18);
-	clear_msr_range(MSR_P4_PERFCTR0, 18);
-
-	wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0);
-	wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0);
-	Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz * 1000UL / nmi_hz));
-	wrmsrl(MSR_P4_IQ_COUNTER0, -((u64)cpu_khz * 1000 / nmi_hz));
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
-	return 1;
-}
-
 void setup_apic_nmi_watchdog(void)
 {
 	switch (boot_cpu_data.x86_vendor) {
diff --git a/arch/x86_64/kernel/perfctr-watchdog.c b/arch/x86_64/kernel/perfctr-watchdog.c
new file mode 100644
index 0000000..984f2fb
--- /dev/null
+++ b/arch/x86_64/kernel/perfctr-watchdog.c
@@ -0,0 +1,725 @@
+/* local apic based NMI watchdog for various CPUs.
+   This file also handles reservation of performance counters for coordination
+   with other users (like oprofile).
+
+   Note that these events normally don't tick when the CPU idles. This means
+   the frequency varies with CPU load.
+
+   Original code for K7/P6 written by Keith Owens */
+
+#include <linux/percpu.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/smp.h>
+#include <linux/nmi.h>
+#include <asm/nmi.h>
+#include <asm/apic.h>
+#include <asm/intel_arch_perfmon.h>
+
+struct nmi_watchdog_ctlblk {
+	unsigned int cccr_msr;
+	unsigned int perfctr_msr;  /* the MSR to reset in NMI handler */
+	unsigned int evntsel_msr;  /* the MSR to select the events to handle */
+};
+
+/* Interface defining a CPU specific perfctr watchdog */
+struct wd_ops {
+	int (*reserve)(void);
+	void (*unreserve)(void);
+	int (*setup)(unsigned nmi_hz);
+	void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz);
+	void (*stop)(void);
+	unsigned perfctr;
+	unsigned evntsel;
+	u64 checkbit;
+};
+
+static const struct wd_ops *wd_ops;
+
+/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
+ * offset from MSR_P4_BSU_ESCR0.  It will be the max for all platforms (for now)
+ */
+#define NMI_MAX_COUNTER_BITS 66
+
+/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
+ * evtsel_nmi_owner tracks the ownership of the event selection
+ * - different performance counters/ event selection may be reserved for
+ *   different subsystems this reservation system just tries to coordinate
+ *   things a little
+ */
+static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
+static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
+
+static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
+
+/* converts an msr to an appropriate reservation bit */
+static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
+{
+	/* returns the bit offset of the performance counter register */
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_AMD:
+		return (msr - MSR_K7_PERFCTR0);
+	case X86_VENDOR_INTEL:
+		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+			return (msr - MSR_ARCH_PERFMON_PERFCTR0);
+
+		switch (boot_cpu_data.x86) {
+		case 6:
+			return (msr - MSR_P6_PERFCTR0);
+		case 15:
+			return (msr - MSR_P4_BPU_PERFCTR0);
+		}
+	}
+	return 0;
+}
+
+/* converts an msr to an appropriate reservation bit */
+/* returns the bit offset of the event selection register */
+static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
+{
+	/* returns the bit offset of the event selection register */
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_AMD:
+		return (msr - MSR_K7_EVNTSEL0);
+	case X86_VENDOR_INTEL:
+		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+			return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
+
+		switch (boot_cpu_data.x86) {
+		case 6:
+			return (msr - MSR_P6_EVNTSEL0);
+		case 15:
+			return (msr - MSR_P4_BSU_ESCR0);
+		}
+	}
+	return 0;
+
+}
+
+/* checks for a bit availability (hack for oprofile) */
+int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
+{
+	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+	return (!test_bit(counter, perfctr_nmi_owner));
+}
+
+/* checks the an msr for availability */
+int avail_to_resrv_perfctr_nmi(unsigned int msr)
+{
+	unsigned int counter;
+
+	counter = nmi_perfctr_msr_to_bit(msr);
+	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+	return (!test_bit(counter, perfctr_nmi_owner));
+}
+
+int reserve_perfctr_nmi(unsigned int msr)
+{
+	unsigned int counter;
+
+	counter = nmi_perfctr_msr_to_bit(msr);
+	/* register not managed by the allocator? */
+	if (counter > NMI_MAX_COUNTER_BITS)
+		return 1;
+
+	if (!test_and_set_bit(counter, perfctr_nmi_owner))
+		return 1;
+	return 0;
+}
+
+void release_perfctr_nmi(unsigned int msr)
+{
+	unsigned int counter;
+
+	counter = nmi_perfctr_msr_to_bit(msr);
+	/* register not managed by the allocator? */
+	if (counter > NMI_MAX_COUNTER_BITS)
+		return;
+
+	clear_bit(counter, perfctr_nmi_owner);
+}
+
+int reserve_evntsel_nmi(unsigned int msr)
+{
+	unsigned int counter;
+
+	counter = nmi_evntsel_msr_to_bit(msr);
+	/* register not managed by the allocator? */
+	if (counter > NMI_MAX_COUNTER_BITS)
+		return 1;
+
+	if (!test_and_set_bit(counter, evntsel_nmi_owner))
+		return 1;
+	return 0;
+}
+
+void release_evntsel_nmi(unsigned int msr)
+{
+	unsigned int counter;
+
+	counter = nmi_evntsel_msr_to_bit(msr);
+	/* register not managed by the allocator? */
+	if (counter > NMI_MAX_COUNTER_BITS)
+		return;
+
+	clear_bit(counter, evntsel_nmi_owner);
+}
+
+EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
+EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
+EXPORT_SYMBOL(reserve_perfctr_nmi);
+EXPORT_SYMBOL(release_perfctr_nmi);
+EXPORT_SYMBOL(reserve_evntsel_nmi);
+EXPORT_SYMBOL(release_evntsel_nmi);
+
+static void stop_apic_nmi_watchdog2(void *unused)
+{
+	stop_apic_nmi_watchdog();
+}
+
+extern atomic_t nmi_watchdog_active;
+void disable_lapic_nmi_watchdog(void)
+{
+	BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
+
+	if (atomic_read(&nmi_watchdog_active) <= 0)
+		return;
+
+	on_each_cpu(stop_apic_nmi_watchdog2, NULL, 0, 1);
+	wd_ops->unreserve();
+
+	BUG_ON(atomic_read(&nmi_watchdog_active) != 0);
+}
+
+static void setup_apic_nmi_watchdog2(void *unused)
+{
+	setup_apic_nmi_watchdog();
+}
+
+void enable_lapic_nmi_watchdog(void)
+{
+	BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
+
+	/* are we already enabled */
+	if (atomic_read(&nmi_watchdog_active) != 0)
+		return;
+
+	/* are we lapic aware */
+	if (!wd_ops)
+		return;
+	if (!wd_ops->reserve()) {
+		printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n");
+		return;
+	}
+
+	on_each_cpu(setup_apic_nmi_watchdog2, NULL, 0, 1);
+	touch_nmi_watchdog();
+}
+
+/*
+ * Activate the NMI watchdog via the local APIC.
+ */
+
+static unsigned int adjust_for_32bit_ctr(unsigned int hz)
+{
+	u64 counter_val;
+	unsigned int retval = hz;
+
+	/*
+	 * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
+	 * are writable, with higher bits sign extending from bit 31.
+	 * So, we can only program the counter with 31 bit values and
+	 * 32nd bit should be 1, for 33.. to be 1.
+	 * Find the appropriate nmi_hz
+	 */
+	counter_val = (u64)cpu_khz * 1000;
+	do_div(counter_val, retval);
+ 	if (counter_val > 0x7fffffffULL) {
+		u64 count = (u64)cpu_khz * 1000;
+		do_div(count, 0x7fffffffUL);
+		retval = count + 1;
+	}
+	return retval;
+}
+
+static void
+write_watchdog_counter(unsigned int perfctr_msr, const char *descr, unsigned nmi_hz)
+{
+	u64 count = (u64)cpu_khz * 1000;
+
+	do_div(count, nmi_hz);
+	if(descr)
+		Dprintk("setting %s to -0x%08Lx\n", descr, count);
+	wrmsrl(perfctr_msr, 0 - count);
+}
+
+static void write_watchdog_counter32(unsigned int perfctr_msr,
+		const char *descr, unsigned nmi_hz)
+{
+	u64 count = (u64)cpu_khz * 1000;
+
+	do_div(count, nmi_hz);
+	if(descr)
+		Dprintk("setting %s to -0x%08Lx\n", descr, count);
+	wrmsr(perfctr_msr, (u32)(-count), 0);
+}
+
+/* AMD K7/K8/Family10h/Family11h support. AMD keeps this interface
+   nicely stable so there is not much variety */
+
+#define K7_EVNTSEL_ENABLE	(1 << 22)
+#define K7_EVNTSEL_INT		(1 << 20)
+#define K7_EVNTSEL_OS		(1 << 17)
+#define K7_EVNTSEL_USR		(1 << 16)
+#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING	0x76
+#define K7_NMI_EVENT		K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
+
+static int setup_k7_watchdog(unsigned nmi_hz)
+{
+	unsigned int perfctr_msr, evntsel_msr;
+	unsigned int evntsel;
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+	perfctr_msr = wd_ops->perfctr;
+	evntsel_msr = wd_ops->evntsel;
+
+	wrmsrl(perfctr_msr, 0UL);
+
+	evntsel = K7_EVNTSEL_INT
+		| K7_EVNTSEL_OS
+		| K7_EVNTSEL_USR
+		| K7_NMI_EVENT;
+
+	/* setup the timer */
+	wrmsr(evntsel_msr, evntsel, 0);
+	write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	evntsel |= K7_EVNTSEL_ENABLE;
+	wrmsr(evntsel_msr, evntsel, 0);
+
+	wd->perfctr_msr = perfctr_msr;
+	wd->evntsel_msr = evntsel_msr;
+	wd->cccr_msr = 0;  //unused
+	return 1;
+}
+
+static void single_msr_stop_watchdog(void)
+{
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+	wrmsr(wd->evntsel_msr, 0, 0);
+}
+
+static int single_msr_reserve(void)
+{
+	if (!reserve_perfctr_nmi(wd_ops->perfctr))
+		return 0;
+
+	if (!reserve_evntsel_nmi(wd_ops->evntsel)) {
+		release_perfctr_nmi(wd_ops->perfctr);
+		return 0;
+	}
+	return 1;
+}
+
+static void single_msr_unreserve(void)
+{
+	release_evntsel_nmi(wd_ops->evntsel);
+	release_perfctr_nmi(wd_ops->perfctr);
+}
+
+static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
+{
+	/* start the cycle over again */
+	write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
+}
+
+static const struct wd_ops k7_wd_ops = {
+	.reserve = single_msr_reserve,
+	.unreserve = single_msr_unreserve,
+	.setup = setup_k7_watchdog,
+	.rearm = single_msr_rearm,
+	.stop = single_msr_stop_watchdog,
+	.perfctr = MSR_K7_PERFCTR0,
+	.evntsel = MSR_K7_EVNTSEL0,
+	.checkbit = 1ULL<<47,
+};
+
+/* Intel Model 6 (PPro+,P2,P3,P-M,Core1) */
+
+#define P6_EVNTSEL0_ENABLE	(1 << 22)
+#define P6_EVNTSEL_INT		(1 << 20)
+#define P6_EVNTSEL_OS		(1 << 17)
+#define P6_EVNTSEL_USR		(1 << 16)
+#define P6_EVENT_CPU_CLOCKS_NOT_HALTED	0x79
+#define P6_NMI_EVENT		P6_EVENT_CPU_CLOCKS_NOT_HALTED
+
+static int setup_p6_watchdog(unsigned nmi_hz)
+{
+	unsigned int perfctr_msr, evntsel_msr;
+	unsigned int evntsel;
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+	perfctr_msr = wd_ops->perfctr;
+	evntsel_msr = wd_ops->evntsel;
+
+#if 0
+	/* KVM doesn't implement this MSR */
+	if (wrmsr_safe(perfctr_msr, 0, 0) < 0)
+		return 0;
+#endif
+
+	evntsel = P6_EVNTSEL_INT
+		| P6_EVNTSEL_OS
+		| P6_EVNTSEL_USR
+		| P6_NMI_EVENT;
+
+	/* setup the timer */
+	wrmsr(evntsel_msr, evntsel, 0);
+	nmi_hz = adjust_for_32bit_ctr(nmi_hz);
+	write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	evntsel |= P6_EVNTSEL0_ENABLE;
+	wrmsr(evntsel_msr, evntsel, 0);
+
+	wd->perfctr_msr = perfctr_msr;
+	wd->evntsel_msr = evntsel_msr;
+	wd->cccr_msr = 0;  //unused
+	return 1;
+}
+
+static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
+{
+	/* P6 based Pentium M need to re-unmask
+	 * the apic vector but it doesn't hurt
+	 * other P6 variant.
+	 * ArchPerfom/Core Duo also needs this */
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	/* P6/ARCH_PERFMON has 32 bit counter write */
+	write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz);
+}
+
+static const struct wd_ops p6_wd_ops = {
+	.reserve = single_msr_reserve,
+	.unreserve = single_msr_unreserve,
+	.setup = setup_p6_watchdog,
+	.rearm = p6_rearm,
+	.stop = single_msr_stop_watchdog,
+	.perfctr = MSR_P6_PERFCTR0,
+	.evntsel = MSR_P6_EVNTSEL0,
+	.checkbit = 1ULL<<39,
+};
+
+/* Intel P4 performance counters. By far the most complicated of all. */
+
+#define MSR_P4_MISC_ENABLE_PERF_AVAIL	(1<<7)
+#define P4_ESCR_EVENT_SELECT(N)	((N)<<25)
+#define P4_ESCR_OS		(1<<3)
+#define P4_ESCR_USR		(1<<2)
+#define P4_CCCR_OVF_PMI0	(1<<26)
+#define P4_CCCR_OVF_PMI1	(1<<27)
+#define P4_CCCR_THRESHOLD(N)	((N)<<20)
+#define P4_CCCR_COMPLEMENT	(1<<19)
+#define P4_CCCR_COMPARE		(1<<18)
+#define P4_CCCR_REQUIRED	(3<<16)
+#define P4_CCCR_ESCR_SELECT(N)	((N)<<13)
+#define P4_CCCR_ENABLE		(1<<12)
+#define P4_CCCR_OVF 		(1<<31)
+
+/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
+   CRU_ESCR0 (with any non-null event selector) through a complemented
+   max threshold. [IA32-Vol3, Section 14.9.9] */
+
+static int setup_p4_watchdog(unsigned nmi_hz)
+{
+	unsigned int perfctr_msr, evntsel_msr, cccr_msr;
+	unsigned int evntsel, cccr_val;
+	unsigned int misc_enable, dummy;
+	unsigned int ht_num;
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+	rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
+	if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
+		return 0;
+
+#ifdef CONFIG_SMP
+	/* detect which hyperthread we are on */
+	if (smp_num_siblings == 2) {
+		unsigned int ebx, apicid;
+
+        	ebx = cpuid_ebx(1);
+	        apicid = (ebx >> 24) & 0xff;
+        	ht_num = apicid & 1;
+	} else
+#endif
+		ht_num = 0;
+
+	/* performance counters are shared resources
+	 * assign each hyperthread its own set
+	 * (re-use the ESCR0 register, seems safe
+	 * and keeps the cccr_val the same)
+	 */
+	if (!ht_num) {
+		/* logical cpu 0 */
+		perfctr_msr = MSR_P4_IQ_PERFCTR0;
+		evntsel_msr = MSR_P4_CRU_ESCR0;
+		cccr_msr = MSR_P4_IQ_CCCR0;
+		cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
+	} else {
+		/* logical cpu 1 */
+		perfctr_msr = MSR_P4_IQ_PERFCTR1;
+		evntsel_msr = MSR_P4_CRU_ESCR0;
+		cccr_msr = MSR_P4_IQ_CCCR1;
+		cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
+	}
+
+	evntsel = P4_ESCR_EVENT_SELECT(0x3F)
+	 	| P4_ESCR_OS
+		| P4_ESCR_USR;
+
+	cccr_val |= P4_CCCR_THRESHOLD(15)
+		 | P4_CCCR_COMPLEMENT
+		 | P4_CCCR_COMPARE
+		 | P4_CCCR_REQUIRED;
+
+	wrmsr(evntsel_msr, evntsel, 0);
+	wrmsr(cccr_msr, cccr_val, 0);
+	write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	cccr_val |= P4_CCCR_ENABLE;
+	wrmsr(cccr_msr, cccr_val, 0);
+	wd->perfctr_msr = perfctr_msr;
+	wd->evntsel_msr = evntsel_msr;
+	wd->cccr_msr = cccr_msr;
+	return 1;
+}
+
+static void stop_p4_watchdog(void)
+{
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+	wrmsr(wd->cccr_msr, 0, 0);
+	wrmsr(wd->evntsel_msr, 0, 0);
+}
+
+static int p4_reserve(void)
+{
+	if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0))
+		return 0;
+#ifdef CONFIG_SMP
+	if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1))
+		goto fail1;
+#endif
+	if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0))
+		goto fail2;
+	/* RED-PEN why is ESCR1 not reserved here? */
+	return 1;
+ fail2:
+#ifdef CONFIG_SMP
+	if (smp_num_siblings > 1)
+		release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
+ fail1:
+#endif
+	release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
+	return 0;
+}
+
+static void p4_unreserve(void)
+{
+#ifdef CONFIG_SMP
+	if (smp_num_siblings > 1)
+		release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
+#endif
+	release_evntsel_nmi(MSR_P4_CRU_ESCR0);
+	release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
+}
+
+static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
+{
+	unsigned dummy;
+	/*
+ 	 * P4 quirks:
+	 * - An overflown perfctr will assert its interrupt
+	 *   until the OVF flag in its CCCR is cleared.
+	 * - LVTPC is masked on interrupt and must be
+	 *   unmasked by the LVTPC handler.
+	 */
+	rdmsrl(wd->cccr_msr, dummy);
+	dummy &= ~P4_CCCR_OVF;
+	wrmsrl(wd->cccr_msr, dummy);
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	/* start the cycle over again */
+	write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
+}
+
+static const struct wd_ops p4_wd_ops = {
+	.reserve = p4_reserve,
+	.unreserve = p4_unreserve,
+	.setup = setup_p4_watchdog,
+	.rearm = p4_rearm,
+	.stop = stop_p4_watchdog,
+	/* RED-PEN this is wrong for the other sibling */
+	.perfctr = MSR_P4_BPU_PERFCTR0,
+	.evntsel = MSR_P4_BSU_ESCR0,
+	.checkbit = 1ULL<<39,
+};
+
+/* Watchdog using the Intel architected PerfMon. Used for Core2 and hopefully
+   all future Intel CPUs. */
+
+#define ARCH_PERFMON_NMI_EVENT_SEL	ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
+#define ARCH_PERFMON_NMI_EVENT_UMASK	ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
+
+static struct wd_ops intel_arch_wd_ops;
+
+static int setup_intel_arch_watchdog(unsigned nmi_hz)
+{
+	unsigned int ebx;
+	union cpuid10_eax eax;
+	unsigned int unused;
+	unsigned int perfctr_msr, evntsel_msr;
+	unsigned int evntsel;
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+	/*
+	 * Check whether the Architectural PerfMon supports
+	 * Unhalted Core Cycles Event or not.
+	 * NOTE: Corresponding bit = 0 in ebx indicates event present.
+	 */
+	cpuid(10, &(eax.full), &ebx, &unused, &unused);
+	if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
+	    (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
+		return 0;
+
+	perfctr_msr = wd_ops->perfctr;
+	evntsel_msr = wd_ops->evntsel;
+
+	wrmsrl(perfctr_msr, 0UL);
+
+	evntsel = ARCH_PERFMON_EVENTSEL_INT
+		| ARCH_PERFMON_EVENTSEL_OS
+		| ARCH_PERFMON_EVENTSEL_USR
+		| ARCH_PERFMON_NMI_EVENT_SEL
+		| ARCH_PERFMON_NMI_EVENT_UMASK;
+
+	/* setup the timer */
+	wrmsr(evntsel_msr, evntsel, 0);
+	nmi_hz = adjust_for_32bit_ctr(nmi_hz);
+	write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+	wrmsr(evntsel_msr, evntsel, 0);
+
+	wd->perfctr_msr = perfctr_msr;
+	wd->evntsel_msr = evntsel_msr;
+	wd->cccr_msr = 0;  //unused
+	intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
+	return 1;
+}
+
+static struct wd_ops intel_arch_wd_ops = {
+	.reserve = single_msr_reserve,
+	.unreserve = single_msr_unreserve,
+	.setup = setup_intel_arch_watchdog,
+	.rearm = p6_rearm,
+	.stop = single_msr_stop_watchdog,
+	.perfctr = MSR_ARCH_PERFMON_PERFCTR1,
+	.evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
+};
+
+static void probe_nmi_watchdog(void)
+{
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_AMD:
+		if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
+		    boot_cpu_data.x86 != 16)
+			return;
+		wd_ops = &k7_wd_ops;
+		break;
+	case X86_VENDOR_INTEL:
+		/* Work around Core Duo (Yonah) errata AE49 where perfctr1
+		   doesn't have a working enable bit. */
+		if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) {
+			intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
+			intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
+		}
+		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+			wd_ops = &intel_arch_wd_ops;
+			break;
+		}
+		switch (boot_cpu_data.x86) {
+		case 6:
+			if (boot_cpu_data.x86_model > 0xd)
+				return;
+
+			wd_ops = &p6_wd_ops;
+			break;
+		case 15:
+			wd_ops = &p4_wd_ops;
+			break;
+		default:
+			return;
+		}
+		break;
+	}
+}
+
+/* Interface to nmi.c */
+
+int lapic_watchdog_init(unsigned nmi_hz)
+{
+	if (!wd_ops) {
+		probe_nmi_watchdog();
+		if (!wd_ops)
+			return -1;
+
+		if (!wd_ops->reserve()) {
+			printk(KERN_ERR
+				"NMI watchdog: cannot reserve perfctrs\n");
+			return -1;
+		}
+	}
+
+	if (!(wd_ops->setup(nmi_hz))) {
+		printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n",
+		       smp_processor_id());
+		return -1;
+	}
+
+	return 0;
+}
+
+void lapic_watchdog_stop(void)
+{
+	if (wd_ops)
+		wd_ops->stop();
+}
+
+unsigned lapic_adjust_nmi_hz(unsigned hz)
+{
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+	if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
+	    wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1)
+		hz = adjust_for_32bit_ctr(hz);
+	return hz;
+}
+
+int lapic_wd_event(unsigned nmi_hz)
+{
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+	u64 ctr;
+	rdmsrl(wd->perfctr_msr, ctr);
+	if (ctr & wd_ops->checkbit) { /* perfctr still running? */
+		return 0;
+	}
+	wd_ops->rearm(wd, nmi_hz);
+	return 1;
+}
+
+int lapic_watchdog_ok(void)
+{
+	return wd_ops != NULL;
+}
+
diff --git a/include/asm-x86_64/apic.h b/include/asm-x86_64/apic.h
index 309cee9..12e8714 100644
--- a/include/asm-x86_64/apic.h
+++ b/include/asm-x86_64/apic.h
@@ -76,6 +76,7 @@ extern void cache_APIC_registers (void);
 extern void sync_Arb_IDs (void);
 extern void init_bsp_APIC (void);
 extern void setup_local_APIC (void);
+extern void stop_apic_nmi_watchdog(void);
 extern void init_apic_mappings (void);
 extern void smp_local_timer_interrupt (struct pt_regs * regs);
 extern void setup_boot_APIC_clock (void);
diff --git a/include/asm-x86_64/intel_arch_perfmon.h b/include/asm-x86_64/intel_arch_perfmon.h
index 59c3964..51db0de 100644
--- a/include/asm-x86_64/intel_arch_perfmon.h
+++ b/include/asm-x86_64/intel_arch_perfmon.h
@@ -14,6 +14,18 @@
 
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL	(0x3c)
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK	(0x00 << 8)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT (1 << 0)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
+	(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
 
+
+union cpuid10_eax {
+	struct {
+		unsigned int version_id:8;
+		unsigned int num_counters:8;
+		unsigned int bit_width:8;
+		unsigned int mask_length:8;
+	} split;
+	unsigned int full;
+};
 #endif	/* X86_64_INTEL_ARCH_PERFMON_H */
diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h
index efb45c8..2974e5c 100644
--- a/include/asm-x86_64/nmi.h
+++ b/include/asm-x86_64/nmi.h
@@ -55,6 +55,21 @@ extern void die_nmi(char *str, struct pt_regs *regs);
 extern int panic_on_timeout;
 extern int unknown_nmi_panic;
 
+extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
+extern int avail_to_resrv_perfctr_nmi(unsigned int);
+extern int reserve_perfctr_nmi(unsigned int);
+extern void release_perfctr_nmi(unsigned int);
+extern int reserve_evntsel_nmi(unsigned int);
+extern void release_evntsel_nmi(unsigned int);
+
+void lapic_watchdog_stop(void);
+int lapic_watchdog_init(unsigned nmi_hz);
+int lapic_wd_event(unsigned nmi_hz);
+unsigned lapic_adjust_nmi_hz(unsigned hz);
+int lapic_watchdog_ok(void);
+void disable_lapic_nmi_watchdog(void);
+void enable_lapic_nmi_watchdog(void);
+
 extern int check_nmi_watchdog(void);
  
 extern void setup_apic_nmi_watchdog (void);