Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 89877e42827f16fa5f86b1df0c2860b1 > files > 1788

kernel-2.6.18-128.1.10.el5.src.rpm

From: Brad Peters <bpeters@redhat.com>
Date: Thu, 24 Jul 2008 18:20:23 -0400
Subject: [ppc] RAS update for Cell
Message-id: 20080724222023.17682.56389.sendpatchset@squad5-lp1.lab.bos.redhat.com
O-Subject: [PATCH RHEL5.3] RAS update for Cell
Bugzilla: 313731
RH-Acked-by: David Howells <dhowells@redhat.com>

RHBZ#:
======
https://bugzilla.redhat.com/show_bug.cgi?id=313731

Description:
===========
New Feature / Power Arch Only

This patch adds support for investigating spus information after a
kernel crash event, through kdump vmcore file.

Implementation is based on xmon code, but the new functionality was
kept independent from xmon.

kABI Status:
============
No symbols were harmed.

Brew:
=====
Built on all platforms.
http://brewweb.devel.redhat.com/brew/taskinfo?taskID=1370193

Kernel binary rpm available at:
===============================
http://people.redhat.com/bpeters/kernels/kernel-2.6.18-94.el5.94.el5.313731.ppc64.rpm

Upstream Status:
================
I checked and this is in upstream 2.6.24.rc7

Test Status:
============
spu information successfully shown to be saved for kexec crash.
Test results:
-----------
crash> extend extensions/spu.so # loading extension
./extensions/spu.so: shared object loaded

crash> spus

NODE 0:
ID        SPUADDR      SPUSTATUS       CTXADDR       CTXSTATE    PID
 0   c000000000a33a80     IDLE                   0      -           0
 1   c000000000a33e80   RUNNING   c000000025206f80   RUNNABLE    3557
 2   c000000000a34280   RUNNING   c000000025207f80   RUNNABLE    3556
 3   c000000000a34680   RUNNING   c000000025208f80   RUNNABLE    3555
 4   c00000002ff48a00   RUNNING   c000000025209f80   RUNNABLE    3554
 5   c00000002ff46a00   RUNNING   c00000002520af80   RUNNABLE    3553
 6   c00000002ff44e00   RUNNING   c00000002520bf80   RUNNABLE    3552
 7   c00000002ff44a00   RUNNING   c00000003ecf9d80   RUNNABLE    3551

NODE 1:
ID        SPUADDR      SPUSTATUS       CTXADDR       CTXSTATE    PID
 8   c00000002ff44600   RUNNING   c00000003ecf3d80   RUNNABLE    3550
 9   c00000002ff44200   RUNNING   c000000020c79d00   RUNNABLE    3548
10   c00000002ff43e00   RUNNING   c000000020c70d00   RUNNABLE    3549
11   c000000000a34a80   RUNNING   c00000002ff57300   RUNNABLE    3547
12   c000000000a34e80   RUNNING   c00000002ff5d300   RUNNABLE    3546
13   c000000000a35280   RUNNING   c00000003080de80   RUNNABLE    3544
14   c000000000a35680   RUNNING   c000000030800e80   RUNNABLE    3545
15   c000000000a35a80   RUNNING   c00000003e370900   RUNNABLE    3543

crash> spurq # run queue is empty...

crash> spuctx c00000003ecf3d80

Dumping context fields for spu_context c00000003ecf3d80:
  state                   = 0
  prio                    = 120
  local_store             = 0xc0000000272158a0
  rq                      = 0xc00000003ecf4748
  name                    = spe
  node                    = 1
  number                  = 8
  pid                     = 3550
  slb_replace             = 0x1
  mm                      = 0xc00000003e9f0500
  timestamp               = 0x101a8f9dc
  class_0_pending         = 0
  problem                 = 0xd000080080690000
  priv2                   = 0xd0000800806b0000
  flags                   = 0x0
  saved_mfc_sr1_RW        = 0x3b
  saved_mfc_dar           = 0x184c080
  saved_mfc_dsisr         = 0x0
  saved_spu_runcntl_RW    = 0x1
  saved_spu_status_R      = 0x9
  saved_spu_npc_RW        = 0x0crash> extend extensions/spu.so # loading extension
./extensions/spu.so: shared object loaded

crash> spus

NODE 0:
ID        SPUADDR      SPUSTATUS       CTXADDR       CTXSTATE    PID
 0   c000000000a33a80     IDLE                   0      -           0
 1   c000000000a33e80   RUNNING   c000000025206f80   RUNNABLE    3557
 2   c000000000a34280   RUNNING   c000000025207f80   RUNNABLE    3556
 3   c000000000a34680   RUNNING   c000000025208f80   RUNNABLE    3555
 4   c00000002ff48a00   RUNNING   c000000025209f80   RUNNABLE    3554
 5   c00000002ff46a00   RUNNING   c00000002520af80   RUNNABLE    3553
 6   c00000002ff44e00   RUNNING   c00000002520bf80   RUNNABLE    3552
 7   c00000002ff44a00   RUNNING   c00000003ecf9d80   RUNNABLE    3551

NODE 1:
ID        SPUADDR      SPUSTATUS       CTXADDR       CTXSTATE    PID
 8   c00000002ff44600   RUNNING   c00000003ecf3d80   RUNNABLE    3550
 9   c00000002ff44200   RUNNING   c000000020c79d00   RUNNABLE    3548
10   c00000002ff43e00   RUNNING   c000000020c70d00   RUNNABLE    3549
11   c000000000a34a80   RUNNING   c00000002ff57300   RUNNABLE    3547
12   c000000000a34e80   RUNNING   c00000002ff5d300   RUNNABLE    3546
13   c000000000a35280   RUNNING   c00000003080de80   RUNNABLE    3544
14   c000000000a35680   RUNNING   c000000030800e80   RUNNABLE    3545
15   c000000000a35a80   RUNNING   c00000003e370900   RUNNABLE    3543

crash> spurq # run queue is empty...

crash> spuctx c00000003ecf3d80

Dumping context fields for spu_context c00000003ecf3d80:
  state                   = 0
  prio                    = 120
  local_store             = 0xc0000000272158a0
  rq                      = 0xc00000003ecf4748
  name                    = spe
  node                    = 1
  number                  = 8
  pid                     = 3550
  slb_replace             = 0x1
  mm                      = 0xc00000003e9f0500
  timestamp               = 0x101a8f9dc
  class_0_pending         = 0
  problem                 = 0xd000080080690000
  priv2                   = 0xd0000800806b0000
  flags                   = 0x0
  saved_mfc_sr1_RW        = 0x3b
  saved_mfc_dar           = 0x184c080
  saved_mfc_dsisr         = 0x0
  saved_spu_runcntl_RW    = 0x1
  saved_spu_status_R      = 0x9
  saved_spu_npc_RW        = 0x0

===============================================================

Brad Peters 1-978-392-1000 x 23183
IBM on-site partner.

Proposed Patch:
===============
This patch is based on 2.6.18-94.el5

diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c
index 1af41f7..f1f602e 100644
--- a/arch/powerpc/kernel/crash.c
+++ b/arch/powerpc/kernel/crash.c
@@ -274,6 +274,72 @@ void crash_kexec_secondary(struct pt_regs *regs)
 	cpus_in_sr = CPU_MASK_NONE;
 }
 #endif
+#ifdef CONFIG_SPU_BASE
+
+#include <asm/spu.h>
+#include <asm/spu_priv1.h>
+
+struct crash_spu_info {
+	struct spu *spu;
+	u32 saved_spu_runcntl_RW;
+	u32 saved_spu_status_R;
+	u32 saved_spu_npc_RW;
+	u64 saved_mfc_sr1_RW;
+	u64 saved_mfc_dar;
+	u64 saved_mfc_dsisr;
+};
+
+#define CRASH_NUM_SPUS	16	/* Enough for current hardware */
+static struct crash_spu_info crash_spu_info[CRASH_NUM_SPUS];
+
+static void crash_kexec_stop_spus(void)
+{
+	struct spu *spu;
+	int i;
+	u64 tmp;
+
+	for (i = 0; i < CRASH_NUM_SPUS; i++) {
+		if (!crash_spu_info[i].spu)
+			continue;
+
+		spu = crash_spu_info[i].spu;
+
+		crash_spu_info[i].saved_spu_runcntl_RW =
+			in_be32(&spu->problem->spu_runcntl_RW);
+		crash_spu_info[i].saved_spu_status_R =
+			in_be32(&spu->problem->spu_status_R);
+		crash_spu_info[i].saved_spu_npc_RW =
+			in_be32(&spu->problem->spu_npc_RW);
+
+		crash_spu_info[i].saved_mfc_dar    = spu_mfc_dar_get(spu);
+		crash_spu_info[i].saved_mfc_dsisr  = spu_mfc_dsisr_get(spu);
+		tmp = spu_mfc_sr1_get(spu);
+		crash_spu_info[i].saved_mfc_sr1_RW = tmp;
+
+		tmp &= ~MFC_STATE1_MASTER_RUN_CONTROL_MASK;
+		spu_mfc_sr1_set(spu, tmp);
+
+		__delay(200);
+	}
+}
+
+void crash_register_spus(struct list_head *list)
+{
+	struct spu *spu;
+
+	list_for_each_entry(spu, list, full_list) {
+		if (spu->number >= CRASH_NUM_SPUS){	WARN_ON(1);	continue;
+		}
+
+		crash_spu_info[spu->number].spu = spu;
+	}
+}
+
+#else
+static inline void crash_kexec_stop_spus(void)
+{
+}
+#endif /* CONFIG_SPU_BASE */
 
 void default_machine_crash_shutdown(struct pt_regs *regs)
 {
@@ -309,6 +375,7 @@ void default_machine_crash_shutdown(struct pt_regs *regs)
 	crash_save_this_cpu(regs, crashing_cpu);
 	crash_kexec_prepare_cpus(crashing_cpu);
 	cpu_set(crashing_cpu, cpus_in_crash);
+	crash_kexec_stop_spus();
 	if (ppc_md.kexec_cpu_down)
 		ppc_md.kexec_cpu_down(1, 0);
 }
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index de45b16..c2c7bc4 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -668,6 +668,7 @@ static int __init init_spu_base(void)
 
 	spu_init_affinity();
 
+	crash_register_spus(&spu_full_list);
 	return 0;
 
  out_unregister_sysdev_class:
diff --git a/include/asm-powerpc/spu.h b/include/asm-powerpc/spu.h
index ea1e75d..ddd23db 100644
--- a/include/asm-powerpc/spu.h
+++ b/include/asm-powerpc/spu.h
@@ -217,6 +217,14 @@ static inline void crash_register_spus(struct list_head *list)
 }
 #endif
 
+#ifdef CONFIG_KEXEC
+void crash_register_spus(struct list_head *list);
+#else
+static inline void crash_register_spus(struct list_head *list)
+{
+}
+#endif
+
 extern void spu_invalidate_slbs(struct spu *spu);
 extern void spu_associate_mm(struct spu *spu, struct mm_struct *mm);