Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > fc11cd6e1c513a17304da94a5390f3cd > files > 2922

kernel-2.6.18-194.11.1.el5.src.rpm

From: Brad Peters <bpeters@redhat.com>
Date: Tue, 29 Jul 2008 17:25:43 -0400
Subject: [ppc64] cell spufs: update with post 2.6.25 patches
Message-id: 20080729212543.14304.29502.sendpatchset@squad5-lp1.lab.bos.redhat.com
O-Subject: [PATCH RHEL 5.3 2/3] Update with post 2.6.25 patches
Bugzilla: 439483

RHBZ#:
======
https://bugzilla.redhat.com/show_bug.cgi?id=439483

Description:
===========
Bug Fix, Updates / Cell arch specific (powerpc)

Updates Cell spufs with post-2.5.25 patches

RHEL Version Found:
================
RHEL 5.2

kABI Status:
============
No symbols were harmed.

Brew:
=====
Built on all platforms.
http://brewweb.devel.redhat.com/brew/taskinfo?taskID=1404188

Upstream Status:
================
Backported from mainline
See patch for details

===============================================================

Brad Peters 1-978-392-1000 x 23183
IBM on-site partner.

Proposed Patch:
===============
This patch is based on 2.6.18-98.el5

this patch contains all bugfixes that went in after 2.6.25

#0001-POWERPC-spufs-add-newline-to-signal-1-2-_type-fil.patch
#0002-POWERPC-spufs-reacquire-LS-pointer-in-spu_process.patch
#0003-POWERPC-spufs-save-MFC-command-channel-before-pur.patch
#0004-POWERPC-spufs-fix-incorrect-file-descriptors-in-S.patch
#0001-POWERPC-spufs-add-.gitignore-for-spu_save_dump.h.patch
#0002-POWERPC-cell-Fix-lost-interrupts-due-to-fasteoi-h.patch
#0003-POWERPC-spufs-don-t-touch-suspend-bits-when-purgi.patch
#0004-POWERPC-spufs-fix-save-of-mfc_cntl-register.patch
#0005-POWERPC-spufs-fix-post-stopped-update-of-MFC_CNTL.patch
#0006-POWERPC-spufs-update-master-runcntl-with-context.patch
#0007-POWERPC-spufs-don-t-acquire-state_mutex-interrupt.patch
#0008-POWERPC-spufs-set-SPU_CONTEXT_SWITCH_PENDING-befo.patch
#0009-POWERPC-spufs-try-to-route-SPU-interrupts-to-loca.patch
#0010-POWERPC-spufs-fix-concurrent-delivery-of-class-0.patch
#0011-POWERPC-spufs-handle-faults-while-the-context-swi.patch
#spufs-dont-requeue-victim-context-in-find_victim-if-its-not-in-spu_run.patch
#spufs-fix-pointer-reference-in-find_victim.diff
#spufs-fix-coredump-note-corruption.patch
#spufs-calculate-load-per-node.diff

diff --git a/arch/powerpc/platforms/cell/interrupt.c b/arch/powerpc/platforms/cell/interrupt.c
index c8fa985..455bbae 100644
--- a/arch/powerpc/platforms/cell/interrupt.c
+++ b/arch/powerpc/platforms/cell/interrupt.c
@@ -35,6 +35,7 @@
 #include <linux/percpu.h>
 #include <linux/types.h>
 #include <linux/ioport.h>
+#include <linux/kernel_stat.h>
 
 #include <asm/io.h>
 #include <asm/pgtable.h>
@@ -232,6 +233,56 @@ static int iic_host_match(struct irq_host *h, struct device_node *node)
 				    "IBM,CBEA-Internal-Interrupt-Controller");
 }
 
+extern int noirqdebug;
+
+static void handle_iic_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
+{
+	const unsigned int cpu = smp_processor_id();
+
+	spin_lock(&desc->lock);
+
+	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+
+	/*
+	 * If we're currently running this IRQ, or its disabled,
+	 * we shouldn't process the IRQ. Mark it pending, handle
+	 * the necessary masking and go out
+	 */
+	if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
+		    !desc->action)) {
+		desc->status |= IRQ_PENDING;
+		goto out_eoi;
+	}
+
+	kstat_cpu(cpu).irqs[irq]++;
+
+	/* Mark the IRQ currently in progress.*/
+	desc->status |= IRQ_INPROGRESS;
+
+	do {
+		struct irqaction *action = desc->action;
+		irqreturn_t action_ret;
+
+		if (unlikely(!action)) {
+			printk("%s: no action\n", __func__);
+			goto out_eoi;
+		}
+
+		desc->status &= ~IRQ_PENDING;
+		spin_unlock(&desc->lock);
+		action_ret = handle_IRQ_event(irq, regs, action);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret, regs);
+		spin_lock(&desc->lock);
+
+	} while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
+
+	desc->status &= ~IRQ_INPROGRESS;
+out_eoi:
+	desc->chip->eoi(irq);
+	spin_unlock(&desc->lock);
+}
+
 static int iic_host_map(struct irq_host *h, unsigned int virq,
 			irq_hw_number_t hw)
 {
@@ -241,10 +292,10 @@ static int iic_host_map(struct irq_host *h, unsigned int virq,
 		break;
 	case IIC_IRQ_TYPE_IOEXC:
 		set_irq_chip_and_handler(virq, &iic_ioexc_chip,
-					 handle_fasteoi_irq);
+					 handle_iic_irq);
 		break;
 	default:
-		set_irq_chip_and_handler(virq, &iic_chip, handle_fasteoi_irq);
+		set_irq_chip_and_handler(virq, &iic_chip, handle_iic_irq);
 	}
 	return 0;
 }
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index bcbf9f8..2c22ce6 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -143,6 +143,10 @@ static void spu_restart_dma(struct spu *spu)
 
 	if (!test_bit(SPU_CONTEXT_SWITCH_PENDING, &spu->flags))
 		out_be64(&priv2->mfc_control_RW, MFC_CNTL_RESTART_DMA_COMMAND);
+	else {
+		set_bit(SPU_CONTEXT_FAULT_PENDING, &spu->flags);
+		mb();
+	}
 }
 
 static inline void spu_load_slb(struct spu *spu, int slbe, struct spu_slb *slb)
@@ -228,11 +232,13 @@ static int __spu_trap_data_map(struct spu *spu, unsigned long ea, u64 dsisr)
 		return 0;
 	}
 
-	spu->class_0_pending = 0;
-	spu->dar = ea;
-	spu->dsisr = dsisr;
+	spu->class_1_dar = ea;
+	spu->class_1_dsisr = dsisr;
+
+	spu->stop_callback(spu, 1);
 
-	spu->stop_callback(spu);
+	spu->class_1_dar = 0;
+	spu->class_1_dsisr = 0;
 
 	return 0;
 }
@@ -320,11 +326,15 @@ spu_irq_class_0(int irq, void *data, struct pt_regs *regs)
 	stat = spu_int_stat_get(spu, 0) & mask;
 
 	spu->class_0_pending |= stat;
-	spu->dsisr = spu_mfc_dsisr_get(spu);
-	spu->dar = spu_mfc_dar_get(spu);
+	spu->class_0_dsisr = spu_mfc_dsisr_get(spu);
+	spu->class_0_dar = spu_mfc_dar_get(spu);
 	spin_unlock(&spu->register_lock);
 
-	spu->stop_callback(spu);
+	spu->stop_callback(spu, 0);
+
+	spu->class_0_pending = 0;
+	spu->class_0_dsisr = 0;
+	spu->class_0_dar = 0;
 
 	spu_int_stat_clear(spu, 0, stat);
 
@@ -365,6 +375,9 @@ spu_irq_class_1(int irq, void *data, struct pt_regs *regs)
 	if (stat & CLASS1_LS_COMPARE_SUSPEND_ON_PUT_INTR)
 		;
 
+	spu->class_1_dsisr = 0;
+	spu->class_1_dar = 0;
+
 	return stat ? IRQ_HANDLED : IRQ_NONE;
 }
 
@@ -398,10 +411,10 @@ spu_irq_class_2(int irq, void *data, struct pt_regs *regs)
 		spu->ibox_callback(spu);
 
 	if (stat & CLASS2_SPU_STOP_INTR)
-		spu->stop_callback(spu);
+		spu->stop_callback(spu, 2);
 
 	if (stat & CLASS2_SPU_HALT_INTR)
-		spu->stop_callback(spu);
+		spu->stop_callback(spu, 2);
 
 	if (stat & CLASS2_SPU_DMA_TAG_GROUP_COMPLETE_INTR)
 		spu->mfc_callback(spu);
diff --git a/arch/powerpc/platforms/cell/spu_priv1_mmio.c b/arch/powerpc/platforms/cell/spu_priv1_mmio.c
index eec8a93..c64d9d4 100644
--- a/arch/powerpc/platforms/cell/spu_priv1_mmio.c
+++ b/arch/powerpc/platforms/cell/spu_priv1_mmio.c
@@ -28,6 +28,7 @@
 #include <linux/io.h>
 #include <linux/mutex.h>
 #include <linux/device.h>
+#include <linux/sched.h>
 
 #include <asm/spu.h>
 #include <asm/spu_priv1.h>
@@ -75,8 +76,19 @@ static u64 int_stat_get(struct spu *spu, int class)
 
 static void cpu_affinity_set(struct spu *spu, int cpu)
 {
-	u64 target = iic_get_target_id(cpu);
-	u64 route = target << 48 | target << 32 | target << 16;
+	u64 target;
+	u64 route;
+
+	if (nr_cpus_node(spu->node)) {
+		cpumask_t spumask = node_to_cpumask(spu->node);
+		cpumask_t cpumask = node_to_cpumask(cpu_to_node(cpu));
+
+		if (!cpus_intersects(spumask, cpumask))
+			return;
+	}
+
+	target = iic_get_target_id(cpu);
+	route = target << 48 | target << 32 | target << 16;
 	out_be64(&spu->priv1->int_route_RW, route);
 }
 
diff --git a/arch/powerpc/platforms/cell/spufs/.gitignore b/arch/powerpc/platforms/cell/spufs/.gitignore
new file mode 100644
index 0000000..a09ee8d
--- /dev/null
+++ b/arch/powerpc/platforms/cell/spufs/.gitignore
@@ -0,0 +1,2 @@
+spu_save_dump.h
+spu_restore_dump.h
diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c
index 0c6a96b..8c40e27 100644
--- a/arch/powerpc/platforms/cell/spufs/coredump.c
+++ b/arch/powerpc/platforms/cell/spufs/coredump.c
@@ -60,7 +60,6 @@ static int spufs_dump_write(struct file *file, const void *addr, int nr, loff_t
 		return -EIO;
 
 	written = file->f_op->write(file, addr, nr, &file->f_pos);
-	*foffset += written;
 
 	if (written != nr)
 		return -EIO;
@@ -133,8 +132,6 @@ static struct spu_context *coredump_next_context(int *fd)
 		if (ctx->flags & SPU_CREATE_NOSCHED)
 			continue;
 
-		/* start searching the next fd next time we're called */
-		(*fd)++;
 		break;
 	}
 
@@ -157,6 +154,9 @@ int spufs_coredump_extra_notes_size(void)
 			break;
 
 		size += rc;
+
+		/* start searching the next fd next time */
+		fd++;
 	}
 
 	return size;
@@ -239,6 +239,9 @@ int spufs_coredump_extra_notes_write(struct file *file, loff_t *foffset)
 		}
 
 		spu_release_saved(ctx);
+
+		/* start searching the next fd next time */
+		fd++;
 	}
 
 	return 0;
diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c
index e46d300..f093a58 100644
--- a/arch/powerpc/platforms/cell/spufs/fault.c
+++ b/arch/powerpc/platforms/cell/spufs/fault.c
@@ -83,13 +83,18 @@ int spufs_handle_class0(struct spu_context *ctx)
 		return 0;
 
 	if (stat & CLASS0_DMA_ALIGNMENT_INTR)
-		spufs_handle_event(ctx, ctx->csa.dar, SPE_EVENT_DMA_ALIGNMENT);
+		spufs_handle_event(ctx, ctx->csa.class_0_dar,
+			SPE_EVENT_DMA_ALIGNMENT);
 
 	if (stat & CLASS0_INVALID_DMA_COMMAND_INTR)
-		spufs_handle_event(ctx, ctx->csa.dar, SPE_EVENT_INVALID_DMA);
+		spufs_handle_event(ctx, ctx->csa.class_0_dar,
+			SPE_EVENT_INVALID_DMA);
 
 	if (stat & CLASS0_SPU_ERROR_INTR)
-		spufs_handle_event(ctx, ctx->csa.dar, SPE_EVENT_SPE_ERROR);
+		spufs_handle_event(ctx, ctx->csa.class_0_dar,
+			SPE_EVENT_SPE_ERROR);
+
+	ctx->csa.class_0_pending = 0;
 
 	return -EIO;
 }
@@ -119,8 +124,8 @@ int spufs_handle_class1(struct spu_context *ctx)
 	 * in time, we can still expect to get the same fault
 	 * the immediately after the context restore.
 	 */
-	ea = ctx->csa.dar;
-	dsisr = ctx->csa.dsisr;
+	ea = ctx->csa.class_1_dar;
+	dsisr = ctx->csa.class_1_dsisr;
 
 	if (!(dsisr & (MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED)))
 		return 0;
@@ -158,7 +163,7 @@ int spufs_handle_class1(struct spu_context *ctx)
 	 * time slicing will not preempt the context while the page fault
 	 * handler is running. Context switch code removes mappings.
 	 */
-	ctx->csa.dar = ctx->csa.dsisr = 0;
+	ctx->csa.class_1_dar = ctx->csa.class_1_dsisr = 0;
 
 	/*
 	 * If we handled the fault successfully and are in runnable
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index db03afa..e173040 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -1331,7 +1331,7 @@ static u64 spufs_signal1_type_get(struct spu_context *ctx)
 	return ctx->ops->signal1_type_get(ctx);
 }
 DEFINE_SPUFS_ATTRIBUTE(spufs_signal1_type, spufs_signal1_type_get,
-		       spufs_signal1_type_set, "%llu", SPU_ATTR_ACQUIRE);
+		       spufs_signal1_type_set, "%llu\n", SPU_ATTR_ACQUIRE);
 
 
 static int spufs_signal2_type_set(void *data, u64 val)
@@ -1353,7 +1353,7 @@ static u64 spufs_signal2_type_get(struct spu_context *ctx)
 	return ctx->ops->signal2_type_get(ctx);
 }
 DEFINE_SPUFS_ATTRIBUTE(spufs_signal2_type, spufs_signal2_type_get,
-		       spufs_signal2_type_set, "%llu", SPU_ATTR_ACQUIRE);
+		       spufs_signal2_type_set, "%llu\n", SPU_ATTR_ACQUIRE);
 
 #if SPUFS_MMAP_4K
 static unsigned long spufs_mss_mmap_nopfn(struct vm_area_struct *vma,
diff --git a/arch/powerpc/platforms/cell/spufs/run.c b/arch/powerpc/platforms/cell/spufs/run.c
index cac69e1..d30bd89 100644
--- a/arch/powerpc/platforms/cell/spufs/run.c
+++ b/arch/powerpc/platforms/cell/spufs/run.c
@@ -11,7 +11,7 @@
 #include "spufs.h"
 
 /* interrupt-level stop callback function. */
-void spufs_stop_callback(struct spu *spu)
+void spufs_stop_callback(struct spu *spu, int irq)
 {
 	struct spu_context *ctx = spu->ctx;
 
@@ -24,9 +24,19 @@ void spufs_stop_callback(struct spu *spu)
 	 */
 	if (ctx) {
 		/* Copy exception arguments into module specific structure */
-		ctx->csa.class_0_pending = spu->class_0_pending;
-		ctx->csa.dsisr = spu->dsisr;
-		ctx->csa.dar = spu->dar;
+		switch(irq) {
+		case 0 :
+			ctx->csa.class_0_pending = spu->class_0_pending;
+			ctx->csa.class_0_dsisr = spu->class_0_dsisr;
+			ctx->csa.class_0_dar = spu->class_0_dar;
+			break;
+		case 1 :
+			ctx->csa.class_1_dsisr = spu->class_1_dsisr;
+			ctx->csa.class_1_dar = spu->class_1_dar;
+			break;
+		case 2 :
+			break;
+		}
 
 		/* ensure that the exception status has hit memory before a
 		 * thread waiting on the context's stop queue is woken */
@@ -34,11 +44,6 @@ void spufs_stop_callback(struct spu *spu)
 
 		wake_up_all(&ctx->stop_wq);
 	}
-
-	/* Clear callback arguments from spu structure */
-	spu->class_0_pending = 0;
-	spu->dsisr = 0;
-	spu->dar = 0;
 }
 
 int spu_stopped(struct spu_context *ctx, u32 *stat)
@@ -56,7 +61,11 @@ int spu_stopped(struct spu_context *ctx, u32 *stat)
 	if (!(*stat & SPU_STATUS_RUNNING) && (*stat & stopped))
 		return 1;
 
-	dsisr = ctx->csa.dsisr;
+	dsisr = ctx->csa.class_0_dsisr;
+	if (dsisr & (MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED))
+		return 1;
+
+	dsisr = ctx->csa.class_1_dsisr;
 	if (dsisr & (MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED))
 		return 1;
 
@@ -294,7 +303,7 @@ static int spu_process_callback(struct spu_context *ctx)
 	u32 ls_pointer, npc;
 	void __iomem *ls;
 	long spu_ret;
-	int ret, ret2;
+	int ret;
 
 	/* get syscall block from local store */
 	npc = ctx->ops->npc_read(ctx) & ~3;
@@ -316,13 +325,15 @@ static int spu_process_callback(struct spu_context *ctx)
 		if (spu_ret <= -ERESTARTSYS) {
 			ret = spu_handle_restartsys(ctx, &spu_ret, &npc);
 		}
-		ret2 = spu_acquire(ctx);
+		mutex_lock(&ctx->state_mutex);
 		if (ret == -ERESTARTSYS)
 			return ret;
-		if (ret2)
-			return -EINTR;
 	}
 
+	/* need to re-get the ls, as it may have changed when we released the
+	 * spu */
+	ls = (void __iomem *)ctx->ops->get_ls(ctx);
+
 	/* write result, jump over indirect pointer */
 	memcpy_toio(ls + ls_pointer, &spu_ret, sizeof(spu_ret));
 	ctx->ops->npc_write(ctx, npc);
@@ -339,13 +350,14 @@ long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *event)
 	if (mutex_lock_interruptible(&ctx->run_mutex))
 		return -ERESTARTSYS;
 
-	spu_enable_spu(ctx);
 	ctx->event_return = 0;
 
 	ret = spu_acquire(ctx);
 	if (ret)
 		goto out_unlock;
 
+	spu_enable_spu(ctx);
+
 	spu_update_sched_info(ctx);
 
 	ret = spu_run_init(ctx, npc);
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 44374c1..24f4c43 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -138,6 +138,9 @@ void __spu_update_sched_info(struct spu_context *ctx)
 	 * if it is timesliced or preempted.
 	 */
 	ctx->cpus_allowed = current->cpus_allowed;
+
+	/* Save the current cpu id for spu interrupt routing. */
+	ctx->last_ran = raw_smp_processor_id();
 }
 
 void spu_update_sched_info(struct spu_context *ctx)
@@ -240,7 +243,6 @@ static void spu_bind_context(struct spu *spu, struct spu_context *ctx)
 	spu_unmap_mappings(ctx);
 	spu_restore(&ctx->csa, spu);
 	spu->timestamp = jiffies;
-	spu_cpu_affinity_set(spu, raw_smp_processor_id());
 	spu_switch_notify(spu, ctx);
 	ctx->state = SPU_STATE_RUNNABLE;
 
@@ -646,7 +648,8 @@ static struct spu *find_victim(struct spu_context *ctx)
 
 			victim->stats.invol_ctx_switch++;
 			spu->stats.invol_ctx_switch++;
-			spu_add_to_rq(victim);
+			if (test_bit(SPU_SCHED_SPU_RUN, &victim->sched_flags))
+				spu_add_to_rq(victim);
 
 			mutex_unlock(&victim->state_mutex);
 
@@ -960,6 +963,7 @@ void spuctx_switch_state(struct spu_context *ctx,
 	struct timespec ts;
 	struct spu *spu;
 	enum spu_utilization_state old_state;
+	int node;
 
 	ktime_get_ts(&ts);
 	curtime = timespec_to_ns(&ts);
@@ -981,6 +985,11 @@ void spuctx_switch_state(struct spu_context *ctx,
 		spu->stats.times[old_state] += delta;
 		spu->stats.util_state = new_state;
 		spu->stats.tstamp = curtime;
+		node = spu->node;
+		if (old_state == SPU_UTIL_USER)
+			atomic_dec(&cbe_spu_info[node].busy_spus);
+		if (new_state == SPU_UTIL_USER);
+			atomic_inc(&cbe_spu_info[node].busy_spus);
 	}
 }
 
diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h
index 2bdb197..0fd6e36 100644
--- a/arch/powerpc/platforms/cell/spufs/spufs.h
+++ b/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -97,6 +97,7 @@ struct spu_context {
 	cpumask_t cpus_allowed;
 	int policy;
 	int prio;
+	int last_ran;
 
 	/* statistics */
 	struct {
@@ -302,7 +303,7 @@ size_t spu_ibox_read(struct spu_context *ctx, u32 *data);
 /* irq callback funcs. */
 void spufs_ibox_callback(struct spu *spu);
 void spufs_wbox_callback(struct spu *spu);
-void spufs_stop_callback(struct spu *spu);
+void spufs_stop_callback(struct spu *spu, int irq);
 void spufs_mfc_callback(struct spu *spu);
 void spufs_dma_callback(struct spu *spu, int type);
 
diff --git a/arch/powerpc/platforms/cell/spufs/switch.c b/arch/powerpc/platforms/cell/spufs/switch.c
index e9dc7a5..3df9a36 100644
--- a/arch/powerpc/platforms/cell/spufs/switch.c
+++ b/arch/powerpc/platforms/cell/spufs/switch.c
@@ -132,6 +132,14 @@ static inline void disable_interrupts(struct spu_state *csa, struct spu *spu)
 	spu_int_mask_set(spu, 2, 0ul);
 	eieio();
 	spin_unlock_irq(&spu->register_lock);
+
+	/*
+	 * This flag needs to be set before calling synchronize_irq so
+	 * that the update will be visible to the relevant handlers
+	 * via a simple load.
+	 */
+	set_bit(SPU_CONTEXT_SWITCH_PENDING, &spu->flags);
+	clear_bit(SPU_CONTEXT_FAULT_PENDING, &spu->flags);
 	synchronize_irq(spu->irqs[0]);
 	synchronize_irq(spu->irqs[1]);
 	synchronize_irq(spu->irqs[2]);
@@ -166,9 +174,8 @@ static inline void set_switch_pending(struct spu_state *csa, struct spu *spu)
 	/* Save, Step 7:
 	 * Restore, Step 5:
 	 *     Set a software context switch pending flag.
+	 *     Done above in Step 3 - disable_interrupts().
 	 */
-	set_bit(SPU_CONTEXT_SWITCH_PENDING, &spu->flags);
-	mb();
 }
 
 static inline void save_mfc_cntl(struct spu_state *csa, struct spu *spu)
@@ -186,20 +193,21 @@ static inline void save_mfc_cntl(struct spu_state *csa, struct spu *spu)
 				 MFC_CNTL_SUSPEND_COMPLETE);
 		/* fall through */
 	case MFC_CNTL_SUSPEND_COMPLETE:
-		if (csa) {
+		if (csa)
 			csa->priv2.mfc_control_RW =
-				MFC_CNTL_SUSPEND_MASK |
+				in_be64(&priv2->mfc_control_RW) |
 				MFC_CNTL_SUSPEND_DMA_QUEUE;
-		}
 		break;
 	case MFC_CNTL_NORMAL_DMA_QUEUE_OPERATION:
 		out_be64(&priv2->mfc_control_RW, MFC_CNTL_SUSPEND_DMA_QUEUE);
 		POLL_WHILE_FALSE((in_be64(&priv2->mfc_control_RW) &
 				  MFC_CNTL_SUSPEND_DMA_STATUS_MASK) ==
 				 MFC_CNTL_SUSPEND_COMPLETE);
-		if (csa) {
-			csa->priv2.mfc_control_RW = 0;
-		}
+		if (csa)
+			csa->priv2.mfc_control_RW =
+				in_be64(&priv2->mfc_control_RW) &
+				~MFC_CNTL_SUSPEND_DMA_QUEUE &
+				~MFC_CNTL_SUSPEND_MASK;
 		break;
 	}
 }
@@ -249,16 +257,21 @@ static inline void save_spu_status(struct spu_state *csa, struct spu *spu)
 	}
 }
 
-static inline void save_mfc_decr(struct spu_state *csa, struct spu *spu)
+static inline void save_mfc_stopped_status(struct spu_state *csa,
+		struct spu *spu)
 {
 	struct spu_priv2 __iomem *priv2 = spu->priv2;
+	const u64 mask = MFC_CNTL_DECREMENTER_RUNNING |
+			MFC_CNTL_DMA_QUEUES_EMPTY;
 
 	/* Save, Step 12:
 	 *     Read MFC_CNTL[Ds].  Update saved copy of
 	 *     CSA.MFC_CNTL[Ds].
+	 *
+	 * update: do the same with MFC_CNTL[Q].
 	 */
-	csa->priv2.mfc_control_RW |=
-		in_be64(&priv2->mfc_control_RW) & MFC_CNTL_DECREMENTER_RUNNING;
+	csa->priv2.mfc_control_RW &= ~mask;
+	csa->priv2.mfc_control_RW |= in_be64(&priv2->mfc_control_RW) & mask;
 }
 
 static inline void halt_mfc_decr(struct spu_state *csa, struct spu *spu)
@@ -462,7 +475,9 @@ static inline void purge_mfc_queue(struct spu_state *csa, struct spu *spu)
 	 * Restore, Step 14.
 	 *     Write MFC_CNTL[Pc]=1 (purge queue).
 	 */
-	out_be64(&priv2->mfc_control_RW, MFC_CNTL_PURGE_DMA_REQUEST);
+	out_be64(&priv2->mfc_control_RW,
+			MFC_CNTL_PURGE_DMA_REQUEST |
+			MFC_CNTL_SUSPEND_MASK);
 	eieio();
 }
 
@@ -725,10 +740,14 @@ static inline void set_switch_active(struct spu_state *csa, struct spu *spu)
 	/* Save, Step 48:
 	 * Restore, Step 23.
 	 *     Change the software context switch pending flag
-	 *     to context switch active.
+	 *     to context switch active.  This implementation does
+	 *     not uses a switch active flag.
 	 *
-	 *     This implementation does not uses a switch active flag.
+	 * Now that we have saved the mfc in the csa, we can add in the
+	 * restart command if an exception occurred.
 	 */
+	if (test_bit(SPU_CONTEXT_FAULT_PENDING, &spu->flags))
+		csa->priv2.mfc_control_RW |= MFC_CNTL_RESTART_DMA_COMMAND;
 	clear_bit(SPU_CONTEXT_SWITCH_PENDING, &spu->flags);
 	mb();
 }
@@ -1690,6 +1709,13 @@ static inline void restore_mfc_sr1(struct spu_state *csa, struct spu *spu)
 	eieio();
 }
 
+static inline void set_int_route(struct spu_state *csa, struct spu *spu)
+{
+	struct spu_context *ctx = spu->ctx;
+
+	spu_cpu_affinity_set(spu, ctx->last_ran);
+}
+
 static inline void restore_other_spu_access(struct spu_state *csa,
 					    struct spu *spu)
 {
@@ -1721,15 +1747,15 @@ static inline void restore_mfc_cntl(struct spu_state *csa, struct spu *spu)
 	 */
 	out_be64(&priv2->mfc_control_RW, csa->priv2.mfc_control_RW);
 	eieio();
+
 	/*
-	 * FIXME: this is to restart a DMA that we were processing
-	 *        before the save. better remember the fault information
-	 *        in the csa instead.
+	 * The queue is put back into the same state that was evident prior to
+	 * the context switch. The suspend flag is added to the saved state in
+	 * the csa, if the operational state was suspending or suspended. In
+	 * this case, the code that suspended the mfc is responsible for
+	 * continuing it. Note that SPE faults do not change the operational
+	 * state of the spu.
 	 */
-	if ((csa->priv2.mfc_control_RW & MFC_CNTL_SUSPEND_DMA_QUEUE_MASK)) {
-		out_be64(&priv2->mfc_control_RW, MFC_CNTL_RESTART_DMA_COMMAND);
-		eieio();
-	}
 }
 
 static inline void enable_user_access(struct spu_state *csa, struct spu *spu)
@@ -1788,7 +1814,7 @@ static int quiece_spu(struct spu_state *prev, struct spu *spu)
 	save_spu_runcntl(prev, spu);	        /* Step 9. */
 	save_mfc_sr1(prev, spu);	        /* Step 10. */
 	save_spu_status(prev, spu);	        /* Step 11. */
-	save_mfc_decr(prev, spu);	        /* Step 12. */
+	save_mfc_stopped_status(prev, spu);     /* Step 12. */
 	halt_mfc_decr(prev, spu);	        /* Step 13. */
 	save_timebase(prev, spu);		/* Step 14. */
 	remove_other_spu_access(prev, spu);	/* Step 15. */
@@ -1815,6 +1841,7 @@ static void save_csa(struct spu_state *prev, struct spu *spu)
 	save_mfc_csr_ato(prev, spu);	/* Step 24. */
 	save_mfc_tclass_id(prev, spu);	/* Step 25. */
 	set_mfc_tclass_id(prev, spu);	/* Step 26. */
+	save_mfc_cmd(prev, spu);	/* Step 26a - moved from 44. */
 	purge_mfc_queue(prev, spu);	/* Step 27. */
 	wait_purge_complete(prev, spu);	/* Step 28. */
 	setup_mfc_sr1(prev, spu);	/* Step 30. */
@@ -1831,7 +1858,6 @@ static void save_csa(struct spu_state *prev, struct spu *spu)
 	save_ppuint_mb(prev, spu);	/* Step 41. */
 	save_ch_part1(prev, spu);	/* Step 42. */
 	save_spu_mb(prev, spu);	        /* Step 43. */
-	save_mfc_cmd(prev, spu);	/* Step 44. */
 	reset_ch(prev, spu);	        /* Step 45. */
 }
 
@@ -2000,6 +2026,7 @@ static void restore_csa(struct spu_state *next, struct spu *spu)
 	check_ppuint_mb_stat(next, spu);	/* Step 67. */
 	spu_invalidate_slbs(spu);		/* Modified Step 68. */
 	restore_mfc_sr1(next, spu);	        /* Step 69. */
+	set_int_route(next, spu);		/* NEW      */
 	restore_other_spu_access(next, spu);	/* Step 70. */
 	restore_spu_runcntl(next, spu);	        /* Step 71. */
 	restore_mfc_cntl(next, spu);	        /* Step 72. */
diff --git a/include/asm-powerpc/spu.h b/include/asm-powerpc/spu.h
index 2b76090..ae93269 100644
--- a/include/asm-powerpc/spu.h
+++ b/include/asm-powerpc/spu.h
@@ -100,6 +100,7 @@
 
 /* Flag indicating progress during context switch. */
 #define SPU_CONTEXT_SWITCH_PENDING	0UL
+#define SPU_CONTEXT_FAULT_PENDING	1UL
 
 struct spu_context;
 struct spu_runqueue;
@@ -130,8 +131,8 @@ struct spu {
 	u32 isrc;				/* obsolete */
 	u32 node;
 	u64 flags;
-	u64 dar;
-	u64 dsisr;
+	u64 dar;				/* obsolete */
+	u64 dsisr;				/* obsolete */
 	size_t ls_size;
 	unsigned int slb_replace;
 	struct mm_struct *mm;
@@ -145,7 +146,9 @@ struct spu {
 
 	void (* wbox_callback)(struct spu *spu);
 	void (* ibox_callback)(struct spu *spu);
+#ifdef __GENKSYMS__
 	void (* stop_callback)(struct spu *spu);
+#endif
 	void (* mfc_callback)(struct spu *spu);
 
 	char irq_c0[8];
@@ -156,8 +159,13 @@ struct spu {
 
 	/* Additions for RHEL5U2 */
 #ifndef __GENKSYMS__
+	void (* stop_callback)(struct spu *spu, int irq);
 	enum { SPU_FREE, SPU_USED } alloc_state;
 	u64 class_0_pending_value;
+	u64 class_0_dar;
+	u64 class_0_dsisr;
+	u64 class_1_dar;
+	u64 class_1_dsisr;
 
 	pid_t tgid;
 	int has_mem_affinity;
@@ -198,6 +206,7 @@ struct cbe_spu_info {
 	struct list_head spus;
 	int n_spus;
 	int nr_active;
+	atomic_t busy_spus;
 	atomic_t reserved_spus;
 };
 
diff --git a/include/asm-powerpc/spu_csa.h b/include/asm-powerpc/spu_csa.h
index 0ab6bff..129ec14 100644
--- a/include/asm-powerpc/spu_csa.h
+++ b/include/asm-powerpc/spu_csa.h
@@ -254,7 +254,8 @@ struct spu_state {
 	u64 spu_chnldata_RW[32];
 	u32 spu_mailbox_data[4];
 	u32 pu_mailbox_data[1];
-	u64 dar, dsisr, class_0_pending;
+	u64 class_0_dar, class_0_dsisr, class_0_pending;
+	u64 class_1_dar, class_1_dsisr;
 	unsigned long suspend_time;
 	spinlock_t register_lock;
 };