Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 89877e42827f16fa5f86b1df0c2860b1 > files > 1780

kernel-2.6.18-128.1.10.el5.src.rpm

From: Brad Peters <bpeters@redhat.com>
Date: Tue, 29 Jul 2008 12:21:04 -0400
Subject: [ppc] IOMMU Performance Enhancements
Message-id: 20080729162104.8384.8733.sendpatchset@squad5-lp1.lab.bos.redhat.com
O-Subject: [PATCH RHEL5.3] IOMMU Performance Enhancements
Bugzilla: 439469
RH-Acked-by: David Howells <dhowells@redhat.com>

RHBZ#:
======
https://bugzilla.redhat.com/show_bug.cgi?id=439469

Description:
===========
New Feature / Power arch only

IOMMU support was integrated in RHEL5.2 for the Cell-based QS-22. With
those patches, networking performance was hurt. The modifications here
work to improve IOMMU performance.  The biggest change is an update to
IOMMU mapping, limiting the size of the IOMMU's DMA to 1G.  This will
satisfy the IOMMU requirements and also means the hash table will
be on node 0

Also included are two new fix patches:

* Fix large hash table allocation on Cell blades
* Fix cell IOMMU code to cope with empty dma-ranges, and non-PCI devices

RHEL Version Found:
================
RHEL 5.2

kABI Status:
============
No symbols were harmed.

Brew:
=====
Built on all platforms.
http://brewweb.devel.redhat.com/brew/taskinfo?taskID=1412765

Upstream Status:
================
It is a backport of Michael Ellerman's Cell IOMMU fixed mapping support
from mainline (first patch at 41d824bf61b507c001868861cddda25eaab23cd7)
and the corresponding fixes
(first patch at ccd05d086f82dba2ab117dcaf4a38cbb2863a439)

The following infrastructure patches were also brought in from mainline:
* 837c54db21fc7047af75f3fe4295e32106449e34
  ([POWERPC] Add of_translate_dma_address)
* 84631f37cc405dd6dcd566f9fa4e8a3ca2f03f76
  ([POWERPC] Implement pci_set_dma_mask() in terms of the dma_ops)

Test Status:
============
I did not confirm performance impact, but did ensure that patch does not
affect kernel stability.

7-29-08 Brad Peters <bpeters@redhat.com>

===============================================================

Brad Peters 1-978-392-1000 x 23183
IBM on-site partner.

Proposed Patch:
===============
This patch is based on 2.6.18-99.el5

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 4be8677..dc959f4 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -71,7 +71,8 @@ obj-$(CONFIG_PCI_MSI)		+= msi.o msi-rtas.o
 endif
 
 pci64-$(CONFIG_PPC64)		+= pci_64.o pci_dn.o pci_iommu.o \
-				   pci_direct_iommu.o ofdev_iommu.o iomap.o
+				   pci_direct_iommu.o pci_fixed_iommu.o \
+				   ofdev_iommu.o iomap.o
 pci32-$(CONFIG_PPC32)		:= pci_32.o
 obj-$(CONFIG_PCI)		+= $(pci64-y) $(pci32-y)
 kexec-$(CONFIG_PPC64)		:= machine_kexec_64.o
diff --git a/arch/powerpc/kernel/dma_64.c b/arch/powerpc/kernel/dma_64.c
index 11dff1c..24336ec 100644
--- a/arch/powerpc/kernel/dma_64.c
+++ b/arch/powerpc/kernel/dma_64.c
@@ -13,12 +13,19 @@
 #include <asm/ibmebus.h>
 #include <asm/scatterlist.h>
 #include <asm/bug.h>
+#include <asm/iommu.h>
 
 static struct dma_mapping_ops *get_dma_ops(struct device *dev)
 {
 #ifdef CONFIG_PCI
-	if (dev->bus == &pci_bus_type)
+	struct pci_dn *pdn;
+
+	if (dev->bus == &pci_bus_type) {
+		pdn = get_pdn(to_pci_dev(dev));
+		if (cell_use_iommu_fixed && pdn && pdn->use_iommu_fixed)
+			return &pci_fixed_ops;
 		return &pci_dma_ops;
+	}
 #endif
 #ifdef CONFIG_IBMVIO
 	if (dev->bus == &vio_bus_type)
diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
index 6a56b45..8c34e52 100644
--- a/arch/powerpc/kernel/pci_64.c
+++ b/arch/powerpc/kernel/pci_64.c
@@ -30,6 +30,7 @@
 #include <asm/byteorder.h>
 #include <asm/machdep.h>
 #include <asm/ppc-pci.h>
+#include <asm/iommu.h>
 
 #ifdef DEBUG
 #include <asm/udbg.h>
@@ -71,6 +72,45 @@ int global_phb_number;		/* Global phb counter */
 struct pci_dev *ppc64_isabridge_dev = NULL;
 EXPORT_SYMBOL_GPL(ppc64_isabridge_dev);
 
+int pci_set_dma_mask(struct pci_dev *dev, u64 mask)
+{
+	struct pci_dn *pdn = get_pdn(dev);
+
+	if (!pci_dma_supported(dev, mask))
+		return -EIO;
+
+	dev->dma_mask = mask;
+
+	if (!cell_use_iommu_fixed)
+		return 0;
+
+	if (mask == DMA_64BIT_MASK &&
+		cell_iommu_get_fixed_address(dev) != OF_BAD_ADDR)
+	{
+		dev_dbg(&dev->dev, "iommu: 64-bit OK, using fixed ops\n");
+		pdn->use_iommu_fixed = 1;
+	} else {
+		dev_dbg(&dev->dev, "iommu: not 64-bit, using default ops\n");
+		pdn->use_iommu_fixed = 0;
+	}
+
+	cell_pci_dma_dev_setup(dev);
+
+	return 0;
+}
+
+int pci_set_consistent_dma_mask(struct pci_dev *dev, u64 mask)
+{
+	int rc = pci_set_dma_mask(dev, mask);
+
+	if (!rc)
+		return rc;
+
+	dev->dev.coherent_dma_mask = dev->dma_mask;
+
+	return 0;
+}
+
 static void fixup_broken_pcnet32(struct pci_dev* dev)
 {
 	if ((dev->class>>8 == PCI_CLASS_NETWORK_ETHERNET)) {
diff --git a/arch/powerpc/kernel/pci_fixed_iommu.c b/arch/powerpc/kernel/pci_fixed_iommu.c
new file mode 100644
index 0000000..f267574
--- /dev/null
+++ b/arch/powerpc/kernel/pci_fixed_iommu.c
@@ -0,0 +1,115 @@
+/*
+ * Essentially a modified version of Benjamin Herrenschmidt's PCI Direct
+ * IOMMU work that takes the DMA offset from the pci_dn.  Used to allow
+ * support for fixed IOMMU mapping on certain cell machines.  For 64-bit
+ * devices this avoids the performance overhead of mapping and unmapping
+ * pages at runtime.  32-bit devices are unable to use the fixed mapping.
+ *
+ * Copyright 2003 Benjamin Herrenschmidt (benh@kernel.crashing.org)
+ * Copyright 2008 IBM Corporation, Mark Nelson
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <asm/pmac_feature.h>
+#include <asm/abs_addr.h>
+
+static u64 get_pci_fixed_dma_offset(struct pci_dev *pdev)
+{
+	struct pci_dn *pdn = get_pdn(pdev);
+
+	if (pdn->use_iommu_fixed)
+		return pdn->addr;
+
+	return 0;
+}
+
+static void *pci_fixed_alloc_coherent(struct device *hwdev, size_t size,
+				   dma_addr_t *dma_handle, gfp_t flag)
+{
+	void *ret;
+
+	ret = (void *)__get_free_pages(flag, get_order(size));
+	if (ret != NULL) {
+		memset(ret, 0, size);
+		*dma_handle = virt_to_abs(ret) +
+				get_pci_fixed_dma_offset(to_pci_dev(hwdev));
+	}
+	return ret;
+}
+
+static void pci_fixed_free_coherent(struct device *hwdev, size_t size,
+				 void *vaddr, dma_addr_t dma_handle)
+{
+	free_pages((unsigned long)vaddr, get_order(size));
+}
+
+static dma_addr_t pci_fixed_map_single(struct device *hwdev, void *ptr,
+		size_t size, enum dma_data_direction direction)
+{
+	return virt_to_abs(ptr) + get_pci_fixed_dma_offset(to_pci_dev(hwdev));
+}
+
+static void pci_fixed_unmap_single(struct device *hwdev, dma_addr_t dma_addr,
+		size_t size, enum dma_data_direction direction)
+{
+}
+
+static int pci_fixed_map_sg(struct device *hwdev, struct scatterlist *sg,
+		int nents, enum dma_data_direction direction)
+{
+	int i;
+
+	for (i = 0; i < nents; i++, sg++) {
+		sg->dma_address = page_to_phys(sg->page) + sg->offset +
+			get_pci_fixed_dma_offset(to_pci_dev(hwdev));
+		sg->dma_length = sg->length;
+	}
+
+	return nents;
+}
+
+static void pci_fixed_unmap_sg(struct device *hwdev, struct scatterlist *sg,
+		int nents, enum dma_data_direction direction)
+{
+}
+
+static int pci_fixed_dma_supported(struct device *dev, u64 mask)
+{
+	return mask == DMA_64BIT_MASK;
+}
+
+struct dma_mapping_ops pci_fixed_ops = {
+	.alloc_coherent = pci_fixed_alloc_coherent,
+	.free_coherent = pci_fixed_free_coherent,
+	.map_single = pci_fixed_map_single,
+	.unmap_single = pci_fixed_unmap_single,
+	.map_sg = pci_fixed_map_sg,
+	.unmap_sg = pci_fixed_unmap_sg,
+	.dma_supported = pci_fixed_dma_supported,
+};
diff --git a/arch/powerpc/kernel/prom_parse.c b/arch/powerpc/kernel/prom_parse.c
index 2305202..b664bd4 100644
--- a/arch/powerpc/kernel/prom_parse.c
+++ b/arch/powerpc/kernel/prom_parse.c
@@ -266,7 +266,7 @@ static struct of_bus *of_match_bus(struct device_node *np)
 
 static int of_translate_one(struct device_node *parent, struct of_bus *bus,
 			    struct of_bus *pbus, u32 *addr,
-			    int na, int ns, int pna)
+			    int na, int ns, int pna, const char *rprop)
 {
 	u32 *ranges;
 	unsigned int rlen;
@@ -285,7 +285,7 @@ static int of_translate_one(struct device_node *parent, struct of_bus *bus,
 	 * to translate addresses that aren't supposed to be translated in
 	 * the first place. --BenH.
 	 */
-	ranges = (u32 *)get_property(parent, "ranges", &rlen);
+	ranges = (u32 *)get_property(parent, rprop, &rlen);
 	if (ranges == NULL || rlen == 0) {
 		offset = of_read_number(addr, na);
 		memset(addr, 0, pna * 4);
@@ -328,7 +328,8 @@ static int of_translate_one(struct device_node *parent, struct of_bus *bus,
  * that can be mapped to a cpu physical address). This is not really specified
  * that way, but this is traditionally the way IBM at least do things
  */
-u64 of_translate_address(struct device_node *dev, u32 *in_addr)
+u64 __of_translate_address(struct device_node *dev, u32 *in_addr,
+			   const char *rprop)
 {
 	struct device_node *parent = NULL;
 	struct of_bus *bus, *pbus;
@@ -387,7 +388,7 @@ u64 of_translate_address(struct device_node *dev, u32 *in_addr)
 		    pbus->name, pna, pns, parent->full_name);
 
 		/* Apply bus translation */
-		if (of_translate_one(dev, bus, pbus, addr, na, ns, pna))
+		if (of_translate_one(dev, bus, pbus, addr, na, ns, pna, rprop))
 			break;
 
 		/* Complete the move up one level */
@@ -403,8 +404,19 @@ u64 of_translate_address(struct device_node *dev, u32 *in_addr)
 
 	return result;
 }
+
+u64 of_translate_address(struct device_node *dev, u32 *in_addr)
+{
+	return __of_translate_address(dev, in_addr, "ranges");
+}
 EXPORT_SYMBOL(of_translate_address);
 
+u64 of_translate_dma_address(struct device_node *dev, u32 *in_addr)
+{
+	return __of_translate_address(dev, in_addr, "dma-ranges");
+}
+EXPORT_SYMBOL(of_translate_dma_address);
+
 u32 *of_get_address(struct device_node *dev, int index, u64 *size,
 		    unsigned int *flags)
 {
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index c541388..6d73ae3 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -432,7 +432,7 @@ void __init htab_initialize(void)
 	unsigned long table;
 	unsigned long pteg_count;
 	unsigned long mode_rw;
-	unsigned long base = 0, size = 0;
+	unsigned long base = 0, size = 0, limit;
 	int i;
 
 	extern unsigned long tce_alloc_start, tce_alloc_end;
@@ -457,9 +457,15 @@ void __init htab_initialize(void)
 		_SDR1 = 0; 
 	} else {
 		/* Find storage for the HPT.  Must be contiguous in
-		 * the absolute address space.
+		 * the absolute address space. On cell we want it to be
+		 * in the first 2 Gig so we can use it for IOMMU hacks.
 		 */
-		table = lmb_alloc(htab_size_bytes, htab_size_bytes);
+		if (machine_is(cell))
+			limit = 0x80000000;
+		else
+			limit = 0;
+
+		table = lmb_alloc_base(htab_size_bytes, htab_size_bytes, limit);
 
 		DBG("Hash table allocated at %lx, size: %lx\n", table,
 		    htab_size_bytes);
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index 5d8db84..2062560 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -110,7 +110,7 @@
 
 /* IOMMU sizing */
 #define IO_SEGMENT_SHIFT	28
-#define IO_PAGENO_BITS		(IO_SEGMENT_SHIFT - IOMMU_PAGE_SHIFT)
+#define IO_PAGENO_BITS(shift)	(IO_SEGMENT_SHIFT - (shift))
 
 /* The high bit needs to be set on every DMA address */
 #define SPIDER_DMA_OFFSET	0x80000000ul
@@ -120,7 +120,6 @@ struct iommu_window {
 	struct cbe_iommu *iommu;
 	unsigned long offset;
 	unsigned long size;
-	unsigned long pte_offset;
 	unsigned int ioid;
 	struct iommu_table table;
 };
@@ -200,7 +199,7 @@ static void tce_build_cell(struct iommu_table *tbl, long index, long npages,
 		(window->ioid & IOPTE_IOID_Mask);
 #endif
 
-	io_pte = (unsigned long *)tbl->it_base + (index - window->pte_offset);
+	io_pte = (unsigned long *)tbl->it_base + (index - tbl->it_offset);
 
 	for (i = 0; i < npages; i++, uaddr += IOMMU_PAGE_SIZE)
 		io_pte[i] = base_pte | (__pa(uaddr) & IOPTE_RPN_Mask);
@@ -232,7 +231,7 @@ static void tce_free_cell(struct iommu_table *tbl, long index, long npages)
 		| (window->ioid & IOPTE_IOID_Mask);
 #endif
 
-	io_pte = (unsigned long *)tbl->it_base + (index - window->pte_offset);
+	io_pte = (unsigned long *)tbl->it_base + (index - tbl->it_offset);
 
 	for (i = 0; i < npages; i++)
 		io_pte[i] = pte;
@@ -308,78 +307,99 @@ static int cell_iommu_find_ioc(int nid, unsigned long *base)
 	return -ENODEV;
 }
 
-static void cell_iommu_setup_hardware(struct cbe_iommu *iommu, unsigned long size)
+static void cell_iommu_setup_stab(struct cbe_iommu *iommu,
+				unsigned long dbase, unsigned long dsize,
+				unsigned long fbase, unsigned long fsize)
 {
 	struct page *page;
-	int ret, i;
-	unsigned long reg, segments, pages_per_segment, ptab_size, n_pte_pages;
-	unsigned long xlate_base;
-	unsigned int virq;
+	unsigned long segments, stab_size;
 
-	if (cell_iommu_find_ioc(iommu->nid, &xlate_base))
-		panic("%s: missing IOC register mappings for node %d\n",
-		      __FUNCTION__, iommu->nid);
+	segments = max(dbase + dsize, fbase + fsize) >> IO_SEGMENT_SHIFT;
 
-	iommu->xlate_regs = ioremap(xlate_base, IOC_Reg_Size);
-	iommu->cmd_regs = iommu->xlate_regs + IOC_IOCmd_Offset;
-
-	segments = size >> IO_SEGMENT_SHIFT;
-	pages_per_segment = 1ull << IO_PAGENO_BITS;
-
-	pr_debug("%s: iommu[%d]: segments: %lu, pages per segment: %lu\n",
-			__FUNCTION__, iommu->nid, segments, pages_per_segment);
+	pr_debug("%s: iommu[%d]: segments: %lu\n",
+			__FUNCTION__, iommu->nid, segments);
 
 	/* set up the segment table */
-	page = alloc_pages_node(iommu->nid, GFP_KERNEL, 0);
+	stab_size = segments * sizeof(unsigned long);
+	page = alloc_pages_node(iommu->nid, GFP_KERNEL, get_order(stab_size));
 	BUG_ON(!page);
 	iommu->stab = page_address(page);
-	clear_page(iommu->stab);
+	memset(iommu->stab, 0, stab_size);
+}
+
+static unsigned long *cell_iommu_alloc_ptab(struct cbe_iommu *iommu,
+		unsigned long base, unsigned long size, unsigned long gap_base,
+		unsigned long gap_size, unsigned long page_shift)
+{
+	struct page *page;
+	int i;
+	unsigned long reg, segments, pages_per_segment, ptab_size,
+		      n_pte_pages, start_seg, *ptab;
+
+	start_seg = base >> IO_SEGMENT_SHIFT;
+	segments  = size >> IO_SEGMENT_SHIFT;
+	pages_per_segment = 1ull << IO_PAGENO_BITS(page_shift);
+	/* PTEs for each segment must start on a 4K bounday */
+	pages_per_segment = max(pages_per_segment,
+				(1 << 12) / sizeof(unsigned long));
 
-	/* ... and the page tables. Since these are contiguous, we can treat
-	 * the page tables as one array of ptes, like pSeries does.
-	 */
 	ptab_size = segments * pages_per_segment * sizeof(unsigned long);
 	pr_debug("%s: iommu[%d]: ptab_size: %lu, order: %d\n", __FUNCTION__,
 			iommu->nid, ptab_size, get_order(ptab_size));
 	page = alloc_pages_node(iommu->nid, GFP_KERNEL, get_order(ptab_size));
 	BUG_ON(!page);
 
-	iommu->ptab = page_address(page);
-	memset(iommu->ptab, 0, ptab_size);
+	ptab = page_address(page);
+	memset(ptab, 0, ptab_size);
 
-	/* allocate a bogus page for the end of each mapping */
-	page = alloc_pages_node(iommu->nid, GFP_KERNEL, 0);
-	BUG_ON(!page);
-	iommu->pad_page = page_address(page);
-	clear_page(iommu->pad_page);
-
-	/* number of pages needed for a page table */
-	n_pte_pages = (pages_per_segment *
-		       sizeof(unsigned long)) >> IOMMU_PAGE_SHIFT;
+	/* number of 4K pages needed for a page table */
+	n_pte_pages = (pages_per_segment * sizeof(unsigned long)) >> 12;
 
 	pr_debug("%s: iommu[%d]: stab at %p, ptab at %p, n_pte_pages: %lu\n",
-			__FUNCTION__, iommu->nid, iommu->stab, iommu->ptab,
+			__FUNCTION__, iommu->nid, iommu->stab, ptab,
 			n_pte_pages);
 
 	/* initialise the STEs */
 	reg = IOSTE_V | ((n_pte_pages - 1) << 5);
 
-	if (IOMMU_PAGE_SIZE == 0x1000)
-		reg |= IOSTE_PS_4K;
-	else if (IOMMU_PAGE_SIZE == 0x10000)
-		reg |= IOSTE_PS_64K;
-	else {
-		extern void __unknown_page_size_error(void);
-		__unknown_page_size_error();
+	switch (page_shift) {
+	case 12: reg |= IOSTE_PS_4K;  break;
+	case 16: reg |= IOSTE_PS_64K; break;
+	case 20: reg |= IOSTE_PS_1M;  break;
+	case 24: reg |= IOSTE_PS_16M; break;
+	default: BUG();
 	}
 
+	gap_base = gap_base >> IO_SEGMENT_SHIFT;
+	gap_size = gap_size >> IO_SEGMENT_SHIFT;
+
 	pr_debug("Setting up IOMMU stab:\n");
-	for (i = 0; i * (1ul << IO_SEGMENT_SHIFT) < size; i++) {
-		iommu->stab[i] = reg |
-			(__pa(iommu->ptab) + n_pte_pages * IOMMU_PAGE_SIZE * i);
+	for (i = start_seg; i < (start_seg + segments); i++) {
+		if (i >= gap_base && i < (gap_base + gap_size)) {
+			pr_debug("\toverlap at %d, skipping\n", i);
+			continue;
+		}
+		iommu->stab[i] = reg | (__pa(ptab) + (n_pte_pages << 12) *
+					(i - start_seg));
 		pr_debug("\t[%d] 0x%016lx\n", i, iommu->stab[i]);
 	}
 
+	return ptab;
+}
+
+static void cell_iommu_enable_hardware(struct cbe_iommu *iommu)
+{
+	int ret;
+	unsigned long reg, xlate_base;
+	unsigned int virq;
+
+	if (cell_iommu_find_ioc(iommu->nid, &xlate_base))
+		panic("%s: missing IOC register mappings for node %d\n",
+		      __FUNCTION__, iommu->nid);
+
+	iommu->xlate_regs = ioremap(xlate_base, IOC_Reg_Size);
+	iommu->cmd_regs = iommu->xlate_regs + IOC_IOCmd_Offset;
+
 	/* ensure that the STEs have updated */
 	mb();
 
@@ -408,6 +428,15 @@ static void cell_iommu_setup_hardware(struct cbe_iommu *iommu, unsigned long siz
 	out_be64(iommu->cmd_regs + IOC_IOCmd_Cfg, reg);
 }
 
+static void cell_iommu_setup_hardware(struct cbe_iommu *iommu,
+	unsigned long base, unsigned long size)
+{
+	cell_iommu_setup_stab(iommu, base, size, 0, 0);
+	iommu->ptab = cell_iommu_alloc_ptab(iommu, base, size, 0, 0,
+					    IOMMU_PAGE_SHIFT);
+	cell_iommu_enable_hardware(iommu);
+}
+
 #if 0/* Unused for now */
 static struct iommu_window *find_window(struct cbe_iommu *iommu,
 		unsigned long offset, unsigned long size)
@@ -425,33 +454,43 @@ static struct iommu_window *find_window(struct cbe_iommu *iommu,
 }
 #endif
 
+static inline u32 cell_iommu_get_ioid(struct device_node *np)
+{
+	const u32 *ioid;
+
+	ioid = (unsigned int *)get_property(np, "ioid", NULL);
+	if (ioid == NULL) {
+		printk(KERN_WARNING "iommu: missing ioid for %s using 0\n",
+		       np->full_name);
+		return 0;
+	}
+
+	return *ioid;
+}
+
 static struct iommu_window * __init
 cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np,
 			unsigned long offset, unsigned long size,
 			unsigned long pte_offset)
 {
 	struct iommu_window *window;
-	const unsigned int *ioid;
+	struct page *page;
+	u32 ioid;
 
-	ioid = (unsigned int *)get_property(np, "ioid", NULL);
-	if (ioid == NULL)
-		printk(KERN_WARNING "iommu: missing ioid for %s using 0\n",
-		       np->full_name);
+	ioid = cell_iommu_get_ioid(np);
 
 	window = kmalloc_node(sizeof(*window), GFP_KERNEL, iommu->nid);
 	BUG_ON(window == NULL);
 
 	window->offset = offset;
 	window->size = size;
-	window->ioid = ioid ? *ioid : 0;
+	window->ioid = ioid;
 	window->iommu = iommu;
-	window->pte_offset = pte_offset;
 
 	window->table.it_blocksize = 16;
 	window->table.it_base = (unsigned long)iommu->ptab;
 	window->table.it_index = iommu->nid;
-	window->table.it_offset = (offset >> IOMMU_PAGE_SHIFT) +
-		window->pte_offset;
+	window->table.it_offset = (offset >> IOMMU_PAGE_SHIFT) + pte_offset;
 	window->table.it_size = size >> IOMMU_PAGE_SHIFT;
 
 	iommu_init_table(&window->table, iommu->nid);
@@ -474,6 +513,11 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np,
 	 * This code also assumes that we have a window that starts at 0,
 	 * which is the case on all spider based blades.
 	 */
+	page = alloc_pages_node(iommu->nid, GFP_KERNEL, 0);
+	BUG_ON(!page);
+	iommu->pad_page = page_address(page);
+	clear_page(iommu->pad_page);
+
 	__set_bit(0, window->table.it_map);
 	tce_build_cell(&window->table, window->table.it_offset, 1,
 		       (unsigned long)iommu->pad_page, DMA_TO_DEVICE);
@@ -493,7 +537,11 @@ static struct cbe_iommu *cell_iommu_for_node(int nid)
 	return NULL;
 }
 
-static void cell_pci_dma_dev_setup(struct pci_dev *dev)
+static unsigned long dma_iommu_fixed_base;
+
+int cell_use_iommu_fixed;
+
+static void cell_pci_dma_dev_setup_iommu(struct pci_dev *dev)
 {
 	struct device_node *dn, *mydn;
 
@@ -504,6 +552,18 @@ static void cell_pci_dma_dev_setup(struct pci_dev *dev)
 		PCI_DN(mydn)->iommu_table = PCI_DN(dn)->iommu_table;
 }
 
+static void cell_pci_dma_dev_setup_fixed(struct pci_dev *dev);
+
+void cell_pci_dma_dev_setup(struct pci_dev *dev)
+{
+	struct pci_dn *pdn = get_pdn(dev);
+
+	if (pdn && pdn->use_iommu_fixed)
+		cell_pci_dma_dev_setup_fixed(dev);
+	else
+		cell_pci_dma_dev_setup_iommu(dev);
+}
+
 static void cell_pci_dma_bus_setup(struct pci_bus *bus)
 {
 	struct iommu_window *window;
@@ -565,10 +625,9 @@ static int __init cell_iommu_get_window(struct device_node *np,
 	return 0;
 }
 
-static void __init cell_iommu_init_one(struct device_node *np, unsigned long offset)
+static struct cbe_iommu * __init cell_iommu_alloc(struct device_node *np)
 {
 	struct cbe_iommu *iommu;
-	unsigned long base, size;
 	int nid, i;
 
 	/* Get node ID */
@@ -576,7 +635,7 @@ static void __init cell_iommu_init_one(struct device_node *np, unsigned long off
 	if (nid < 0) {
 		printk(KERN_ERR "iommu: failed to get node for %s\n",
 		       np->full_name);
-		return;
+		return NULL;
 	}
 	pr_debug("iommu: setting up iommu for node %d (%s)\n",
 		 nid, np->full_name);
@@ -592,7 +651,7 @@ static void __init cell_iommu_init_one(struct device_node *np, unsigned long off
 	if (cbe_nr_iommus >= NR_IOMMUS) {
 		printk(KERN_ERR "iommu: too many IOMMUs detected ! (%s)\n",
 		       np->full_name);
-		return;
+		return NULL;
 	}
 
 	/* Init base fields */
@@ -603,6 +662,19 @@ static void __init cell_iommu_init_one(struct device_node *np, unsigned long off
 	snprintf(iommu->name, sizeof(iommu->name), "iommu%d", i);
 	INIT_LIST_HEAD(&iommu->windows);
 
+	return iommu;
+}
+
+static void __init cell_iommu_init_one(struct device_node *np,
+				       unsigned long offset)
+{
+	struct cbe_iommu *iommu;
+	unsigned long base, size;
+
+	iommu = cell_iommu_alloc(np);
+	if (!iommu)
+		return;
+
 	/* Obtain a window for it */
 	cell_iommu_get_window(np, &base, &size);
 
@@ -610,7 +682,7 @@ static void __init cell_iommu_init_one(struct device_node *np, unsigned long off
 		 base, base + size - 1);
 
 	/* Initialize the hardware */
-	cell_iommu_setup_hardware(iommu, size);
+	cell_iommu_setup_hardware(iommu, base, size);
 
 	/* Setup the iommu_table */
 	cell_iommu_setup_window(iommu, np, base, size,
@@ -706,6 +778,264 @@ static int __init cell_iommu_init_disabled(void)
 	return 0;
 }
 
+/*
+ *  Fixed IOMMU mapping support
+ *
+ *  This code adds support for setting up a fixed IOMMU mapping on certain
+ *  cell machines. For 64-bit devices this avoids the performance overhead of
+ *  mapping and unmapping pages at runtime. 32-bit devices are unable to use
+ *  the fixed mapping.
+ *
+ *  The fixed mapping is established at boot, and maps all of physical memory
+ *  1:1 into device space at some offset. On machines with < 30 GB of memory
+ *  we setup the fixed mapping immediately above the normal IOMMU window.
+ *
+ *  For example a machine with 4GB of memory would end up with the normal
+ *  IOMMU window from 0-2GB and the fixed mapping window from 2GB to 6GB. In
+ *  this case a 64-bit device wishing to DMA to 1GB would be told to DMA to
+ *  3GB, plus any offset required by firmware. The firmware offset is encoded
+ *  in the "dma-ranges" property.
+ *
+ *  On machines with 30GB or more of memory, we are unable to place the fixed
+ *  mapping above the normal IOMMU window as we would run out of address space.
+ *  Instead we move the normal IOMMU window to coincide with the hash page
+ *  table, this region does not need to be part of the fixed mapping as no
+ *  device should ever be DMA'ing to it. We then setup the fixed mapping
+ *  from 0 to 32GB.
+ */
+
+u64 cell_iommu_get_fixed_address(struct pci_dev *dev)
+{
+	u64 cpu_addr, size, best_size, dev_addr = OF_BAD_ADDR;
+	struct device_node *np, *tmp;
+	u32 *ranges = NULL;
+	int i, len, best, naddr, nsize, pna, range_size;
+
+	np = of_node_get(pci_device_to_OF_node(dev));
+	while (1) {
+		naddr = prom_n_addr_cells(np);
+		nsize = prom_n_size_cells(np);
+
+		tmp = of_get_parent(np);
+		of_node_put(np);
+		np = tmp;
+
+		if (!np)
+			break;
+
+		ranges = (u32 *)get_property(np, "dma-ranges", &len);
+
+		/* Ignore empty ranges, they imply no translation required */
+		if (ranges && len > 0)
+			break;
+	}
+
+	if (!ranges) {
+		dev_dbg(&dev->dev, "iommu: no dma-ranges found\n");
+		goto out;
+	}
+
+	len /= sizeof(u32);
+
+	pna = prom_n_addr_cells(np);
+	range_size = naddr + nsize + pna;
+
+	/* dma-ranges format:
+	 * child addr	: naddr cells
+	 * parent addr	: pna cells
+	 * size		: nsize cells
+	 */
+	for (i = 0, best = -1, best_size = 0; i < len; i += range_size) {
+		cpu_addr = of_translate_dma_address(np, ranges + i + naddr);
+		size = of_read_number(ranges + i + naddr + pna, nsize);
+
+		if (cpu_addr == 0 && size > best_size) {
+			best = i;
+			best_size = size;
+		}
+	}
+
+	if (best >= 0) {
+		dev_addr = of_read_number(ranges + best, naddr);
+	} else
+		dev_dbg(&dev->dev, "iommu: no suitable range found!\n");
+
+out:
+	of_node_put(np);
+
+	return dev_addr;
+}
+
+static void cell_pci_dma_dev_setup_fixed(struct pci_dev *dev)
+{
+	struct pci_dn *pdn = get_pdn(dev);
+
+	pdn->addr = cell_iommu_get_fixed_address(dev) + dma_iommu_fixed_base;
+
+	dev_dbg(&dev->dev, "iommu: fixed addr = %lx\n", pdn->addr);
+}
+
+static void insert_16M_pte(unsigned long addr, unsigned long *ptab,
+			   unsigned long base_pte)
+{
+	unsigned long segment, offset;
+
+	segment = addr >> IO_SEGMENT_SHIFT;
+	offset = (addr >> 24) - (segment << IO_PAGENO_BITS(24));
+	ptab = ptab + (segment * (1 << 12) / sizeof(unsigned long));
+
+	pr_debug("iommu: addr %lx ptab %p segment %lx offset %lx\n",
+		  addr, ptab, segment, offset);
+
+	ptab[offset] = base_pte | (__pa(addr) & IOPTE_RPN_Mask);
+}
+
+static void cell_iommu_setup_fixed_ptab(struct cbe_iommu *iommu,
+	struct device_node *np, unsigned long dbase, unsigned long dsize,
+	unsigned long fbase, unsigned long fsize)
+{
+	unsigned long base_pte, uaddr, ioaddr, *ptab;
+
+	ptab = cell_iommu_alloc_ptab(iommu, fbase, fsize, dbase, dsize, 24);
+
+	dma_iommu_fixed_base = fbase;
+
+	pr_debug("iommu: mapping 0x%lx pages from 0x%lx\n", fsize, fbase);
+
+	base_pte = IOPTE_PP_W | IOPTE_PP_R | IOPTE_M | IOPTE_SO_RW
+		    | (cell_iommu_get_ioid(np) & IOPTE_IOID_Mask);
+
+	for (uaddr = 0; uaddr < fsize; uaddr += (1 << 24)) {
+		/* Don't touch the dynamic region */
+		ioaddr = uaddr + fbase;
+		if (ioaddr >= dbase && ioaddr < (dbase + dsize)) {
+			pr_debug("iommu: fixed/dynamic overlap, skipping\n");
+			continue;
+		}
+
+		insert_16M_pte(uaddr, ptab, base_pte);
+	}
+
+	mb();
+}
+
+static int __init cell_iommu_fixed_mapping_init(void)
+{
+	unsigned long dbase, dsize, fbase, fsize, hbase, hend;
+	struct cbe_iommu *iommu;
+	struct device_node *np;
+
+	/* The fixed mapping is only supported on axon machines */
+	np = of_find_node_by_name(NULL, "axon");
+	if (!np) {
+		pr_debug("iommu: fixed mapping disabled, no axons found\n");
+		return -1;
+	}
+
+	/* We must have dma-ranges properties for fixed mapping to work */
+	for (np = NULL; (np = of_find_all_nodes(np));) {
+		if (of_find_property(np, "dma-ranges", NULL))
+			break;
+	}
+	of_node_put(np);
+
+	if (!np) {
+		pr_debug("iommu: no dma-ranges found, no fixed mapping\n");
+		return -1;
+	}
+
+	/* The default setup is to have the fixed mapping sit after the
+	 * dynamic region, so find the top of the largest IOMMU window
+	 * on any axon, then add the size of RAM and that's our max value.
+	 * If that is > 32GB we have to do other shennanigans.
+	 */
+	fbase = 0;
+	for_each_node_by_name(np, "axon") {
+		cell_iommu_get_window(np, &dbase, &dsize);
+		fbase = max(fbase, dbase + dsize);
+	}
+
+	fbase = _ALIGN_UP(fbase, 1 << IO_SEGMENT_SHIFT);
+	fsize = lmb_phys_mem_size();
+
+	if ((fbase + fsize) <= 0x800000000)
+		hbase = 0; /* use the device tree window */
+	else {
+		/* If we're over 32 GB we need to cheat. We can't map all of
+		 * RAM with the fixed mapping, and also fit the dynamic
+		 * region. So try to place the dynamic region where the hash
+		 * table sits, drivers never need to DMA to it, we don't
+		 * need a fixed mapping for that area.
+		 */
+		if (!htab_address) {
+			pr_debug("iommu: htab is NULL, on LPAR? Huh?\n");
+			return -1;
+		}
+		hbase = __pa(htab_address);
+		hend  = hbase + htab_size_bytes;
+
+		/* The window must start and end on a segment boundary */
+		if ((hbase != _ALIGN_UP(hbase, 1 << IO_SEGMENT_SHIFT)) ||
+		    (hend != _ALIGN_UP(hend, 1 << IO_SEGMENT_SHIFT))) {
+			pr_debug("iommu: hash window not segment aligned\n");
+			return -1;
+		}
+
+		/* Check the hash window fits inside the real DMA window */
+		for_each_node_by_name(np, "axon") {
+			cell_iommu_get_window(np, &dbase, &dsize);
+
+			if (hbase < dbase || (hend > (dbase + dsize))) {
+				pr_debug("iommu: hash window doesn't fit in"
+					 "real DMA window\n");
+				return -1;
+			}
+		}
+
+		fbase = 0;
+	}
+
+	/* Setup the dynamic regions */
+	for_each_node_by_name(np, "axon") {
+		iommu = cell_iommu_alloc(np);
+		BUG_ON(!iommu);
+
+		if (hbase == 0)
+			cell_iommu_get_window(np, &dbase, &dsize);
+		else {
+			dbase = hbase;
+			dsize = htab_size_bytes;
+		}
+
+		printk(KERN_DEBUG "iommu: node %d, dynamic window 0x%lx-0x%lx "
+			"fixed window 0x%lx-0x%lx\n", iommu->nid, dbase,
+			 dbase + dsize, fbase, fbase + fsize);
+
+		cell_iommu_setup_stab(iommu, dbase, dsize, fbase, fsize);
+		iommu->ptab = cell_iommu_alloc_ptab(iommu, dbase, dsize, 0, 0,
+						    IOMMU_PAGE_SHIFT);
+		cell_iommu_setup_fixed_ptab(iommu, np, dbase, dsize,
+					     fbase, fsize);
+		cell_iommu_enable_hardware(iommu);
+		cell_iommu_setup_window(iommu, np, dbase, dsize, 0);
+	}
+
+	cell_use_iommu_fixed = 1;
+
+	return 0;
+}
+
+static int iommu_fixed_disabled;
+
+static int __init setup_iommu_fixed(char *str)
+{
+	if (strcmp(str, "off") == 0)
+		iommu_fixed_disabled = 1;
+
+	return 1;
+}
+__setup("iommu_fixed=", setup_iommu_fixed);
+
 static int __init cell_iommu_init(void)
 {
 	struct device_node *np;
@@ -730,6 +1060,9 @@ static int __init cell_iommu_init(void)
 	ppc_md.iommu_bus_setup = cell_pci_dma_bus_setup;
 	iommu_setup_of_dev = cell_of_dma_dev_setup;
 
+	if (!iommu_fixed_disabled && cell_iommu_fixed_mapping_init() == 0)
+		goto out;
+
 	/* Create an iommu for each /axon node.  */
 	for_each_node_by_name(np, "axon") {
 		if (np->parent == NULL || np->parent->parent != NULL)
@@ -746,6 +1079,7 @@ static int __init cell_iommu_init(void)
 		cell_iommu_init_one(np, SPIDER_DMA_OFFSET);
 	}
 
+out:
 	/* Setup default PCI iommu ops */
 	pci_iommu_init();
 	ofdev_iommu_init();
diff --git a/include/asm-powerpc/dma.h b/include/asm-powerpc/dma.h
index 7a4374b..42ed29a 100644
--- a/include/asm-powerpc/dma.h
+++ b/include/asm-powerpc/dma.h
@@ -372,6 +372,9 @@ static __inline__ int get_dma_residue(unsigned int dmanr)
 	    ? count : (count << 1);
 }
 
+/* We have our own implementation of pci_set_dma_mask() */
+#define HAVE_ARCH_PCI_SET_DMA_MASK
+
 /* These are in kernel/dma.c: */
 
 /* reserve a DMA channel */
diff --git a/include/asm-powerpc/iommu.h b/include/asm-powerpc/iommu.h
index 7707763..13fa619 100644
--- a/include/asm-powerpc/iommu.h
+++ b/include/asm-powerpc/iommu.h
@@ -111,7 +111,11 @@ extern void iommu_init_early_dart(void);
 #ifdef CONFIG_PCI
 extern void pci_iommu_init(void);
 extern void pci_direct_iommu_init(void);
+extern struct dma_mapping_ops pci_fixed_ops;
 extern unsigned long pci_direct_dma_offset;
+extern int cell_use_iommu_fixed;
+extern u64 cell_iommu_get_fixed_address(struct pci_dev *dev);
+extern void cell_pci_dma_dev_setup(struct pci_dev *dev);
 #else
 static inline void pci_iommu_init(void) { }
 #endif
diff --git a/include/asm-powerpc/msi.h b/include/asm-powerpc/msi.h
index 0da8155..7426cac 100644
--- a/include/asm-powerpc/msi.h
+++ b/include/asm-powerpc/msi.h
@@ -58,6 +58,4 @@ static inline int irq_has_action(unsigned int irq)
 	return desc->action != NULL;
 }
 
-extern struct pci_dn *get_pdn(struct pci_dev *pdev);
-
 #endif /* _POWERPC_MSI_H */
diff --git a/include/asm-powerpc/pci-bridge.h b/include/asm-powerpc/pci-bridge.h
index 5df62d0..e75a3c7 100644
--- a/include/asm-powerpc/pci-bridge.h
+++ b/include/asm-powerpc/pci-bridge.h
@@ -83,6 +83,11 @@ struct pci_dn {
 #ifdef CONFIG_PCI_MSI
 	struct list_head msi_list;
 #endif
+#ifdef CONFIG_PPC_CELL_NATIVE
+	int	use_iommu_fixed;
+	u64	addr;			/* address to DMA to when using the
+					   Cell's IOMMU fixed mapping */
+#endif
 };
 
 /* Get the pointer to a device_node's pci_dn */
diff --git a/include/asm-powerpc/pci.h b/include/asm-powerpc/pci.h
index 46afd29..03969e2 100644
--- a/include/asm-powerpc/pci.h
+++ b/include/asm-powerpc/pci.h
@@ -238,6 +238,8 @@ extern pgprot_t	pci_phys_mem_access_prot(struct file *file,
 					 unsigned long size,
 					 pgprot_t prot);
 
+extern struct pci_dn *get_pdn(struct pci_dev *pdev);
+
 #if defined(CONFIG_PPC_MULTIPLATFORM) || defined(CONFIG_PPC32)
 #define HAVE_ARCH_PCI_RESOURCE_TO_USER
 extern void pci_resource_to_user(const struct pci_dev *dev, int bar,
diff --git a/include/asm-powerpc/prom.h b/include/asm-powerpc/prom.h
index e26feb2..ff4d156 100644
--- a/include/asm-powerpc/prom.h
+++ b/include/asm-powerpc/prom.h
@@ -214,6 +214,9 @@ static inline u64 of_read_number(u32 *cell, int size)
 #define OF_BAD_ADDR	((u64)-1)
 extern u64 of_translate_address(struct device_node *np, u32 *addr);
 
+/* Translate a DMA address from device space to CPU space */
+extern u64 of_translate_dma_address(struct device_node *dev, u32 *in_addr);
+
 /* Extract an address from a device, returns the region size and
  * the address space flags too. The PCI version uses a BAR number
  * instead of an absolute index