Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > fc11cd6e1c513a17304da94a5390f3cd > files > 2708

kernel-2.6.18-194.11.1.el5.src.rpm

From: Doug Ledford <dledford@redhat.com>
Date: Tue, 14 Apr 2009 15:23:33 -0400
Subject: [openib] ipath: update driver to OFED 1.4.1-rc3
Message-id: 1239737023-31222-7-git-send-email-dledford@redhat.com
O-Subject: [Patch RHEL5.4 06/16] [ipath] update driver to OFED 1.4.1-rc3 version
Bugzilla: 230035 480696

Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/ipath/Kconfig b/drivers/infiniband/hw/ipath/Kconfig
index 2556762..3c7968f 100644
--- a/drivers/infiniband/hw/ipath/Kconfig
+++ b/drivers/infiniband/hw/ipath/Kconfig
@@ -1,6 +1,6 @@
 config INFINIBAND_IPATH
 	tristate "QLogic InfiniPath Driver"
-	depends on PCI_MSI && 64BIT && NET
+	depends on 64BIT && NET
 	---help---
 	This is a driver for QLogic InfiniPath host channel adapters,
 	including InfiniBand verbs support.  This driver allows these
diff --git a/drivers/infiniband/hw/ipath/Makefile b/drivers/infiniband/hw/ipath/Makefile
index f19ef92..e3109fb 100644
--- a/drivers/infiniband/hw/ipath/Makefile
+++ b/drivers/infiniband/hw/ipath/Makefile
@@ -26,15 +26,19 @@ ib_ipath-y := \
 	ipath_sysfs.o \
 	ipath_uc.o \
 	ipath_ud.o \
+	ipath_wc_pat.o \
 	ipath_user_pages.o \
 	ipath_user_sdma.o \
 	ipath_verbs_mcast.o \
-	ipath_verbs.o
+	ipath_verbs.o \
+	ipath_iba7220.o \
+	ipath_sd7220.o \
+	ipath_sd7220_img.o
 
 ib_ipath-y += ipath_iba6110.o
 ib_ipath-$(CONFIG_PCI_MSI) += ipath_iba6120.o
-ib_ipath-$(CONFIG_PCI_MSI) += ipath_iba7220.o ipath_sd7220.o ipath_sd7220_img.o
 
-ib_ipath-$(CONFIG_X86_64) += ipath_wc_x86_64.o
+ib_ipath-$(CONFIG_X86_64) += iowrite32_copy_x86_64.o
 ib_ipath-$(CONFIG_X86_64) += memcpy_cachebypass_x86_64.o
+ib_ipath-$(CONFIG_X86_64) += ipath_wc_x86_64.o
 ib_ipath-$(CONFIG_PPC64) += ipath_wc_ppc64.o
diff --git a/drivers/infiniband/hw/ipath/iowrite32_copy_x86_64.S b/drivers/infiniband/hw/ipath/iowrite32_copy_x86_64.S
new file mode 100644
index 0000000..6c659cf
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/iowrite32_copy_x86_64.S
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/**
+ * __iowrite32_copy - copy a memory block using dword multiple writes
+ *
+ * This is primarily for writing to the InfiniPath PIO buffers, which
+ * only support dword multiple writes, and thus can not use memcpy().
+ * For this reason, we use nothing smaller than dword writes.
+ * It is also used as a fast copy routine in some places that have been
+ * measured to win over memcpy, and the performance delta matters.
+ *
+ * Count is number of dwords; might not be a qword multiple.
+ */
+
+ 	.globl __iowrite32_copy
+	.p2align 4
+/* rdi	destination, rsi source, rdx count */
+__iowrite32_copy:
+	movl %edx,%ecx
+	shrl $1,%ecx
+	andl $1,%edx
+	rep
+	movsq
+	movl %edx,%ecx
+	rep
+	movsd
+	ret
diff --git a/drivers/infiniband/hw/ipath/ipath_common.h b/drivers/infiniband/hw/ipath/ipath_common.h
index 2fa012d..28cfe97 100644
--- a/drivers/infiniband/hw/ipath/ipath_common.h
+++ b/drivers/infiniband/hw/ipath/ipath_common.h
@@ -201,7 +201,6 @@ typedef enum _ipath_ureg {
 #define IPATH_RUNTIME_RCVHDR_COPY	0x8
 #define IPATH_RUNTIME_MASTER	0x10
 #define IPATH_RUNTIME_NODMA_RTAIL 0x80
-#define IPATH_RUNTIME_SPECIAL_TRIGGER 0x100
 #define IPATH_RUNTIME_SDMA	      0x200
 #define IPATH_RUNTIME_FORCE_PIOAVAIL 0x400
 #define IPATH_RUNTIME_PIO_REGSWAPPED 0x800
@@ -452,8 +451,6 @@ struct ipath_user_info {
 #define IPATH_CMD_SDMA_INFLIGHT 31	/* sdma inflight counter request */
 #define IPATH_CMD_SDMA_COMPLETE 32	/* sdma completion counter request */
 
-#define IPATH_CMD_MAX		31
-
 /*
  * Poll types
  */
diff --git a/drivers/infiniband/hw/ipath/ipath_cq.c b/drivers/infiniband/hw/ipath/ipath_cq.c
index 29d1a82..261bf85 100644
--- a/drivers/infiniband/hw/ipath/ipath_cq.c
+++ b/drivers/infiniband/hw/ipath/ipath_cq.c
@@ -82,7 +82,7 @@ void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited)
 		wc->uqueue[head].opcode = entry->opcode;
 		wc->uqueue[head].vendor_err = entry->vendor_err;
 		wc->uqueue[head].byte_len = entry->byte_len;
-		wc->uqueue[head].imm_data = (__u32 __force)entry->imm_data;
+		wc->uqueue[head].ex.imm_data = (__u32 __force) entry->ex.imm_data;
 		wc->uqueue[head].qp_num = entry->qp->qp_num;
 		wc->uqueue[head].src_qp = entry->src_qp;
 		wc->uqueue[head].wc_flags = entry->wc_flags;
diff --git a/drivers/infiniband/hw/ipath/ipath_diag.c b/drivers/infiniband/hw/ipath/ipath_diag.c
index f4eaf13..6d49d2f 100644
--- a/drivers/infiniband/hw/ipath/ipath_diag.c
+++ b/drivers/infiniband/hw/ipath/ipath_diag.c
@@ -403,7 +403,7 @@ static ssize_t ipath_diagpkt_write(struct file *fp,
 		goto bail;
 	}
 	/*
-	 * - Want to skip check for l_state if using custom PBC,
+	 * Want to skip check for l_state if using custom PBC,
 	 * because we might be trying to force an SM packet out.
 	 * first-cut, skip _all_ state checking in that case.
 	 */
@@ -476,13 +476,6 @@ static ssize_t ipath_diagpkt_write(struct file *fp,
 	} else
 		__iowrite32_copy(piobuf + 2, tmpbuf, clen);
 
-	if (dd->ipath_flags & IPATH_USE_SPCL_TRIG) {
-		u32 spcl_off = (pbufn > dd->ipath_piobcnt2k) ?
-			2047 : 1023;
-		ipath_flush_wc();
-		__raw_writel(0xaebecede, piobuf + spcl_off);
-	}
-
 	ipath_flush_wc();
 
 	ret = sizeof(dp);
diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c
index b91d67e..39c3123 100644
--- a/drivers/infiniband/hw/ipath/ipath_driver.c
+++ b/drivers/infiniband/hw/ipath/ipath_driver.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
  * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -39,6 +39,7 @@
 #include <linux/netdevice.h>
 #include <linux/vmalloc.h>
 
+#include "ipath_wc_pat.h"
 #include "ipath_kernel.h"
 #include "ipath_verbs.h"
 
@@ -83,11 +84,6 @@ module_param_named(hol_timeout_ms, ipath_hol_timeout_ms, uint, S_IRUGO);
 MODULE_PARM_DESC(hol_timeout_ms,
 	"duration of user app suspension after link failure");
 
-unsigned ipath_sdma_fetch_arb = 1;
-EXPORT_SYMBOL_GPL(ipath_sdma_fetch_arb);
-module_param_named(fetch_arb, ipath_sdma_fetch_arb, uint, S_IRUGO);
-MODULE_PARM_DESC(fetch_arb, "IBA7220: change SDMA descriptor arbitration");
-
 unsigned ipath_linkrecovery = 1;
 module_param_named(linkrecovery, ipath_linkrecovery, uint, S_IWUSR | S_IRUGO);
 MODULE_PARM_DESC(linkrecovery, "enable workaround for link recovery issue");
@@ -160,23 +156,6 @@ static struct pci_driver ipath_driver = {
 	.id_table = ipath_pci_tbl,
 };
 
-static void ipath_check_status(struct work_struct *work)
-{
-	struct ipath_devdata *dd = container_of(work, struct ipath_devdata,
-						status_work.work);
-
-	/*
-	 * If we're in the NOCABLE state, try again in another minute.
-	 */
-	if (*dd->ipath_statusp & IPATH_STATUS_IB_NOCABLE) {
-		schedule_delayed_work(&dd->status_work, HZ * STATUS_TIMEOUT);
-		return;
-	}
-
-	if (!(*dd->ipath_statusp & IPATH_STATUS_IB_READY))
-		dev_info(&dd->pcidev->dev, "IB link is not ACTIVE\n");
-}
-
 static inline void read_bars(struct ipath_devdata *dd, struct pci_dev *dev,
 			     u32 *bar0, u32 *bar1)
 {
@@ -244,8 +223,6 @@ static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev)
 	dd->pcidev = pdev;
 	pci_set_drvdata(pdev, dd);
 
-	INIT_DELAYED_WORK(&dd->status_work, ipath_check_status);
-
 	list_add(&dd->ipath_list, &ipath_dev_list);
 
 bail_unlock:
@@ -378,8 +355,8 @@ static void ipath_verify_pioperf(struct ipath_devdata *dd)
 	 * length 0, no dwords actually sent, and mark as VL15
 	 * on chips where that may matter (due to IB flowcontrol)
 	 */
-	if ((dd->ipath_flags&IPATH_HAS_PBC_CNT))
-		writeq(0x80000000UL<<32, piobuf);
+	if ((dd->ipath_flags & IPATH_HAS_PBC_CNT))
+		writeq(1UL << 63, piobuf);
 	else
 		writeq(0, piobuf);
 	ipath_flush_wc();
@@ -560,7 +537,8 @@ static int __devinit ipath_init_one(struct pci_dev *pdev,
 #endif
 	case PCI_DEVICE_ID_INFINIPATH_7220:
 #ifndef CONFIG_PCI_MSI
-		ipath_dbg("CONFIG_PCI_MSI is not enabled, using IntX for unit %u\n", dd->ipath_unit);
+		ipath_dbg("CONFIG_PCI_MSI is not enabled, "
+			  "using INTx for unit %u\n", dd->ipath_unit);
 #endif
 		ipath_init_iba7220_funcs(dd);
 		break;
@@ -600,19 +578,19 @@ static int __devinit ipath_init_one(struct pci_dev *pdev,
 #else
 	dd->ipath_kregbase = ioremap_nocache(addr, len);
 #endif
-
 	if (!dd->ipath_kregbase) {
 		ipath_dbg("Unable to map io addr %llx to kvirt, failing\n",
 			  addr);
 		ret = -ENOMEM;
-		goto bail_iounmap;
+		goto bail_regions;
 	}
 	dd->ipath_kregend = (u64 __iomem *)
 		((void __iomem *)dd->ipath_kregbase + len);
 	dd->ipath_physaddr = addr;	/* used for io_remap, etc. */
 	/* for user mmap */
-	ipath_cdbg(VERBOSE, "mapped io addr %llx to kregbase %p\n",
-		   addr, dd->ipath_kregbase);
+	ipath_cdbg(VERBOSE, "mapped io addr %llx to kregbase %p, "
+		   "length %lu bytes\n",
+		   addr, dd->ipath_kregbase, (unsigned long) len);
 
 	if (dd->ipath_f_bus(dd, pdev))
 		ipath_dev_err(dd, "Failed to setup config space; "
@@ -624,15 +602,15 @@ static int __devinit ipath_init_one(struct pci_dev *pdev,
 	 * check 0 irq after we return from chip-specific bus setup, since
 	 * that can affect this due to setup
 	 */
-	if (!pdev->irq)
+	if (!dd->ipath_irq)
 		ipath_dev_err(dd, "irq is 0, BIOS error?  Interrupts won't "
 			      "work\n");
 	else {
-		ret = request_irq(pdev->irq, ipath_intr, IRQF_SHARED,
+		ret = request_irq(dd->ipath_irq, ipath_intr, IRQF_SHARED,
 				  IPATH_DRV_NAME, dd);
 		if (ret) {
 			ipath_dev_err(dd, "Couldn't setup irq handler, "
-				      "irq=%d: %d\n", pdev->irq, ret);
+				      "irq=%d: %d\n", dd->ipath_irq, ret);
 			goto bail_iounmap;
 		}
 	}
@@ -641,13 +619,15 @@ static int __devinit ipath_init_one(struct pci_dev *pdev,
 	if (ret)
 		goto bail_irqsetup;
 
-	ret = ipath_enable_wc(dd);
+	if (!ipath_wc_pat) {
+		ret = ipath_enable_wc(dd);
 
-	if (ret) {
-		ipath_dev_err(dd, "Write combining not enabled "
-			      "(err %d): performance may be poor\n",
-			      -ret);
-		ret = 0;
+		if (ret) {
+			ipath_dev_err(dd, "Write combining not enabled "
+				      "(err %d): performance may be poor\n",
+				      -ret);
+			ret = 0;
+		}
 	}
 
 	ipath_verify_pioperf(dd);
@@ -658,9 +638,6 @@ static int __devinit ipath_init_one(struct pci_dev *pdev,
 	ipath_diag_add(dd);
 	ipath_register_ib_device(dd);
 
-	/* Check that card status in STATUS_TIMEOUT seconds. */
-	schedule_delayed_work(&dd->status_work, HZ * STATUS_TIMEOUT);
-
 	goto bail;
 
 bail_irqsetup:
@@ -686,6 +663,8 @@ bail:
 static void __devexit cleanup_device(struct ipath_devdata *dd)
 {
 	int port;
+	struct ipath_portdata **tmp;
+	unsigned long flags;
 
 	if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) {
 		/* can't do anything more with chip; needs re-init */
@@ -697,12 +676,15 @@ static void __devexit cleanup_device(struct ipath_devdata *dd)
 			 * re-init
 			 */
 			dd->ipath_kregbase = NULL;
+			dd->ipath_piobase = NULL;
+			dd->ipath_userbase = NULL;
 			dd->ipath_uregbase = 0;
 			dd->ipath_sregbase = 0;
 			dd->ipath_cregbase = 0;
 			dd->ipath_kregsize = 0;
 		}
-		ipath_disable_wc(dd);
+		if (!ipath_wc_pat)
+			ipath_disable_wc(dd);
 	}
 
 	if (dd->ipath_spectriggerhit)
@@ -767,20 +749,21 @@ static void __devexit cleanup_device(struct ipath_devdata *dd)
 
 	/*
 	 * free any resources still in use (usually just kernel ports)
-	 * at unload; we do for portcnt, not cfgports, because cfgports
-	 * could have changed while we were loaded.
+	 * at unload; we do for portcnt, because that's what we allocate.
+	 * We acquire lock to be really paranoid that ipath_pd isn't being
+	 * accessed from some interrupt-related code (that should not happen,
+	 * but best to be sure).
 	 */
+	spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
+	tmp = dd->ipath_pd;
+	dd->ipath_pd = NULL;
+	spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
 	for (port = 0; port < dd->ipath_portcnt; port++) {
-		struct ipath_portdata *pd = dd->ipath_pd[port];
-		dd->ipath_pd[port] = NULL;
+		struct ipath_portdata *pd = tmp[port];
+		tmp[port] = NULL; /* debugging paranoia */
 		ipath_free_pddata(dd, pd);
 	}
-	kfree(dd->ipath_pd);
-	/*
-	 * debuggability, in case some cleanup path tries to use it
-	 * after this
-	 */
-	dd->ipath_pd = NULL;
+	kfree(tmp);
 }
 
 static void __devexit ipath_remove_one(struct pci_dev *pdev)
@@ -795,7 +778,6 @@ static void __devexit ipath_remove_one(struct pci_dev *pdev)
 	 */
 	ipath_shutdown_device(dd);
 
-	cancel_delayed_work(&dd->status_work);
 	flush_scheduled_work();
 
 	if (dd->verbs_dev)
@@ -817,11 +799,10 @@ static void __devexit ipath_remove_one(struct pci_dev *pdev)
 	 * free up port 0 (kernel) rcvhdr, egr bufs, and eventually tid bufs
 	 * for all versions of the driver, if they were allocated
 	 */
-	if (pdev->irq) {
-		ipath_cdbg(VERBOSE,
-			   "unit %u free_irq of irq %x\n",
-			   dd->ipath_unit, pdev->irq);
-		free_irq(pdev->irq, dd);
+	if (dd->ipath_irq) {
+		ipath_cdbg(VERBOSE, "unit %u free irq %d\n",
+			   dd->ipath_unit, dd->ipath_irq);
+		dd->ipath_f_free_irq(dd);
 	} else
 		ipath_dbg("irq is 0, not doing free_irq "
 			  "for unit %u\n", dd->ipath_unit);
@@ -837,6 +818,17 @@ static void __devexit ipath_remove_one(struct pci_dev *pdev)
 
 	ipath_cdbg(VERBOSE, "Unmapping kregbase %p\n", dd->ipath_kregbase);
 	iounmap((volatile void __iomem *) dd->ipath_kregbase);
+	if (dd->ipath_piobase) {
+		ipath_cdbg(VERBOSE, "Unmapping piobase %p\n",
+			   dd->ipath_piobase);
+		iounmap((volatile void __iomem *) dd->ipath_piobase);
+	}
+	if (dd->ipath_userbase) {
+		ipath_cdbg(VERBOSE, "Unmapping userbase %p\n",
+			   dd->ipath_userbase);
+		iounmap((volatile void __iomem *) dd->ipath_userbase);
+	}
+
 	pci_release_regions(pdev);
 	ipath_cdbg(VERBOSE, "calling pci_disable_device\n");
 	pci_disable_device(pdev);
@@ -864,10 +856,10 @@ void ipath_disarm_piobufs(struct ipath_devdata *dd, unsigned first,
 			  unsigned cnt)
 {
 	unsigned i, last = first + cnt;
+	unsigned long flags;
 
 	ipath_cdbg(PKT, "disarm %u PIObufs first=%u\n", cnt, first);
 	for (i = first; i < last; i++) {
-		unsigned long flags;
 		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
 		/*
 		 * The disarm-related bits are write-only, so it
@@ -1129,7 +1121,7 @@ struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd,
 	 * key header.  In order to keep everything dword aligned,
 	 * we'll reserve 4 bytes.
 	 */
-	len = dd->ipath_ibmaxlen + 4;
+	len = dd->ipath_init_ibmaxlen + 4;
 
 	if (dd->ipath_flags & IPATH_4BYTE_TID) {
 		/* We need a 2KB multiple alignment, and there is no way
@@ -1286,7 +1278,7 @@ reloop:
 			 */
 			ipath_cdbg(ERRPKT, "Error Pkt, but no eflags! egrbuf"
 				  " %x, len %x hdrq+%x rhf: %Lx\n",
-				  etail, tlen, l,
+				  etail, tlen, l, (unsigned long long)
 				  le64_to_cpu(*(__le64 *) rhf_addr));
 			if (ipath_debug & __IPATH_ERRPKTDBG) {
 				u32 j, *d, dw = rsize-2;
@@ -1455,7 +1447,6 @@ static void ipath_update_pio_bufs(struct ipath_devdata *dd)
 	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
 }
 
-
 /*
  * used to force update of pioavailshadow if we can't get a pio buffer.
  * Needed primarily due to exitting freeze mode after recovering
@@ -1485,7 +1476,8 @@ static void ipath_reset_availshadow(struct ipath_devdata *dd)
 			0xaaaaaaaaaaaaaaaaULL); /* All BUSY bits in qword */
 		if (oldval != dd->ipath_pioavailshadow[i])
 			ipath_dbg("shadow[%d] was %Lx, now %lx\n",
-				i, oldval, dd->ipath_pioavailshadow[i]);
+				i, (unsigned long long) oldval,
+				dd->ipath_pioavailshadow[i]);
 	}
 	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
 }
@@ -1575,7 +1567,6 @@ static noinline void no_pio_bufs(struct ipath_devdata *dd)
 	}
 }
 
-
 /*
  * common code for normal driver pio buffer allocation, and reserved
  * allocation.
@@ -1666,13 +1657,11 @@ rescan:
 	return buf;
 }
 
-
 /**
  * ipath_getpiobuf - find an available pio buffer
  * @dd: the infinipath device
  * @plen: the size of the PIO buffer needed in 32-bit words
  * @pbufnum: the buffer number is placed here
- * Searches the allocated driver range.
  */
 u32 __iomem *ipath_getpiobuf(struct ipath_devdata *dd, u32 plen, u32 *pbufnum)
 {
@@ -1925,7 +1914,7 @@ void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl)
 	 */
 	if (dd->ipath_flags & IPATH_HAS_SEND_DMA) {
 		int skip_cancel;
-		u64 *statp = &dd->ipath_sdma_status;
+		unsigned long *statp = &dd->ipath_sdma_status;
 
 		spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
 		skip_cancel =
@@ -1980,7 +1969,7 @@ void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl)
 	    !test_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status) &&
 	    test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status)) {
 		spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-	    	/* only wait so long for intr */
+		/* only wait so long for intr */
 		dd->ipath_sdma_abort_intr_timeout = jiffies + HZ;
 		dd->ipath_sdma_reset_wait = 200;
 		if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
@@ -2014,13 +2003,6 @@ void ipath_force_pio_avail_update(struct ipath_devdata *dd)
 	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
 }
 
-
-/*
- * Formerly took parameter <which> in pre-shifted,
- * pre-merged form with LinkCmd and LinkInitCmd
- * together, and assuming the zero was NOP.
- * This is problematic for IBA7220.
- */
 static void ipath_set_ib_lstate(struct ipath_devdata *dd, int linkcmd,
 				int linitcmd)
 {
@@ -2031,6 +2013,7 @@ static void ipath_set_ib_lstate(struct ipath_devdata *dd, int linkcmd,
 		[INFINIPATH_IBCC_LINKCMD_ARMED] = "ARMED",
 		[INFINIPATH_IBCC_LINKCMD_ACTIVE] = "ACTIVE"
 	};
+
 	if (linitcmd == INFINIPATH_IBCC_LINKINITCMD_DISABLE) {
 		/*
 		 * If we are told to disable, note that so link-recovery
@@ -2041,9 +2024,9 @@ static void ipath_set_ib_lstate(struct ipath_devdata *dd, int linkcmd,
 		preempt_enable();
 	} else if (linitcmd) {
 		/*
-		 * Any other linkinitcmd will lead to LINKDOWN< and then
-		 * to INIT (if all is well), so clear flag to let ink-recovery
-		 * code attempt to bring us back up.
+		 * Any other linkinitcmd will lead to LINKDOWN and then
+		 * to INIT (if all is well), so clear flag to let
+		 * link-recovery code attempt to bring us back up.
 		 */
 		preempt_disable();
 		dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED;
@@ -2134,6 +2117,7 @@ int ipath_set_linkstate(struct ipath_devdata *dd, u8 newstate)
 		dd->ipath_ibcctrl |= INFINIPATH_IBCC_LOOPBACK;
 		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
 				 dd->ipath_ibcctrl);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_scratch, 0xfeedbeef);
 
 		/* turn heartbeat off, as it causes loopback to fail */
 		dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
@@ -2150,6 +2134,7 @@ int ipath_set_linkstate(struct ipath_devdata *dd, u8 newstate)
 		dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LOOPBACK;
 		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
 				 dd->ipath_ibcctrl);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_scratch, 0xfeedbeef);
 		/* don't wait */
 		ret = 0;
 		goto bail;
@@ -2251,6 +2236,7 @@ int ipath_set_mtu(struct ipath_devdata *dd, u16 arg)
 		dd->ipath_ibcctrl = ibc;
 		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
 				 dd->ipath_ibcctrl);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_scratch, 0xfeedbeef);
 		dd->ipath_f_tidtemplate(dd);
 	}
 
@@ -2458,10 +2444,6 @@ void ipath_shutdown_device(struct ipath_devdata *dd)
 		del_timer_sync(&dd->ipath_stats_timer);
 		dd->ipath_stats_timer_active = 0;
 	}
-	if (dd->ipath_link_timer_active) {
-		del_timer_sync(&dd->ipath_link_timer);
-		dd->ipath_link_timer_active = 0;
-	}
 	if (dd->ipath_intrchk_timer.data) {
 		del_timer_sync(&dd->ipath_intrchk_timer);
 		dd->ipath_intrchk_timer.data = 0;
@@ -2545,9 +2527,9 @@ void ipath_free_pddata(struct ipath_devdata *dd, struct ipath_portdata *pd)
 			   skbinfo);
 		for (e = 0; e < dd->ipath_p0_rcvegrcnt; e++)
 			if (skbinfo[e].skb) {
-				pci_unmap_single(dd->pcidev,
-					skbinfo[e].phys, dd->ipath_ibmaxlen,
-					PCI_DMA_FROMDEVICE);
+				pci_unmap_single(dd->pcidev, skbinfo[e].phys,
+						 dd->ipath_init_ibmaxlen,
+						 PCI_DMA_FROMDEVICE);
 				dev_kfree_skb(skbinfo[e].skb);
 			}
 		vfree(skbinfo);
@@ -2566,6 +2548,15 @@ static int __init infinipath_init(void)
 	if (ipath_debug & __IPATH_DBG)
 		printk(KERN_INFO DRIVER_LOAD_MSG "%s", ib_ipath_version);
 
+	if (ipath_wc_pat) {
+		if (ipath_enable_wc_pat() || !ipath_wc_pat_enabled()) {
+			printk(KERN_ERR IPATH_DRV_NAME
+			       ": WC PAT unavailable, fall-back to MTRR\n");
+			ipath_wc_pat = 0;
+		} else
+			ipath_dbg("WC PAT mechanism is enabled\n");
+	}
+
 	/*
 	 * These must be called before the driver is registered with
 	 * the PCI subsystem.
@@ -2574,7 +2565,7 @@ static int __init infinipath_init(void)
 	if (!idr_pre_get(&unit_table, GFP_KERNEL)) {
 		printk(KERN_ERR IPATH_DRV_NAME ": idr_pre_get() failed\n");
 		ret = -ENOMEM;
-		goto bail;
+		goto bail_wc_pat;
 	}
 
 	ret = pci_register_driver(&ipath_driver);
@@ -2609,6 +2600,10 @@ bail_pci:
 bail_unit:
 	idr_destroy(&unit_table);
 
+bail_wc_pat:
+	if (ipath_wc_pat)
+		ipath_disable_wc_pat();
+
 bail:
 	return ret;
 }
@@ -2623,6 +2618,11 @@ static void __exit infinipath_cleanup(void)
 	pci_unregister_driver(&ipath_driver);
 
 	idr_destroy(&unit_table);
+
+	if (ipath_wc_pat) {
+		ipath_disable_wc_pat();
+		ipath_dbg("WC PAT mechanism is disabled\n");
+	}
 }
 
 /**
@@ -2638,6 +2638,7 @@ int ipath_reset_device(int unit)
 {
 	int ret, i;
 	struct ipath_devdata *dd = ipath_lookup(unit);
+	unsigned long flags;
 
 	if (!dd) {
 		ret = -ENODEV;
@@ -2663,6 +2664,7 @@ int ipath_reset_device(int unit)
 		goto bail;
 	}
 
+	spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
 	if (dd->ipath_pd)
 		for (i = 1; i < dd->ipath_cfgports; i++) {
 			if (dd->ipath_pd[i] && dd->ipath_pd[i]->port_cnt) {
@@ -2675,6 +2677,7 @@ int ipath_reset_device(int unit)
 				goto bail;
 			}
 		}
+	spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
 
 	if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
 		teardown_sdma(dd);
@@ -2704,18 +2707,24 @@ bail:
  * through the normal interfaces (i.e., everything other than diags
  * interface).  Returns number of signalled processes.
  */
-int ipath_signal_procs(struct ipath_devdata *dd, int sig)
+static int ipath_signal_procs(struct ipath_devdata *dd, int sig)
 {
 	int i, sub, any = 0;
 	pid_t pid;
-
+	unsigned long flags;
+	
 	if (!dd->ipath_pd)
 		return 0;
+
+	spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
 	for (i = 1; i < dd->ipath_cfgports; i++) {
-		if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt
-			 || !dd->ipath_pd[i]->port_pid)
+		if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt ||
+		    !dd->ipath_pd[i]->port_pid)
 			continue;
 		pid = dd->ipath_pd[i]->port_pid;
+		if (!pid)
+			continue;
+
 		dev_info(&dd->pcidev->dev, "context %d in use "
 			  "(PID %u), sending signal %d\n",
 			  i, pid, sig);
@@ -2732,6 +2741,7 @@ int ipath_signal_procs(struct ipath_devdata *dd, int sig)
 			any++;
 		}
 	}
+	spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
 	return any;
 }
 
diff --git a/drivers/infiniband/hw/ipath/ipath_eeprom.c b/drivers/infiniband/hw/ipath/ipath_eeprom.c
index 899f156..dc37277 100644
--- a/drivers/infiniband/hw/ipath/ipath_eeprom.c
+++ b/drivers/infiniband/hw/ipath/ipath_eeprom.c
@@ -482,7 +482,6 @@ done:
 	return (idx >= 0) ? i2c_chains + idx : NULL;
 }
 
-
 static int ipath_eeprom_internal_read(struct ipath_devdata *dd,
 					u8 eeprom_offset, void *buffer, int len)
 {
@@ -561,7 +560,8 @@ static int ipath_eeprom_internal_write(struct ipath_devdata *dd, u8 eeprom_offse
 
 	while (len > 0) {
 		if (icd->eeprom_dev == IPATH_NO_DEV) {
-			if (i2c_startcmd(dd, (eeprom_offset << 1) | WRITE_CMD)) {
+			if (i2c_startcmd(dd,
+					 (eeprom_offset << 1) | WRITE_CMD)) {
 				ipath_dbg("Failed to start cmd offset %u\n",
 					eeprom_offset);
 				goto failed_write;
diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c
index dc61e15..09b41e2 100644
--- a/drivers/infiniband/hw/ipath/ipath_file_ops.c
+++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c
@@ -39,11 +39,13 @@
 #include <linux/highmem.h>
 #include <linux/io.h>
 #include <linux/jiffies.h>
+#include <linux/smp_lock.h>
 #include <asm/pgtable.h>
 
 #include "ipath_kernel.h"
 #include "ipath_common.h"
 #include "ipath_user_sdma.h"
+#include "ipath_wc_pat.h"
 
 static int ipath_open(struct inode *, struct file *);
 static int ipath_close(struct inode *, struct file *);
@@ -222,8 +224,13 @@ static int ipath_get_base_info(struct file *fp,
 			(unsigned long long) kinfo->spi_subport_rcvhdr_base);
 	}
 
-	kinfo->spi_pioindex = (kinfo->spi_piobufbase - dd->ipath_piobufbase) /
-		dd->ipath_palign;
+	/*
+	 * All user buffers are 2KB buffers.  If we ever support
+	 * giving 4KB buffers to user processes, this will need some
+	 * work.
+	 */
+	kinfo->spi_pioindex = (kinfo->spi_piobufbase -
+		(dd->ipath_piobufbase & 0xffffffff)) / dd->ipath_palign;
 	kinfo->spi_pioalign = dd->ipath_palign;
 
 	kinfo->spi_qpair = IPATH_KD_QP;
@@ -903,7 +910,7 @@ static int ipath_create_user_egr(struct ipath_portdata *pd)
 	chunk = pd->port_rcvegrbuf_chunks;
 	egrperchunk = pd->port_rcvegrbufs_perchunk;
 	size = pd->port_rcvegrbuf_size;
-	pd->port_rcvegrbuf = kmalloc(chunk * sizeof(pd->port_rcvegrbuf[0]),
+	pd->port_rcvegrbuf = kzalloc(chunk * sizeof(pd->port_rcvegrbuf[0]),
 				     GFP_KERNEL);
 	if (!pd->port_rcvegrbuf) {
 		ret = -ENOMEM;
@@ -1077,6 +1084,9 @@ static int mmap_piobufs(struct vm_area_struct *vma,
 	vma->vm_flags &= ~VM_MAYREAD;
 	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
 
+	if (ipath_wc_pat)
+		vma->vm_page_prot = pgprot_wc(vma->vm_page_prot);
+
 	ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT,
 				 vma->vm_end - vma->vm_start,
 				 vma->vm_page_prot);
@@ -1747,8 +1757,8 @@ recheck:
 			ipath_dbg("No ports available (none initialized "
 				  "and ready)\n");
 		} else {
-			if (prefunit > 0) {
-				/* if started above 0, retry from 0 */
+			if (prefunit != -1) {
+				/* if had prefunit, retry from 0 */
 				ipath_cdbg(PROC,
 					   "%s[%u] no ports on prefunit "
 					   "%d, clear and re-check\n",
@@ -1823,6 +1833,7 @@ done:
 static int ipath_open(struct inode *in, struct file *fp)
 {
 	/* The real work is performed later in ipath_assign_port() */
+	cycle_kernel_lock();
 	fp->private_data = kzalloc(sizeof(struct ipath_filedata), GFP_KERNEL);
 	return fp->private_data ? 0 : -ENOMEM;
 }
@@ -1981,7 +1992,12 @@ static int ipath_do_user_init(struct file *fp,
 	 * explictly set the in-memory tail copy to 0 beforehand, so we
 	 * don't have to wait to be sure the DMA update has happened
 	 * (chip resets head/tail to 0 on transition to enable).
+	 * The mutex ensures that the read value of dd->ipath_rcvctrl
+         * after the atomic set_bit is not stale, and avoids a race
+         * hazard with 2 processes attempting to enable (distinct)
+	 * ports simultaneously.
 	 */
+	mutex_lock(&ipath_mutex);
 	set_bit(dd->ipath_r_portenable_shift + pd->port_port,
 		&dd->ipath_rcvctrl);
 	if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
@@ -1993,6 +2009,7 @@ static int ipath_do_user_init(struct file *fp,
 	}
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
 			 dd->ipath_rcvctrl);
+	mutex_unlock(&ipath_mutex);
 	/* Notify any waiting slaves */
 	if (pd->port_subport_cnt) {
 		clear_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag);
@@ -2047,7 +2064,9 @@ static int ipath_close(struct inode *in, struct file *fp)
 	struct ipath_filedata *fd;
 	struct ipath_portdata *pd;
 	struct ipath_devdata *dd;
+	unsigned long flags;
 	unsigned port;
+	pid_t pid;
 
 	ipath_cdbg(VERBOSE, "close on dev %lx, private data %p\n",
 		   (long)in->i_rdev, fp->private_data);
@@ -2079,14 +2098,13 @@ static int ipath_close(struct inode *in, struct file *fp)
 		mutex_unlock(&ipath_mutex);
 		goto bail;
 	}
+	/* early; no interrupt users after this */
+	spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
 	port = pd->port_port;
-
-	if (pd->port_hdrqfull) {
-		ipath_cdbg(PROC, "%s[%u] had %u rcvhdrqfull errors "
-			   "during run\n", pd->port_comm, pd->port_pid,
-			   pd->port_hdrqfull);
-		pd->port_hdrqfull = 0;
-	}
+	dd->ipath_pd[port] = NULL;
+	pid = pd->port_pid;
+	pd->port_pid = 0;
+	spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
 
 	if (pd->port_rcvwait_to || pd->port_piowait_to
 	    || pd->port_rcvnowait || pd->port_pionowait) {
@@ -2143,12 +2161,10 @@ static int ipath_close(struct inode *in, struct file *fp)
 			unlock_expected_tids(pd);
 		ipath_stats.sps_ports--;
 		ipath_cdbg(PROC, "%s[%u] closed port %u:%u\n",
-			   pd->port_comm, pd->port_pid,
+			   pd->port_comm, pid,
 			   dd->ipath_unit, port);
 	}
 
-	pd->port_pid = 0;
-	dd->ipath_pd[pd->port_port] = NULL; /* before releasing mutex */
 	mutex_unlock(&ipath_mutex);
 	ipath_free_pddata(dd, pd); /* after releasing the mutex */
 
diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c
index fe852e3..d32ec71 100644
--- a/drivers/infiniband/hw/ipath/ipath_fs.c
+++ b/drivers/infiniband/hw/ipath/ipath_fs.c
@@ -31,7 +31,6 @@
  * SOFTWARE.
  */
 
-#include <linux/version.h>
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
@@ -238,8 +237,7 @@ static int create_device_files(struct super_block *sb,
 
 	snprintf(unit, sizeof unit, "%02d", dd->ipath_unit);
 	ret = create_file(unit, S_IFDIR|S_IRUGO|S_IXUGO, sb->s_root, &dir,
-			  (struct file_operations *) &simple_dir_operations,
-			  dd);
+			  &simple_dir_operations, dd);
 	if (ret) {
 		printk(KERN_ERR "create_file(%s) failed: %d\n", unit, ret);
 		goto bail;
diff --git a/drivers/infiniband/hw/ipath/ipath_iba6110.c b/drivers/infiniband/hw/ipath/ipath_iba6110.c
index 6559c92..5b91705 100644
--- a/drivers/infiniband/hw/ipath/ipath_iba6110.c
+++ b/drivers/infiniband/hw/ipath/ipath_iba6110.c
@@ -39,11 +39,11 @@
 #include <linux/vmalloc.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
-#include <linux/swap.h>
 #include <rdma/ib_verbs.h>
 
 #include "ipath_kernel.h"
 #include "ipath_registers.h"
+#include "ipath_wc_pat.h"
 
 static void ipath_setup_ht_setextled(struct ipath_devdata *, u64, u64);
 
@@ -473,7 +473,6 @@ static const struct ipath_hwerror_msgs ipath_6110_hwerror_msgs[] = {
 	INFINIPATH_HWE_MSG(SERDESPLLFAILED, "SerDes PLL"),
 };
 
-
 #define TXE_PIO_PARITY ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | \
 		        INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) \
 		        << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)
@@ -963,11 +962,28 @@ static void slave_or_pri_blk(struct ipath_devdata *dd, struct pci_dev *pdev,
 		}
 		dd->ipath_lbus_speed = speed;
 	}
+
 	snprintf(dd->ipath_lbus_info, sizeof(dd->ipath_lbus_info),
 		"HyperTransport,%uMHz,x%u\n",
 		dd->ipath_lbus_speed,
 		dd->ipath_lbus_width);
+}
+
+static int ipath_ht_intconfig(struct ipath_devdata *dd)
+{
+	int ret;
+
+	if (dd->ipath_intconfig) {
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_interruptconfig,
+				 dd->ipath_intconfig);	/* interrupt address */
+		ret = 0;
+	} else {
+		ipath_dev_err(dd, "No interrupts enabled, couldn't setup "
+			      "interrupt address\n");
+		ret = -EINVAL;
+	}
 
+	return ret;
 }
 
 static int set_int_handler(struct ipath_devdata *dd, struct pci_dev *pdev,
@@ -1010,6 +1026,7 @@ static int set_int_handler(struct ipath_devdata *dd, struct pci_dev *pdev,
 
 	/* can't program yet, so save for interrupt setup */
 	dd->ipath_intconfig = ihandler;
+	dd->ipath_irq = intvec;
 	/* keep going, so we find link control stuff also */
 
 	return ihandler != 0;
@@ -1482,25 +1499,6 @@ static void ipath_ht_quiet_serdes(struct ipath_devdata *dd)
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val);
 }
 
-static int ipath_ht_intconfig(struct ipath_devdata *dd)
-{
-	int ret;
-
-	if (!dd->ipath_intconfig) {
-		ipath_dev_err(dd, "No interrupts enabled, couldn't setup "
-			      "interrupt address\n");
-		ret = 1;
-		goto bail;
-	}
-
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_interruptconfig,
-			 dd->ipath_intconfig);	/* interrupt address */
-	ret = 0;
-
-bail:
-	return ret;
-}
-
 /**
  * ipath_pe_put_tid - write a TID in chip
  * @dd: the infinipath device
@@ -1654,8 +1652,12 @@ static int ipath_ht_early_init(struct ipath_devdata *dd)
 	 * these out on the wire.
 	 * Chip Errata bug 6610
 	 */
-	piobuf = (u32 __iomem *) (((char __iomem *)(dd->ipath_kregbase)) +
-				  dd->ipath_piobufbase);
+	if (ipath_wc_pat)
+		piobuf = (u32 __iomem *) dd->ipath_piobase;
+	else
+		piobuf = (u32 __iomem *)
+			(((char __iomem *)(dd->ipath_kregbase)) +
+			 dd->ipath_piobufbase);
 	pioincr = dd->ipath_palign / sizeof(*piobuf);
 	for (i = 0; i < dd->ipath_piobcnt2k; i++) {
 		/*
@@ -1717,6 +1719,13 @@ static int ipath_ht_get_base_info(struct ipath_portdata *pd, void *kbase)
 	return 0;
 }
 
+static void ipath_ht_free_irq(struct ipath_devdata *dd)
+{
+	free_irq(dd->ipath_irq, dd);
+	dd->ipath_irq = 0;
+	dd->ipath_intconfig = 0;
+}
+
 static struct ipath_message_header *
 ipath_ht_get_msgheader(struct ipath_devdata *dd, __le32 *rhf_addr)
 {
@@ -1944,6 +1953,7 @@ void ipath_init_iba6110_funcs(struct ipath_devdata *dd)
 	dd->ipath_f_cleanup = ipath_setup_ht_cleanup;
 	dd->ipath_f_setextled = ipath_setup_ht_setextled;
 	dd->ipath_f_get_base_info = ipath_ht_get_base_info;
+	dd->ipath_f_free_irq = ipath_ht_free_irq;
 	dd->ipath_f_tidtemplate = ipath_ht_tidtemplate;
 	dd->ipath_f_intr_fallback = ipath_ht_nointr_fallback;
 	dd->ipath_f_get_msgheader = ipath_ht_get_msgheader;
diff --git a/drivers/infiniband/hw/ipath/ipath_iba6120.c b/drivers/infiniband/hw/ipath/ipath_iba6120.c
index 790b8f7..302e412 100644
--- a/drivers/infiniband/hw/ipath/ipath_iba6120.c
+++ b/drivers/infiniband/hw/ipath/ipath_iba6120.c
@@ -36,10 +36,8 @@
  */
 
 #include <linux/interrupt.h>
-#include <linux/vmalloc.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
-#include <linux/swap.h>
 #include <rdma/ib_verbs.h>
 
 #include "ipath_kernel.h"
@@ -389,7 +387,6 @@ static const struct ipath_hwerror_msgs ipath_6120_hwerror_msgs[] = {
 	INFINIPATH_HWE_MSG(SERDESPLLFAILED, "SerDes PLL"),
 };
 
-
 #define TXE_PIO_PARITY ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | \
 		        INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) \
 		        << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)
@@ -702,6 +699,10 @@ static void ipath_pe_init_hwerrors(struct ipath_devdata *dd)
 		 */
 		val &= ~INFINIPATH_HWE_PCIEBUSPARITYRADM;
 	}
+
+	/* avoid some intel cpu's speculative read freeze mode issue */
+	val &= ~(INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF
+	   << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT);
 	dd->ipath_hwerrmask = val;
 }
 
@@ -724,6 +725,12 @@ static int ipath_pe_bringup_serdes(struct ipath_devdata *dd)
 				 INFINIPATH_HWE_SERDESPLLFAILED);
 	}
 
+	dd->ibdeltainprog = 1;
+	dd->ibsymsnap =
+	     ipath_read_creg32(dd, dd->ipath_cregs->cr_ibsymbolerrcnt);
+	dd->iblnkerrsnap =
+	     ipath_read_creg32(dd, dd->ipath_cregs->cr_iblinkerrrecovcnt);
+
 	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
 	config1 = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig1);
 
@@ -813,6 +820,36 @@ static void ipath_pe_quiet_serdes(struct ipath_devdata *dd)
 {
 	u64 val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
 
+	if (dd->ibsymdelta || dd->iblnkerrdelta ||
+	    dd->ibdeltainprog) {
+		u64 diagc;
+		/* enable counter writes */
+		diagc = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwdiagctrl);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwdiagctrl,
+				 diagc | INFINIPATH_DC_COUNTERWREN);
+
+		if (dd->ibsymdelta || dd->ibdeltainprog) {
+			val = ipath_read_creg32(dd,
+					dd->ipath_cregs->cr_ibsymbolerrcnt);
+			if (dd->ibdeltainprog)
+				val -= val - dd->ibsymsnap;
+			val -= dd->ibsymdelta;
+			ipath_write_creg(dd,
+				  dd->ipath_cregs->cr_ibsymbolerrcnt, val);
+		}
+		if (dd->iblnkerrdelta || dd->ibdeltainprog) {
+			val = ipath_read_creg32(dd,
+					dd->ipath_cregs->cr_iblinkerrrecovcnt);
+			if (dd->ibdeltainprog)
+				val -= val - dd->iblnkerrsnap;
+			val -= dd->iblnkerrdelta;
+			ipath_write_creg(dd,
+				   dd->ipath_cregs->cr_iblinkerrrecovcnt, val);
+	     }
+
+	     /* and disable counter writes */
+	     ipath_write_kreg(dd, dd->ipath_kregs->kr_hwdiagctrl, diagc);
+	}
 	val |= INFINIPATH_SERDC0_TXIDLE;
 	ipath_dbg("Setting TxIdleEn on serdes (config0 = %llx)\n",
 		  (unsigned long long) val);
@@ -911,7 +948,6 @@ static void ipath_setup_pe_cleanup(struct ipath_devdata *dd)
 	pci_disable_msi(dd->pcidev);
 }
 
-
 static void ipath_6120_pcie_params(struct ipath_devdata *dd)
 {
 	u16 linkstat, speed;
@@ -968,7 +1004,6 @@ bail:
 	return;
 }
 
-
 /**
  * ipath_setup_pe_config - setup PCIe config related stuff
  * @dd: the infinipath device
@@ -1000,6 +1035,7 @@ static int ipath_setup_pe_config(struct ipath_devdata *dd,
 		ipath_dev_err(dd, "pci_enable_msi failed: %d, "
 			      "interrupts may not work\n", ret);
 	/* continue even if it fails, we may still be OK... */
+	dd->ipath_irq = pdev->irq;
 
 	if ((pos = pci_find_capability(dd->pcidev, PCI_CAP_ID_MSI))) {
 		u16 control;
@@ -1484,13 +1520,17 @@ static int ipath_pe_early_init(struct ipath_devdata *dd)
 
 	/*
 	 * For openfabrics, we need to be able to handle an IB header of
-	 * 24 dwords.  HT chip has arbitrary sized receive buffers, so we
-	 * made them the same size as the PIO buffers.  This chip does not
-	 * handle arbitrary size buffers, so we need the header large enough
-	 * to handle largest IB header, but still have room for a 2KB MTU
-	 * standard IB packet.
+	 * at least 24 dwords.  This chip does not handle arbitrary size
+	 * buffers, so we need the header large enough to handle largest
+	 * IB header, but still have room for a 2KB MTU standard IB packet.
+	 * Additionally, some processor/memory controller combinations
+	 * benefit quite strongly from having the DMA'ed data be cacheline
+	 * aligned and a cacheline multiple, so we set the size to 32 dwords
+	 * (2 64-byte primary cachelines for pretty much all processors of
+	 * interest).  The alignment hurts nothing, other than using somewhat
+	 * more memory.
 	 */
-	dd->ipath_rcvhdrentsize = 24;
+	dd->ipath_rcvhdrentsize = 32;
 	dd->ipath_rcvhdrsize = IPATH_DFLT_RCVHDRSIZE;
 	dd->ipath_rhf_offset = 0;
 	dd->ipath_egrtidbase = (u64 __iomem *)
@@ -1555,6 +1595,12 @@ done:
 	return 0;
 }
 
+static void ipath_pe_free_irq(struct ipath_devdata *dd)
+{
+	free_irq(dd->ipath_irq, dd);
+	dd->ipath_irq = 0;
+}
+
 
 static struct ipath_message_header *
 ipath_pe_get_msgheader(struct ipath_devdata *dd, __le32 *rhf_addr)
@@ -1747,6 +1793,31 @@ static void ipath_pe_config_jint(struct ipath_devdata *dd, u16 a, u16 b)
 
 static int ipath_pe_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs)
 {
+	if (ibup) {
+		if (dd->ibdeltainprog) {
+			dd->ibdeltainprog = 0;
+			dd->ibsymdelta +=
+				ipath_read_creg32(dd,
+				  dd->ipath_cregs->cr_ibsymbolerrcnt) -
+				dd->ibsymsnap;
+			dd->iblnkerrdelta +=
+				ipath_read_creg32(dd,
+				  dd->ipath_cregs->cr_iblinkerrrecovcnt) -
+				dd->iblnkerrsnap;
+		}
+	} else {
+		dd->ipath_lli_counter = 0;
+		if (!dd->ibdeltainprog) {
+			dd->ibdeltainprog = 1;
+			dd->ibsymsnap =
+				ipath_read_creg32(dd,
+				  dd->ipath_cregs->cr_ibsymbolerrcnt);
+			dd->iblnkerrsnap =
+				ipath_read_creg32(dd,
+				  dd->ipath_cregs->cr_iblinkerrrecovcnt);
+		}
+	}
+
 	ipath_setup_pe_setextled(dd, ipath_ib_linkstate(dd, ibcs),
 		ipath_ib_linktrstate(dd, ibcs));
 	return 0;
@@ -1780,6 +1851,7 @@ void ipath_init_iba6120_funcs(struct ipath_devdata *dd)
 	dd->ipath_f_cleanup = ipath_setup_pe_cleanup;
 	dd->ipath_f_setextled = ipath_setup_pe_setextled;
 	dd->ipath_f_get_base_info = ipath_pe_get_base_info;
+	dd->ipath_f_free_irq = ipath_pe_free_irq;
 	dd->ipath_f_tidtemplate = ipath_pe_tidtemplate;
 	dd->ipath_f_intr_fallback = ipath_pe_nointr_fallback;
 	dd->ipath_f_xgxs_reset = ipath_pe_xgxs_reset;
@@ -1795,3 +1867,4 @@ void ipath_init_iba6120_funcs(struct ipath_devdata *dd)
 	/* initialize chip-specific variables */
 	ipath_init_pe_variables(dd);
 }
+
diff --git a/drivers/infiniband/hw/ipath/ipath_iba7220.c b/drivers/infiniband/hw/ipath/ipath_iba7220.c
index f16cf9c..521c51e 100644
--- a/drivers/infiniband/hw/ipath/ipath_iba7220.c
+++ b/drivers/infiniband/hw/ipath/ipath_iba7220.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
  * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -54,6 +54,15 @@ module_param_named(compat_ddr_negotiate, ipath_compat_ddr_negotiate, uint,
 MODULE_PARM_DESC(compat_ddr_negotiate,
 		"Attempt pre-IBTA 1.2 DDR speed negotiation");
 
+static unsigned ipath_sdma_fetch_arb = 1;
+module_param_named(fetch_arb, ipath_sdma_fetch_arb, uint, S_IRUGO);
+MODULE_PARM_DESC(fetch_arb, "IBA7220: change SDMA descriptor arbitration");
+
+static int ipath_pcie_coalesce;
+module_param_named(pcie_coalesce, ipath_pcie_coalesce, int, S_IRUGO);
+MODULE_PARM_DESC(pcie_coalesce, "tune PCIe coalescing on some Intel chipsets");
+
+
 /*
  * This file contains almost all the chip-specific register information and
  * access functions for the QLogic InfiniPath 7220 PCI-Express chip, with the
@@ -407,10 +416,6 @@ static const struct ipath_cregs ipath_7220_cregs = {
 	.cr_psxmitwaitcount = IPATH_CREG_OFFSET(PSXmitWaitCount),
 };
 
-/* kr_revision bits */
-#define INFINIPATH_R_EMULATORREV_MASK ((1ULL<<22) - 1)
-#define INFINIPATH_R_EMULATORREV_SHIFT 40
-
 /* kr_control bits */
 #define INFINIPATH_C_RESET (1U<<7)
 
@@ -528,9 +533,7 @@ static const struct ipath_cregs ipath_7220_cregs = {
 
 static char int_type[16] = "auto";
 module_param_string(interrupt_type, int_type, sizeof(int_type), 0444);
-MODULE_PARM_DESC(int_type, " interrupt_type=auto|force_msi|force_intx\n");
-
-static int ipath_special_trigger;
+MODULE_PARM_DESC(int_type, " interrupt_type=auto|force_msi|force_intx");
 
 /* packet rate matching delay; chip has support */
 static u8 rate_to_delay[2][2] = {
@@ -539,9 +542,6 @@ static u8 rate_to_delay[2][2] = {
 	{   4, 1 }  /* DDR */
 };
 
-module_param_named(special_trigger, ipath_special_trigger, int, S_IRUGO);
-MODULE_PARM_DESC(special_trigger, "Enable SpecialTrigger arm/launch");
-
 /* 7220 specific hardware errors... */
 static const struct ipath_hwerror_msgs ipath_7220_hwerror_msgs[] = {
 	INFINIPATH_HWE_MSG(PCIEPOISONEDTLP, "PCIe Poisoned TLP"),
@@ -857,17 +857,8 @@ static int ipath_7220_boardname(struct ipath_devdata *dd, char *name,
 			 boardrev);
 		break;
 	}
-	if (n) {
-		if (dd->ipath_revision & INFINIPATH_R_EMULATOR_MASK) {
-			unsigned rev =
-				(unsigned) ((dd->ipath_revision >>
-					INFINIPATH_R_EMULATORREV_SHIFT) &
-					INFINIPATH_R_EMULATORREV_MASK);
-
-			snprintf(name, namelen, "%s(%u)", n, rev);
-		} else
-			snprintf(name, namelen, "%s", n);
-	}
+	if (n)
+		snprintf(name, namelen, "%s", n);
 
 	if (dd->ipath_majrev != 5 || !dd->ipath_minrev ||
 		dd->ipath_minrev > 2) {
@@ -965,6 +956,12 @@ static int ipath_7220_bringup_serdes(struct ipath_devdata *dd)
 				 INFINIPATH_HWE_SERDESPLLFAILED);
 	}
 
+	dd->ibdeltainprog = 1;
+	dd->ibsymsnap =
+	     ipath_read_creg32(dd, dd->ipath_cregs->cr_ibsymbolerrcnt);
+	dd->iblnkerrsnap =
+	     ipath_read_creg32(dd, dd->ipath_cregs->cr_iblinkerrrecovcnt);
+
 	if (!dd->ipath_ibcddrctrl) {
 		/* not on re-init after reset */
 		dd->ipath_ibcddrctrl =
@@ -1010,8 +1007,10 @@ static int ipath_7220_bringup_serdes(struct ipath_devdata *dd)
 
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcddrctrl,
 			dd->ipath_ibcddrctrl);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_scratch, 0xfeedbeef);
 
 	ipath_write_kreg(dd, IPATH_KREG_OFFSET(IBNCModeCtrl), 0Ull);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_scratch, 0xfeedbeef);
 
 	/* IBA7220 has SERDES MPU reset in D0 of what _was_ IBPLLCfg */
 	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibserdesctrl);
@@ -1046,7 +1045,7 @@ static int ipath_7220_bringup_serdes(struct ipath_devdata *dd)
 	ipath_cdbg(VERBOSE, "done: xgxs=%llx from %llx\n",
 		   (unsigned long long)
 		   ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig),
-		   prev_val);
+		   (unsigned long long) prev_val);
 
 	guid = be64_to_cpu(dd->ipath_guid);
 
@@ -1056,8 +1055,10 @@ static int ipath_7220_bringup_serdes(struct ipath_devdata *dd)
 		ipath_dbg("No GUID for heartbeat, faking %llx\n",
 			(unsigned long long)guid);
 	} else
-		ipath_cdbg(VERBOSE, "Wrote %llX to HRTBT_GUID\n", guid);
+		ipath_cdbg(VERBOSE, "Wrote %llX to HRTBT_GUID\n",
+			(unsigned long long) guid);
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_hrtbt_guid, guid);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_scratch, 0xfeedbeef);
 	return ret;
 }
 
@@ -1097,6 +1098,37 @@ static void ipath_7220_config_jint(struct ipath_devdata *dd,
 static void ipath_7220_quiet_serdes(struct ipath_devdata *dd)
 {
 	u64 val;
+	if (dd->ibsymdelta || dd->iblnkerrdelta ||
+	    dd->ibdeltainprog) {
+		u64 diagc;
+		/* enable counter writes */
+		diagc = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwdiagctrl);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwdiagctrl,
+				 diagc | INFINIPATH_DC_COUNTERWREN);
+
+		if (dd->ibsymdelta || dd->ibdeltainprog) {
+			val = ipath_read_creg32(dd,
+					dd->ipath_cregs->cr_ibsymbolerrcnt);
+			if (dd->ibdeltainprog)
+				val -= val - dd->ibsymsnap;
+			val -= dd->ibsymdelta;
+			ipath_write_creg(dd,
+				  dd->ipath_cregs->cr_ibsymbolerrcnt, val);
+		}
+		if (dd->iblnkerrdelta || dd->ibdeltainprog) {
+			val = ipath_read_creg32(dd,
+					dd->ipath_cregs->cr_iblinkerrrecovcnt);
+			if (dd->ibdeltainprog)
+				val -= val - dd->iblnkerrsnap;
+			val -= dd->iblnkerrdelta;
+			ipath_write_creg(dd,
+				   dd->ipath_cregs->cr_iblinkerrrecovcnt, val);
+	     }
+
+	     /* and disable counter writes */
+	     ipath_write_kreg(dd, dd->ipath_kregs->kr_hwdiagctrl, diagc);
+	}
+
 	dd->ipath_flags &= ~IPATH_IB_AUTONEG_INPROG;
 	wake_up(&dd->ipath_autoneg_wait);
 	cancel_delayed_work(&dd->ipath_autoneg_work);
@@ -1221,16 +1253,23 @@ static int ipath_msi_enabled(struct pci_dev *pdev)
 
 /*
  * disable msi interrupt if enabled, and clear the flag.
- * flag is used primarily for the fallback to IntX, but
+ * flag is used primarily for the fallback to INTx, but
  * is also used in reinit after reset as a flag.
  */
 static void ipath_7220_nomsi(struct ipath_devdata *dd)
 {
 	dd->ipath_msi_lo = 0;
-#ifdef CONFIG_PCI_MSI
-	if (ipath_msi_enabled(dd->pcidev))
+
+	if (ipath_msi_enabled(dd->pcidev)) {
+		/*
+		 * free, but don't zero; later kernels require
+		 * it be freed before disable_msi, so the intx
+		 * setup has to request it again.
+		 */
+		 if (dd->ipath_irq)
+			free_irq(dd->ipath_irq, dd);
 		pci_disable_msi(dd->pcidev);
-#endif
+	}
 }
 
 /*
@@ -1246,6 +1285,90 @@ static void ipath_setup_7220_cleanup(struct ipath_devdata *dd)
 	ipath_7220_nomsi(dd);
 }
 
+/*
+ * Enable PCIe completion and data coalescing, on Intel 5x00 and 7300
+ * chipsets.   This is known to be unsafe for some revisions of some
+ * of these chipsets, with some BIOS settings, and enabling it on those
+ * systems may result in the system crashing, and/or data corruption.
+ */
+static void ipath_7220_tune_pcie_coalesce(struct ipath_devdata *dd)
+{
+	int r;
+	struct pci_dev *parent;
+	int ppos;
+	u16 devid;
+	u32 mask, bits, val;
+
+	if (!ipath_pcie_coalesce)
+		return;
+
+	/* Find out supported and configured values for parent (root) */
+	parent = dd->pcidev->bus->self;
+	if (parent->bus->parent) {
+		dev_info(&dd->pcidev->dev, "Parent not root\n");
+		return;
+	}
+	ppos = pci_find_capability(parent, PCI_CAP_ID_EXP);
+	if (!ppos) {
+		ipath_dbg("parent not PCIe root complex!?\n");
+		return;
+	}
+	if (parent->vendor != 0x8086) {
+		ipath_dbg("VendorID 0x%x isn't Intel, skip\n", parent->vendor);
+		return;
+	}
+
+	/*
+	 *  - bit 12: Max_rdcmp_Imt_EN: need to set to 1
+	 *  - bit 11: COALESCE_FORCE: need to set to 0
+	 *  - bit 10: COALESCE_EN: need to set to 1
+	 *  (but limitations on some on some chipsets)
+	 *
+	 *  On the Intel 5000, 5100, and 7300 chipsets, there is
+	 *  also: - bit 25:24: COALESCE_MODE, need to set to 0
+	 *  OLSON OLSON: 10,11,12 may need to be gated by maxpayload
+	 */
+	devid = parent->device;
+	if (devid >= 0x25e2 && devid <= 0x25fa) {
+		/* 5000 P/V/X/Z */
+		u8 rev;
+		pci_read_config_byte(parent, PCI_REVISION_ID, &rev);
+		if (rev <= 0xb2) {
+			bits = 1U << 10;
+			ipath_dbg("Old rev 5000* (0x%x), enable-only\n", rev);
+		} else
+			bits = 7U << 10;
+		mask = (3U << 24) | (7U << 10);
+	} else if (devid >= 0x65e2 && devid <= 0x65fa) {
+		/* 5100 */
+		bits = 1U << 10;
+		mask = (3U << 24) | (7U << 10);
+	} else if (devid >= 0x4021 && devid <= 0x402e) {
+		/* 5400 */
+		bits = 7U << 10;
+		mask = 7U << 10;
+	} else if (devid >= 0x3604 && devid <= 0x360a) {
+		/* 7300 */
+		bits = 7U << 10;
+		mask = (3U << 24) | (7U << 10);
+	} else {
+		/* not one of the chipsets that we know about */
+		ipath_dbg("DeviceID 0x%x isn't one we know, skip\n", devid);
+		return;
+	}
+	pci_read_config_dword(parent, 0x48, &val);
+	ipath_dbg("Read initial value 0x%x at 0x48, deviceid 0x%x\n",
+		val, devid);
+	val &= ~mask;
+	val |= bits;
+	r = pci_write_config_dword(parent, 0x48, val);
+	if (r)
+		ipath_dev_err(dd, "Unable to update deviceid 0x%x to val 0x%x"
+				" for PCIe coalescing\n", devid, val);
+	else
+		dev_info(&dd->pcidev->dev, "Updated deviceid 0x%x to val 0x%x"
+				" for PCIe coalescing\n", devid, val);
+}
 
 static void ipath_7220_pcie_params(struct ipath_devdata *dd, u32 boardrev)
 {
@@ -1305,6 +1428,8 @@ static void ipath_7220_pcie_params(struct ipath_devdata *dd, u32 boardrev)
 			"PCIe linkspeed %u is incorrect; "
 			"should be 1 (2500)!\n", speed);
 
+	ipath_7220_tune_pcie_coalesce(dd);
+
 bail:
 	/* fill in string, even on errors */
 	snprintf(dd->ipath_lbus_info, sizeof(dd->ipath_lbus_info),
@@ -1342,7 +1467,8 @@ static int ipath_setup_7220_config(struct ipath_devdata *dd,
 	u32 boardrev;
 
 	dd->ipath_msi_lo = 0;	/* used as a flag during reset processing */
-#ifdef CONFIG_PCI_MSI
+
+	pos = pci_find_capability(pdev, PCI_CAP_ID_MSI);
 	if (!strcmp(int_type, "force_msi") || !strcmp(int_type, "auto"))
 		ret = pci_enable_msi(pdev);
 	if (ret) {
@@ -1357,7 +1483,7 @@ static int ipath_setup_7220_config(struct ipath_devdata *dd,
 		if (!strcmp(int_type, "auto"))
 			ipath_dev_err(dd, "pci_enable_msi failed: %d, "
 				      "falling back to INTx\n", ret);
-	} else if ((pos = pci_find_capability(pdev, PCI_CAP_ID_MSI))) {
+	} else if (pos) {
 		u16 control;
 		pci_read_config_dword(pdev, pos + PCI_MSI_ADDRESS_LO,
 				      &dd->ipath_msi_lo);
@@ -1374,10 +1500,8 @@ static int ipath_setup_7220_config(struct ipath_devdata *dd,
 	} else
 		ipath_dev_err(dd, "Can't find MSI capability, "
 			      "can't save MSI settings for reset\n");
-#else
-	ipath_dbg("PCI_MSI not configured, using IntX interrupts\n");
-	ipath_enable_intx(pdev);
-#endif
+
+	dd->ipath_irq = pdev->irq;
 
 	/*
 	 * We save the cachelinesize also, although it doesn't
@@ -1397,7 +1521,7 @@ static int ipath_setup_7220_config(struct ipath_devdata *dd,
 
 	dd->ipath_flags |= IPATH_NODMA_RTAIL | IPATH_HAS_SEND_DMA |
 		IPATH_HAS_PBC_CNT | IPATH_HAS_THRESH_UPDATE;
-	dd->ipath_pioupd_thresh = 4U; /* set default update threshold */
+	dd->ipath_pioupd_thresh = 8U; /* set default update threshold */
 	return 0;
 }
 
@@ -1578,7 +1702,7 @@ static void ipath_init_7220_variables(struct ipath_devdata *dd)
 static int ipath_reinit_msi(struct ipath_devdata *dd)
 {
 	int ret = 0;
-#ifdef CONFIG_PCI_MSI
+
 	int pos;
 	u16 control;
 	if (!dd->ipath_msi_lo) /* Using intX, or init problem */
@@ -1612,10 +1736,10 @@ static int ipath_reinit_msi(struct ipath_devdata *dd)
 			      ((control & PCI_MSI_FLAGS_64BIT) ? 12 : 8),
 			      dd->ipath_msi_data);
 	ret = 1;
+
 bail:
-#endif
 	if (!ret) {
-		ipath_dbg("Using IntX, MSI disabled or not configured\n");
+		ipath_dbg("Using INTx, MSI disabled or not configured\n");
 		ipath_enable_intx(dd->pcidev);
 		ret = 1;
 	}
@@ -1727,7 +1851,7 @@ static void ipath_7220_put_tid(struct ipath_devdata *dd, u64 __iomem *tidptr,
 				 "not 2KB aligned!\n", pa);
 			return;
 		}
-		if (pa >= (1UL << IBA7220_TID_SZ_SHIFT)) {
+		if (chippa >= (1UL << IBA7220_TID_SZ_SHIFT)) {
 			ipath_dev_err(dd,
 				      "BUG: Physical page address 0x%lx "
 				      "larger than supported\n", pa);
@@ -1835,18 +1959,20 @@ static int ipath_7220_early_init(struct ipath_devdata *dd)
 		dd->ipath_control |= 1<<4;
 
 	dd->ipath_flags |= IPATH_4BYTE_TID;
-	if (ipath_special_trigger)
-		dd->ipath_flags |= IPATH_USE_SPCL_TRIG;
 
 	/*
 	 * For openfabrics, we need to be able to handle an IB header of
-	 * 24 dwords.  HT chip has arbitrary sized receive buffers, so we
-	 * made them the same size as the PIO buffers.  This chip does not
-	 * handle arbitrary size buffers, so we need the header large enough
-	 * to handle largest IB header, but still have room for a 2KB MTU
-	 * standard IB packet.
+	 * at least 24 dwords.  This chip does not handle arbitrary size
+	 * buffers, so we need the header large enough to handle largest
+	 * IB header, but still have room for a 2KB MTU standard IB packet.
+	 * Additionally, some processor/memory controller combinations
+	 * benefit quite strongly from having the DMA'ed data be cacheline
+	 * aligned and a cacheline multiple, so we set the size to 32 dwords
+	 * (2 64-byte primary cachelines for pretty much all processors of
+	 * interest).  The alignment hurts nothing, other than using somewhat
+	 * more memory.
 	 */
-	dd->ipath_rcvhdrentsize = 24;
+	dd->ipath_rcvhdrentsize = 32;
 	dd->ipath_rcvhdrsize = IPATH_DFLT_RCVHDRSIZE;
 	dd->ipath_rhf_offset =
 		dd->ipath_rcvhdrentsize - sizeof(u64) / sizeof(u32);
@@ -1918,12 +2044,15 @@ static int ipath_7220_get_base_info(struct ipath_portdata *pd, void *kbase)
 		IPATH_RUNTIME_PCIE | IPATH_RUNTIME_NODMA_RTAIL |
 		IPATH_RUNTIME_SDMA;
 
-	if (ipath_special_trigger)
-		kinfo->spi_runtime_flags |= IPATH_RUNTIME_SPECIAL_TRIGGER;
-
 	return 0;
 }
 
+static void ipath_7220_free_irq(struct ipath_devdata *dd)
+{
+	free_irq(dd->ipath_irq, dd);
+	dd->ipath_irq = 0;
+}
+
 static struct ipath_message_header *
 ipath_7220_get_msgheader(struct ipath_devdata *dd, __le32 *rhf_addr)
 {
@@ -1965,7 +2094,7 @@ static void ipath_7220_config_ports(struct ipath_devdata *dd, ushort cfgports)
 			 dd->ipath_rcvctrl);
 	dd->ipath_p0_rcvegrcnt = 2048; /* always */
 	if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
-		dd->ipath_pioreserved = 3; /* kpiobufs used for PIO */
+		dd->ipath_pioreserved = 8; /* kpiobufs used for PIO */
 }
 
 
@@ -2127,6 +2256,7 @@ static int ipath_7220_set_ib_cfg(struct ipath_devdata *dd, int which, u32 val)
 	dd->ipath_ibcddrctrl |= (((u64) val & maskr) << lsb);
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcddrctrl,
 			 dd->ipath_ibcddrctrl);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_scratch, 0xfeedbeef);
 	if (setforce)
 		dd->ipath_flags |= IPATH_IB_FORCE_NOTIFY;
 bail:
@@ -2143,14 +2273,25 @@ static void ipath_7220_read_counters(struct ipath_devdata *dd,
 		counters[i] = ipath_snap_cntr(dd, i);
 }
 
-/* if we are using MSI, try to fallback to IntX */
+/* if we are using MSI, try to fallback to INTx */
 static int ipath_7220_intr_fallback(struct ipath_devdata *dd)
 {
 	if (dd->ipath_msi_lo) {
 		dev_info(&dd->pcidev->dev, "MSI interrupt not detected,"
-			" trying IntX interrupts\n");
+			" trying INTx interrupts\n");
 		ipath_7220_nomsi(dd);
 		ipath_enable_intx(dd->pcidev);
+		/*
+		 * some newer kernels require free_irq before disable_msi,
+		 * and irq can be changed during disable and intx enable
+		 * and we need to therefore use the pcidev->irq value,
+		 * not our saved MSI value.
+		 */
+		dd->ipath_irq = dd->pcidev->irq;
+		if (request_irq(dd->ipath_irq, ipath_intr, IRQF_SHARED,
+			IPATH_DRV_NAME, dd))
+			ipath_dev_err(dd,
+				"Could not re-request_irq for INTx\n");
 		return 1;
 	}
 	return 0;
@@ -2203,12 +2344,6 @@ static void autoneg_send(struct ipath_devdata *dd,
 	ipath_flush_wc();
 	__iowrite32_copy(piobuf + 2, hdr, 7);
 	__iowrite32_copy(piobuf + 9, data, dcnt);
-	if (dd->ipath_flags & IPATH_USE_SPCL_TRIG) {
-		u32 spcl_off = (pnum > dd->ipath_piobcnt2k) ?
-			2047 : 1023;
-		ipath_flush_wc();
-		__raw_writel(0xaebecede, piobuf + spcl_off);
-	}
 	ipath_flush_wc();
 }
 
@@ -2230,18 +2365,18 @@ static void ipath_autoneg_send(struct ipath_devdata *dd, int which)
 		0xffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
 		0x40000001, 0x1388, 0x15e, /* rest 0's */
 		};
-	dcnt = sizeof(madpayload_start)/sizeof(madpayload_start[0]);
-	hcnt = sizeof(hdr)/sizeof(hdr[0]);
+	dcnt = ARRAY_SIZE(madpayload_start);
+	hcnt = ARRAY_SIZE(hdr);
 	if (!swapped) {
 		/* for maintainability, do it at runtime */
 		for (i = 0; i < hcnt; i++) {
-			dw = cpu_to_be32(hdr[i]);
+			dw = (__force u32) cpu_to_be32(hdr[i]);
 			hdr[i] = dw;
 		}
 		for (i = 0; i < dcnt; i++) {
-			dw = cpu_to_be32(madpayload_start[i]);
+			dw = (__force u32) cpu_to_be32(madpayload_start[i]);
 			madpayload_start[i] = dw;
-			dw = cpu_to_be32(madpayload_done[i]);
+			dw = (__force u32) cpu_to_be32(madpayload_done[i]);
 			madpayload_done[i] = dw;
 		}
 		swapped = 1;
@@ -2295,6 +2430,7 @@ static void set_speed_fast(struct ipath_devdata *dd, u32 speed)
 		IBA7220_IBC_WIDTH_SHIFT;
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcddrctrl,
 			dd->ipath_ibcddrctrl);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_scratch, 0xfeedbeef);
 	ipath_cdbg(VERBOSE, "setup for IB speed (%x) done\n", speed);
 }
 
@@ -2314,6 +2450,7 @@ static void try_auto_neg(struct ipath_devdata *dd)
 	 */
 	ipath_write_kreg(dd, IPATH_KREG_OFFSET(IBNCModeCtrl),
 		0x3b9dc07);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_scratch, 0xfeedbeef);
 	dd->ipath_flags |= IPATH_IB_AUTONEG_INPROG;
 	ipath_autoneg_send(dd, 0);
 	set_speed_fast(dd, IPATH_IB_DDR);
@@ -2326,7 +2463,7 @@ static void try_auto_neg(struct ipath_devdata *dd)
 
 static int ipath_7220_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs)
 {
-	int ret = 0;
+     int ret = 0, symadj = 0;
 	u32 ltstate = ipath_ib_linkstate(dd, ibcs);
 
 	dd->ipath_link_width_active =
@@ -2369,6 +2506,13 @@ static int ipath_7220_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs)
 			ipath_dbg("DDR negotiation try, %u/%u\n",
 				dd->ipath_autoneg_tries,
 				IPATH_AUTONEG_TRIES);
+			if (!dd->ibdeltainprog) {
+				dd->ibdeltainprog = 1;
+				dd->ibsymsnap = ipath_read_creg32(dd,
+					dd->ipath_cregs->cr_ibsymbolerrcnt);
+				dd->iblnkerrsnap = ipath_read_creg32(dd,
+					dd->ipath_cregs->cr_iblinkerrrecovcnt);
+			}
 			try_auto_neg(dd);
 			ret = 1; /* no other IB status change processing */
 		} else if ((dd->ipath_flags & IPATH_IB_AUTONEG_INPROG)
@@ -2389,6 +2533,7 @@ static int ipath_7220_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs)
 				set_speed_fast(dd,
 					dd->ipath_link_speed_enabled);
 				wake_up(&dd->ipath_autoneg_wait);
+				symadj = 1;
 			} else if (dd->ipath_flags & IPATH_IB_AUTONEG_FAILED) {
 				/*
 				 * clear autoneg failure flag, and do setup
@@ -2404,22 +2549,31 @@ static int ipath_7220_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs)
 					IBA7220_IBC_IBTA_1_2_MASK;
 				ipath_write_kreg(dd,
 					IPATH_KREG_OFFSET(IBNCModeCtrl), 0);
+				ipath_write_kreg(dd,
+					dd->ipath_kregs->kr_scratch,
+					0xfeedbeef);
+				symadj = 1;
 			}
 		}
 		/*
-		 * if we are in 1X, and are in autoneg width, it
-		 * could be due to an xgxs problem, so if we haven't
+		 * if we are in 1X on rev1 only, and are in autoneg width,
+		 * it could be due to an xgxs problem, so if we haven't
 		 * already tried, try twice to get to 4X; if we
 		 * tried, and couldn't, report it, since it will
 		 * probably not be what is desired.
 		 */
-		if ((dd->ipath_link_width_enabled & (IB_WIDTH_1X |
+		if (dd->ipath_minrev == 1 &&
+		    (dd->ipath_link_width_enabled & (IB_WIDTH_1X |
 			IB_WIDTH_4X)) == (IB_WIDTH_1X | IB_WIDTH_4X)
 			&& dd->ipath_link_width_active == IB_WIDTH_1X
 			&& dd->ipath_x1_fix_tries < 3) {
-			if (++dd->ipath_x1_fix_tries == 3)
+		     if (++dd->ipath_x1_fix_tries == 3) {
 				dev_info(&dd->pcidev->dev,
 					"IB link is in 1X mode\n");
+				if (!(dd->ipath_flags &
+				      IPATH_IB_AUTONEG_INPROG))
+					symadj = 1;
+		     }
 			else {
 				ipath_cdbg(VERBOSE, "IB 1X in "
 					"auto-width, try %u to be "
@@ -2430,7 +2584,8 @@ static int ipath_7220_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs)
 				dd->ipath_f_xgxs_reset(dd);
 				ret = 1; /* skip other processing */
 			}
-		}
+		} else if (!(dd->ipath_flags & IPATH_IB_AUTONEG_INPROG))
+			symadj = 1;
 
 		if (!ret) {
 			dd->delay_mult = rate_to_delay
@@ -2441,6 +2596,25 @@ static int ipath_7220_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs)
 		}
 	}
 
+	if (symadj) {
+		if (dd->ibdeltainprog) {
+			dd->ibdeltainprog = 0;
+			dd->ibsymdelta += ipath_read_creg32(dd,
+				dd->ipath_cregs->cr_ibsymbolerrcnt) -
+				dd->ibsymsnap;
+			dd->iblnkerrdelta += ipath_read_creg32(dd,
+				dd->ipath_cregs->cr_iblinkerrrecovcnt) -
+				dd->iblnkerrsnap;
+		}
+	} else if (!ibup && !dd->ibdeltainprog
+		   && !(dd->ipath_flags & IPATH_IB_AUTONEG_INPROG)) {
+		dd->ibdeltainprog = 1;
+		dd->ibsymsnap =	ipath_read_creg32(dd,
+				     dd->ipath_cregs->cr_ibsymbolerrcnt);
+		dd->iblnkerrsnap = ipath_read_creg32(dd,
+				     dd->ipath_cregs->cr_iblinkerrrecovcnt);
+	}
+
 	if (!ret)
 		ipath_setup_7220_setextled(dd, ipath_ib_linkstate(dd, ibcs),
 			ltstate);
@@ -2507,7 +2681,7 @@ done:
 	if (dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) {
 		ipath_dbg("Did not get to DDR INIT (%x) after %Lu msecs\n",
 			ipath_ib_state(dd, dd->ipath_lastibcstat),
-			jiffies_to_msecs(jiffies)-startms);
+			(unsigned long long) jiffies_to_msecs(jiffies)-startms);
 		dd->ipath_flags &= ~IPATH_IB_AUTONEG_INPROG;
 		if (dd->ipath_autoneg_tries == IPATH_AUTONEG_TRIES) {
 			dd->ipath_flags |= IPATH_IB_AUTONEG_FAILED;
@@ -2543,6 +2717,7 @@ void ipath_init_iba7220_funcs(struct ipath_devdata *dd)
 	dd->ipath_f_cleanup = ipath_setup_7220_cleanup;
 	dd->ipath_f_setextled = ipath_setup_7220_setextled;
 	dd->ipath_f_get_base_info = ipath_7220_get_base_info;
+	dd->ipath_f_free_irq = ipath_7220_free_irq;
 	dd->ipath_f_tidtemplate = ipath_7220_tidtemplate;
 	dd->ipath_f_intr_fallback = ipath_7220_intr_fallback;
 	dd->ipath_f_xgxs_reset = ipath_7220_xgxs_reset;
diff --git a/drivers/infiniband/hw/ipath/ipath_init_chip.c b/drivers/infiniband/hw/ipath/ipath_init_chip.c
index edd5c92..c01ff2a 100644
--- a/drivers/infiniband/hw/ipath/ipath_init_chip.c
+++ b/drivers/infiniband/hw/ipath/ipath_init_chip.c
@@ -37,6 +37,7 @@
 
 #include "ipath_kernel.h"
 #include "ipath_common.h"
+#include "ipath_wc_pat.h"
 
 /*
  * min buffers we want to have per port, after driver
@@ -126,7 +127,7 @@ static int create_port0_egr(struct ipath_devdata *dd)
 		dd->ipath_port0_skbinfo[e].phys =
 		  ipath_map_single(dd->pcidev,
 				   dd->ipath_port0_skbinfo[e].skb->data,
-				   dd->ipath_ibmaxlen, PCI_DMA_FROMDEVICE);
+				   dd->ipath_init_ibmaxlen, PCI_DMA_FROMDEVICE);
 		dd->ipath_f_put_tid(dd, e + (u64 __iomem *)
 				    ((char __iomem *) dd->ipath_kregbase +
 				     dd->ipath_rcvegrbase),
@@ -220,6 +221,131 @@ static struct ipath_portdata *create_portdata0(struct ipath_devdata *dd)
 	return pd;
 }
 
+static int init_chip_wc_pat(struct ipath_devdata *dd)
+{
+	int ret = 0;
+	u64 __iomem *ipath_kregbase = NULL;
+	void __iomem *ipath_piobase = NULL;
+	u64 __iomem *ipath_userbase = NULL;
+	u64 ipath_kreglen;
+	u64 ipath_pio2koffset = dd->ipath_piobufbase & 0xffffffff;
+	u64 ipath_pio4koffset = dd->ipath_piobufbase >> 32;
+	u64 ipath_pio2klen = dd->ipath_piobcnt2k * dd->ipath_palign;
+	u64 ipath_pio4klen = dd->ipath_piobcnt4k * dd->ipath_4kalign;
+	u64 ipath_physaddr = dd->ipath_physaddr;
+	u64 ipath_piolen;
+	u64 ipath_userlen = 0;
+
+	/* Assumes chip address space looks like:
+		- kregs + sregs + cregs + uregs (in any order)
+		- piobufs (2K and 4K bufs in either order)
+	   or:
+		- kregs + sregs + cregs (in any order)
+		- piobufs (2K and 4K bufs in either order)
+		- uregs
+	*/
+	if (dd->ipath_piobcnt4k == 0) {
+		ipath_kreglen = ipath_pio2koffset;
+		ipath_piolen = ipath_pio2klen;
+	} else if (ipath_pio2koffset < ipath_pio4koffset) {
+		ipath_kreglen = ipath_pio2koffset;
+		ipath_piolen = ipath_pio4koffset + ipath_pio4klen -
+			ipath_kreglen;
+	} else {
+		ipath_kreglen = ipath_pio4koffset;
+		ipath_piolen = ipath_pio2koffset + ipath_pio2klen -
+			ipath_kreglen;
+	}
+	if (dd->ipath_sregbase > ipath_kreglen) {
+		ipath_dbg("Unexpected sregbase layout\n");
+		ret = -EINVAL;
+		goto done;
+	}
+	if (dd->ipath_cregbase > ipath_kreglen) {
+		ipath_dbg("Unexpected cregbase layout\n");
+		ret = -EINVAL;
+		goto done;
+	}
+	if (dd->ipath_uregbase > ipath_kreglen)
+		/* Map just the configured ports (not all hw ports) */
+		ipath_userlen = dd->ipath_ureg_align *
+				dd->ipath_cfgports;
+
+	/* Sanity checks passed, now create the new mappings */
+	ipath_kregbase = ioremap_nocache(ipath_physaddr,
+					 ipath_kreglen);
+	if (!ipath_kregbase) {
+		ipath_dbg("Unable to remap io addr %llx to kvirt\n",
+			  ipath_physaddr);
+		ret = -ENOMEM;
+		goto done;
+	}
+	ipath_cdbg(VERBOSE, "WC PAT remapped io addr %llx"
+		   " to kregbase %p for %llu bytes\n",
+		   ipath_physaddr, ipath_kregbase, ipath_kreglen);
+
+	ipath_piobase = (void __iomem *) ioremap_wc(
+				ipath_physaddr + ipath_kreglen,
+				ipath_piolen);
+	if (!ipath_piobase) {
+		ipath_dbg("Unable to remap io addr %llx to kvirt\n",
+			  ipath_physaddr + ipath_kreglen);
+		ret = -ENOMEM;
+		goto done_kregbase;
+	}
+	ipath_cdbg(VERBOSE, "WC PAT remapped io addr %llx"
+		   " to piobase %p for %llu bytes\n",
+		   ipath_physaddr + ipath_kreglen,
+		   ipath_piobase, ipath_piolen);
+
+	if (ipath_userlen) {
+		ipath_userbase = (void __iomem *) ioremap_nocache(
+					ipath_physaddr +
+					dd->ipath_uregbase,
+					ipath_userlen);
+		if (!ipath_userbase) {
+			ipath_dbg("Unable to remap io addr %llx "
+				  "to kvirt\n",
+				  ipath_physaddr + dd->ipath_uregbase);
+			ret = -ENOMEM;
+			goto done_piobase;
+		}
+		ipath_cdbg(VERBOSE, "WC PAT remapped io addr %llx"
+			   " to userbase %p for %llu bytes\n",
+			   ipath_physaddr + dd->ipath_uregbase,
+			   ipath_userbase, ipath_userlen);
+	}
+
+	/* All remapping successful, get rid of old mapping */
+	iounmap((volatile void __iomem *) dd->ipath_kregbase);
+
+	/* Finally update dd with the changes */
+	dd->ipath_kregbase = ipath_kregbase;
+	dd->ipath_kregend = (u64 __iomem *)
+		((char __iomem *) ipath_kregbase + ipath_kreglen);
+	dd->ipath_piobase = ipath_piobase;
+	dd->ipath_pio2kbase = (void __iomem *)
+		(((char __iomem *) dd->ipath_piobase) +
+		 ipath_pio2koffset - ipath_kreglen);
+	if (dd->ipath_piobcnt4k)
+		dd->ipath_pio4kbase = (void __iomem *)
+			(((char __iomem *) dd->ipath_piobase) +
+			 ipath_pio4koffset - ipath_kreglen);
+	if (ipath_userlen)
+		/* ureg will now be accessed relative to dd->ipath_userbase */
+		dd->ipath_userbase = ipath_userbase;
+	goto done;
+
+done_piobase:
+	iounmap((volatile void __iomem *) ipath_piobase);
+
+done_kregbase:
+	iounmap((volatile void __iomem *) ipath_kregbase);
+
+done:
+	return ret;
+}
+
 static int init_chip_first(struct ipath_devdata *dd)
 {
 	struct ipath_portdata *pd;
@@ -229,6 +355,7 @@ static int init_chip_first(struct ipath_devdata *dd)
 	spin_lock_init(&dd->ipath_kernel_tid_lock);
 	spin_lock_init(&dd->ipath_user_tid_lock);
 	spin_lock_init(&dd->ipath_sendctrl_lock);
+	spin_lock_init(&dd->ipath_uctxt_lock);
 	spin_lock_init(&dd->ipath_sdma_lock);
 	spin_lock_init(&dd->ipath_gpio_lock);
 	spin_lock_init(&dd->ipath_eep_st_lock);
@@ -314,6 +441,15 @@ static int init_chip_first(struct ipath_devdata *dd)
 		 */
 		dd->ipath_4kalign = ALIGN(dd->ipath_piosize4k,
 					  dd->ipath_palign);
+	}
+
+	if (ipath_wc_pat) {
+		ret = init_chip_wc_pat(dd);
+		if (ret)
+			goto done;
+	}
+
+	if (dd->ipath_piobcnt4k) {
 		ipath_dbg("%u 2k(%x) piobufs @ %p, %u 4k(%x) @ %p "
 			  "(%x aligned)\n",
 			  dd->ipath_piobcnt2k, dd->ipath_piosize2k,
@@ -483,8 +619,6 @@ static void enable_chip(struct ipath_devdata *dd, int reinit)
 	/* Enable PIO send, and update of PIOavail regs to memory. */
 	dd->ipath_sendctrl = INFINIPATH_S_PIOENABLE |
 		INFINIPATH_S_PIOBUFAVAILUPD;
-	if (dd->ipath_flags & IPATH_USE_SPCL_TRIG)
-		dd->ipath_sendctrl |= INFINIPATH_S_SPECIALTRIGGER;
 
 	/*
 	 * Set the PIO avail update threshold to host memory
@@ -535,21 +669,21 @@ static void enable_chip(struct ipath_devdata *dd, int reinit)
 	 * initial values of the generation bit correct.
 	 */
 	for (i = 0; i < dd->ipath_pioavregs; i++) {
-		__le64 tmp;
+		__le64 pioavail;
 
 		/*
 		 * Chip Errata bug 6641; even and odd qwords>3 are swapped.
 		 */
 		if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS))
-			tmp = dd->ipath_pioavailregs_dma[i ^ 1];
+			pioavail = dd->ipath_pioavailregs_dma[i ^ 1];
 		else
-			tmp = dd->ipath_pioavailregs_dma[i];
+			pioavail = dd->ipath_pioavailregs_dma[i];
 		/*
 		 * don't need to worry about ipath_pioavailkernel here
 		 * because we will call ipath_chg_pioavailkernel() later
 		 * in initialization, to busy out buffers as needed
 		 */
-		dd->ipath_pioavailshadow[i] = le64_to_cpu(tmp);
+		dd->ipath_pioavailshadow[i] = le64_to_cpu(pioavail);
 	}
 	/* can get counters, stats, etc. */
 	dd->ipath_flags |= IPATH_PRESENT;
@@ -655,10 +789,7 @@ static int init_housekeeping(struct ipath_devdata *dd, int reinit)
 			    INFINIPATH_R_SOFTWARE_SHIFT) &
 		 INFINIPATH_R_SOFTWARE_MASK);
 
-	if (dd->ipath_revision & INFINIPATH_R_EMULATOR_MASK)
-		dev_info(&dd->pcidev->dev, "%s", dd->ipath_boardversion);
-	else
-		ipath_dbg("%s", dd->ipath_boardversion);
+	ipath_dbg("%s", dd->ipath_boardversion);
 
 	if (ret)
 		goto done;
@@ -672,7 +803,6 @@ done:
 	return ret;
 }
 
-
 static void verify_interrupt(unsigned long opaque)
 {
 	struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
@@ -681,7 +811,7 @@ static void verify_interrupt(unsigned long opaque)
 		return; /* being torn down */
 
 	/*
-	 * If we don't have a lid or any interrupts, let the user know and
+	 * If we don't have any interrupts, let the user know and
 	 * don't bother checking again.
 	 */
 	if (dd->ipath_int_counter == 0) {
@@ -695,7 +825,6 @@ static void verify_interrupt(unsigned long opaque)
 			dd->ipath_int_counter);
 }
 
-
 /**
  * ipath_init_chip - do the actual initialization sequence on the chip
  * @dd: the infinipath device
@@ -793,11 +922,17 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit)
 			"ports <= %u\n", dd->ipath_pbufsport,
 			dd->ipath_ports_extrabuf);
 	dd->ipath_lastpioindex = 0;
-	dd->ipath_lastpioindexl = dd->ipath_lastport_piobuf;
+	dd->ipath_lastpioindexl = dd->ipath_piobcnt2k;
 	/* ipath_pioavailshadow initialized earlier */
 	ipath_cdbg(VERBOSE, "%d PIO bufs for kernel out of %d total %u "
 		   "each for %u user ports\n", kpiobufs,
 		   piobufs, dd->ipath_pbufsport, uports);
+	if (dd->ipath_pioupd_thresh &&
+		(dd->ipath_pioupd_thresh > dd->ipath_pbufsport - 2)) {
+		dd->ipath_pioupd_thresh = dd->ipath_pbufsport - 2;
+		ipath_cdbg(VERBOSE, "Drop pioupd_thresh to %u\n",
+			dd->ipath_pioupd_thresh);
+	}
 	ret = dd->ipath_f_early_init(dd);
 	if (ret) {
 		ipath_dev_err(dd, "Early initialization failure\n");
diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c
index b3445e9..e73afd9 100644
--- a/drivers/infiniband/hw/ipath/ipath_intr.c
+++ b/drivers/infiniband/hw/ipath/ipath_intr.c
@@ -243,7 +243,6 @@ static char *ib_linkstate(struct ipath_devdata *dd, u64 ibcs)
 	return ret;
 }
 
-
 void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev)
 {
 	struct ib_event event;
@@ -269,7 +268,8 @@ static void handle_e_ibstatuschanged(struct ipath_devdata *dd,
 	lastlstate = ipath_ib_linkstate(dd, dd->ipath_lastibcstat);
 	ltstate = ipath_ib_linktrstate(dd, ibcs); /* linktrainingtate */
 
-	/* Since going into a recovery state causes the link state to go
+	/*
+	 * Since going into a recovery state causes the link state to go
 	 * down and since recovery is transitory, it is better if we "miss"
 	 * ever seeing the link training state go into recovery (i.e.,
 	 * ignore this transition for link state special handling purposes)
@@ -328,12 +328,12 @@ static void handle_e_ibstatuschanged(struct ipath_devdata *dd,
 		 * Ignore cycling back and forth from Polling.Active to
 		 * Polling.Quiet while waiting for the other end of the link
 		 * to come up, except to try and decide if we are connected
-		 * to a live IB device or not.  We will * cycle back and
+		 * to a live IB device or not.  We will cycle back and
 		 * forth between them if no cable is plugged in, the other
 		 * device is powered off or disabled, etc.
 		 */
-		if (lastlts == INFINIPATH_IBCS_LT_STATE_POLLACTIVE
-		    || lastlts == INFINIPATH_IBCS_LT_STATE_POLLQUIET) {
+		if (lastlts == INFINIPATH_IBCS_LT_STATE_POLLACTIVE ||
+		    lastlts == INFINIPATH_IBCS_LT_STATE_POLLQUIET) {
 			if (!(dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) &&
 			     (++dd->ipath_ibpollcnt == 40)) {
 				dd->ipath_flags |= IPATH_NOCABLE;
@@ -350,16 +350,16 @@ static void handle_e_ibstatuschanged(struct ipath_devdata *dd,
 	dd->ipath_ibpollcnt = 0; /* not poll*, now */
 	ipath_stats.sps_iblink++;
 
-	if (ibstate != init && dd->ipath_lastlinkrecov &&
-		ipath_linkrecovery) {
+	if (ibstate != init && dd->ipath_lastlinkrecov && ipath_linkrecovery) {
 		u64 linkrecov;
 		linkrecov = ipath_snap_cntr(dd,
 			dd->ipath_cregs->cr_iblinkerrrecovcnt);
 		if (linkrecov != dd->ipath_lastlinkrecov) {
 			ipath_dbg("IB linkrecov up %Lx (%s %s) recov %Lu\n",
-				ibcs, ib_linkstate(dd, ibcs),
+				(unsigned long long) ibcs,
+				ib_linkstate(dd, ibcs),
 				ipath_ibcstatus_str[ltstate],
-				linkrecov);
+				(unsigned long long) linkrecov);
 			/* and no more until active again */
 			dd->ipath_lastlinkrecov = 0;
 			ipath_set_linkstate(dd, IPATH_IB_LINKDOWN);
@@ -449,9 +449,8 @@ done:
 	return;
 }
 
-
 static void handle_supp_msgs(struct ipath_devdata *dd,
-	     unsigned supp_msgs, char *msg, u32 msgsz)
+			     unsigned supp_msgs, char *msg, u32 msgsz)
 {
 	/*
 	 * Print the message unless it's ibc status change only, which
@@ -461,8 +460,8 @@ static void handle_supp_msgs(struct ipath_devdata *dd,
 		int iserr;
 		ipath_err_t mask;
 		iserr = ipath_decode_err(dd, msg, msgsz,
-				dd->ipath_lasterror &
-				~INFINIPATH_E_IBSTATUSCHANGED);
+					 dd->ipath_lasterror &
+					 ~INFINIPATH_E_IBSTATUSCHANGED);
 
 		mask = INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
 			INFINIPATH_E_PKTERRS | INFINIPATH_E_SDMADISABLED;
@@ -550,13 +549,20 @@ static void handle_sdma_errors(struct ipath_devdata *dd, ipath_err_t errs)
 		ipath_cdbg(VERBOSE, "sdma tl 0x%lx hd 0x%lx status 0x%lx "
 			"lengen 0x%lx\n", tl, hd, status, lengen);
 	}
-
+	expected = test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
+	ipath_dbg("%sxpected sdma error, sdma_status 0x%lx\n",
+		expected ?  "e" : "une", dd->ipath_sdma_status);
+	/*
+	 * we are in interrupt context (and only one interrupt vector),
+	 * so we won't get another interrupt and process the sdma state
+	 * change before the set_bit of SDMA_DISABLED.  We set DISABLED
+	 * here because there are cases where abort_task will not.
+	 */
+	if (!expected) /* must be prior to setting SDMA_DISABLED */
+		ipath_cancel_sends(dd, 1);
 	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
 	__set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
-	expected = test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
 	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-	if (!expected)
-		ipath_cancel_sends(dd, 1);
 }
 
 static void handle_sdma_intr(struct ipath_devdata *dd, u64 istat)
@@ -571,13 +577,19 @@ static void handle_sdma_intr(struct ipath_devdata *dd, u64 istat)
 	if (istat & INFINIPATH_I_SDMADISABLED) {
 		expected = test_bit(IPATH_SDMA_ABORTING,
 			&dd->ipath_sdma_status);
-		ipath_dbg("%s SDmaDisabled intr\n",
-			expected ? "expected" : "unexpected");
+		ipath_dbg("%sxpected sdma disabled intr, sdma_status 0x%lx\n",
+			expected ?  "e" : "une", dd->ipath_sdma_status);
+		/*
+		 * we are in interrupt context (and only one interrupt vector),
+		 * so we won't get another interrupt and process the sdma state
+		 * change before the set_bit of SDMA_DISABLED.  We set DISABLED
+		 * here because there are cases where abort_task will not.
+		 */
+		if (!expected) /* must be prior to setting SDMA_DISABLED */
+			ipath_cancel_sends(dd, 1);
 		spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
 		__set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
 		spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-		if (!expected)
-			ipath_cancel_sends(dd, 1);
 		if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
 			tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
 	}
@@ -588,8 +600,10 @@ static int handle_hdrq_full(struct ipath_devdata *dd)
 	int chkerrpkts = 0;
 	u32 hd, tl;
 	u32 i;
+	unsigned long flags;
 
 	ipath_stats.sps_hdrqfull++;
+	spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
 	for (i = 0; i < dd->ipath_cfgports; i++) {
 		struct ipath_portdata *pd = dd->ipath_pd[i];
 
@@ -625,6 +639,7 @@ static int handle_hdrq_full(struct ipath_devdata *dd)
 			wake_up_interruptible(&pd->port_wait);
 		}
 	}
+	spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
 
 	return chkerrpkts;
 }
@@ -939,14 +954,14 @@ static noinline void ipath_bad_intr(struct ipath_devdata *dd, u32 *unexpectp)
 			 * linuxbios development work, and it may happen in
 			 * the future again.
 			 */
-			if (dd->pcidev && dd->pcidev->irq) {
+			if (dd->pcidev && dd->ipath_irq) {
 				ipath_dev_err(dd, "Now %u unexpected "
 					      "interrupts, unregistering "
 					      "interrupt handler\n",
 					      *unexpectp);
-				ipath_dbg("free_irq of irq %x\n",
-					  dd->pcidev->irq);
-				free_irq(dd->pcidev->irq, dd);
+				ipath_dbg("free_irq of irq %d\n",
+					  dd->ipath_irq);
+				dd->ipath_f_free_irq(dd);
 			}
 		}
 		if (ipath_read_ireg(dd, dd->ipath_kregs->kr_intmask)) {
@@ -982,7 +997,7 @@ static noinline void ipath_bad_regread(struct ipath_devdata *dd)
 		if (allbits == 2) {
 			ipath_dev_err(dd, "Still bad interrupt status, "
 				      "unregistering interrupt\n");
-			free_irq(dd->pcidev->irq, dd);
+			dd->ipath_f_free_irq(dd);
 		} else if (allbits > 2) {
 			if ((allbits % 10000) == 0)
 				printk(".");
@@ -1011,7 +1026,6 @@ set:
 	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
 }
 
-
 /*
  * Handle receive interrupts for user ports; this means a user
  * process was waiting for a packet to arrive, and didn't want
@@ -1022,6 +1036,7 @@ static void handle_urcv(struct ipath_devdata *dd, u64 istat)
 	u64 portr;
 	int i;
 	int rcvdint = 0;
+	unsigned long flags;
 
 	/*
 	 * test_and_clear_bit(IPATH_PORT_WAITING_RCV) and
@@ -1037,6 +1052,7 @@ static void handle_urcv(struct ipath_devdata *dd, u64 istat)
 		 dd->ipath_i_rcvavail_mask) |
 		((istat >> dd->ipath_i_rcvurg_shift) &
 		 dd->ipath_i_rcvurg_mask);
+	spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
 	for (i = 1; i < dd->ipath_cfgports; i++) {
 		struct ipath_portdata *pd = dd->ipath_pd[i];
 
@@ -1054,6 +1070,8 @@ static void handle_urcv(struct ipath_devdata *dd, u64 istat)
 			}
 		}
 	}
+	spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+	
 	if (rcvdint) {
 		/* only want to take one interrupt, so turn off the rcv
 		 * interrupt for all the ports that we set the rcv_waiting
@@ -1121,9 +1139,11 @@ irqreturn_t ipath_intr(int irq, void *data)
 	if (unlikely(istat & ~dd->ipath_i_bitsextant))
 		ipath_dev_err(dd,
 			      "interrupt with unknown interrupts %Lx set\n",
+			      (unsigned long long)
 			      istat & ~dd->ipath_i_bitsextant);
 	else if (istat & ~INFINIPATH_I_ERROR) /* errors do own printing */
-		ipath_cdbg(VERBOSE, "intr stat=0x%Lx\n", istat);
+		ipath_cdbg(VERBOSE, "intr stat=0x%Lx\n",
+			(unsigned long long) istat);
 
 	if (istat & INFINIPATH_I_ERROR) {
 		ipath_stats.sps_errints++;
@@ -1131,7 +1151,8 @@ irqreturn_t ipath_intr(int irq, void *data)
 					  dd->ipath_kregs->kr_errorstatus);
 		if (!estat)
 			dev_info(&dd->pcidev->dev, "error interrupt (%Lx), "
-				 "but no error bits set!\n", istat);
+				 "but no error bits set!\n",
+				 (unsigned long long) istat);
 		else if (estat == -1LL)
 			/*
 			 * should we try clearing all, or hope next read
@@ -1238,17 +1259,14 @@ irqreturn_t ipath_intr(int irq, void *data)
 	 * waiting for receive are at the bottom.
 	 */
 	kportrbits = (1ULL << dd->ipath_i_rcvavail_shift) |
-		(1ULL << dd->ipath_i_rcvurg_shift) |
-		INFINIPATH_I_JINT;
+		(1ULL << dd->ipath_i_rcvurg_shift);
 	if (chk0rcv || (istat & kportrbits)) {
 		istat &= ~kportrbits;
 		ipath_kreceive(dd->ipath_pd[0]);
 	}
 
-	if (istat & ((dd->ipath_i_rcvavail_mask <<
-		      dd->ipath_i_rcvavail_shift)
-		     | (dd->ipath_i_rcvurg_mask <<
-			dd->ipath_i_rcvurg_shift)))
+	if (istat & ((dd->ipath_i_rcvavail_mask << dd->ipath_i_rcvavail_shift) |
+		     (dd->ipath_i_rcvurg_mask << dd->ipath_i_rcvurg_shift)))
 		handle_urcv(dd, istat);
 
 	if (istat & (INFINIPATH_I_SDMAINT | INFINIPATH_I_SDMADISABLED))
diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h
index 7754f5e..65f9234 100644
--- a/drivers/infiniband/hw/ipath/ipath_kernel.h
+++ b/drivers/infiniband/hw/ipath/ipath_kernel.h
@@ -43,7 +43,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <asm/io.h>
 #include <rdma/ib_verbs.h>
 
@@ -102,7 +102,6 @@ struct ipath_portdata {
 	/* mmap of hdrq, must fit in 44 bits */
 	dma_addr_t port_rcvhdrq_phys;
 	dma_addr_t port_rcvhdrqtailaddr_phys;
-
 	/*
 	 * number of opens (including slave subports) on this instance
 	 * (ignoring forks, dup, etc. for now)
@@ -211,7 +210,6 @@ struct ipath_sdma_txreq {
 	};
 	void              (*callback)(void *, int);
 	void               *callback_cookie;
-	int                 callback_status;
 	u16                 start_idx;  /* sdma private */
 	u16                 next_descq_idx;  /* sdma private */
 	struct list_head    list;       /* sdma private */
@@ -227,13 +225,17 @@ struct ipath_sdma_desc {
 #define IPATH_SDMA_TXREQ_F_FREEBUF      0x8
 #define IPATH_SDMA_TXREQ_F_FREEDESC     0x10
 #define IPATH_SDMA_TXREQ_F_VL15         0x20
-#define IPATH_SDMA_TXREQ_F_FREECNT_HACK 0x40 /*XXX*/
 
 #define IPATH_SDMA_TXREQ_S_OK        0
 #define IPATH_SDMA_TXREQ_S_SENDERROR 1
 #define IPATH_SDMA_TXREQ_S_ABORTED   2
 #define IPATH_SDMA_TXREQ_S_SHUTDOWN  3
 
+#define IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG	(1ull << 63)
+#define IPATH_SDMA_STATUS_ABORT_IN_PROG			(1ull << 62)
+#define IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE		(1ull << 61)
+#define IPATH_SDMA_STATUS_SCB_EMPTY			(1ull << 30)
+
 /* max dwords in small buffer packet */
 #define IPATH_SMALLBUF_DWORDS (dd->ipath_piosize2k >> 2)
 
@@ -275,6 +277,10 @@ struct ipath_devdata {
 	void __iomem *ipath_pio2kbase;
 	/* kvirt address of 1st 4k pio buffer */
 	void __iomem *ipath_pio4kbase;
+	/* mem-mapped pointer to base of PIO buffers (if using WC PAT) */
+	void __iomem *ipath_piobase;
+	/* mem-mapped pointer to base of user chip regs (if using WC PAT) */
+	u64 __iomem *ipath_userbase;
 	/*
 	 * points to area where PIOavail registers will be DMA'ed.
 	 * Has to be on a page of it's own, because the page will be
@@ -310,6 +316,8 @@ struct ipath_devdata {
 	void (*ipath_f_setextled)(struct ipath_devdata *, u64, u64);
 	/* fill out chip-specific fields */
 	int (*ipath_f_get_base_info)(struct ipath_portdata *, void *);
+	/* free irq */
+	void (*ipath_f_free_irq)(struct ipath_devdata *);
 	struct ipath_message_header *(*ipath_f_get_msgheader)
 					(struct ipath_devdata *, __le32 *);
 	void (*ipath_f_config_ports)(struct ipath_devdata *, ushort);
@@ -350,6 +358,19 @@ struct ipath_devdata {
 	/* errors masked because they occur too fast */
 	ipath_err_t ipath_maskederrs;
 	u64 ipath_lastlinkrecov; /* link recoveries at last ACTIVE */
+	/* these 5 fields are used to establish deltas for IB Symbol
+	 * errors and linkrecovery errors. They can be reported on
+	 * some chips during link negotiation prior to INIT, and with
+	 * DDR when faking DDR negotiations with non-IBTA switches.
+	 * The chip counters are adjusted at driver unload if there is
+	 * a non-zero delta.
+	 */
+	u64 ibdeltainprog;
+	u64 ibsymdelta;
+	u64 ibsymsnap;
+	u64 iblnkerrdelta;
+	u64 iblnkerrsnap;
+
 	/* time in jiffies at which to re-enable maskederrs */
 	unsigned long ipath_unmasktime;
 	/* count of egrfull errors, combined for all ports */
@@ -376,7 +397,6 @@ struct ipath_devdata {
 	u32 ipath_lastport_piobuf;
 	/* is a stats timer active */
 	u32 ipath_stats_timer_active;
-	u32 ipath_link_timer_active;
 	/* number of interrupts for this device -- saturates... */
 	u32 ipath_int_counter;
 	/* dwords sent read from counter */
@@ -460,6 +480,8 @@ struct ipath_devdata {
 	spinlock_t ipath_kernel_tid_lock;
 	spinlock_t ipath_user_tid_lock;
 	spinlock_t ipath_sendctrl_lock;
+	/* around ipath_pd and (user ports) port_cnt use (intr vs free) */
+	spinlock_t ipath_uctxt_lock;
 
 	/*
 	 * IPATH_STATUS_*,
@@ -477,7 +499,6 @@ struct ipath_devdata {
 	struct class_device *diag_class_dev;
 	/* timer used to prevent stats overflow, error throttling, etc. */
 	struct timer_list ipath_stats_timer;
-	struct timer_list ipath_link_timer;
 	/* timer to verify interrupts work, and fallback if possible */
 	struct timer_list ipath_intrchk_timer;
 	void *ipath_dummy_hdrq;	/* used after port close */
@@ -485,7 +506,7 @@ struct ipath_devdata {
 
 	/* SendDMA related entries */
 	spinlock_t            ipath_sdma_lock;
-	u64                   ipath_sdma_status;
+	unsigned long         ipath_sdma_status;
 	unsigned long         ipath_sdma_abort_jiffies;
 	unsigned long         ipath_sdma_abort_intr_timeout;
 	unsigned long         ipath_sdma_buf_jiffies;
@@ -500,9 +521,7 @@ struct ipath_devdata {
 	u16                   ipath_sdma_reset_wait;
 	u8                    ipath_sdma_generation;
 	struct tasklet_struct ipath_sdma_abort_task;
-	struct tasklet_struct ipath_sdma_notify_task;
 	struct list_head      ipath_sdma_activelist;
-	struct list_head      ipath_sdma_notifylist;
 	atomic_t              ipath_sdma_vl15_count;
 	struct timer_list     ipath_sdma_vl15_timer;
 
@@ -719,9 +738,6 @@ struct ipath_devdata {
 	u32 ipath_overrun_thresh_errs;
 	u32 ipath_lli_errs;
 
-	/* status check work */
-	struct delayed_work status_work;
-
 	/*
 	 * Not all devices managed by a driver instance are the same
 	 * type, so these fields must be per-device.
@@ -815,7 +831,7 @@ struct ipath_devdata {
 	u8 ipath_presets_needed; /* Set if presets to be restored next DOWN */
 };
 
-/* ipath_hol_state values (stopping/starting user proc, send flushing */
+/* ipath_hol_state values (stopping/starting user proc, send flushing) */
 #define IPATH_HOL_UP       0
 #define IPATH_HOL_DOWN     1
 /* ipath_hol_next toggle values, used when hol_state IPATH_HOL_DOWN */
@@ -827,8 +843,8 @@ struct ipath_devdata {
 #define IPATH_SDMA_DISARMED  1
 #define IPATH_SDMA_DISABLED  2
 #define IPATH_SDMA_LAYERBUF  3
-#define IPATH_SDMA_RUNNING  62
-#define IPATH_SDMA_SHUTDOWN 63
+#define IPATH_SDMA_RUNNING  30
+#define IPATH_SDMA_SHUTDOWN 31
 
 /* bit combinations that correspond to abort states */
 #define IPATH_SDMA_ABORT_NONE 0
@@ -862,7 +878,6 @@ void ipath_disable_wc(struct ipath_devdata *dd);
 int ipath_count_units(int *npresentp, int *nupp, int *maxportsp);
 void ipath_shutdown_device(struct ipath_devdata *);
 void ipath_clear_freeze(struct ipath_devdata *);
-int ipath_signal_procs(struct ipath_devdata *, int);
 
 struct file_operations;
 int ipath_cdev_init(int minor, char *name, const struct file_operations *fops,
@@ -884,7 +899,7 @@ extern int ipath_diag_inuse;
 
 irqreturn_t ipath_intr(int irq, void *devid);
 int ipath_decode_err(struct ipath_devdata *dd, char *buf, size_t blen,
-                     ipath_err_t err);
+		     ipath_err_t err);
 #if __IPATH_INFO || __IPATH_DBG
 extern const char *ipath_ibcstatus_str[];
 #endif
@@ -988,8 +1003,6 @@ void ipath_shutdown_relock_poll(struct ipath_devdata *);
 #define IPATH_HAS_PBC_CNT   0x800000
 		/* Suppress heartbeat, even if turning off loopback */
 #define IPATH_NO_HRTBT      0x1000000
-		/* 7220 SpecialTrigger arm/launch mechanism */
-#define IPATH_USE_SPCL_TRIG 0x2000000
 #define IPATH_HAS_THRESH_UPDATE 0x4000000
 #define IPATH_HAS_MULT_IB_SPEED 0x8000000
 #define IPATH_IB_AUTONEG_INPROG 0x10000000
@@ -1025,7 +1038,6 @@ void ipath_get_eeprom_info(struct ipath_devdata *);
 int ipath_update_eeprom_log(struct ipath_devdata *dd);
 void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr);
 u64 ipath_snap_cntr(struct ipath_devdata *, ipath_creg);
-void ipath_dump_lookup_output_queue(struct ipath_devdata *);
 void ipath_disarm_senderrbufs(struct ipath_devdata *);
 void ipath_force_pio_avail_update(struct ipath_devdata *);
 void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev);
@@ -1114,10 +1126,15 @@ static inline u32 ipath_read_ureg32(const struct ipath_devdata *dd,
 	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
 		return 0;
 
-	return readl(regno + (u64 __iomem *)
-		     (dd->ipath_uregbase +
-		      (char __iomem *)dd->ipath_kregbase +
-		      dd->ipath_ureg_align * port));
+	if (dd->ipath_userbase)
+		return readl(regno + (u64 __iomem *)
+			     ((char __iomem *)dd->ipath_userbase +
+			      dd->ipath_ureg_align * port));
+	else
+		return readl(regno + (u64 __iomem *)
+			     (dd->ipath_uregbase +
+			      (char __iomem *)dd->ipath_kregbase +
+			      dd->ipath_ureg_align * port));
 }
 
 /**
@@ -1132,9 +1149,17 @@ static inline u32 ipath_read_ureg32(const struct ipath_devdata *dd,
 static inline void ipath_write_ureg(const struct ipath_devdata *dd,
 				    ipath_ureg regno, u64 value, int port)
 {
-	u64 __iomem *ubase = (u64 __iomem *)
-		(dd->ipath_uregbase + (char __iomem *) dd->ipath_kregbase +
-		 dd->ipath_ureg_align * port);
+	u64 __iomem *ubase;
+	if (dd->ipath_userbase)
+		ubase = (u64 __iomem *)
+			((char __iomem *) dd->ipath_userbase +
+			 dd->ipath_ureg_align * port);
+	else
+		ubase = (u64 __iomem *)
+			(dd->ipath_uregbase +
+			 (char __iomem *) dd->ipath_kregbase +
+			 dd->ipath_ureg_align * port);
+
 	if (dd->ipath_kregbase)
 		writeq(value, &ubase[regno]);
 }
@@ -1308,11 +1333,8 @@ const char *ipath_get_unit_name(int unit);
 extern unsigned ipath_debug; /* debugging bit mask */
 extern unsigned ipath_linkrecovery;
 extern unsigned ipath_mtu4096;
-extern unsigned ipath_sdma_fetch_arb;
 extern struct mutex ipath_mutex;
 
-
-
 #define IPATH_DRV_NAME		"ib_ipath"
 #define IPATH_MAJOR		233
 #define IPATH_USER_MINOR_BASE	0
diff --git a/drivers/infiniband/hw/ipath/ipath_keys.c b/drivers/infiniband/hw/ipath/ipath_keys.c
index 8f32b17..6dfb578 100644
--- a/drivers/infiniband/hw/ipath/ipath_keys.c
+++ b/drivers/infiniband/hw/ipath/ipath_keys.c
@@ -93,17 +93,37 @@ bail:
  * @rkt: table from which to free the lkey
  * @lkey: lkey id to free
  */
-void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey)
+int ipath_free_lkey(struct ipath_ibdev *dev, struct ipath_mregion *mr)
 {
 	unsigned long flags;
+	u32 lkey = mr->lkey;
 	u32 r;
+	int ret;
 
-	if (lkey == 0)
-		return;
-	r = lkey >> (32 - ib_ipath_lkey_table_size);
-	spin_lock_irqsave(&rkt->lock, flags);
-	rkt->table[r] = NULL;
-	spin_unlock_irqrestore(&rkt->lock, flags);
+	spin_lock_irqsave(&dev->lk_table.lock, flags);
+	if (lkey == 0) {
+		if (dev->dma_mr) {
+			ret = atomic_read(&dev->dma_mr->refcount);
+			if (dev->dma_mr == mr) {
+				if (!ret)
+					dev->dma_mr = NULL;
+			} else
+				ret = 0;
+		} else
+			ret = 0;
+	} else {
+		r = lkey >> (32 - ib_ipath_lkey_table_size);
+		ret = atomic_read(&dev->lk_table.table[r]->refcount);
+		if (!ret)
+			dev->lk_table.table[r] = NULL;
+	}
+	spin_unlock_irqrestore(&dev->lk_table.lock, flags);
+
+	if (ret) {
+		ipath_dbg("MR busy (LKEY %x cnt %u)\n", lkey, ret);
+		ret = -EBUSY;
+	}
+	return ret;
 }
 
 /**
@@ -125,40 +145,41 @@ int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge,
 	struct ipath_mregion *mr;
 	unsigned n, m;
 	size_t off;
-	int ret;
+	int ret = 0;
+	unsigned long flags;
 
 	/*
 	 * We use LKEY == zero for kernel virtual addresses
 	 * (see ipath_get_dma_mr and ipath_dma.c).
 	 */
+	spin_lock_irqsave(&rkt->lock, flags);
 	if (sge->lkey == 0) {
 		struct ipath_pd *pd = to_ipd(qp->ibqp.pd);
+		struct ipath_ibdev *dev = to_idev(pd->ibpd.device);
 
-		if (pd->user) {
-			ret = 0;
+		if (pd->user)
 			goto bail;
-		}
-		isge->mr = NULL;
+		if (!dev->dma_mr)
+			goto bail;
+		atomic_inc(&dev->dma_mr->refcount);
+		isge->mr = dev->dma_mr;
 		isge->vaddr = (void *) sge->addr;
 		isge->length = sge->length;
 		isge->sge_length = sge->length;
-		ret = 1;
-		goto bail;
+		isge->m = 0;
+		isge->n = 0;
+		goto ok;
 	}
 	mr = rkt->table[(sge->lkey >> (32 - ib_ipath_lkey_table_size))];
 	if (unlikely(mr == NULL || mr->lkey != sge->lkey ||
-		     qp->ibqp.pd != mr->pd)) {
-		ret = 0;
+		     qp->ibqp.pd != mr->pd))
 		goto bail;
-	}
 
 	off = sge->addr - mr->user_base;
 	if (unlikely(sge->addr < mr->user_base ||
 		     off + sge->length > mr->length ||
-		     (mr->access_flags & acc) != acc)) {
-		ret = 0;
+		     (mr->access_flags & acc) != acc))
 		goto bail;
-	}
 
 	off += mr->offset;
 	m = 0;
@@ -171,16 +192,17 @@ int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge,
 			n = 0;
 		}
 	}
+	atomic_inc(&mr->refcount);
 	isge->mr = mr;
 	isge->vaddr = mr->map[m]->segs[n].vaddr + off;
 	isge->length = mr->map[m]->segs[n].length - off;
 	isge->sge_length = sge->length;
 	isge->m = m;
 	isge->n = n;
-
+ok:
 	ret = 1;
-
 bail:
+	spin_unlock_irqrestore(&rkt->lock, flags);
 	return ret;
 }
 
@@ -195,51 +217,49 @@ bail:
  *
  * Return 1 if successful, otherwise 0.
  */
-int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss,
+int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge *sge,
 		  u32 len, u64 vaddr, u32 rkey, int acc)
 {
 	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
 	struct ipath_lkey_table *rkt = &dev->lk_table;
-	struct ipath_sge *sge = &ss->sge;
 	struct ipath_mregion *mr;
 	unsigned n, m;
 	size_t off;
-	int ret;
+	int ret = 0;
+	unsigned long flags;
 
 	/*
 	 * We use RKEY == zero for kernel virtual addresses
 	 * (see ipath_get_dma_mr and ipath_dma.c).
 	 */
+	spin_lock_irqsave(&rkt->lock, flags);
 	if (rkey == 0) {
 		struct ipath_pd *pd = to_ipd(qp->ibqp.pd);
+		struct ipath_ibdev *dev = to_idev(pd->ibpd.device);
 
-		if (pd->user) {
-			ret = 0;
+		if (pd->user)
 			goto bail;
-		}
-		sge->mr = NULL;
+		if (!dev->dma_mr)
+			goto bail;
+		atomic_inc(&dev->dma_mr->refcount);
+		sge->mr = dev->dma_mr;
 		sge->vaddr = (void *) vaddr;
 		sge->length = len;
 		sge->sge_length = len;
-		ss->sg_list = NULL;
-		ss->num_sge = 1;
-		ret = 1;
-		goto bail;
+		sge->m = 0;
+		sge->n = 0;
+		goto ok;
 	}
 
 	mr = rkt->table[(rkey >> (32 - ib_ipath_lkey_table_size))];
 	if (unlikely(mr == NULL || mr->lkey != rkey ||
-		     qp->ibqp.pd != mr->pd)) {
-		ret = 0;
+		     qp->ibqp.pd != mr->pd))
 		goto bail;
-	}
 
 	off = vaddr - mr->iova;
 	if (unlikely(vaddr < mr->iova || off + len > mr->length ||
-		     (mr->access_flags & acc) == 0)) {
-		ret = 0;
+		     (mr->access_flags & acc) == 0))
 		goto bail;
-	}
 
 	off += mr->offset;
 	m = 0;
@@ -252,17 +272,16 @@ int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss,
 			n = 0;
 		}
 	}
+	atomic_inc(&mr->refcount);
 	sge->mr = mr;
 	sge->vaddr = mr->map[m]->segs[n].vaddr + off;
 	sge->length = mr->map[m]->segs[n].length - off;
 	sge->sge_length = len;
 	sge->m = m;
 	sge->n = n;
-	ss->sg_list = NULL;
-	ss->num_sge = 1;
-
+ok:
 	ret = 1;
-
 bail:
+	spin_unlock_irqrestore(&rkt->lock, flags);
 	return ret;
 }
diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c
index 361a73f..2a4d8a2 100644
--- a/drivers/infiniband/hw/ipath/ipath_mad.c
+++ b/drivers/infiniband/hw/ipath/ipath_mad.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
  * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -111,9 +111,9 @@ static int recv_subn_get_nodeinfo(struct ib_smp *smp,
 	nip->revision = cpu_to_be32((majrev << 16) | minrev);
 	nip->local_port_num = port;
 	vendor = dd->ipath_vendorid;
-	nip->vendor_id[0] = 0;
-	nip->vendor_id[1] = vendor >> 8;
-	nip->vendor_id[2] = vendor;
+	nip->vendor_id[0] = IPATH_SRC_OUI_1;
+	nip->vendor_id[1] = IPATH_SRC_OUI_2;
+	nip->vendor_id[2] = IPATH_SRC_OUI_3;
 
 	return reply(smp);
 }
@@ -146,7 +146,6 @@ static int recv_subn_get_guidinfo(struct ib_smp *smp,
 	return reply(smp);
 }
 
-
 static void set_link_width_enabled(struct ipath_devdata *dd, u32 w)
 {
 	(void) dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, w);
@@ -185,6 +184,7 @@ static int set_overrunthreshold(struct ipath_devdata *dd, unsigned n)
 			(u64) n << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT;
 		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
 				 dd->ipath_ibcctrl);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_scratch, 0xfeedbeef);
 	}
 	return 0;
 }
@@ -217,6 +217,7 @@ static int set_phyerrthreshold(struct ipath_devdata *dd, unsigned n)
 			(u64) n << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT;
 		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
 				 dd->ipath_ibcctrl);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_scratch, 0xfeedbeef);
 	}
 	return 0;
 }
@@ -349,6 +350,7 @@ bail:
  */
 static int get_pkeys(struct ipath_devdata *dd, u16 * pkeys)
 {
+	/* always a kernel port, no locking needed */
 	struct ipath_portdata *pd = dd->ipath_pd[0];
 
 	memcpy(pkeys, pd->port_pkeys, sizeof(pd->port_pkeys));
@@ -402,6 +404,7 @@ static int set_linkdowndefaultstate(struct ipath_devdata *dd, int sleep)
 		dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE;
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
 			 dd->ipath_ibcctrl);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_scratch, 0xfeedbeef);
 	return 0;
 }
 
@@ -731,6 +734,7 @@ static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys)
 	int i;
 	int changed = 0;
 
+	/* always a kernel port, no locking needed */
 	pd = dd->ipath_pd[0];
 
 	for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
@@ -756,6 +760,7 @@ static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys)
 		pd->port_pkeys[i] = key;
 	}
 	if (changed) {
+		struct ib_event event;
 		u64 pkey;
 
 		pkey = (u64) dd->ipath_pkeys[0] |
@@ -766,6 +771,11 @@ static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys)
 			   (unsigned long long) pkey);
 		ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey,
 				 pkey);
+
+		event.event = IB_EVENT_PKEY_CHANGE;
+		event.device = &dd->verbs_dev->ibdev;
+		event.element.port_num = 1;
+		ib_dispatch_event(&event);
 	}
 	return 0;
 }
@@ -1398,7 +1408,8 @@ static int process_subn(struct ib_device *ibdev, int mad_flags,
 	}
 
 	/* Is the mkey in the process of expiring? */
-	if (dev->mkey_lease_timeout && jiffies >= dev->mkey_lease_timeout) {
+	if (dev->mkey_lease_timeout &&
+	    time_after_eq(jiffies, dev->mkey_lease_timeout)) {
 		/* Clear timeout and mkey protection field. */
 		dev->mkey_lease_timeout = 0;
 		dev->mkeyprot = 0;
@@ -1492,6 +1503,10 @@ static int process_subn(struct ib_device *ibdev, int mad_flags,
 			goto bail;
 		}
 
+	case IB_MGMT_METHOD_TRAP:
+	case IB_MGMT_METHOD_REPORT:
+	case IB_MGMT_METHOD_REPORT_RESP:
+	case IB_MGMT_METHOD_TRAP_REPRESS:
 	case IB_MGMT_METHOD_GET_RESP:
 		/*
 		 * The ib_mad module will call us to process responses
diff --git a/drivers/infiniband/hw/ipath/ipath_mr.c b/drivers/infiniband/hw/ipath/ipath_mr.c
index db4ba92..0e164e6 100644
--- a/drivers/infiniband/hw/ipath/ipath_mr.c
+++ b/drivers/infiniband/hw/ipath/ipath_mr.c
@@ -35,6 +35,7 @@
 #include <rdma/ib_pack.h>
 #include <rdma/ib_smi.h>
 
+#include "ipath_kernel.h"
 #include "ipath_verbs.h"
 
 /* Fast memory region */
@@ -60,8 +61,15 @@ static inline struct ipath_fmr *to_ifmr(struct ib_fmr *ibfmr)
  */
 struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc)
 {
+	struct ipath_ibdev *dev = to_idev(pd->device);
 	struct ipath_mr *mr;
 	struct ib_mr *ret;
+	unsigned long flags;
+
+	if (to_ipd(pd)->user) {
+		ret = ERR_PTR(-EPERM);
+		goto bail;
+	}
 
 	mr = kzalloc(sizeof *mr, GFP_KERNEL);
 	if (!mr) {
@@ -70,6 +78,13 @@ struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc)
 	}
 
 	mr->mr.access_flags = acc;
+	atomic_set(&mr->mr.refcount, 0);
+
+	spin_lock_irqsave(&dev->lk_table.lock, flags);
+	if (!dev->dma_mr)
+		dev->dma_mr = &mr->mr;
+	spin_unlock_irqrestore(&dev->lk_table.lock, flags);
+
 	ret = &mr->ibmr;
 
 bail:
@@ -104,6 +119,7 @@ static struct ipath_mr *alloc_mr(int count,
 		goto bail;
 	mr->ibmr.rkey = mr->ibmr.lkey = mr->mr.lkey;
 
+	atomic_set(&mr->mr.refcount, 0);
 	goto done;
 
 bail:
@@ -195,7 +211,8 @@ struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 		goto bail;
 	}
 
-	umem = ib_umem_get(pd->uobject->context, start, length, mr_access_flags);
+	umem = ib_umem_get(pd->uobject->context, start, length,
+			   mr_access_flags, 0);
 	if (IS_ERR(umem))
 		return (void *) umem;
 
@@ -257,9 +274,14 @@ bail:
 int ipath_dereg_mr(struct ib_mr *ibmr)
 {
 	struct ipath_mr *mr = to_imr(ibmr);
+	struct ipath_ibdev *dev = to_idev(ibmr->device);
+	int ret;
 	int i;
 
-	ipath_free_lkey(&to_idev(ibmr->device)->lk_table, ibmr->lkey);
+	ret = ipath_free_lkey(dev, &mr->mr);
+	if (ret)
+		return ret;
+
 	i = mr->mr.mapsz;
 	while (i) {
 		i--;
@@ -323,6 +345,7 @@ struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
 	fmr->mr.max_segs = fmr_attr->max_pages;
 	fmr->page_shift = fmr_attr->page_shift;
 
+	atomic_set(&fmr->mr.refcount, 0);
 	ret = &fmr->ibfmr;
 	goto done;
 
@@ -356,6 +379,12 @@ int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list,
 	u32 ps;
 	int ret;
 
+	if (atomic_read(&fmr->mr.refcount)) {
+		ipath_dbg("FMR modified when busy (LKEY %x cnt %u)\n",
+			  fmr->mr.lkey, atomic_read(&fmr->mr.refcount));
+		return -EBUSY;
+	}
+
 	if (list_len > fmr->mr.max_segs) {
 		ret = -EINVAL;
 		goto bail;
@@ -399,6 +428,10 @@ int ipath_unmap_fmr(struct list_head *fmr_list)
 	list_for_each_entry(fmr, fmr_list, ibfmr.list) {
 		rkt = &to_idev(fmr->ibfmr.device)->lk_table;
 		spin_lock_irqsave(&rkt->lock, flags);
+		if (atomic_read(&fmr->mr.refcount))
+			ipath_dbg("FMR busy (LKEY %x cnt %u)\n",
+				  fmr->mr.lkey, atomic_read(&fmr->mr.refcount));
+
 		fmr->mr.user_base = 0;
 		fmr->mr.iova = 0;
 		fmr->mr.length = 0;
@@ -416,9 +449,13 @@ int ipath_unmap_fmr(struct list_head *fmr_list)
 int ipath_dealloc_fmr(struct ib_fmr *ibfmr)
 {
 	struct ipath_fmr *fmr = to_ifmr(ibfmr);
+	int ret;
 	int i;
 
-	ipath_free_lkey(&to_idev(ibfmr->device)->lk_table, ibfmr->lkey);
+	ret = ipath_free_lkey(to_idev(ibfmr->device), &fmr->mr);
+	if (ret)
+		return ret;
+
 	i = fmr->mr.mapsz;
 	while (i)
 		kfree(fmr->mr.map[--i]);
diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/infiniband/hw/ipath/ipath_qp.c
index e8498bb..aefd621 100644
--- a/drivers/infiniband/hw/ipath/ipath_qp.c
+++ b/drivers/infiniband/hw/ipath/ipath_qp.c
@@ -330,6 +330,10 @@ static void ipath_reset_qp(struct ipath_qp *qp, enum ib_qp_type type)
 	qp->s_wqe = NULL;
 	qp->s_pkt_delay = 0;
 	qp->s_draining = 0;
+	qp->s_next_psn = 0;
+	qp->s_last_psn = 0;
+	qp->s_sending_psn = 0;
+	qp->s_sending_hpsn = 0;
 	qp->s_psn = 0;
 	qp->r_psn = 0;
 	qp->r_msn = 0;
@@ -348,6 +352,7 @@ static void ipath_reset_qp(struct ipath_qp *qp, enum ib_qp_type type)
 	qp->s_head = 0;
 	qp->s_tail = 0;
 	qp->s_cur = 0;
+	qp->s_acked = 0;
 	qp->s_last = 0;
 	qp->s_ssn = 1;
 	qp->s_lsn = 0;
@@ -359,6 +364,50 @@ static void ipath_reset_qp(struct ipath_qp *qp, enum ib_qp_type type)
 		qp->r_rq.wq->head = 0;
 		qp->r_rq.wq->tail = 0;
 	}
+	qp->r_sge.num_sge = 0;
+}
+
+static void clear_mr_refs(struct ipath_qp *qp, int clr_sends)
+{
+	unsigned n;
+
+	while (qp->r_sge.num_sge) {
+		atomic_dec(&qp->r_sge.sge.mr->refcount);
+		if (--qp->r_sge.num_sge)
+			qp->r_sge.sge = *qp->r_sge.sg_list++;
+	}
+
+	if (clr_sends) {
+		while (qp->s_last != qp->s_head) {
+			struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
+			unsigned i;
+
+			for (i = 0; i < wqe->wr.num_sge; i++) {
+				struct ipath_sge *sge = &wqe->sg_list[i];
+
+				atomic_dec(&sge->mr->refcount);
+			}
+			if (++qp->s_last >= qp->s_size)
+				qp->s_last = 0;
+		}
+		if (qp->s_rdma_mr) {
+			atomic_dec(&qp->s_rdma_mr->refcount);
+			qp->s_rdma_mr = NULL;
+		}
+	}
+
+	if (qp->ibqp.qp_type != IB_QPT_RC)
+		return;
+
+	for (n = 0; n < ARRAY_SIZE(qp->s_ack_queue); n++) {
+		struct ipath_ack_entry *e = &qp->s_ack_queue[n];
+
+		if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST &&
+		    e->rdma_sge.mr) {
+			atomic_dec(&e->rdma_sge.mr->refcount);
+			e->rdma_sge.mr = NULL;
+		}
+	}
 }
 
 /**
@@ -394,6 +443,8 @@ int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err)
 	if (qp->s_last != qp->s_head)
 		ipath_schedule_send(qp);
 
+	clear_mr_refs(qp, 0);
+
 	memset(&wc, 0, sizeof(wc));
 	wc.qp = &qp->ibqp;
 	wc.opcode = IB_WC_RECV;
@@ -521,8 +572,9 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 			tasklet_kill(&qp->s_task);
 			wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy));
 			spin_lock_irq(&qp->s_lock);
+			clear_mr_refs(qp, 1);
+			ipath_reset_qp(qp, ibqp->qp_type);
 		}
-		ipath_reset_qp(qp, ibqp->qp_type);
 		break;
 
 	case IB_QPS_SQD:
@@ -552,8 +604,8 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		qp->remote_qpn = attr->dest_qp_num;
 
 	if (attr_mask & IB_QP_SQ_PSN) {
-		qp->s_psn = qp->s_next_psn = attr->sq_psn;
-		qp->s_last_psn = qp->s_next_psn - 1;
+		qp->s_sending_psn = qp->s_psn = qp->s_next_psn = attr->sq_psn;
+		qp->s_sending_hpsn = qp->s_last_psn = qp->s_next_psn - 1;
 	}
 
 	if (attr_mask & IB_QP_RQ_PSN)
@@ -745,8 +797,14 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
 	struct ipath_swqe *swq = NULL;
 	struct ipath_ibdev *dev;
 	size_t sz;
+	size_t sg_list_sz;
 	struct ib_qp *ret;
 
+	if (init_attr->create_flags) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
 	if (init_attr->cap.max_send_sge > ib_ipath_max_sges ||
 	    init_attr->cap.max_send_wr > ib_ipath_max_qp_wrs) {
 		ret = ERR_PTR(-EINVAL);
@@ -784,27 +842,34 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
 			goto bail;
 		}
 		sz = sizeof(*qp);
+		sg_list_sz = 0;
 		if (init_attr->srq) {
 			struct ipath_srq *srq = to_isrq(init_attr->srq);
 
-			sz += sizeof(*qp->r_sg_list) *
-				srq->rq.max_sge;
-		} else
-			sz += sizeof(*qp->r_sg_list) *
-				init_attr->cap.max_recv_sge;
-		qp = kmalloc(sz, GFP_KERNEL);
+			if (srq->rq.max_sge > 1)
+				sg_list_sz = sizeof(*qp->r_sg_list) *
+					(srq->rq.max_sge - 1);
+		} else if (init_attr->cap.max_recv_sge > 1)
+			sg_list_sz = sizeof(*qp->r_sg_list) *
+				(init_attr->cap.max_recv_sge - 1);
+		qp = kzalloc(sz + sg_list_sz, GFP_KERNEL);
 		if (!qp) {
 			ret = ERR_PTR(-ENOMEM);
 			goto bail_swq;
 		}
-		if (init_attr->srq) {
+		if (sg_list_sz && (init_attr->qp_type == IB_QPT_UD ||
+		    init_attr->qp_type == IB_QPT_SMI ||
+		    init_attr->qp_type == IB_QPT_GSI)) {
+			qp->r_ud_sg_list = kmalloc(sg_list_sz, GFP_KERNEL);
+			if (!qp->r_ud_sg_list) {
+				ret = ERR_PTR(-ENOMEM);
+				goto bail_qp;
+			}
+		} else
+			qp->r_ud_sg_list = NULL;
+		if (init_attr->srq)
 			sz = 0;
-			qp->r_rq.size = 0;
-			qp->r_rq.max_sge = 0;
-			qp->r_rq.wq = NULL;
-			init_attr->cap.max_recv_wr = 0;
-			init_attr->cap.max_recv_sge = 0;
-		} else {
+		else {
 			qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
 			qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
 			sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
@@ -813,7 +878,7 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
 					      qp->r_rq.size * sz);
 			if (!qp->r_rq.wq) {
 				ret = ERR_PTR(-ENOMEM);
-				goto bail_qp;
+				goto bail_sg_list;
 			}
 			memset(qp->r_rq.wq, 0,
 			       sizeof(struct ipath_rwq) + qp->r_rq.size * sz);
@@ -837,18 +902,14 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
 		qp->s_max_sge = init_attr->cap.max_send_sge;
 		if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
 			qp->s_flags = IPATH_S_SIGNAL_REQ_WR;
-		else
-			qp->s_flags = 0;
 		dev = to_idev(ibpd->device);
 		err = ipath_alloc_qpn(&dev->qp_table, qp,
 				      init_attr->qp_type);
 		if (err) {
 			ret = ERR_PTR(err);
 			vfree(qp->r_rq.wq);
-			goto bail_qp;
+			goto bail_sg_list;
 		}
-		qp->ip = NULL;
-		qp->s_tx = NULL;
 		ipath_reset_qp(qp, init_attr->qp_type);
 		break;
 
@@ -922,6 +983,8 @@ bail_ip:
 		vfree(qp->r_rq.wq);
 	ipath_free_qp(&dev->qp_table, qp);
 	free_qpn(&dev->qp_table, qp->ibqp.qp_num);
+bail_sg_list:
+	kfree(qp->r_ud_sg_list);
 bail_qp:
 	kfree(qp);
 bail_swq:
@@ -965,6 +1028,10 @@ int ipath_destroy_qp(struct ib_qp *ibqp)
 	ipath_free_qp(&dev->qp_table, qp);
 
 	if (qp->s_tx) {
+		if (qp->s_tx->mr) {
+			atomic_dec(&qp->s_tx->mr->refcount);
+			qp->s_tx->mr = NULL;
+		}
 		atomic_dec(&qp->refcount);
 		if (qp->s_tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF)
 			kfree(qp->s_tx->txreq.map_addr);
@@ -976,6 +1043,8 @@ int ipath_destroy_qp(struct ib_qp *ibqp)
 
 	wait_event(qp->wait, !atomic_read(&qp->refcount));
 
+	clear_mr_refs(qp, 1);
+
 	/* all user's cleaned up, mark it available */
 	free_qpn(&dev->qp_table, qp->ibqp.qp_num);
 	spin_lock(&dev->n_qps_lock);
@@ -986,6 +1055,7 @@ int ipath_destroy_qp(struct ib_qp *ibqp)
 		kref_put(&qp->ip->ref, ipath_release_mmap_info);
 	else
 		vfree(qp->r_rq.wq);
+	kfree(qp->r_ud_sg_list);
 	vfree(qp->s_wq);
 	kfree(qp);
 	return 0;
@@ -1048,12 +1118,4 @@ void ipath_get_credit(struct ipath_qp *qp, u32 aeth)
 		if (ipath_cmp24(credit, qp->s_lsn) > 0)
 			qp->s_lsn = credit;
 	}
-
-	/* Restart sending if it was blocked due to lack of credits. */
-	if ((qp->s_flags & IPATH_S_WAIT_SSN_CREDIT) &&
-	    qp->s_cur != qp->s_head &&
-	    (qp->s_lsn == (u32) -1 ||
-	     ipath_cmp24(get_swqe_ptr(qp, qp->s_cur)->ssn,
-			 qp->s_lsn + 1) <= 0))
-		ipath_schedule_send(qp);
 }
diff --git a/drivers/infiniband/hw/ipath/ipath_rc.c b/drivers/infiniband/hw/ipath/ipath_rc.c
index 9d1c0f8..801694f 100644
--- a/drivers/infiniband/hw/ipath/ipath_rc.c
+++ b/drivers/infiniband/hw/ipath/ipath_rc.c
@@ -49,7 +49,7 @@ static u32 restart_sge(struct ipath_sge_state *ss, struct ipath_swqe *wqe,
 	ss->sg_list = wqe->sg_list + 1;
 	ss->num_sge = wqe->wr.num_sge;
 	ss->total_len = wqe->length;
-	ipath_skip_sge(ss, len);
+	ipath_skip_sge(ss, len, 0);
 	return wqe->length - len;
 }
 
@@ -103,6 +103,12 @@ static int ipath_make_rc_ack(struct ipath_ibdev *dev, struct ipath_qp *qp,
 	switch (qp->s_ack_state) {
 	case OP(RDMA_READ_RESPONSE_LAST):
 	case OP(RDMA_READ_RESPONSE_ONLY):
+		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+		if (e->rdma_sge.mr) {
+			atomic_dec(&e->rdma_sge.mr->refcount);
+			e->rdma_sge.mr = NULL;
+		}
+		/* FALLTHROUGH */
 	case OP(ATOMIC_ACKNOWLEDGE):
 		/*
 		 * We can increment the tail pointer now that the last
@@ -124,10 +130,25 @@ static int ipath_make_rc_ack(struct ipath_ibdev *dev, struct ipath_qp *qp,
 
 		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
 		if (e->opcode == OP(RDMA_READ_REQUEST)) {
+			/*
+			 * If a RDMA read response is being resent and
+			 * we haven't seen the duplicate request yet,
+			 * then stop sending the remaining responses the
+			 * responder has seen until the requester resends it.
+			 */
+			if (e->rdma_sge.sge_length && !e->rdma_sge.mr) {
+				qp->s_tail_ack_queue = qp->r_head_ack_queue;
+				qp->s_ack_state = OP(ACKNOWLEDGE);
+				goto bail;
+			}
 			/* Copy SGE state in case we need to resend */
-			qp->s_ack_rdma_sge = e->rdma_sge;
+			qp->s_rdma_mr = e->rdma_sge.mr;
+			if (qp->s_rdma_mr)
+				atomic_inc(&qp->s_rdma_mr->refcount);
+			qp->s_ack_rdma_sge.sge = e->rdma_sge;
+			qp->s_ack_rdma_sge.num_sge = 1;
 			qp->s_cur_sge = &qp->s_ack_rdma_sge;
-			len = e->rdma_sge.sge.sge_length;
+			len = e->rdma_sge.sge_length;
 			if (len > pmtu) {
 				len = pmtu;
 				qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
@@ -160,6 +181,10 @@ static int ipath_make_rc_ack(struct ipath_ibdev *dev, struct ipath_qp *qp,
 		qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
 		/* FALLTHROUGH */
 	case OP(RDMA_READ_RESPONSE_MIDDLE):
+		qp->s_cur_sge = &qp->s_ack_rdma_sge;
+		qp->s_rdma_mr = qp->s_ack_rdma_sge.sge.mr;
+		if (qp->s_rdma_mr)
+			atomic_inc(&qp->s_rdma_mr->refcount);
 		len = qp->s_ack_rdma_sge.sge.sge_length;
 		if (len > pmtu)
 			len = pmtu;
@@ -167,7 +192,8 @@ static int ipath_make_rc_ack(struct ipath_ibdev *dev, struct ipath_qp *qp,
 			ohdr->u.aeth = ipath_compute_aeth(qp);
 			hwords++;
 			qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
-			qp->s_ack_queue[qp->s_tail_ack_queue].sent = 1;
+			e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+			e->sent = 1;
 		}
 		bth0 = qp->s_ack_state << 24;
 		bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK;
@@ -196,6 +222,7 @@ static int ipath_make_rc_ack(struct ipath_ibdev *dev, struct ipath_qp *qp,
 		bth0 = OP(ACKNOWLEDGE) << 24;
 		bth2 = qp->s_ack_psn & IPATH_PSN_MASK;
 	}
+	qp->s_rdma_ack_cnt++;
 	qp->s_hdrwords = hwords;
 	qp->s_cur_size = len;
 	ipath_make_ruc_header(dev, qp, ohdr, bth0, bth2);
@@ -225,6 +252,7 @@ int ipath_make_rc_req(struct ipath_qp *qp)
 	char newreq;
 	unsigned long flags;
 	int ret = 0;
+	int delta;
 
 	ohdr = &qp->s_hdr.u.oth;
 	if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
@@ -255,6 +283,12 @@ int ipath_make_rc_req(struct ipath_qp *qp)
 			goto bail;
 		}
 		wqe = get_swqe_ptr(qp, qp->s_last);
+		while (qp->s_last != qp->s_acked) {
+			ipath_send_complete(qp, wqe, IB_WC_SUCCESS);
+			if (++qp->s_last >= qp->s_size)
+				qp->s_last = 0;
+			wqe = get_swqe_ptr(qp, qp->s_last);
+		}
 		ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
 		goto done;
 	}
@@ -265,6 +299,19 @@ int ipath_make_rc_req(struct ipath_qp *qp)
 		goto bail;
 	}
 
+	/*
+	 * Leave BUSY set until sdma queue drains so we don't send
+	 * the same PSN multiple times.
+	 */
+	if (ipath_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) {
+		if (ipath_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
+			qp->s_flags |= IPATH_S_WAITING;
+			goto bail;
+		}
+		qp->s_sending_psn = qp->s_psn;
+		qp->s_sending_hpsn = qp->s_psn - 1;
+	}
+
 	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
 	hwords = 5;
 	bth0 = 1 << 22; /* Set M bit */
@@ -329,7 +376,7 @@ int ipath_make_rc_req(struct ipath_qp *qp)
 			else {
 				qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
 				/* Immediate data comes after the BTH */
-				ohdr->u.imm_data = wqe->wr.imm_data;
+				ohdr->u.imm_data = wqe->wr.ex.imm_data;
 				hwords += 1;
 			}
 			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
@@ -369,7 +416,7 @@ int ipath_make_rc_req(struct ipath_qp *qp)
 				qp->s_state =
 					OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
 				/* Immediate data comes after RETH */
-				ohdr->u.rc.imm_data = wqe->wr.imm_data;
+				ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
 				hwords += 1;
 				if (wqe->wr.send_flags & IB_SEND_SOLICITED)
 					bth0 |= 1 << 23;
@@ -514,7 +561,7 @@ int ipath_make_rc_req(struct ipath_qp *qp)
 		else {
 			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
 			/* Immediate data comes after the BTH */
-			ohdr->u.imm_data = wqe->wr.imm_data;
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
 			hwords += 1;
 		}
 		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
@@ -550,7 +597,7 @@ int ipath_make_rc_req(struct ipath_qp *qp)
 		else {
 			qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
 			/* Immediate data comes after the BTH */
-			ohdr->u.imm_data = wqe->wr.imm_data;
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
 			hwords += 1;
 			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
 				bth0 |= 1 << 23;
@@ -575,9 +622,8 @@ int ipath_make_rc_req(struct ipath_qp *qp)
 		ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len);
 		qp->s_state = OP(RDMA_READ_REQUEST);
 		hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
-		bth2 = qp->s_psn++ & IPATH_PSN_MASK;
-		if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
-			qp->s_next_psn = qp->s_psn;
+		bth2 = qp->s_psn & IPATH_PSN_MASK;
+		qp->s_psn = wqe->lpsn + 1;
 		ss = NULL;
 		len = 0;
 		qp->s_cur++;
@@ -585,7 +631,9 @@ int ipath_make_rc_req(struct ipath_qp *qp)
 			qp->s_cur = 0;
 		break;
 	}
-	if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT - 1) >= 0)
+	qp->s_sending_hpsn = bth2;
+	delta = (((int) bth2 - (int) wqe->psn) << 8) >> 8;
+	if (delta && delta % IPATH_PSN_CREDIT == 0)
 		bth2 |= 1 << 31;	/* Request ACK. */
 	qp->s_len -= len;
 	qp->s_hdrwords = hwords;
@@ -619,7 +667,6 @@ static void send_rc_ack(struct ipath_qp *qp)
 	u16 lrh0;
 	u32 bth0;
 	u32 hwords;
-	u32 pbufn;
 	u32 __iomem *piobuf;
 	struct ipath_ib_header hdr;
 	struct ipath_other_headers *ohdr;
@@ -630,7 +677,8 @@ static void send_rc_ack(struct ipath_qp *qp)
 	/* Don't send ACK or NAK if a RDMA read or atomic is pending. */
 	if (qp->r_head_ack_queue != qp->s_tail_ack_queue ||
 	    (qp->s_flags & IPATH_S_ACK_PENDING) ||
-	    qp->s_ack_state != OP(ACKNOWLEDGE))
+	    qp->s_ack_state != OP(ACKNOWLEDGE) ||
+	    qp->s_rdma_ack_cnt)
 		goto queue_ack;
 
 	spin_unlock_irqrestore(&qp->s_lock, flags);
@@ -640,7 +688,7 @@ static void send_rc_ack(struct ipath_qp *qp)
 	if (!(dd->ipath_flags & IPATH_LINKACTIVE))
 		goto done;
 
-	piobuf = ipath_getpiobuf(dd, 0, &pbufn);
+	piobuf = ipath_getpiobuf(dd, 0, NULL);
 	if (!piobuf) {
 		/*
 		 * We are out of PIO buffers at the moment.
@@ -678,7 +726,8 @@ static void send_rc_ack(struct ipath_qp *qp)
 	hdr.lrh[0] = cpu_to_be16(lrh0);
 	hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
 	hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
-	hdr.lrh[3] = cpu_to_be16(dd->ipath_lid);
+	hdr.lrh[3] = cpu_to_be16(dd->ipath_lid |
+				 qp->remote_ah_attr.src_path_bits);
 	ohdr->bth[0] = cpu_to_be32(bth0);
 	ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
 	ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & IPATH_PSN_MASK);
@@ -695,14 +744,6 @@ static void send_rc_ack(struct ipath_qp *qp)
 	} else
 		__iowrite32_copy(piobuf + 2, (u32 *) &hdr, hwords);
 
-	if (dd->ipath_flags & IPATH_USE_SPCL_TRIG) {
-		u32 spcl_off = (pbufn >= dd->ipath_piobcnt2k) ?
-			2047 : 1023;
-
-		ipath_flush_wc();
-		__raw_writel(0xaebecede, piobuf + spcl_off);
-	}
-
 	ipath_flush_wc();
 
 	dev->n_unicast_xmit++;
@@ -734,7 +775,7 @@ done:
  */
 static void reset_psn(struct ipath_qp *qp, u32 psn)
 {
-	u32 n = qp->s_last;
+	u32 n = qp->s_acked;
 	struct ipath_swqe *wqe = get_swqe_ptr(qp, n);
 	u32 opcode;
 
@@ -815,12 +856,17 @@ done:
  */
 void ipath_restart_rc(struct ipath_qp *qp, u32 psn)
 {
-	struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
+	struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_acked);
 	struct ipath_ibdev *dev;
 
 	if (qp->s_retry == 0) {
-		ipath_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
-		ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+		if (qp->s_last == qp->s_acked) {
+			ipath_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
+			ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+		} else {
+			/* XXX need to handle delayed completion */
+			ipath_dbg("Delayed too many retries\n");
+		}
 		goto bail;
 	}
 	qp->s_retry--;
@@ -849,6 +895,101 @@ bail:
 	return;
 }
 
+/*
+ * Set qp->s_sending_psn to the next PSN after the given one.
+ * This would be psn+1 except when RDMA reads are present.
+ */
+static void reset_sending_psn(struct ipath_qp *qp, u32 psn)
+{
+	struct ipath_swqe *wqe;
+	u32 n = qp->s_last;
+
+	/* Find the work request corresponding to the given PSN. */
+	for (;;) {
+		wqe = get_swqe_ptr(qp, n);
+		if (ipath_cmp24(psn, wqe->lpsn) <= 0) {
+			if (wqe->wr.opcode == IB_WR_RDMA_READ)
+				qp->s_sending_psn = wqe->lpsn + 1;
+			else
+				qp->s_sending_psn = psn + 1;
+			break;
+		}
+		if (++n == qp->s_size)
+			n = 0;
+		if (n == qp->s_tail)
+			break;
+	}
+}
+
+/*
+ * This should be called with the QP s_lock held and interrupts disabled.
+ */
+void ipath_rc_send_complete(struct ipath_qp *qp, struct ipath_ib_header *hdr)
+{
+	struct ipath_other_headers *ohdr;
+	struct ipath_swqe *wqe;
+	struct ib_wc wc;
+	unsigned i;
+	u32 opcode;
+	u32 psn;
+
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
+		return;
+
+	/* Find out where the BTH is */
+	if ((be16_to_cpu(hdr->lrh[0]) & 3) == IPATH_LRH_BTH)
+		ohdr = &hdr->u.oth;
+	else
+		ohdr = &hdr->u.l.oth;
+
+	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+		WARN_ON(!qp->s_rdma_ack_cnt);
+		qp->s_rdma_ack_cnt--;
+		return;
+	}
+
+	psn = be32_to_cpu(ohdr->bth[2]);
+	reset_sending_psn(qp, psn);
+
+	while (qp->s_last != qp->s_acked) {
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		if (ipath_cmp24(wqe->lpsn, qp->s_sending_psn) >= 0 &&
+		    ipath_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
+			break;
+		for (i = 0; i < wqe->wr.num_sge; i++) {
+			struct ipath_sge *sge = &wqe->sg_list[i];
+
+			atomic_dec(&sge->mr->refcount);
+		}
+		/* Post a send completion queue entry if requested. */
+		if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
+		    (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+			memset(&wc, 0, sizeof wc);
+			wc.wr_id = wqe->wr.wr_id;
+			wc.status = IB_WC_SUCCESS;
+			wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
+			wc.byte_len = wqe->length;
+			wc.qp = &qp->ibqp;
+			ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
+		}
+		if (++qp->s_last >= qp->s_size)
+			qp->s_last = 0;
+	}
+	/*
+	 * If we were waiting for sends to complete before resending,
+	 * and they are now complete, restart sending.
+	 */
+	if (qp->s_cur != qp->s_head &&
+	    ipath_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0 &&
+	    ipath_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) {
+		qp->s_sending_psn = qp->s_psn;
+		qp->s_sending_hpsn = qp->s_psn - 1;
+		ipath_schedule_send(qp);
+	}
+}
+
 static inline void update_last_psn(struct ipath_qp *qp, u32 psn)
 {
 	qp->s_last_psn = psn;
@@ -875,6 +1016,7 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
 	int ret = 0;
 	u32 ack_psn;
 	int diff;
+	unsigned i;
 
 	/*
 	 * Remove the QP from the timeout queue (or RNR timeout queue).
@@ -896,7 +1038,7 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
 	ack_psn = psn;
 	if (aeth >> 29)
 		ack_psn--;
-	wqe = get_swqe_ptr(qp, qp->s_last);
+	wqe = get_swqe_ptr(qp, qp->s_acked);
 
 	/*
 	 * The MSN might be for a later WQE than the PSN indicates so
@@ -956,65 +1098,79 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
 			    qp->s_flags & IPATH_S_RDMAR_PENDING)
 				ipath_schedule_send(qp);
 		}
-		/* Post a send completion queue entry if requested. */
-		if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
-		    (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
-			memset(&wc, 0, sizeof wc);
-			wc.wr_id = wqe->wr.wr_id;
-			wc.status = IB_WC_SUCCESS;
-			wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
-			wc.byte_len = wqe->length;
-			wc.qp = &qp->ibqp;
-			wc.src_qp = qp->remote_qpn;
-			wc.slid = qp->remote_ah_attr.dlid;
-			wc.sl = qp->remote_ah_attr.sl;
-			ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
-		}
+		/*
+		 * Don't decrement refcount and don't generate a
+		 * completion if the WQE is being resent until the send
+		 * is finished.
+		 */
+		if (ipath_cmp24(wqe->lpsn, qp->s_sending_psn) < 0 ||
+		    ipath_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
+			for (i = 0; i < wqe->wr.num_sge; i++) {
+				struct ipath_sge *sge = &wqe->sg_list[i];
+
+				atomic_dec(&sge->mr->refcount);
+			}
+			/* Post a send completion queue entry if requested. */
+			if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
+			    (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+				memset(&wc, 0, sizeof wc);
+				wc.wr_id = wqe->wr.wr_id;
+				wc.status = IB_WC_SUCCESS;
+				wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
+				wc.byte_len = wqe->length;
+				wc.qp = &qp->ibqp;
+				ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc,
+						0);
+			}
+			if (++qp->s_last >= qp->s_size)
+				qp->s_last = 0;
+		} else
+			dev->n_rc_delayed_comp++;
 		qp->s_retry = qp->s_retry_cnt;
 		/*
 		 * If we are completing a request which is in the process of
 		 * being resent, we can stop resending it since we know the
 		 * responder has already seen it.
 		 */
-		if (qp->s_last == qp->s_cur) {
+		if (qp->s_acked == qp->s_cur) {
 			if (++qp->s_cur >= qp->s_size)
 				qp->s_cur = 0;
-			qp->s_last = qp->s_cur;
-			if (qp->s_last == qp->s_tail)
+			qp->s_acked = qp->s_cur;
+			if (qp->s_acked == qp->s_tail)
 				break;
 			wqe = get_swqe_ptr(qp, qp->s_cur);
 			qp->s_state = OP(SEND_LAST);
 			qp->s_psn = wqe->psn;
 		} else {
-			if (++qp->s_last >= qp->s_size)
-				qp->s_last = 0;
-			if (qp->state == IB_QPS_SQD && qp->s_last == qp->s_cur)
+			if (++qp->s_acked >= qp->s_size)
+				qp->s_acked = 0;
+			if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
 				qp->s_draining = 0;
-			if (qp->s_last == qp->s_tail)
+			if (qp->s_acked == qp->s_tail)
 				break;
-			wqe = get_swqe_ptr(qp, qp->s_last);
+			wqe = get_swqe_ptr(qp, qp->s_acked);
 		}
 	}
 
 	switch (aeth >> 29) {
 	case 0:		/* ACK */
 		dev->n_rc_acks++;
-		/* If this is a partial ACK, reset the retransmit timer. */
-		if (qp->s_last != qp->s_tail) {
+		if (qp->s_acked != qp->s_tail) {
+			/*
+			 * We got a partial ACK for a resent operation so
+			 * reset the retransmit timer.
+			 */
 			spin_lock(&dev->pending_lock);
 			if (list_empty(&qp->timerwait))
 				list_add_tail(&qp->timerwait,
 					&dev->pending[dev->pending_index]);
 			spin_unlock(&dev->pending_lock);
 			/*
-			 * If we get a partial ACK for a resent operation,
-			 * we can stop resending the earlier packets and
+			 * We can stop resending the earlier packets and
 			 * continue with the next packet the receiver wants.
 			 */
-			if (ipath_cmp24(qp->s_psn, psn) <= 0) {
+			if (ipath_cmp24(qp->s_psn, psn) <= 0)
 				reset_psn(qp, psn + 1);
-				ipath_schedule_send(qp);
-			}
 		} else if (ipath_cmp24(qp->s_psn, psn) <= 0) {
 			qp->s_state = OP(SEND_LAST);
 			qp->s_psn = psn + 1;
@@ -1023,12 +1179,16 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
 		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
 		qp->s_retry = qp->s_retry_cnt;
 		update_last_psn(qp, psn);
+		if (qp->s_cur != qp->s_head)
+			ipath_schedule_send(qp);
+		else
+			qp->s_flags &= ~IPATH_S_WAITING;
 		ret = 1;
 		goto bail;
 
 	case 1:		/* RNR NAK */
 		dev->n_rnr_naks++;
-		if (qp->s_last == qp->s_tail)
+		if (qp->s_acked == qp->s_tail)
 			goto bail;
 		if (qp->s_rnr_retry == 0) {
 			status = IB_WC_RNR_RETRY_EXC_ERR;
@@ -1056,7 +1216,7 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
 		goto bail;
 
 	case 3:		/* NAK */
-		if (qp->s_last == qp->s_tail)
+		if (qp->s_acked == qp->s_tail)
 			goto bail;
 		/* The last valid PSN is the previous PSN. */
 		update_last_psn(qp, psn - 1);
@@ -1087,8 +1247,13 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
 			status = IB_WC_REM_OP_ERR;
 			dev->n_other_naks++;
 		class_b:
-			ipath_send_complete(qp, wqe, status);
-			ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+			if (qp->s_last == qp->s_acked) {
+				ipath_send_complete(qp, wqe, status);
+				ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+			} else {
+				/* XXX need to handle delayed completion */
+				ipath_dbg("Delayed error %d\n", status);
+			}
 			break;
 
 		default:
@@ -1135,13 +1300,12 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
 {
 	struct ipath_swqe *wqe;
 	enum ib_wc_status status;
-	unsigned long flags;
 	int diff;
 	u32 pad;
 	u32 aeth;
 	u64 val;
 
-	spin_lock_irqsave(&qp->s_lock, flags);
+	spin_lock(&qp->s_lock);
 
 	/* Double check we can process this now that we hold the s_lock. */
 	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
@@ -1168,9 +1332,9 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
 		goto ack_done;
 	}
 
-	if (unlikely(qp->s_last == qp->s_tail))
+	if (unlikely(qp->s_acked == qp->s_tail))
 		goto ack_done;
-	wqe = get_swqe_ptr(qp, qp->s_last);
+	wqe = get_swqe_ptr(qp, qp->s_acked);
 	status = IB_WC_SUCCESS;
 
 	switch (opcode) {
@@ -1197,7 +1361,7 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
 		    opcode != OP(RDMA_READ_RESPONSE_FIRST))
 			goto ack_done;
 		hdrsize += 4;
-		wqe = get_swqe_ptr(qp, qp->s_last);
+		wqe = get_swqe_ptr(qp, qp->s_acked);
 		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
 			goto ack_op_err;
 		qp->r_flags &= ~IPATH_R_RDMAR_SEQ;
@@ -1244,8 +1408,8 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
 		 */
 		qp->s_rdma_read_len -= pmtu;
 		update_last_psn(qp, psn);
-		spin_unlock_irqrestore(&qp->s_lock, flags);
-		ipath_copy_sge(&qp->s_rdma_read_sge, data, pmtu);
+		spin_unlock(&qp->s_lock);
+		ipath_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0);
 		goto bail;
 
 	case OP(RDMA_READ_RESPONSE_ONLY):
@@ -1269,7 +1433,7 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
 		 * have to be careful to copy the data to the right
 		 * location.
 		 */
-		wqe = get_swqe_ptr(qp, qp->s_last);
+		wqe = get_swqe_ptr(qp, qp->s_acked);
 		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
 						  wqe, psn, pmtu);
 		goto read_last;
@@ -1305,7 +1469,8 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
 			aeth = be32_to_cpu(((__be32 *) data)[0]);
 			data += sizeof(__be32);
 		}
-		ipath_copy_sge(&qp->s_rdma_read_sge, data, tlen);
+		ipath_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0);
+		WARN_ON(qp->s_rdma_read_sge.num_sge);
 		(void) do_rc_ack(qp, aeth, psn,
 				 OP(RDMA_READ_RESPONSE_LAST), 0);
 		goto ack_done;
@@ -1318,10 +1483,15 @@ ack_op_err:
 ack_len_err:
 	status = IB_WC_LOC_LEN_ERR;
 ack_err:
-	ipath_send_complete(qp, wqe, status);
-	ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+	if (qp->s_last == qp->s_acked) {
+		ipath_send_complete(qp, wqe, status);
+		ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+	} else {
+		/* XXX need to handle delayed completion */
+		ipath_dbg("Delayed error %d\n", status);
+	}
 ack_done:
-	spin_unlock_irqrestore(&qp->s_lock, flags);
+	spin_unlock(&qp->s_lock);
 bail:
 	return;
 }
@@ -1355,7 +1525,6 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
 	struct ipath_ack_entry *e;
 	u8 i, prev;
 	int old_req;
-	unsigned long flags;
 
 	if (diff > 0) {
 		/*
@@ -1390,7 +1559,7 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
 	e = NULL;
 	old_req = 1;
 
-	spin_lock_irqsave(&qp->s_lock, flags);
+	spin_lock(&qp->s_lock);
 	/* Double check we can process this now that we hold the s_lock. */
 	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
 		goto unlock_done;
@@ -1447,8 +1616,12 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
 		offset = ((psn - e->psn) & IPATH_PSN_MASK) *
 			ib_mtu_enum_to_int(qp->path_mtu);
 		len = be32_to_cpu(reth->length);
-		if (unlikely(offset + len > e->rdma_sge.sge.sge_length))
+		if (unlikely(offset + len > e->rdma_sge.sge_length))
 			goto unlock_done;
+		if (e->rdma_sge.mr) {
+			atomic_dec(&e->rdma_sge.mr->refcount);
+			e->rdma_sge.mr = NULL;
+		}
 		if (len != 0) {
 			u32 rkey = be32_to_cpu(reth->rkey);
 			u64 vaddr = be64_to_cpu(reth->vaddr);
@@ -1460,12 +1633,9 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
 			if (unlikely(!ok))
 				goto unlock_done;
 		} else {
-			e->rdma_sge.sg_list = NULL;
-			e->rdma_sge.num_sge = 0;
-			e->rdma_sge.sge.mr = NULL;
-			e->rdma_sge.sge.vaddr = NULL;
-			e->rdma_sge.sge.length = 0;
-			e->rdma_sge.sge.sge_length = 0;
+			e->rdma_sge.vaddr = NULL;
+			e->rdma_sge.length = 0;
+			e->rdma_sge.sge_length = 0;
 		}
 		e->psn = psn;
 		qp->s_ack_state = OP(ACKNOWLEDGE);
@@ -1495,7 +1665,7 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
 		 * after all the previous RDMA reads and atomics.
 		 */
 		if (i == qp->r_head_ack_queue) {
-			spin_unlock_irqrestore(&qp->s_lock, flags);
+			spin_unlock(&qp->s_lock);
 			qp->r_nak_state = 0;
 			qp->r_ack_psn = qp->r_psn - 1;
 			goto send_ack;
@@ -1508,7 +1678,7 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
 		if (qp->r_head_ack_queue == qp->s_tail_ack_queue &&
 		    !(qp->s_flags & IPATH_S_ACK_PENDING) &&
 		    qp->s_ack_state == OP(ACKNOWLEDGE)) {
-			spin_unlock_irqrestore(&qp->s_lock, flags);
+			spin_unlock(&qp->s_lock);
 			qp->r_nak_state = 0;
 			qp->r_ack_psn = qp->s_ack_queue[i].psn - 1;
 			goto send_ack;
@@ -1525,7 +1695,7 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
 	ipath_schedule_send(qp);
 
 unlock_done:
-	spin_unlock_irqrestore(&qp->s_lock, flags);
+	spin_unlock(&qp->s_lock);
 done:
 	return 1;
 
@@ -1559,10 +1729,8 @@ static inline void ipath_update_ack_queue(struct ipath_qp *qp, unsigned n)
 	next = n + 1;
 	if (next > IPATH_MAX_RDMA_ATOMIC)
 		next = 0;
-	if (n == qp->s_tail_ack_queue) {
-		qp->s_tail_ack_queue = next;
-		qp->s_ack_state = OP(ACKNOWLEDGE);
-	}
+	qp->s_tail_ack_queue = next;
+	qp->s_ack_state = OP(ACKNOWLEDGE);
 }
 
 /**
@@ -1591,7 +1759,6 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 	int diff;
 	struct ib_reth *reth;
 	int header_in_data;
-	unsigned long flags;
 
 	/* Validate the SLID. See Ch. 9.6.1.5 */
 	if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid))
@@ -1694,7 +1861,7 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 		qp->r_rcv_len += pmtu;
 		if (unlikely(qp->r_rcv_len > qp->r_len))
 			goto nack_inv;
-		ipath_copy_sge(&qp->r_sge, data, pmtu);
+		ipath_copy_sge(&qp->r_sge, data, pmtu, 1);
 		break;
 
 	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
@@ -1714,11 +1881,11 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 	case OP(SEND_LAST_WITH_IMMEDIATE):
 	send_last_imm:
 		if (header_in_data) {
-			wc.imm_data = *(__be32 *) data;
+			wc.ex.imm_data = *(__be32 *) data;
 			data += sizeof(__be32);
 		} else {
 			/* Immediate data comes after BTH */
-			wc.imm_data = ohdr->u.imm_data;
+			wc.ex.imm_data = ohdr->u.imm_data;
 		}
 		hdrsize += 4;
 		wc.wc_flags = IB_WC_WITH_IMM;
@@ -1737,7 +1904,12 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 		wc.byte_len = tlen + qp->r_rcv_len;
 		if (unlikely(wc.byte_len > qp->r_len))
 			goto nack_inv;
-		ipath_copy_sge(&qp->r_sge, data, tlen);
+		ipath_copy_sge(&qp->r_sge, data, tlen, 1);
+		while (qp->r_sge.num_sge) {
+			atomic_dec(&qp->r_sge.sge.mr->refcount);
+			if (--qp->r_sge.num_sge)
+				qp->r_sge.sge = *qp->r_sge.sg_list++;
+		}
 		qp->r_msn++;
 		if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
 			break;
@@ -1775,19 +1947,21 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 		hdrsize += sizeof(*reth);
 		qp->r_len = be32_to_cpu(reth->length);
 		qp->r_rcv_len = 0;
+		qp->r_sge.sg_list = NULL;
 		if (qp->r_len != 0) {
 			u32 rkey = be32_to_cpu(reth->rkey);
 			u64 vaddr = be64_to_cpu(reth->vaddr);
 			int ok;
 
 			/* Check rkey & NAK */
-			ok = ipath_rkey_ok(qp, &qp->r_sge,
+			ok = ipath_rkey_ok(qp, &qp->r_sge.sge,
 					   qp->r_len, vaddr, rkey,
 					   IB_ACCESS_REMOTE_WRITE);
 			if (unlikely(!ok))
 				goto nack_acc;
+			qp->r_sge.num_sge = 1;
 		} else {
-			qp->r_sge.sg_list = NULL;
+			qp->r_sge.num_sge = 0;
 			qp->r_sge.sge.mr = NULL;
 			qp->r_sge.sge.vaddr = NULL;
 			qp->r_sge.sge.length = 0;
@@ -1812,7 +1986,7 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 		next = qp->r_head_ack_queue + 1;
 		if (next > IPATH_MAX_RDMA_ATOMIC)
 			next = 0;
-		spin_lock_irqsave(&qp->s_lock, flags);
+		spin_lock(&qp->s_lock);
 		/* Double check we can process this while holding the s_lock. */
 		if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
 			goto unlock;
@@ -1822,6 +1996,10 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 			ipath_update_ack_queue(qp, next);
 		}
 		e = &qp->s_ack_queue[qp->r_head_ack_queue];
+		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+			atomic_dec(&e->rdma_sge.mr->refcount);
+			e->rdma_sge.mr = NULL;
+		}
 		/* RETH comes after BTH */
 		if (!header_in_data)
 			reth = &ohdr->u.rc.reth;
@@ -1847,12 +2025,10 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 			if (len > pmtu)
 				qp->r_psn += (len - 1) / pmtu;
 		} else {
-			e->rdma_sge.sg_list = NULL;
-			e->rdma_sge.num_sge = 0;
-			e->rdma_sge.sge.mr = NULL;
-			e->rdma_sge.sge.vaddr = NULL;
-			e->rdma_sge.sge.length = 0;
-			e->rdma_sge.sge.sge_length = 0;
+			e->rdma_sge.mr = NULL;
+			e->rdma_sge.vaddr = NULL;
+			e->rdma_sge.length = 0;
+			e->rdma_sge.sge_length = 0;
 		}
 		e->opcode = opcode;
 		e->sent = 0;
@@ -1890,7 +2066,7 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 		next = qp->r_head_ack_queue + 1;
 		if (next > IPATH_MAX_RDMA_ATOMIC)
 			next = 0;
-		spin_lock_irqsave(&qp->s_lock, flags);
+		spin_lock(&qp->s_lock);
 		/* Double check we can process this while holding the s_lock. */
 		if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
 			goto unlock;
@@ -1899,6 +2075,11 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 				goto nack_inv_unlck;
 			ipath_update_ack_queue(qp, next);
 		}
+		e = &qp->s_ack_queue[qp->r_head_ack_queue];
+		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+			atomic_dec(&e->rdma_sge.mr->refcount);
+			e->rdma_sge.mr = NULL;
+		}
 		if (!header_in_data)
 			ateth = &ohdr->u.atomic_eth;
 		else
@@ -1909,19 +2090,20 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 			goto nack_inv_unlck;
 		rkey = be32_to_cpu(ateth->rkey);
 		/* Check rkey & NAK */
-		if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge,
+		if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge.sge,
 					    sizeof(u64), vaddr, rkey,
 					    IB_ACCESS_REMOTE_ATOMIC)))
 			goto nack_acc_unlck;
 		/* Perform atomic OP and save result. */
 		maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
 		sdata = be64_to_cpu(ateth->swap_data);
-		e = &qp->s_ack_queue[qp->r_head_ack_queue];
 		e->atomic_data = (opcode == OP(FETCH_ADD)) ?
 			(u64) atomic64_add_return(sdata, maddr) - sdata :
 			(u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
 				      be64_to_cpu(ateth->compare_data),
 				      sdata);
+		atomic_dec(&qp->r_sge.sge.mr->refcount);
+		qp->r_sge.num_sge = 0;
 		e->opcode = opcode;
 		e->sent = 0;
 		e->psn = psn & IPATH_PSN_MASK;
@@ -1956,7 +2138,7 @@ rnr_nak:
 	goto send_ack;
 
 nack_inv_unlck:
-	spin_unlock_irqrestore(&qp->s_lock, flags);
+	spin_unlock(&qp->s_lock);
 nack_inv:
 	ipath_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
 	qp->r_nak_state = IB_NAK_INVALID_REQUEST;
@@ -1964,7 +2146,7 @@ nack_inv:
 	goto send_ack;
 
 nack_acc_unlck:
-	spin_unlock_irqrestore(&qp->s_lock, flags);
+	spin_unlock(&qp->s_lock);
 nack_acc:
 	ipath_rc_error(qp, IB_WC_LOC_PROT_ERR);
 	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
@@ -1974,7 +2156,7 @@ send_ack:
 	goto done;
 
 unlock:
-	spin_unlock_irqrestore(&qp->s_lock, flags);
+	spin_unlock(&qp->s_lock);
 done:
 	return;
 }
diff --git a/drivers/infiniband/hw/ipath/ipath_registers.h b/drivers/infiniband/hw/ipath/ipath_registers.h
index a0fabaa..8f44d0c 100644
--- a/drivers/infiniband/hw/ipath/ipath_registers.h
+++ b/drivers/infiniband/hw/ipath/ipath_registers.h
@@ -59,7 +59,6 @@
 #define INFINIPATH_R_SOFTWARE_SHIFT 24
 #define INFINIPATH_R_BOARDID_MASK 0xFF
 #define INFINIPATH_R_BOARDID_SHIFT 32
-#define INFINIPATH_R_EMULATOR_MASK (1ULL<<62)
 
 /* kr_control bits */
 #define INFINIPATH_C_FREEZEMODE 0x00000002
@@ -70,12 +69,10 @@
 #define INFINIPATH_S_UPDTHRESH_SHIFT 24
 #define INFINIPATH_S_UPDTHRESH_MASK 0x1f
 
-
 #define IPATH_S_ABORT		0
 #define IPATH_S_PIOINTBUFAVAIL	1
 #define IPATH_S_PIOBUFAVAILUPD	2
 #define IPATH_S_PIOENABLE	3
-#define IPATH_S_SPECIALTRIGGER	4
 #define IPATH_S_SDMAINTENABLE	9
 #define IPATH_S_SDMASINGLEDESCRIPTOR	10
 #define IPATH_S_SDMAENABLE	11
@@ -86,7 +83,6 @@
 #define INFINIPATH_S_PIOINTBUFAVAIL	(1U << IPATH_S_PIOINTBUFAVAIL)
 #define INFINIPATH_S_PIOBUFAVAILUPD	(1U << IPATH_S_PIOBUFAVAILUPD)
 #define INFINIPATH_S_PIOENABLE		(1U << IPATH_S_PIOENABLE)
-#define INFINIPATH_S_SPECIALTRIGGER	(1U << IPATH_S_SPECIALTRIGGER)
 #define INFINIPATH_S_SDMAINTENABLE	(1U << IPATH_S_SDMAINTENABLE)
 #define INFINIPATH_S_SDMASINGLEDESCRIPTOR \
 					(1U << IPATH_S_SDMASINGLEDESCRIPTOR)
diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c
index 1cc5da6..44f0518 100644
--- a/drivers/infiniband/hw/ipath/ipath_ruc.c
+++ b/drivers/infiniband/hw/ipath/ipath_ruc.c
@@ -142,6 +142,12 @@ int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe,
 	goto bail;
 
 bad_lkey:
+	while (j) {
+		struct ipath_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge;
+
+		atomic_dec(&sge->mr->refcount);
+	}
+	ss->num_sge = 0;
 	memset(&wc, 0, sizeof(wc));
 	wc.wr_id = wqe->wr_id;
 	wc.status = IB_WC_LOC_PROT_ERR;
@@ -157,7 +163,7 @@ bail:
 /**
  * ipath_get_rwqe - copy the next RWQE into the QP's RWQE
  * @qp: the QP
- * @wr_id_only: update wr_id only, not SGEs
+ * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
  *
  * Return 0 if no RWQE is available, otherwise return 1.
  *
@@ -174,8 +180,6 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
 	u32 tail;
 	int ret;
 
-	qp->r_sge.sg_list = qp->r_sg_list;
-
 	if (qp->ibqp.srq) {
 		srq = to_isrq(qp->ibqp.srq);
 		handler = srq->ibsrq.event_handler;
@@ -197,20 +201,29 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
 	/* Validate tail before using it since it is user writable. */
 	if (tail >= rq->size)
 		tail = 0;
-	do {
-		if (unlikely(tail == wq->head)) {
+	if (unlikely(tail == wq->head)) {
+		ret = 0;
+		goto unlock;
+	}
+	/* Make sure entry is read after head index is read. */
+	smp_rmb();
+	wqe = get_rwqe_ptr(rq, tail);
+	/*
+	 * Even though we update the tail index in memory, the verbs
+	 * consumer is not supposed to post more entries until a
+	 * completion is generated.
+	 */
+	if (++tail >= rq->size)
+		tail = 0;
+	wq->tail = tail;
+	if (!wr_id_only) {
+		qp->r_sge.sg_list = qp->r_sg_list;
+		if (!ipath_init_sge(qp, wqe, &qp->r_len, &qp->r_sge)) {
 			ret = 0;
 			goto unlock;
 		}
-		/* Make sure entry is read after head index is read. */
-		smp_rmb();
-		wqe = get_rwqe_ptr(rq, tail);
-		if (++tail >= rq->size)
-			tail = 0;
-	} while (!wr_id_only && !ipath_init_sge(qp, wqe, &qp->r_len,
-						&qp->r_sge));
+	}
 	qp->r_wr_id = wqe->wr_id;
-	wq->tail = tail;
 
 	ret = 1;
 	set_bit(IPATH_R_WRID_VALID, &qp->r_aflags);
@@ -268,6 +281,7 @@ static void ipath_ruc_loopback(struct ipath_qp *sqp)
 	u64 sdata;
 	atomic64_t *maddr;
 	enum ib_wc_status send_status;
+	int release;
 
 	/*
 	 * Note that we check the responder QP state after
@@ -325,6 +339,7 @@ again:
 	memset(&wc, 0, sizeof wc);
 	send_status = IB_WC_SUCCESS;
 
+	release = 1;
 	sqp->s_sge.sge = wqe->sg_list[0];
 	sqp->s_sge.sg_list = wqe->sg_list + 1;
 	sqp->s_sge.num_sge = wqe->wr.num_sge;
@@ -332,7 +347,7 @@ again:
 	switch (wqe->wr.opcode) {
 	case IB_WR_SEND_WITH_IMM:
 		wc.wc_flags = IB_WC_WITH_IMM;
-		wc.imm_data = wqe->wr.imm_data;
+		wc.ex.imm_data = wqe->wr.ex.imm_data;
 		/* FALLTHROUGH */
 	case IB_WR_SEND:
 		if (!ipath_get_rwqe(qp, 0))
@@ -343,7 +358,7 @@ again:
 		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
 			goto inv_err;
 		wc.wc_flags = IB_WC_WITH_IMM;
-		wc.imm_data = wqe->wr.imm_data;
+		wc.ex.imm_data = wqe->wr.ex.imm_data;
 		if (!ipath_get_rwqe(qp, 1))
 			goto rnr_nak;
 		/* FALLTHROUGH */
@@ -352,22 +367,27 @@ again:
 			goto inv_err;
 		if (wqe->length == 0)
 			break;
-		if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, wqe->length,
+		if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
 					    wqe->wr.wr.rdma.remote_addr,
 					    wqe->wr.wr.rdma.rkey,
 					    IB_ACCESS_REMOTE_WRITE)))
 			goto acc_err;
+		qp->r_sge.sg_list = NULL;
+		qp->r_sge.num_sge = 1;
 		qp->r_sge.total_len = wqe->length;
 		break;
 
 	case IB_WR_RDMA_READ:
 		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
 			goto inv_err;
-		if (unlikely(!ipath_rkey_ok(qp, &sqp->s_sge, wqe->length,
+		if (unlikely(!ipath_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
 					    wqe->wr.wr.rdma.remote_addr,
 					    wqe->wr.wr.rdma.rkey,
 					    IB_ACCESS_REMOTE_READ)))
 			goto acc_err;
+		release = 0;
+		sqp->s_sge.sg_list = NULL;
+		sqp->s_sge.num_sge = 1;
 		qp->r_sge.sge = wqe->sg_list[0];
 		qp->r_sge.sg_list = wqe->sg_list + 1;
 		qp->r_sge.num_sge = wqe->wr.num_sge;
@@ -378,7 +398,7 @@ again:
 	case IB_WR_ATOMIC_FETCH_AND_ADD:
 		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
 			goto inv_err;
-		if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, sizeof(u64),
+		if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
 					    wqe->wr.wr.atomic.remote_addr,
 					    wqe->wr.wr.atomic.rkey,
 					    IB_ACCESS_REMOTE_ATOMIC)))
@@ -391,6 +411,8 @@ again:
 			(u64) atomic64_add_return(sdata, maddr) - sdata :
 			(u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
 				      sdata, wqe->wr.wr.atomic.swap);
+		atomic_dec(&qp->r_sge.sge.mr->refcount);
+		qp->r_sge.num_sge = 0;
 		goto send_comp;
 
 	default:
@@ -407,14 +429,16 @@ again:
 		if (len > sge->sge_length)
 			len = sge->sge_length;
 		BUG_ON(len == 0);
-		ipath_copy_sge(&qp->r_sge, sge->vaddr, len);
+		ipath_copy_sge(&qp->r_sge, sge->vaddr, len, release);
 		sge->vaddr += len;
 		sge->length -= len;
 		sge->sge_length -= len;
 		if (sge->sge_length == 0) {
+			if (!release)
+				atomic_dec(&sge->mr->refcount);
 			if (--sqp->s_sge.num_sge)
 				*sge = *sqp->s_sge.sg_list++;
-		} else if (sge->length == 0 && sge->mr != NULL) {
+		} else if (sge->length == 0 && sge->mr->lkey) {
 			if (++sge->n >= IPATH_SEGSZ) {
 				if (++sge->m >= sge->mr->mapsz)
 					break;
@@ -427,6 +451,12 @@ again:
 		}
 		sqp->s_len -= len;
 	}
+	if (release)
+		while (qp->r_sge.num_sge) {
+			atomic_dec(&qp->r_sge.sge.mr->refcount);
+			if (--qp->r_sge.num_sge)
+				qp->r_sge.sge = *qp->r_sge.sg_list++;
+		}
 
 	if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
 		goto send_comp;
@@ -621,7 +651,8 @@ void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp,
 	qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
 	qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
 	qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
-	qp->s_hdr.lrh[3] = cpu_to_be16(dev->dd->ipath_lid);
+	qp->s_hdr.lrh[3] = cpu_to_be16(dev->dd->ipath_lid |
+				       qp->remote_ah_attr.src_path_bits);
 	bth0 |= ipath_get_pkey(dev->dd, qp->s_pkey_index);
 	bth0 |= extra_bytes << 20;
 	ohdr->bth[0] = cpu_to_be32(bth0 | (1 << 22));
@@ -641,12 +672,14 @@ void ipath_do_send(unsigned long data)
 {
 	struct ipath_qp *qp = (struct ipath_qp *)data;
 	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ipath_devdata *dd = dev->dd;
 	int (*make_req)(struct ipath_qp *qp);
 	unsigned long flags;
 
 	if ((qp->ibqp.qp_type == IB_QPT_RC ||
 	     qp->ibqp.qp_type == IB_QPT_UC) &&
-	    qp->remote_ah_attr.dlid == dev->dd->ipath_lid) {
+	    (qp->remote_ah_attr.dlid & ~((1 << dd->ipath_lmc) - 1)) ==
+	    dd->ipath_lid) {
 		ipath_ruc_loopback(qp);
 		goto bail;
 	}
@@ -701,10 +734,16 @@ void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
 			 enum ib_wc_status status)
 {
 	u32 old_last, last;
+	unsigned i;
 
 	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
 		return;
 
+	for (i = 0; i < wqe->wr.num_sge; i++) {
+		struct ipath_sge *sge = &wqe->sg_list[i];
+
+		atomic_dec(&sge->mr->refcount);
+	}
 	/* See ch. 11.2.4.1 and 10.7.3.1 */
 	if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
 	    (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
@@ -726,6 +765,8 @@ void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
 	if (++last >= qp->s_size)
 		last = 0;
 	qp->s_last = last;
+	if (qp->s_acked == old_last)
+		qp->s_acked = last;
 	if (qp->s_cur == old_last)
 		qp->s_cur = last;
 	if (qp->s_tail == old_last)
diff --git a/drivers/infiniband/hw/ipath/ipath_sd7220.c b/drivers/infiniband/hw/ipath/ipath_sd7220.c
index c0c9b44..aa47eb5 100644
--- a/drivers/infiniband/hw/ipath/ipath_sd7220.c
+++ b/drivers/infiniband/hw/ipath/ipath_sd7220.c
@@ -47,7 +47,7 @@
  * various SerDes registers by IBC. It is not part of the normal kregs
  * map and is used in exactly one place, hence the #define below.
  */
-#define KR_IBSerDesMappTable (0x94000 / (sizeof (uint64_t)))
+#define KR_IBSerDesMappTable (0x94000 / (sizeof(uint64_t)))
 
 /*
  * Below used for sdnum parameter, selecting one of the two sections
@@ -93,7 +93,7 @@ void ipath_set_relock_poll(struct ipath_devdata *dd, int ibup);
 /*
  * Below keeps track of whether the "once per power-on" initialization has
  * been done, because uC code Version 1.32.17 or higher allows the uC to
- * be reset at will, and Automatic Equalization may requore it. So the
+ * be reset at will, and Automatic Equalization may require it. So the
  * state of the reset "pin", as reflected in was_reset parameter to
  * ipath_sd7220_init() is no longer valid. Instead, we check for the
  * actual uC code having been loaded.
@@ -137,10 +137,10 @@ bail:
 	return;
 }
 
-/* After a reset or other unusual event, the epb interface may need
+/*
+ * After a reset or other unusual event, the epb interface may need
  * to be re-synchronized, between the host and the uC.
- * returns <0 for failure
- * (which can only happen if we fail IBSD_RESYNC_TRIES times)
+ * returns <0 for failure to resync within IBSD_RESYNC_TRIES (not expected)
  */
 #define IBSD_RESYNC_TRIES 3
 #define IB_PGUDP(chn) (EPB_LOC((chn), 2, 1) | EPB_IB_QUAD0_CS)
@@ -301,12 +301,7 @@ static void ipath_sd_trimdone_monitor(struct ipath_devdata *dd,
 		ipath_cdbg(VERBOSE, "IBCS TRIMDONE set (%s)\n", where);
 	else
 		ipath_dev_err(dd, "IBCS TRIMDONE clear (%s)\n", where);
-	/*
-	* Do "dummy read/mod/wr" to get EPB in sane state after reset
-	* The default (and hopefully only, D6..0) value for MPREG6 is 0, and
-	* we want to set to 0x80. Since we can't trust read, or we wouldn't
-	* be doing this, hope for the best
-	*/
+
 	udelay(2);
 
 	ret = ipath_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG6, 0x80, 0x80);
@@ -370,12 +365,14 @@ int ipath_sd7220_init(struct ipath_devdata *dd, int was_reset)
 {
 	int ret = 1; /* default to failure */
 	int first_reset;
+	int val_stat;
 
 	if (!was_reset) {
 		/* entered with reset not asserted, we need to do it */
 		ipath_ibsd_reset(dd, 1);
 		ipath_sd_trimdone_monitor(dd, "Driver-reload");
 	}
+
 	/* Substitute our deduced value for was_reset */
 	ret = ipath_ibsd_ucode_loaded(dd);
 	if (ret < 0) {
@@ -383,88 +380,82 @@ int ipath_sd7220_init(struct ipath_devdata *dd, int was_reset)
 		goto done;
 	}
 	first_reset = !ret; /* First reset if IBSD uCode not yet loaded */
-	do {
-		/*
-		 * Alter some regs per vendor latest doc, reset-defaults
-		 * are not right for IB.
-		 */
-		ret = ipath_sd_early(dd);
+
+	/*
+	 * Alter some regs per vendor latest doc, reset-defaults
+	 * are not right for IB.
+	 */
+	ret = ipath_sd_early(dd);
+	if (ret < 0) {
+		ipath_dev_err(dd, "Failed to set IB SERDES early defaults\n");
+		ret = 1;
+		goto done;
+	}
+
+	/*
+	 * Set DAC manual trim IB.
+	 * We only do this once after chip has been reset (usually
+	 * same as once per system boot).
+	 */
+	if (first_reset) {
+		ret = ipath_sd_dactrim(dd);
 		if (ret < 0) {
-			ipath_dev_err(dd,
-				"Failed to set IB SERDES early defaults\n");
+			ipath_dev_err(dd, "Failed IB SERDES DAC trim\n");
 			ret = 1;
-			break;
-		}
-		/* Set DAC manual trim IB.
-		 * We only do this once after chip has been reset (usually
-		 * same as once per system boot).
-		 */
-		if (first_reset) {
-			ret = ipath_sd_dactrim(dd);
-			if (ret < 0) {
-				ipath_dev_err(dd,
-					 "Failed IB SERDES DAC trim\n");
-				ret = 1;
-				break;
-			}
+			goto done;
 		}
-		/*
-		 * Set various registers (DDS and RXEQ) that will be
-		 * controlled by IBC (in 1.2 mode) to reasonable preset values
-		 * Calling the "internal" version avoids the "check for needed"
-		 * and "trimdone monitor" that might be counter-productive.
-		 */
-		ret = ipath_internal_presets(dd);
+	}
+
+	/*
+	 * Set various registers (DDS and RXEQ) that will be
+	 * controlled by IBC (in 1.2 mode) to reasonable preset values
+	 * Calling the "internal" version avoids the "check for needed"
+	 * and "trimdone monitor" that might be counter-productive.
+	 */
+	ret = ipath_internal_presets(dd);
+	if (ret < 0) {
+		ipath_dev_err(dd, "Failed to set IB SERDES presets\n");
+		ret = 1;
+		goto done;
+	}
+	ret = ipath_sd_trimself(dd, 0x80);
+	if (ret < 0) {
+		ipath_dev_err(dd, "Failed to set IB SERDES TRIMSELF\n");
+		ret = 1;
+		goto done;
+	}
+
+	/* Load image, then try to verify */
+	ret = 0;	/* Assume success */
+	if (first_reset) {
+		int vfy;
+		int trim_done;
+		ipath_dbg("SerDes uC was reset, reloading PRAM\n");
+		ret = ipath_sd7220_ib_load(dd);
 		if (ret < 0) {
-			ipath_dev_err(dd, "Failed to set IB SERDES presets\n");
+			ipath_dev_err(dd, "Failed to load IB SERDES image\n");
 			ret = 1;
-			break;
+			goto done;
 		}
-		ret = ipath_sd_trimself(dd, 0x80);
-		if (ret < 0) {
-			ipath_dev_err(dd, "Failed to set IB SERDES TRIMSELF\n");
+
+		/* Loaded image, try to verify */
+		vfy = ipath_sd7220_ib_vfy(dd);
+		if (vfy != ret) {
+			ipath_dev_err(dd, "SERDES PRAM VFY failed\n");
 			ret = 1;
-			break;
+			goto done;
 		}
+		/*
+		 * Loaded and verified. Almost good...
+		 * hold "success" in ret
+		 */
+		ret = 0;
 
-		/* Load image, then try to verify */
-		ret = 0;	/* Assume success */
-		if (first_reset) {
-			ipath_dbg("SerDes uC was reset, reloading PRAM\n");
-			ret = ipath_sd7220_ib_load(dd);
-			if (ret < 0) {
-				ipath_dev_err(dd,
-					"Failed to load IB SERDES image\n");
-				break;
-			} else {
-				/* Loaded image, try to verify */
-				int vfy;
-
-				vfy = ipath_sd7220_ib_vfy(dd);
-				if (vfy != ret) {
-					ipath_dev_err(dd,
-						"SERDES PRAM VFY failed\n");
-					ret = 1;
-				} else {
-					/*
-					 * Loaded and verified. Almost good...
-					 * hold "success" in ret
-					 */
-					ret = 0;
-				} /* end if verified */
-			} /* end if loaded */
-		} /* end if first_reset */
-	} while (0) ; /* do_while for goto-less bailing */
-
-	if (ret == 0 && first_reset) {
 		/*
 		 * Prev steps all worked, continue bringup
 		 * De-assert RESET to uC, only in first reset, to allow
 		 * trimming.
-		 */
-		int trim_done;
-
-		/*
+		 *
 		 * Since our default setup sets START_EQ1 to
 		 * PRESET, we need to clear that for this very first run.
 		 */
@@ -478,7 +469,7 @@ int ipath_sd7220_init(struct ipath_devdata *dd, int was_reset)
 		ipath_ibsd_reset(dd, 0);
 		/*
 		 * If this is not the first reset, trimdone should be set
-		 * already. We may need to check about this.
+		 * already.
 		 */
 		trim_done = ipath_sd_trimdone_poll(dd);
 		/*
@@ -493,25 +484,19 @@ int ipath_sd7220_init(struct ipath_devdata *dd, int was_reset)
 			ret = 1;
 			goto done;
 		}
-		/*
-		 * DEBUG: check each time we reset if trimdone bits have
-		 * gotten cleared, and re-set them.
-		 */
+
 		ipath_sd_trimdone_monitor(dd, "First-reset");
 		/* Remember so we do not re-do the load, dactrim, etc. */
 		dd->serdes_first_init_done = 1;
 	}
-	if (ret == 0) {
-		/*
-		 * setup for channel training and load values for
-		 * RxEq and DDS in tables used by IBC in IB1.2 mode
-		 */
-		int val_stat;
+	/*
+	 * Setup for channel training and load values for
+	 * RxEq and DDS in tables used by IBC in IB1.2 mode
+	 */
 
-		val_stat = ipath_sd_setvals(dd);
-		if (val_stat < 0)
-			ret = 1;
-	}
+	val_stat = ipath_sd_setvals(dd);
+	if (val_stat < 0)
+		ret = 1;
 done:
 	/* start relock timer regardless, but start at 1 second */
 	ipath_set_relock_poll(dd, -1);
@@ -573,9 +558,6 @@ static int epb_access(struct ipath_devdata *dd, int sdnum, int claim)
 		 */
 		u64 newval = 0;
 		ipath_write_kreg(dd, acc, newval);
-		/* WHAT IS APPROPRIATE DELAY? How do we handle
-		 * failures?
-		 */
 		/* First read after write is not trustworthy */
 		pollval = ipath_read_kreg32(dd, acc);
 		udelay(5);
@@ -587,7 +569,6 @@ static int epb_access(struct ipath_devdata *dd, int sdnum, int claim)
 		u64 pollval;
 		u64 newval = EPB_ACC_REQ | oct_sel;
 		ipath_write_kreg(dd, acc, newval);
-		/* WHAT IS APPROPRIATE DELAY? How do we handle failures? */
 		/* First read after write is not trustworthy */
 		pollval = ipath_read_kreg32(dd, acc);
 		udelay(5);
@@ -772,8 +753,8 @@ static int ipath_sd7220_ram_xfer(struct ipath_devdata *dd, int sdnum, u32 loc,
 	 * In future code, we may need to distinguish several address ranges,
 	 * and select various memories based on this. For now, just trim
 	 * "loc" (location including address and memory select) to
-	 *  "addr" (address within memory). we will only support PRAM
-	 *  The memory is 8KB.
+	 * "addr" (address within memory). we will only support PRAM
+	 * The memory is 8KB.
 	 */
 	addr = loc & 0x1FFF;
 	for (tries = EPB_TRANS_TRIES; tries; --tries) {
@@ -784,7 +765,6 @@ static int ipath_sd7220_ram_xfer(struct ipath_devdata *dd, int sdnum, u32 loc,
 	}
 
 	sofar = 0;
-	/* If failed to see ready, what error-handling? */
 	if (tries <= 0)
 		ipath_dbg("No initial RDY on EPB access request\n");
 	else {
@@ -908,10 +888,7 @@ int ipath_sd7220_prog_vfy(struct ipath_devdata *dd, int sdnum,
 	return errors ? -errors : sofar;
 }
 
-/*
- * Future driver may use IRQ to sequence SERDES and IBC bringup, but
- * for now we poll.
- */
+/* IRQ not set up at this point in init, so we poll. */
 #define IB_SERDES_TRIM_DONE (1ULL << 11)
 #define TRIM_TMO (30)
 
@@ -944,45 +921,15 @@ static int ipath_sd_trimdone_poll(struct ipath_devdata *dd)
 #define TX_FAST_ELT (9)
 
 /*
- * Set the "register patch" values for SERDES. These are referenced,
- * indirectly, by the contents of the SerDesDDSRXEQ[] array. Details
- * are beyond the scope of this document, but in essence, there are
- * two classes of registers that are "tweaked" during operation,
- * Driver DeEmphasis (DDS) and Receiver Equalization. The first
- * 'm' (currently 6) entries specify registers for DDS, and the next
- * 'n' (currently 15) entries specify registers for RxEQ. In operation,
- * the hardware traverses the list for each, forming an index into
- * IBSerDesMappTable[] in one of two ways:
- * DDS entries:
- * (0 << 8) | (dds_val) << 4) | (index)
- * RxEQ entries:
- * (1 << 8) | (rxeq_val << 6) | (0 << 5) | (index)
- *  Where (index) is the index in the SerDesDDSRXEQ[] array, and
- * dds_val (4 bits) or rxeq_val (2 bits) are based on conditions in
- * the SerDes and IBC.
- *
- * With the data coming from one place, and the addresses coming from
- * another, it can get confusing, but the _intent_ is to follow the
- * initialization sequence described in Infinband_REG_Prcedure_10_17_06.pdf.
- * This is somewhat complicated by the fact that although the doc
- * says "write these registers in this order", in fact the order
- * is not relevant (per the vendor). In an attempt to make it somewhat
- * easier on human maintainers, the table below is organized as 16
- * rows, corresponding to one of the rows in the vendor's table.
- * Each row has amplitude, Main_control, Post-curser, and Pre-curser values
- * for "full" (DDR) and "half" (SDR) rate. These are merged into an array
- * of six bytes, in the order they are actually to be stored, to the
- * Registers of element 9, Rges 0, 1, 9, 0xA, 6, 7, in that order
- *
- * Also, because the vendor specifies that the "Enable" bits are set in
- * every case, we do that in the macro. That way the values can be
- * visually compared with vendor docs.
+ * Set the "negotiation" values for SERDES. These are used by the IB1.2
+ * link negotiation. Macros below are attempt to keep the values a
+ * little more human-editable.
+ * First, values related to Drive De-emphasis Settings.
  */
 
 #define NUM_DDS_REGS 6
 #define DDS_REG_MAP 0x76A910 /* LSB-first list of regs (in elt 9) to mod */
 
-/* Ignore values from vendor. Use compromise values in all slots */
 #define DDS_VAL(amp_d, main_d, ipst_d, ipre_d, amp_s, main_s, ipst_s, ipre_s) \
 	{ { ((amp_d & 0x1F) << 1) | 1, ((amp_s & 0x1F) << 1) | 1, \
 	  (main_d << 3) | 4 | (ipre_d >> 2), \
@@ -1014,22 +961,11 @@ static struct dds_init {
 	DDS_VAL(28, 25,  6, 0, 21, 28,  3, 0),
 	DDS_VAL(27, 26,  5, 0, 19, 29,  2, 0),
 	DDS_VAL(25, 27,  4, 0, 17, 30,  1, 0)
-/*
- * Below is 17th value from vendor. IBC only handles 16 values, and uses
- * first one as default. The line below just documents what we would use.
- *	DDS_VAL(23, 28,  3, 0,  15, 31, 0, 0 )
- */
 };
 
 /*
- * Now the RXEQ section of the table. This is both simpler and
- * more complicated. Simpler because each "register store" has only
- * four valuess associated with it (only two bits of RxEqualization).
- * So, unlike the DDS values above, we simply enumerate all four
- * values across one "line", which corresponds to one register-write.
- * More complicated because there are several register-writes that do
- * not differ across RXEQ vals.
- * Values below are from Vendor doc dated 7May2007
+ * Next, values related to Receive Equalization.
+ * In comments, FDR (Full) is IB DDR, HDR (Half) is IB SDR
  */
 /* Hardware packs an element number and register address thus: */
 #define RXEQ_INIT_RDESC(elt, addr) (((elt) & 0xF) | ((addr) << 4))
@@ -1044,11 +980,6 @@ static struct dds_init {
 #define RXEQ_SDR_G1CNT_Z1CNT 0x11
 #define RXEQ_SDR_ZCNT 23
 
-/*
- * The values below (as opposed to what "was") were experimentally determined
- * to reduce IB Symbol errors, but currently all four "sets" are the same.
- * with more experimentation, we will derive a range.
- */
 static struct rxeq_init {
 	u16 rdesc;	/* in form used in SerDesDDSRXEQ */
 	u8  rdata[4];
@@ -1056,17 +987,17 @@ static struct rxeq_init {
 	/* Set Rcv Eq. to Preset node */
 	RXEQ_VAL_ALL(7, 0x27, 0x10),
 	/* Set DFELTHFDR/HDR thresholds */
-	RXEQ_VAL(7, 8,    0, 0, 0, 0), /* FDR, was 0, 1, 2, 3 */
+	RXEQ_VAL(7, 8,    0, 0, 0, 0), /* FDR */
 	RXEQ_VAL(7, 0x21, 0, 0, 0, 0), /* HDR */
 	/* Set TLTHFDR/HDR theshold */
-	RXEQ_VAL(7, 9,    2, 2, 2, 2), /* FDR, was 0, 2, 4, 6 */
-	RXEQ_VAL(7, 0x23, 2, 2, 2, 2), /* HDR, was  0, 1, 2, 3 */
+	RXEQ_VAL(7, 9,    2, 2, 2, 2), /* FDR */
+	RXEQ_VAL(7, 0x23, 2, 2, 2, 2), /* HDR */
 	/* Set Preamp setting 2 (ZFR/ZCNT) */
-	RXEQ_VAL(7, 0x1B, 12, 12, 12, 12), /* FDR, was 12, 16, 20, 24 */
-	RXEQ_VAL(7, 0x1C, 12, 12, 12, 12), /* HDR, was 12, 16, 20, 24 */
+	RXEQ_VAL(7, 0x1B, 12, 12, 12, 12), /* FDR */
+	RXEQ_VAL(7, 0x1C, 12, 12, 12, 12), /* HDR */
 	/* Set Preamp DC gain and Setting 1 (GFR/GHR) */
-	RXEQ_VAL(7, 0x1E, 0x10, 0x10, 0x10, 0x10), /* FDR, was 0x10, 0x11, 0x12, 0x14 */
-	RXEQ_VAL(7, 0x1F, 0x10, 0x10, 0x10, 0x10), /* HDR, was 0x10, 0x11, 0x12, 0x14 */
+	RXEQ_VAL(7, 0x1E, 0x10, 0x10, 0x10, 0x10), /* FDR */
+	RXEQ_VAL(7, 0x1F, 0x10, 0x10, 0x10, 0x10), /* HDR */
 	/* Toggle RELOCK (in VCDL_CTRL0) to lock to data */
 	RXEQ_VAL_ALL(6, 6, 0x20), /* Set D5 High */
 	RXEQ_VAL_ALL(6, 6, 0), /* Set D5 Low */
@@ -1171,13 +1102,13 @@ static int ibsd_sto_noisy(struct ipath_devdata *dd, int loc, int val, int mask)
 	return ret;
 }
 
-/* Repeat a "store" across all channels of the IB SerDes.
+/*
+ * Repeat a "store" across all channels of the IB SerDes.
  * Although nominally it inherits the "read value" of the last
  * channel it modified, the only really useful return is <0 for
  * failure, >= 0 for success. The parameter 'loc' is assumed to
  * be the location for the channel-0 copy of the register to
- * be modified. In future, we could use the "gang write" option
- * of EPB, as long as we do not read.
+ * be modified.
  */
 static int ibsd_mod_allchnls(struct ipath_devdata *dd, int loc, int val,
 	int mask)
@@ -1186,7 +1117,8 @@ static int ibsd_mod_allchnls(struct ipath_devdata *dd, int loc, int val,
 	int chnl;
 
 	if (loc & EPB_GLOBAL_WR) {
-		/* our caller has assured us that we can set all four
+		/*
+		 * Our caller has assured us that we can set all four
 		 * channels at once. Trust that. If mask is not 0xFF,
 		 * we will read the _specified_ channel for our starting
 		 * value.
@@ -1285,8 +1217,6 @@ static int set_rxeq_vals(struct ipath_devdata *dd, int vsel)
 /*
  * Set the default values (row 0) for DDR Driver Demphasis.
  * we do this initially and whenever we turn off IB-1.2
- * Vendor recommends non-default presets, depending on
- * cable length. Initial testing will assume 3 meter cables.
  * The "default" values for Rx equalization are also stored to
  * SerDes registers. Formerly (and still default), we used set 2.
  * For experimenting with cables and link-partners, we allow changing
@@ -1295,7 +1225,8 @@ static int set_rxeq_vals(struct ipath_devdata *dd, int vsel)
 static unsigned ipath_rxeq_set = 2;
 module_param_named(rxeq_default_set, ipath_rxeq_set, uint,
 	S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(rxeq_default_set, "Which set [0..3] of Rx Equalization values is default");
+MODULE_PARM_DESC(rxeq_default_set,
+	"Which set [0..3] of Rx Equalization values is default");
 
 static int ipath_internal_presets(struct ipath_devdata *dd)
 {
@@ -1337,44 +1268,33 @@ static int ipath_sd_early(struct ipath_devdata *dd)
 	int ret = -1; /* Default failed */
 	int chnl;
 
-	do {
-		for (chnl = 0; chnl < 4; ++chnl) {
-			ret = ibsd_sto_noisy(dd, RXHSCTRL0(chnl), 0xD4, 0xFF);
-			if (ret < 0)
-				break;
-		}
+	for (chnl = 0; chnl < 4; ++chnl) {
+		ret = ibsd_sto_noisy(dd, RXHSCTRL0(chnl), 0xD4, 0xFF);
 		if (ret < 0)
-			break;
-		for (chnl = 0; chnl < 4; ++chnl) {
-			ret = ibsd_sto_noisy(dd, VCDL_DAC2(chnl), 0x2D, 0xFF);
-			if (ret < 0)
-				break;
-		}
+			goto bail;
+	}
+	for (chnl = 0; chnl < 4; ++chnl) {
+		ret = ibsd_sto_noisy(dd, VCDL_DAC2(chnl), 0x2D, 0xFF);
 		if (ret < 0)
-			break;
-		/* more fine-tuning of what will be default */
-		for (chnl = 0; chnl < 4; ++chnl) {
-			ret = ibsd_sto_noisy(dd, VCDL_CTRL2(chnl), 3, 0xF);
-			if (ret < 0)
-				break;
-		}
+			goto bail;
+	}
+	/* more fine-tuning of what will be default */
+	for (chnl = 0; chnl < 4; ++chnl) {
+		ret = ibsd_sto_noisy(dd, VCDL_CTRL2(chnl), 3, 0xF);
 		if (ret < 0)
-			break;
-		for (chnl = 0; chnl < 4; ++chnl) {
-			ret = ibsd_sto_noisy(dd, START_EQ1(chnl), 0x10, 0xFF);
-			if (ret < 0)
-				break;
-		}
+			goto bail;
+	}
+	for (chnl = 0; chnl < 4; ++chnl) {
+		ret = ibsd_sto_noisy(dd, START_EQ1(chnl), 0x10, 0xFF);
 		if (ret < 0)
-			break;
-		for (chnl = 0; chnl < 4; ++chnl) {
-			ret = ibsd_sto_noisy(dd, START_EQ2(chnl), 0x30, 0xFF);
-			if (ret < 0)
-				break;
-		}
+			goto bail;
+	}
+	for (chnl = 0; chnl < 4; ++chnl) {
+		ret = ibsd_sto_noisy(dd, START_EQ2(chnl), 0x30, 0xFF);
 		if (ret < 0)
-			break;
-	} while (0);
+			goto bail;
+	}
+bail:
 	return ret;
 }
 
@@ -1387,42 +1307,32 @@ static int ipath_sd_dactrim(struct ipath_devdata *dd)
 	int ret = -1; /* Default failed */
 	int chnl;
 
-	do {
-		for (chnl = 0; chnl < 4; ++chnl) {
-			ret = ibsd_sto_noisy(dd, BACTRL(chnl), 0x40, 0xFF);
-			if (ret < 0)
-				break;
-		}
+	for (chnl = 0; chnl < 4; ++chnl) {
+		ret = ibsd_sto_noisy(dd, BACTRL(chnl), 0x40, 0xFF);
 		if (ret < 0)
-			break;
-		for (chnl = 0; chnl < 4; ++chnl) {
-			ret = ibsd_sto_noisy(dd, LDOUTCTRL1(chnl), 0x04, 0xFF);
-			if (ret < 0)
-				break;
-		}
+			goto bail;
+	}
+	for (chnl = 0; chnl < 4; ++chnl) {
+		ret = ibsd_sto_noisy(dd, LDOUTCTRL1(chnl), 0x04, 0xFF);
 		if (ret < 0)
-			break;
-		for (chnl = 0; chnl < 4; ++chnl) {
-			ret = ibsd_sto_noisy(dd, RXHSSTATUS(chnl), 0x04, 0xFF);
-			if (ret < 0)
-				break;
-		}
+			goto bail;
+	}
+	for (chnl = 0; chnl < 4; ++chnl) {
+		ret = ibsd_sto_noisy(dd, RXHSSTATUS(chnl), 0x04, 0xFF);
 		if (ret < 0)
-			break;
-		/*
-		 * delay for max possible number of steps, with slop.
-		 * Each step is about 4usec.
-		 */
-		udelay(415);
-		for (chnl = 0; chnl < 4; ++chnl) {
-			ret = ibsd_sto_noisy(dd, LDOUTCTRL1(chnl), 0x00, 0xFF);
-			if (ret < 0)
-				break;
-		}
+			goto bail;
+	}
+	/*
+	 * delay for max possible number of steps, with slop.
+	 * Each step is about 4usec.
+	 */
+	udelay(415);
+	for (chnl = 0; chnl < 4; ++chnl) {
+		ret = ibsd_sto_noisy(dd, LDOUTCTRL1(chnl), 0x00, 0xFF);
 		if (ret < 0)
-			break;
-	} while (0);
-
+			goto bail;
+	}
+bail:
 	return ret;
 }
 
@@ -1495,19 +1405,16 @@ static void ipath_run_relock(unsigned long opaque)
 	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
 	ltstate = ipath_ib_linktrstate(dd, val);
 
-	/* Below check was <= CFGDEBOUNCE, JBR requests change for test */
 	if (ltstate <= INFINIPATH_IBCS_LT_STATE_CFGWAITRMT
 		&& ltstate != INFINIPATH_IBCS_LT_STATE_LINKUP) {
 		int timeoff;
 		/* Not up yet. Try again, if allowed by module-param */
 		if (ipath_relock_by_timer) {
-			if (dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) {
+			if (dd->ipath_flags & IPATH_IB_AUTONEG_INPROG)
 				ipath_cdbg(VERBOSE, "Skip RELOCK in AUTONEG\n");
-			} else if (!(dd->ipath_flags &
-					IPATH_IB_LINK_DISABLED)) {
+			else if (!(dd->ipath_flags & IPATH_IB_LINK_DISABLED)) {
 				ipath_cdbg(VERBOSE, "RELOCK\n");
 				ipath_toggle_rclkrls(dd);
-
 			}
 		}
 		/* re-set timer for next check */
@@ -1528,7 +1435,7 @@ void ipath_set_relock_poll(struct ipath_devdata *dd, int ibup)
 	struct ipath_relock *irp = &dd->ipath_relock_singleton;
 
 	if (ibup > 0) {
-		/* we are now up, so squelch timer */
+		/* we are now up, so relax timer to 1 second interval */
 		if (atomic_read(&irp->ipath_relock_timer_active))
 			mod_timer(&irp->ipath_relock_timer, jiffies + HZ);
 	} else {
diff --git a/drivers/infiniband/hw/ipath/ipath_sdma.c b/drivers/infiniband/hw/ipath/ipath_sdma.c
index 3697449..85fdc98 100644
--- a/drivers/infiniband/hw/ipath/ipath_sdma.c
+++ b/drivers/infiniband/hw/ipath/ipath_sdma.c
@@ -131,76 +131,28 @@ int ipath_sdma_make_progress(struct ipath_devdata *dd)
 			dd->ipath_sdma_descq_head = 0;
 
 		if (txp && txp->next_descq_idx == dd->ipath_sdma_descq_head) {
-			/* move to notify list */
+			/* remove from active list */
+			list_del_init(&txp->list);
+			if (txp->callback)
+				(*txp->callback)(txp->callback_cookie,
+						 IPATH_SDMA_TXREQ_S_OK);
 			if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
 				vl15_watchdog_deq(dd);
-			list_move_tail(lp, &dd->ipath_sdma_notifylist);
 			if (!list_empty(&dd->ipath_sdma_activelist)) {
 				lp = dd->ipath_sdma_activelist.next;
 				txp = list_entry(lp, struct ipath_sdma_txreq,
 						 list);
 				start_idx = txp->start_idx;
-			} else {
-				lp = NULL;
+			} else
 				txp = NULL;
-			}
 		}
 		progress = 1;
 	}
 
-	if (progress)
-		tasklet_hi_schedule(&dd->ipath_sdma_notify_task);
-
 done:
 	return progress;
 }
 
-static void ipath_sdma_notify(struct ipath_devdata *dd, struct list_head *list)
-{
-	struct ipath_sdma_txreq *txp, *txp_next;
-
-	list_for_each_entry_safe(txp, txp_next, list, list) {
-		list_del_init(&txp->list);
-
-		if (txp->callback)
-			(*txp->callback)(txp->callback_cookie,
-					 txp->callback_status);
-	}
-}
-
-static void sdma_notify_taskbody(struct ipath_devdata *dd)
-{
-	unsigned long flags;
-	struct list_head list;
-
-	INIT_LIST_HEAD(&list);
-
-	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-
-	list_splice_init(&dd->ipath_sdma_notifylist, &list);
-
-	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-
-	ipath_sdma_notify(dd, &list);
-
-	/*
-	 * The IB verbs layer needs to see the callback before getting
-	 * the call to ipath_ib_piobufavail() because the callback
-	 * handles releasing resources the next send will need.
-	 * Otherwise, we could do these calls in
-	 * ipath_sdma_make_progress().
-	 */
-	ipath_ib_piobufavail(dd->verbs_dev);
-}
-
-static void sdma_notify_task(unsigned long opaque)
-{
-	struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
-
-	if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
-		sdma_notify_taskbody(dd);
-}
-
 static void dump_sdma_state(struct ipath_devdata *dd)
 {
 	unsigned long reg;
@@ -258,19 +210,14 @@ static void sdma_abort_task(unsigned long opaque)
 	if (status == IPATH_SDMA_ABORT_ABORTED) {
 		struct ipath_sdma_txreq *txp, *txpnext;
 		u64 hwstatus;
-		int notify = 0;
 
 		hwstatus = ipath_read_kreg64(dd,
 				dd->ipath_kregs->kr_senddmastatus);
 
-		if (/* ScoreBoardDrainInProg */
-		    test_bit(63, &hwstatus) ||
-		    /* AbortInProg */
-		    test_bit(62, &hwstatus) ||
-		    /* InternalSDmaEnable */
-		    test_bit(61, &hwstatus) ||
-		    /* ScbEmpty */
-		    !test_bit(30, &hwstatus)) {
+		if ((hwstatus & (IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG |
+				 IPATH_SDMA_STATUS_ABORT_IN_PROG	     |
+				 IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE)) ||
+		    !(hwstatus & IPATH_SDMA_STATUS_SCB_EMPTY)) {
 			if (dd->ipath_sdma_reset_wait > 0) {
 				/* not done shutting down sdma */
 				--dd->ipath_sdma_reset_wait;
@@ -284,14 +231,13 @@ static void sdma_abort_task(unsigned long opaque)
 		/* dequeue all "sent" requests */
 		list_for_each_entry_safe(txp, txpnext,
 					 &dd->ipath_sdma_activelist, list) {
-			txp->callback_status = IPATH_SDMA_TXREQ_S_ABORTED;
+			list_del_init(&txp->list);
+			if (txp->callback)
+				(*txp->callback)(txp->callback_cookie,
+						 IPATH_SDMA_TXREQ_S_ABORTED);
 			if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
 				vl15_watchdog_deq(dd);
-			list_move_tail(&txp->list, &dd->ipath_sdma_notifylist);
-			notify = 1;
 		}
-		if (notify)
-			tasklet_hi_schedule(&dd->ipath_sdma_notify_task);
 
 		/* reset our notion of head and tail */
 		dd->ipath_sdma_descq_tail = 0;
@@ -345,7 +291,7 @@ resched:
 	 * state change
 	 */
 	if (jiffies > dd->ipath_sdma_abort_jiffies) {
-		ipath_dbg("looping with status 0x%016llx\n",
+		ipath_dbg("looping with status 0x%08lx\n",
 			  dd->ipath_sdma_status);
 		dd->ipath_sdma_abort_jiffies = jiffies + 5 * HZ;
 	}
@@ -484,10 +430,7 @@ int setup_sdma(struct ipath_devdata *dd)
 			 senddmabufmask[2]);
 
 	INIT_LIST_HEAD(&dd->ipath_sdma_activelist);
-	INIT_LIST_HEAD(&dd->ipath_sdma_notifylist);
 
-	tasklet_init(&dd->ipath_sdma_notify_task, sdma_notify_task,
-		     (unsigned long) dd);
 	tasklet_init(&dd->ipath_sdma_abort_task, sdma_abort_task,
 		     (unsigned long) dd);
 
@@ -524,7 +467,6 @@ void teardown_sdma(struct ipath_devdata *dd)
 	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
 
 	tasklet_kill(&dd->ipath_sdma_abort_task);
-	tasklet_kill(&dd->ipath_sdma_notify_task);
 
 	/* turn off sdma */
 	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
@@ -538,15 +480,15 @@ void teardown_sdma(struct ipath_devdata *dd)
 	/* dequeue all "sent" requests */
 	list_for_each_entry_safe(txp, txpnext, &dd->ipath_sdma_activelist,
 				 list) {
-		txp->callback_status = IPATH_SDMA_TXREQ_S_SHUTDOWN;
+		list_del_init(&txp->list);
+		if (txp->callback)
+			(*txp->callback)(txp->callback_cookie,
+					 IPATH_SDMA_TXREQ_S_SHUTDOWN);
 		if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
 			vl15_watchdog_deq(dd);
-		list_move_tail(&txp->list, &dd->ipath_sdma_notifylist);
 	}
 	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
 
-	sdma_notify_taskbody(dd);
-
 	del_timer_sync(&dd->ipath_sdma_vl15_timer);
 
 	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
@@ -615,7 +557,7 @@ void ipath_restart_sdma(struct ipath_devdata *dd)
 	}
 	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
 	if (!needed) {
-		ipath_dbg("invalid attempt to restart SDMA, status 0x%016llx\n",
+		ipath_dbg("invalid attempt to restart SDMA, status 0x%08lx\n",
 			dd->ipath_sdma_status);
 		goto bail;
 	}
@@ -702,10 +644,8 @@ retry:
 
 	addr = dma_map_single(&dd->pcidev->dev, tx->txreq.map_addr,
 			      tx->map_len, DMA_TO_DEVICE);
-	if (dma_mapping_error(addr)) {
-		ret = -EIO;
-		goto unlock;
-	}
+	if (dma_mapping_error(addr))
+		goto ioerr;
 
 	dwoffset = tx->map_len >> 2;
 	make_sdma_desc(dd, sdmadesc, (u64) addr, dwoffset, 0);
@@ -745,6 +685,8 @@ retry:
 		dw = (len + 3) >> 2;
 		addr = dma_map_single(&dd->pcidev->dev, sge->vaddr, dw << 2,
 				      DMA_TO_DEVICE);
+		if (dma_mapping_error(addr))
+			goto unmap;
 		make_sdma_desc(dd, sdmadesc, (u64) addr, dw, dwoffset);
 		/* SDmaUseLargeBuf has to be set in every descriptor */
 		if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF)
@@ -765,7 +707,7 @@ retry:
 		if (sge->sge_length == 0) {
 			if (--ss->num_sge)
 				*sge = *ss->sg_list++;
-		} else if (sge->length == 0 && sge->mr != NULL) {
+		} else if (sge->length == 0 && sge->mr->lkey) {
 			if (++sge->n >= IPATH_SEGSZ) {
 				if (++sge->m >= sge->mr->mapsz)
 					break;
@@ -791,18 +733,28 @@ retry:
 		descqp[0] |= __constant_cpu_to_le64(1ULL << 15);
 	}
 
+	tx->txreq.next_descq_idx = tail;
+	dd->ipath_sdma_descq_tail = tail;
 	/* Commit writes to memory and advance the tail on the chip */
 	wmb();
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, tail);
 
-	tx->txreq.next_descq_idx = tail;
-	tx->txreq.callback_status = IPATH_SDMA_TXREQ_S_OK;
-	dd->ipath_sdma_descq_tail = tail;
 	dd->ipath_sdma_descq_added += tx->txreq.sg_count;
 	list_add_tail(&tx->txreq.list, &dd->ipath_sdma_activelist);
 	if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_VL15)
 		vl15_watchdog_enq(dd);
-
+	goto unlock;
+
+unmap:
+	while (tail != dd->ipath_sdma_descq_tail) {
+		if (!tail)
+			tail = dd->ipath_sdma_descq_cnt - 1;
+		else
+			tail--;
+		unmap_desc(dd, tail);
+	}
+ioerr:
+	ret = -EIO;
 unlock:
 	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
 fail:
diff --git a/drivers/infiniband/hw/ipath/ipath_stats.c b/drivers/infiniband/hw/ipath/ipath_stats.c
index b6c809d..f63e143 100644
--- a/drivers/infiniband/hw/ipath/ipath_stats.c
+++ b/drivers/infiniband/hw/ipath/ipath_stats.c
@@ -112,6 +112,14 @@ u64 ipath_snap_cntr(struct ipath_devdata *dd, ipath_creg creg)
 			dd->ipath_lastrpkts = val;
 		}
 		val64 = dd->ipath_rpkts;
+	} else if (creg == dd->ipath_cregs->cr_ibsymbolerrcnt) {
+		if (dd->ibdeltainprog)
+			val64 -= val64 - dd->ibsymsnap;
+		val64 -= dd->ibsymdelta;
+	} else if (creg == dd->ipath_cregs->cr_iblinkerrrecovcnt) {
+		if (dd->ibdeltainprog)
+			val64 -= val64 - dd->iblnkerrsnap;
+		val64 -= dd->iblnkerrdelta;
 	} else
 		val64 = (u64) val;
 
@@ -308,11 +316,12 @@ void ipath_get_faststats(unsigned long opaque)
 			 * level.
 			 */
 			if (iserr)
-				ipath_dbg("Re-enabling queue full errors (%s)\n",
-					  ebuf);
+				ipath_dbg(
+					"Re-enabling queue full errors (%s)\n",
+					ebuf);
 			else
 				ipath_cdbg(ERRPKT, "Re-enabling packet"
-						" problem interrupt (%s)\n", ebuf);
+					" problem interrupt (%s)\n", ebuf);
 		}
 
 		/* re-enable masked errors */
diff --git a/drivers/infiniband/hw/ipath/ipath_sysfs.c b/drivers/infiniband/hw/ipath/ipath_sysfs.c
index ca1df73..ec04b5d 100644
--- a/drivers/infiniband/hw/ipath/ipath_sysfs.c
+++ b/drivers/infiniband/hw/ipath/ipath_sysfs.c
@@ -164,7 +164,6 @@ static ssize_t show_boardversion(struct device *dev,
 	return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_boardversion);
 }
 
-
 static ssize_t show_localbus_info(struct device *dev,
 			       struct device_attribute *attr,
 			       char *buf)
@@ -1088,13 +1087,13 @@ static DEVICE_ATTR(unit, S_IRUGO, show_unit, NULL);
 static DEVICE_ATTR(rx_pol_inv, S_IWUSR, NULL, store_rx_pol_inv);
 static DEVICE_ATTR(led_override, S_IWUSR, NULL, store_led_override);
 static DEVICE_ATTR(logged_errors, S_IRUGO, show_logged_errs, NULL);
+static DEVICE_ATTR(localbus_info, S_IRUGO, show_localbus_info, NULL);
 static DEVICE_ATTR(jint_max_packets, S_IWUSR | S_IRUGO,
 		   show_jint_max_packets, store_jint_max_packets);
 static DEVICE_ATTR(jint_idle_ticks, S_IWUSR | S_IRUGO,
 		   show_jint_idle_ticks, store_jint_idle_ticks);
 static DEVICE_ATTR(tempsense, S_IWUSR | S_IRUGO,
 		   show_tempsense, store_tempsense);
-static DEVICE_ATTR(localbus_info, S_IRUGO, show_localbus_info, NULL);
 
 static struct attribute *dev_attributes[] = {
 	&dev_attr_guid.attr,
@@ -1194,7 +1193,6 @@ void ipath_driver_remove_group(struct device_driver *drv)
 int ipath_device_create_group(struct device *dev, struct ipath_devdata *dd)
 {
 	int ret;
-	char unit[5];
 
 	ret = sysfs_create_group(&dev->kobj, &dev_attr_group);
 	if (ret)
@@ -1204,15 +1202,10 @@ int ipath_device_create_group(struct device *dev, struct ipath_devdata *dd)
 	if (ret)
 		goto bail_attrs;
 
-	snprintf(unit, sizeof(unit), "%02d", dd->ipath_unit);
-	ret = sysfs_create_link(&dev->driver->kobj, &dev->kobj, unit);
-	if (ret)
-		goto bail_counter;
-
 	if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) {
 		ret = device_create_file(dev, &dev_attr_jint_idle_ticks);
 		if (ret)
-			goto bail_unit;
+			goto bail_counter;
 		ret = device_create_file(dev, &dev_attr_jint_max_packets);
 		if (ret)
 			goto bail_idle;
@@ -1222,14 +1215,12 @@ int ipath_device_create_group(struct device *dev, struct ipath_devdata *dd)
 			goto bail_max;
 	}
 
-	goto bail;
+	return 0;
 
 bail_max:
 	device_remove_file(dev, &dev_attr_jint_max_packets);
 bail_idle:
 	device_remove_file(dev, &dev_attr_jint_idle_ticks);
-bail_unit:
-	sysfs_remove_link(&dev->driver->kobj, unit);
 bail_counter:
 	sysfs_remove_group(&dev->kobj, &dev_counter_attr_group);
 bail_attrs:
@@ -1240,11 +1231,6 @@ bail:
 
 void ipath_device_remove_group(struct device *dev, struct ipath_devdata *dd)
 {
-	char unit[5];
-
-	snprintf(unit, sizeof(unit), "%02d", dd->ipath_unit);
-	sysfs_remove_link(&dev->driver->kobj, unit);
-
 	sysfs_remove_group(&dev->kobj, &dev_counter_attr_group);
 
 	if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) {
diff --git a/drivers/infiniband/hw/ipath/ipath_uc.c b/drivers/infiniband/hw/ipath/ipath_uc.c
index 717c13e..38df44f 100644
--- a/drivers/infiniband/hw/ipath/ipath_uc.c
+++ b/drivers/infiniband/hw/ipath/ipath_uc.c
@@ -114,7 +114,7 @@ int ipath_make_uc_req(struct ipath_qp *qp)
 				qp->s_state =
 					OP(SEND_ONLY_WITH_IMMEDIATE);
 				/* Immediate data comes after the BTH */
-				ohdr->u.imm_data = wqe->wr.imm_data;
+				ohdr->u.imm_data = wqe->wr.ex.imm_data;
 				hwords += 1;
 			}
 			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
@@ -143,7 +143,7 @@ int ipath_make_uc_req(struct ipath_qp *qp)
 				qp->s_state =
 					OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
 				/* Immediate data comes after the RETH */
-				ohdr->u.rc.imm_data = wqe->wr.imm_data;
+				ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
 				hwords += 1;
 				if (wqe->wr.send_flags & IB_SEND_SOLICITED)
 					bth0 |= 1 << 23;
@@ -172,7 +172,7 @@ int ipath_make_uc_req(struct ipath_qp *qp)
 		else {
 			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
 			/* Immediate data comes after the BTH */
-			ohdr->u.imm_data = wqe->wr.imm_data;
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
 			hwords += 1;
 		}
 		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
@@ -197,7 +197,7 @@ int ipath_make_uc_req(struct ipath_qp *qp)
 			qp->s_state =
 				OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
 			/* Immediate data comes after the BTH */
-			ohdr->u.imm_data = wqe->wr.imm_data;
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
 			hwords += 1;
 			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
 				bth0 |= 1 << 23;
@@ -225,6 +225,26 @@ unlock:
 	return ret;
 }
 
+static void fix_mr_refcount(struct ipath_qp *qp)
+{
+	unsigned i;
+
+	if (qp->r_sge.num_sge == qp->s_rdma_read_sge.num_sge)
+		return;
+	while (qp->r_sge.num_sge) {
+		atomic_dec(&qp->r_sge.sge.mr->refcount);
+		if (--qp->r_sge.num_sge)
+			qp->r_sge.sge = *qp->r_sge.sg_list++;
+	}
+	for (i = 0; i < qp->s_rdma_read_sge.num_sge; i++) {
+		struct ipath_sge *sge = i ?
+			&qp->s_rdma_read_sge.sg_list[i - 1] :
+			&qp->s_rdma_read_sge.sge;
+
+		atomic_inc(&sge->mr->refcount);
+	}
+}
+
 /**
  * ipath_uc_rcv - handle an incoming UC packet
  * @dev: the device the packet came in on
@@ -293,6 +313,11 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 		 */
 		qp->r_psn = psn;
 	inv:
+		while (qp->r_sge.num_sge) {
+			atomic_dec(&qp->r_sge.sge.mr->refcount);
+			if (--qp->r_sge.num_sge)
+				qp->r_sge.sge = *qp->r_sge.sg_list++;
+		}
 		qp->r_state = OP(SEND_LAST);
 		switch (opcode) {
 		case OP(SEND_FIRST):
@@ -348,13 +373,13 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 	send_first:
 		if (qp->r_flags & IPATH_R_REUSE_SGE) {
 			qp->r_flags &= ~IPATH_R_REUSE_SGE;
+			fix_mr_refcount(qp);
 			qp->r_sge = qp->s_rdma_read_sge;
 		} else if (!ipath_get_rwqe(qp, 0)) {
 			dev->n_pkt_drops++;
 			goto done;
-		}
-		/* Save the WQE so we can reuse it in case of an error. */
-		qp->s_rdma_read_sge = qp->r_sge;
+		} else
+			qp->s_rdma_read_sge = qp->r_sge;
 		qp->r_rcv_len = 0;
 		if (opcode == OP(SEND_ONLY))
 			goto send_last;
@@ -374,17 +399,17 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 			dev->n_pkt_drops++;
 			goto done;
 		}
-		ipath_copy_sge(&qp->r_sge, data, pmtu);
+		ipath_copy_sge(&qp->r_sge, data, pmtu, 1);
 		break;
 
 	case OP(SEND_LAST_WITH_IMMEDIATE):
 	send_last_imm:
 		if (header_in_data) {
-			wc.imm_data = *(__be32 *) data;
+			wc.ex.imm_data = *(__be32 *) data;
 			data += sizeof(__be32);
 		} else {
 			/* Immediate data comes after BTH */
-			wc.imm_data = ohdr->u.imm_data;
+			wc.ex.imm_data = ohdr->u.imm_data;
 		}
 		hdrsize += 4;
 		wc.wc_flags = IB_WC_WITH_IMM;
@@ -410,7 +435,12 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 		}
 		wc.opcode = IB_WC_RECV;
 	last_imm:
-		ipath_copy_sge(&qp->r_sge, data, tlen);
+		ipath_copy_sge(&qp->r_sge, data, tlen, 1);
+		while (qp->r_sge.num_sge) {
+			atomic_dec(&qp->r_sge.sge.mr->refcount);
+			if (--qp->r_sge.num_sge)
+				qp->r_sge.sge = *qp->r_sge.sg_list++;
+		}
 		wc.wr_id = qp->r_wr_id;
 		wc.status = IB_WC_SUCCESS;
 		wc.qp = &qp->ibqp;
@@ -437,21 +467,23 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 		hdrsize += sizeof(*reth);
 		qp->r_len = be32_to_cpu(reth->length);
 		qp->r_rcv_len = 0;
+		qp->r_sge.sg_list = NULL;
 		if (qp->r_len != 0) {
 			u32 rkey = be32_to_cpu(reth->rkey);
 			u64 vaddr = be64_to_cpu(reth->vaddr);
 			int ok;
 
 			/* Check rkey */
-			ok = ipath_rkey_ok(qp, &qp->r_sge, qp->r_len,
+			ok = ipath_rkey_ok(qp, &qp->r_sge.sge, qp->r_len,
 					   vaddr, rkey,
 					   IB_ACCESS_REMOTE_WRITE);
 			if (unlikely(!ok)) {
 				dev->n_pkt_drops++;
 				goto done;
 			}
+			qp->r_sge.num_sge = 1;
 		} else {
-			qp->r_sge.sg_list = NULL;
+			qp->r_sge.num_sge = 0;
 			qp->r_sge.sge.mr = NULL;
 			qp->r_sge.sge.vaddr = NULL;
 			qp->r_sge.sge.length = 0;
@@ -478,17 +510,17 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 			dev->n_pkt_drops++;
 			goto done;
 		}
-		ipath_copy_sge(&qp->r_sge, data, pmtu);
+		ipath_copy_sge(&qp->r_sge, data, pmtu, 1);
 		break;
 
 	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
 	rdma_last_imm:
 		if (header_in_data) {
-			wc.imm_data = *(__be32 *) data;
+			wc.ex.imm_data = *(__be32 *) data;
 			data += sizeof(__be32);
 		} else {
 			/* Immediate data comes after BTH */
-			wc.imm_data = ohdr->u.imm_data;
+			wc.ex.imm_data = ohdr->u.imm_data;
 		}
 		hdrsize += 4;
 		wc.wc_flags = IB_WC_WITH_IMM;
@@ -533,7 +565,12 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 			dev->n_pkt_drops++;
 			goto done;
 		}
-		ipath_copy_sge(&qp->r_sge, data, tlen);
+		ipath_copy_sge(&qp->r_sge, data, tlen, 1);
+		while (qp->r_sge.num_sge) {
+			atomic_dec(&qp->r_sge.sge.mr->refcount);
+			if (--qp->r_sge.num_sge)
+				qp->r_sge.sge = *qp->r_sge.sg_list++;
+		}
 		break;
 
 	default:
diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c
index 64e0c9a..0bf7a96 100644
--- a/drivers/infiniband/hw/ipath/ipath_ud.c
+++ b/drivers/infiniband/hw/ipath/ipath_ud.c
@@ -54,6 +54,7 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe)
 	unsigned long flags;
 	struct ipath_rq *rq;
 	struct ipath_srq *srq;
+	struct ipath_sge_state ssge;
 	struct ipath_sge_state rsge;
 	struct ipath_sge *sge;
 	struct ipath_rwq *wq;
@@ -70,8 +71,6 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe)
 		goto done;
 	}
 
-	rsge.sg_list = NULL;
-
 	/*
 	 * Check that the qkey matches (except for QP0, see 9.6.1.4.1).
 	 * Qkeys with the high order bit set mean use the
@@ -96,7 +95,7 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe)
 
 	if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
 		wc.wc_flags = IB_WC_WITH_IMM;
-		wc.imm_data = swqe->wr.imm_data;
+		wc.ex.imm_data = swqe->wr.ex.imm_data;
 	}
 
 	/*
@@ -115,21 +114,6 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe)
 		rq = &qp->r_rq;
 	}
 
-	if (rq->max_sge > 1) {
-		/*
-		 * XXX We could use GFP_KERNEL if ipath_do_send()
-		 * was always called from the tasklet instead of
-		 * from ipath_post_send().
-		 */
-		rsge.sg_list = kmalloc((rq->max_sge - 1) *
-					sizeof(struct ipath_sge),
-				       GFP_ATOMIC);
-		if (!rsge.sg_list) {
-			dev->n_pkt_drops++;
-			goto drop;
-		}
-	}
-
 	/*
 	 * Get the next work request entry to find where to put the data.
 	 * Note that it is safe to drop the lock after changing rq->tail
@@ -147,14 +131,21 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe)
 		goto drop;
 	}
 	wqe = get_rwqe_ptr(rq, tail);
-	if (!ipath_init_sge(qp, wqe, &rlen, &rsge)) {
+	rsge.sg_list = qp->r_ud_sg_list;
+	if (unlikely(!ipath_init_sge(qp, wqe, &rlen, &rsge))) {
 		spin_unlock_irqrestore(&rq->lock, flags);
 		dev->n_pkt_drops++;
 		goto drop;
 	}
 	/* Silently drop packets which are too big. */
-	if (wc.byte_len > rlen) {
+	if (unlikely(wc.byte_len > rlen)) {
+		unsigned i;
+
 		spin_unlock_irqrestore(&rq->lock, flags);
+		for (i = 0; i < rsge.num_sge; i++) {
+			sge = i ? &rsge.sg_list[i - 1] : &rsge.sge;
+			atomic_dec(&sge->mr->refcount);
+		}
 		dev->n_pkt_drops++;
 		goto drop;
 	}
@@ -192,11 +183,14 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe)
 
 	ah_attr = &to_iah(swqe->wr.wr.ud.ah)->attr;
 	if (ah_attr->ah_flags & IB_AH_GRH) {
-		ipath_copy_sge(&rsge, &ah_attr->grh, sizeof(struct ib_grh));
+		ipath_copy_sge(&rsge, &ah_attr->grh, sizeof(struct ib_grh), 1);
 		wc.wc_flags |= IB_WC_GRH;
 	} else
-		ipath_skip_sge(&rsge, sizeof(struct ib_grh));
-	sge = swqe->sg_list;
+		ipath_skip_sge(&rsge, sizeof(struct ib_grh), 1);
+	ssge.sg_list = swqe->sg_list + 1;
+	ssge.sge = *swqe->sg_list;
+	ssge.num_sge = swqe->wr.num_sge;
+	sge = &ssge.sge;
 	while (length) {
 		u32 len = sge->length;
 
@@ -205,14 +199,14 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe)
 		if (len > sge->sge_length)
 			len = sge->sge_length;
 		BUG_ON(len == 0);
-		ipath_copy_sge(&rsge, sge->vaddr, len);
+		ipath_copy_sge(&rsge, sge->vaddr, len, 1);
 		sge->vaddr += len;
 		sge->length -= len;
 		sge->sge_length -= len;
 		if (sge->sge_length == 0) {
-			if (--swqe->wr.num_sge)
-				sge++;
-		} else if (sge->length == 0 && sge->mr != NULL) {
+			if (--ssge.num_sge)
+				*sge = *ssge.sg_list++;
+		} else if (sge->length == 0 && sge->mr->lkey) {
 			if (++sge->n >= IPATH_SEGSZ) {
 				if (++sge->m >= sge->mr->mapsz)
 					break;
@@ -225,12 +219,17 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe)
 		}
 		length -= len;
 	}
+	while (rsge.num_sge) {
+		atomic_dec(&rsge.sge.mr->refcount);
+		if (--rsge.num_sge)
+			rsge.sge = *rsge.sg_list++;
+	}
 	wc.status = IB_WC_SUCCESS;
 	wc.opcode = IB_WC_RECV;
 	wc.qp = &qp->ibqp;
 	wc.src_qp = sqp->ibqp.qp_num;
-	/* XXX do we know which pkey matched? Only needed for GSI. */
-	wc.pkey_index = 0;
+	wc.pkey_index = qp->ibqp.qp_type == IB_QPT_GSI ?
+		swqe->wr.wr.ud.pkey_index : 0;
 	wc.slid = dev->dd->ipath_lid |
 		(ah_attr->src_path_bits &
 		 ((1 << dev->dd->ipath_lmc) - 1));
@@ -242,7 +241,6 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe)
 	ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
 		       swqe->wr.send_flags & IB_SEND_SOLICITED);
 drop:
-	kfree(rsge.sg_list);
 	if (atomic_dec_and_test(&qp->refcount))
 		wake_up(&qp->wait);
 done:;
@@ -267,6 +265,7 @@ int ipath_make_ud_req(struct ipath_qp *qp)
 	u16 lrh0;
 	u16 lid;
 	int ret = 0;
+	int next_cur;
 
 	spin_lock_irqsave(&qp->s_lock, flags);
 
@@ -290,8 +289,9 @@ int ipath_make_ud_req(struct ipath_qp *qp)
 		goto bail;
 
 	wqe = get_swqe_ptr(qp, qp->s_cur);
-	if (++qp->s_cur >= qp->s_size)
-		qp->s_cur = 0;
+	next_cur = qp->s_cur + 1;
+	if (next_cur >= qp->s_size)
+		next_cur = 0;
 
 	/* Construct the header. */
 	ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr;
@@ -315,6 +315,7 @@ int ipath_make_ud_req(struct ipath_qp *qp)
 				qp->s_flags |= IPATH_S_WAIT_DMA;
 				goto bail;
 			}
+			qp->s_cur = next_cur;
 			spin_unlock_irqrestore(&qp->s_lock, flags);
 			ipath_ud_loopback(qp, wqe);
 			spin_lock_irqsave(&qp->s_lock, flags);
@@ -323,6 +324,7 @@ int ipath_make_ud_req(struct ipath_qp *qp)
 		}
 	}
 
+	qp->s_cur = next_cur;
 	extra_bytes = -wqe->length & 3;
 	nwords = (wqe->length + extra_bytes) >> 2;
 
@@ -355,7 +357,7 @@ int ipath_make_ud_req(struct ipath_qp *qp)
 	}
 	if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
 		qp->s_hdrwords++;
-		ohdr->u.ud.imm_data = wqe->wr.imm_data;
+		ohdr->u.ud.imm_data = wqe->wr.ex.imm_data;
 		bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
 	} else
 		bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
@@ -377,7 +379,8 @@ int ipath_make_ud_req(struct ipath_qp *qp)
 		bth0 |= 1 << 23;
 	bth0 |= extra_bytes << 20;
 	bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? IPATH_DEFAULT_P_KEY :
-		ipath_get_pkey(dev->dd, qp->s_pkey_index);
+		ipath_get_pkey(dev->dd, qp->ibqp.qp_type == IB_QPT_GSI ?
+				wqe->wr.wr.ud.pkey_index : qp->s_pkey_index);
 	ohdr->bth[0] = cpu_to_be32(bth0);
 	/*
 	 * Use the multicast QP if the destination LID is a multicast LID.
@@ -406,6 +409,23 @@ unlock:
 	return ret;
 }
 
+static unsigned ipath_lookup_pkey(struct ipath_devdata *dd, u16 pkey)
+{
+	unsigned i;
+
+	pkey &= 0x7fff;	/* remove limited/full membership bit */
+
+	for (i = 0; i < ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys); ++i)
+		if ((dd->ipath_pd[0]->port_pkeys[i] & 0x7fff) == pkey)
+			return i;
+
+	/*
+	 * Should not get here, this means hardware failed to validate pkeys.
+	 * Punt and return index 0.
+	 */
+	return 0;
+}
+
 /**
  * ipath_ud_rcv - receive an incoming UD packet
  * @dev: the device the packet came in on
@@ -493,14 +513,14 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 	if (qp->ibqp.qp_num > 1 &&
 	    opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
 		if (header_in_data) {
-			wc.imm_data = *(__be32 *) data;
+			wc.ex.imm_data = *(__be32 *) data;
 			data += sizeof(__be32);
 		} else
-			wc.imm_data = ohdr->u.ud.imm_data;
+			wc.ex.imm_data = ohdr->u.ud.imm_data;
 		wc.wc_flags = IB_WC_WITH_IMM;
 		hdrsize += sizeof(u32);
 	} else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
-		wc.imm_data = 0;
+		wc.ex.imm_data = 0;
 		wc.wc_flags = 0;
 	} else {
 		dev->n_pkt_drops++;
@@ -559,12 +579,17 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 	}
 	if (has_grh) {
 		ipath_copy_sge(&qp->r_sge, &hdr->u.l.grh,
-			       sizeof(struct ib_grh));
+			       sizeof(struct ib_grh), 1);
 		wc.wc_flags |= IB_WC_GRH;
 	} else
-		ipath_skip_sge(&qp->r_sge, sizeof(struct ib_grh));
+		ipath_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
 	ipath_copy_sge(&qp->r_sge, data,
-		       wc.byte_len - sizeof(struct ib_grh));
+		       wc.byte_len - sizeof(struct ib_grh), 1);
+	while (qp->r_sge.num_sge) {
+		atomic_dec(&qp->r_sge.sge.mr->refcount);
+		if (--qp->r_sge.num_sge)
+			qp->r_sge.sge = *qp->r_sge.sg_list++;
+	}
 	if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
 		goto bail;
 	wc.wr_id = qp->r_wr_id;
@@ -573,8 +598,8 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 	wc.vendor_err = 0;
 	wc.qp = &qp->ibqp;
 	wc.src_qp = src_qp;
-	/* XXX do we know which pkey matched? Only needed for GSI. */
-	wc.pkey_index = 0;
+	wc.pkey_index = qp->ibqp.qp_type == IB_QPT_GSI ?
+		ipath_lookup_pkey(dev->dd, be32_to_cpu(ohdr->bth[0])) : 0;
 	wc.slid = be16_to_cpu(hdr->lrh[3]);
 	wc.sl = (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF;
 	dlid = be16_to_cpu(hdr->lrh[1]);
diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.c b/drivers/infiniband/hw/ipath/ipath_user_sdma.c
index fb26b03..39eb1fa 100644
--- a/drivers/infiniband/hw/ipath/ipath_user_sdma.c
+++ b/drivers/infiniband/hw/ipath/ipath_user_sdma.c
@@ -484,7 +484,8 @@ static int ipath_user_sdma_queue_pkts(const struct ipath_devdata *dd,
 			const unsigned long faddr =
 				(unsigned long) iov[idx].iov_base;
 
-			if (slen & 3 || faddr & 3 || !slen || slen > PAGE_SIZE) {
+			if (slen & 3 || faddr & 3 || !slen ||
+			    slen > PAGE_SIZE) {
 				ret = -EINVAL;
 				goto free_pbc;
 			}
@@ -552,6 +553,12 @@ done:
 	return ret;
 }
 
+static void ipath_user_sdma_set_complete_counter(struct ipath_user_sdma_queue *pq,
+						 u32 c)
+{
+	pq->sent_counter = c;
+}
+
 /* try to clean out queue -- needs pq->lock */
 static int ipath_user_sdma_queue_clean(const struct ipath_devdata *dd,
 				       struct ipath_user_sdma_queue *pq)
@@ -665,8 +672,8 @@ static inline __le64 ipath_sdma_make_first_desc0(__le64 descq)
 
 static inline __le64 ipath_sdma_make_last_desc0(__le64 descq)
 {
-					      /* last */  /* dma head */
-	return descq | __constant_cpu_to_le64(1ULL << 11 | 1ULL << 13);
+					      /* last */
+	return descq | __constant_cpu_to_le64(1ULL << 11);
 }
 
 static inline __le64 ipath_sdma_make_desc1(u64 addr)
@@ -705,6 +712,8 @@ static int ipath_user_sdma_push_pkts(struct ipath_devdata *dd,
 	int ret = 0;
 	unsigned long flags;
 	u16 tail;
+	u8 generation;
+	u64 descq_added;
 
 	if (list_empty(pktlist))
 		return 0;
@@ -714,6 +723,10 @@ static int ipath_user_sdma_push_pkts(struct ipath_devdata *dd,
 
 	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
 
+	/* keep a copy for restoring purposes in case of problems */
+	generation = dd->ipath_sdma_generation;
+	descq_added = dd->ipath_sdma_descq_added;
+
 	if (unlikely(dd->ipath_sdma_status & IPATH_SDMA_ABORT_MASK)) {
 		ret = -ECOMM;
 		goto unlock;
@@ -756,7 +769,7 @@ static int ipath_user_sdma_push_pkts(struct ipath_devdata *dd,
 		if (ofs >= IPATH_SMALLBUF_DWORDS) {
 			for (i = 0; i < pkt->naddr; i++) {
 				dd->ipath_sdma_descq[dtail].qw[0] |=
-					1ULL<<14;
+					__constant_cpu_to_le64(1ULL << 14);
 				if (++dtail == dd->ipath_sdma_descq_cnt)
 					dtail = 0;
 			}
@@ -777,6 +790,10 @@ unlock_check_tail:
 	}
 
 unlock:
+	if (unlikely(ret < 0)) {
+		dd->ipath_sdma_generation = generation;
+		dd->ipath_sdma_descq_added = descq_added;
+	}
 	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
 
 	return ret;
@@ -860,26 +877,11 @@ int ipath_user_sdma_make_progress(struct ipath_devdata *dd,
 	return ret;
 }
 
-int ipath_user_sdma_pkt_sent(const struct ipath_user_sdma_queue *pq,
-			     u32 counter)
-{
-	const u32 scounter = ipath_user_sdma_complete_counter(pq);
-	const s32 dcounter = scounter - counter;
-
-	return dcounter >= 0;
-}
-
 u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq)
 {
 	return pq->sent_counter;
 }
 
-void ipath_user_sdma_set_complete_counter(struct ipath_user_sdma_queue *pq,
-					  u32 c)
-{
-	pq->sent_counter = c;
-}
-
 u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq)
 {
 	return pq->counter;
diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.h b/drivers/infiniband/hw/ipath/ipath_user_sdma.h
index ce0448f..fc76316 100644
--- a/drivers/infiniband/hw/ipath/ipath_user_sdma.h
+++ b/drivers/infiniband/hw/ipath/ipath_user_sdma.h
@@ -45,12 +45,8 @@ int ipath_user_sdma_writev(struct ipath_devdata *dd,
 int ipath_user_sdma_make_progress(struct ipath_devdata *dd,
 				  struct ipath_user_sdma_queue *pq);
 
-int ipath_user_sdma_pkt_sent(const struct ipath_user_sdma_queue *pq,
-			     u32 counter);
 void ipath_user_sdma_queue_drain(struct ipath_devdata *dd,
 				 struct ipath_user_sdma_queue *pq);
 
 u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq);
-void ipath_user_sdma_set_complete_counter(struct ipath_user_sdma_queue *pq,
-					  u32 c);
 u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq);
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c
index 7bdcc04..3fc08ae 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs.c
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.c
@@ -35,6 +35,7 @@
 #include <rdma/ib_user_verbs.h>
 #include <linux/io.h>
 #include <linux/utsname.h>
+#include <linux/rculist.h>
 
 #include "ipath_kernel.h"
 #include "ipath_verbs.h"
@@ -117,7 +118,7 @@ MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
 
 static unsigned int ib_ipath_disable_sma;
 module_param_named(disable_sma, ib_ipath_disable_sma, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(ib_ipath_disable_sma, "Disable the SMA");
+MODULE_PARM_DESC(disable_sma, "Disable the SMA");
 
 /*
  * Note that it is OK to post send work requests in the SQE and ERR
@@ -173,7 +174,8 @@ static __be64 sys_image_guid;
  * @data: the data to copy
  * @length: the length of the data
  */
-void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length)
+void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length,
+		    int release)
 {
 	struct ipath_sge *sge = &ss->sge;
 
@@ -193,9 +195,11 @@ void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length)
 		sge->length -= len;
 		sge->sge_length -= len;
 		if (sge->sge_length == 0) {
+			if (release)
+				atomic_dec(&sge->mr->refcount);
 			if (--ss->num_sge)
 				*sge = *ss->sg_list++;
-		} else if (sge->length == 0 && sge->mr != NULL) {
+		} else if (sge->length == 0 && sge->mr->lkey) {
 			if (++sge->n >= IPATH_SEGSZ) {
 				if (++sge->m >= sge->mr->mapsz)
 					break;
@@ -216,7 +220,7 @@ void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length)
  * @ss: the SGE state
  * @length: the number of bytes to skip
  */
-void ipath_skip_sge(struct ipath_sge_state *ss, u32 length)
+void ipath_skip_sge(struct ipath_sge_state *ss, u32 length, int release)
 {
 	struct ipath_sge *sge = &ss->sge;
 
@@ -232,9 +236,11 @@ void ipath_skip_sge(struct ipath_sge_state *ss, u32 length)
 		sge->length -= len;
 		sge->sge_length -= len;
 		if (sge->sge_length == 0) {
+			if (release)
+				atomic_dec(&sge->mr->refcount);
 			if (--ss->num_sge)
 				*sge = *ss->sg_list++;
-		} else if (sge->length == 0 && sge->mr != NULL) {
+		} else if (sge->length == 0 && sge->mr->lkey) {
 			if (++sge->n >= IPATH_SEGSZ) {
 				if (++sge->m >= sge->mr->mapsz)
 					break;
@@ -281,7 +287,7 @@ static u32 ipath_count_sge(struct ipath_sge_state *ss, u32 length)
 		if (sge.sge_length == 0) {
 			if (--num_sge)
 				sge = *sg_list++;
-		} else if (sge.length == 0 && sge.mr != NULL) {
+		} else if (sge.length == 0 && sge.mr->lkey) {
 			if (++sge.n >= IPATH_SEGSZ) {
 				if (++sge.m >= sge.mr->mapsz)
 					break;
@@ -320,7 +326,7 @@ static void ipath_copy_from_sge(void *data, struct ipath_sge_state *ss,
 		if (sge->sge_length == 0) {
 			if (--ss->num_sge)
 				*sge = *ss->sg_list++;
-		} else if (sge->length == 0 && sge->mr != NULL) {
+		} else if (sge->length == 0 && sge->mr->lkey) {
 			if (++sge->n >= IPATH_SEGSZ) {
 				if (++sge->m >= sge->mr->mapsz)
 					break;
@@ -350,9 +356,16 @@ static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr)
 	int acc;
 	int ret;
 	unsigned long flags;
+	struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
 
 	spin_lock_irqsave(&qp->s_lock, flags);
 
+	if (qp->ibqp.qp_type != IB_QPT_SMI &&
+	    !(dd->ipath_flags & IPATH_LINKACTIVE)) {
+		ret = -ENETDOWN;
+		goto bail;
+	}
+
 	/* Check that state is OK to post send. */
 	if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK)))
 		goto bail_inval;
@@ -398,10 +411,11 @@ static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr)
 	wqe = get_swqe_ptr(qp, qp->s_head);
 	wqe->wr = *wr;
 	wqe->length = 0;
+	j = 0;
 	if (wr->num_sge) {
 		acc = wr->opcode >= IB_WR_RDMA_READ ?
 			IB_ACCESS_LOCAL_WRITE : 0;
-		for (i = 0, j = 0; i < wr->num_sge; i++) {
+		for (i = 0; i < wr->num_sge; i++) {
 			u32 length = wr->sg_list[i].length;
 			int ok;
 
@@ -410,7 +424,7 @@ static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr)
 			ok = ipath_lkey_ok(qp, &wqe->sg_list[j],
 					   &wr->sg_list[i], acc);
 			if (!ok)
-				goto bail_inval;
+				goto bail_inval_free;
 			wqe->length += length;
 			j++;
 		}
@@ -419,15 +433,21 @@ static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr)
 	if (qp->ibqp.qp_type == IB_QPT_UC ||
 	    qp->ibqp.qp_type == IB_QPT_RC) {
 		if (wqe->length > 0x80000000U)
-			goto bail_inval;
+			goto bail_inval_free;
 	} else if (wqe->length > to_idev(qp->ibqp.device)->dd->ipath_ibmtu)
-		goto bail_inval;
+		goto bail_inval_free;
 	wqe->ssn = qp->s_ssn++;
 	qp->s_head = next;
 
 	ret = 0;
 	goto bail;
 
+bail_inval_free:
+	while (j) {
+		struct ipath_sge *sge = &wqe->sg_list[--j];
+
+		atomic_dec(&sge->mr->refcount);
+	}
 bail_inval:
 	ret = -EINVAL;
 bail:
@@ -752,7 +772,7 @@ static void ipath_ib_timer(struct ipath_ibdev *dev)
 		resend = qp->timer_next;
 
 		spin_lock_irqsave(&qp->s_lock, flags);
-		if (qp->s_last != qp->s_tail &&
+		if (qp->s_acked != qp->s_tail &&
 		    ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) {
 			dev->n_timeouts++;
 			ipath_restart_rc(qp, qp->s_last_psn + 1);
@@ -788,7 +808,7 @@ static void update_sge(struct ipath_sge_state *ss, u32 length)
 	if (sge->sge_length == 0) {
 		if (--ss->num_sge)
 			*sge = *ss->sg_list++;
-	} else if (sge->length == 0 && sge->mr != NULL) {
+	} else if (sge->length == 0 && sge->mr->lkey) {
 		if (++sge->n >= IPATH_SEGSZ) {
 			if (++sge->m >= sge->mr->mapsz)
 				return;
@@ -989,7 +1009,7 @@ unsigned ipath_ib_rate_to_mult(enum ib_rate rate)
 /*
  * Convert delay multiplier to IB rate
  */
-enum ib_rate ipath_mult_to_ib_rate(unsigned mult)
+static enum ib_rate ipath_mult_to_ib_rate(unsigned mult)
 {
 	switch (mult) {
 	case 8:  return IB_RATE_2_5_GBPS;
@@ -1031,7 +1051,7 @@ static void sdma_complete(void *cookie, int status)
 	struct ipath_verbs_txreq *tx = cookie;
 	struct ipath_qp *qp = tx->qp;
 	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-	unsigned int flags;
+	unsigned long flags;
 	enum ib_wc_status ibs = status == IPATH_SDMA_TXREQ_S_OK ?
 		IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR;
 
@@ -1039,6 +1059,8 @@ static void sdma_complete(void *cookie, int status)
 		spin_lock_irqsave(&qp->s_lock, flags);
 		if (tx->wqe)
 			ipath_send_complete(qp, tx->wqe, ibs);
+		else if (qp->ibqp.qp_type == IB_QPT_RC)
+			ipath_rc_send_complete(qp, &tx->hdr.hdr);
 		if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND &&
 		     qp->s_last != qp->s_head) ||
 		    (qp->s_flags & IPATH_S_WAIT_DMA))
@@ -1049,19 +1071,29 @@ static void sdma_complete(void *cookie, int status)
 		spin_lock_irqsave(&qp->s_lock, flags);
 		ipath_send_complete(qp, tx->wqe, ibs);
 		spin_unlock_irqrestore(&qp->s_lock, flags);
+	} else if (qp->ibqp.qp_type == IB_QPT_RC) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		ipath_rc_send_complete(qp, &tx->hdr.hdr);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
 	}
 
+	if (tx->mr) {
+		atomic_dec(&tx->mr->refcount);
+		tx->mr = NULL;
+	}
 	if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF)
 		kfree(tx->txreq.map_addr);
 	put_txreq(dev, tx);
 
 	if (atomic_dec_and_test(&qp->refcount))
 		wake_up(&qp->wait);
+
+	ipath_ib_piobufavail(dev);
 }
 
 static void decrement_dma_busy(struct ipath_qp *qp)
 {
-	unsigned int flags;
+	unsigned long flags;
 
 	if (atomic_dec_and_test(&qp->s_dma_busy)) {
 		spin_lock_irqsave(&qp->s_lock, flags);
@@ -1138,6 +1170,9 @@ static int ipath_verbs_send_dma(struct ipath_qp *qp,
 	tx->qp = qp;
 	atomic_inc(&qp->refcount);
 	tx->wqe = qp->s_wqe;
+	tx->mr = qp->s_rdma_mr;
+	if (qp->s_rdma_mr)
+		qp->s_rdma_mr = NULL;
 	tx->txreq.callback = sdma_complete;
 	tx->txreq.callback_cookie = tx;
 	tx->txreq.flags = IPATH_SDMA_TXREQ_F_HEADTOHOST |
@@ -1190,9 +1225,10 @@ static int ipath_verbs_send_dma(struct ipath_qp *qp,
 	tx->txreq.map_addr = piobuf;
 	tx->txreq.flags |= IPATH_SDMA_TXREQ_F_FREEBUF;
 	tx->txreq.sg_count = 1;
+	memcpy(&tx->hdr.hdr, hdr, hdrwords << 2);
 
-	*piobuf++ = cpu_to_le32(plen);
-	*piobuf++ = cpu_to_le32(control);
+	*piobuf++ = (__force u32) cpu_to_le32(plen);
+	*piobuf++ = (__force u32) cpu_to_le32(control);
 	memcpy(piobuf, hdr, hdrwords << 2);
 	ipath_copy_from_sge(piobuf + hdrwords, ss, len);
 
@@ -1213,6 +1249,10 @@ static int ipath_verbs_send_dma(struct ipath_qp *qp,
 	goto bail;
 
 err_tx:
+	if (tx->mr) {
+		atomic_dec(&tx->mr->refcount);
+		tx->mr = NULL;
+	}
 	if (atomic_dec_and_test(&qp->refcount))
 		wake_up(&qp->wait);
 	put_txreq(dev, tx);
@@ -1231,7 +1271,7 @@ static int ipath_verbs_send_pio(struct ipath_qp *qp,
 	unsigned flush_wc;
 	u32 control;
 	int ret;
-	unsigned int flags;
+	unsigned long flags;
 
 	piobuf = ipath_getpiobuf(dd, plen, NULL);
 	if (unlikely(piobuf == NULL)) {
@@ -1302,10 +1342,18 @@ static int ipath_verbs_send_pio(struct ipath_qp *qp,
 	}
 	copy_io(piobuf, ss, len, flush_wc);
 done:
+	if (qp->s_rdma_mr) {
+		atomic_dec(&qp->s_rdma_mr->refcount);
+		qp->s_rdma_mr = NULL;
+	}
 	if (qp->s_wqe) {
 		spin_lock_irqsave(&qp->s_lock, flags);
 		ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS);
 		spin_unlock_irqrestore(&qp->s_lock, flags);
+	} else if (qp->ibqp.qp_type == IB_QPT_RC) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		ipath_rc_send_complete(qp, ibhdr);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
 	}
 	ret = 0;
 bail:
@@ -1505,9 +1553,11 @@ static int ipath_query_device(struct ib_device *ibdev,
 
 	props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
 		IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
-		IB_DEVICE_SYS_IMAGE_GUID;
+		IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
+		IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
 	props->page_size_cap = PAGE_SIZE;
-	props->vendor_id = dev->dd->ipath_vendorid;
+	props->vendor_id =
+		IPATH_SRC_OUI_1 << 16 | IPATH_SRC_OUI_2 << 8 | IPATH_SRC_OUI_3;
 	props->vendor_part_id = dev->dd->ipath_deviceid;
 	props->hw_ver = dev->dd->ipath_pcirev;
 
@@ -1853,7 +1903,7 @@ unsigned ipath_get_npkeys(struct ipath_devdata *dd)
 }
 
 /**
- * ipath_get_pkey - return the indexed PKEY from the port 0 PKEY table
+ * ipath_get_pkey - return the indexed PKEY from the port PKEY table
  * @dd: the infinipath device
  * @index: the PKEY index
  */
@@ -1861,6 +1911,7 @@ unsigned ipath_get_pkey(struct ipath_devdata *dd, unsigned index)
 {
 	unsigned ret;
 
+	/* always a kernel port, no locking needed */
 	if (index >= ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys))
 		ret = 0;
 	else
@@ -2135,7 +2186,6 @@ int ipath_register_ib_device(struct ipath_devdata *dd)
 	dev->phys_port_cnt = 1;
 	dev->num_comp_vectors = 1;
 	dev->dma_device = &dd->pcidev->dev;
-	dev->class_dev.dev = dev->dma_device;
 	dev->query_device = ipath_query_device;
 	dev->modify_device = ipath_modify_device;
 	dev->query_port = ipath_query_port;
@@ -2228,6 +2278,8 @@ void ipath_unregister_ib_device(struct ipath_ibdev *dev)
 		ipath_dev_err(dev->dd, "piowait list not empty!\n");
 	if (!list_empty(&dev->rnrwait))
 		ipath_dev_err(dev->dd, "rnrwait list not empty!\n");
+	if (dev->dma_mr)
+		ipath_dev_err(dev->dd, "DMA MR not NULL!\n");
 	if (!ipath_mcast_tree_empty())
 		ipath_dev_err(dev->dd, "multicast table memory leak!\n");
 	/*
@@ -2274,10 +2326,12 @@ static ssize_t show_stats(struct class_device *cdev, char *buf)
 		container_of(cdev, struct ipath_ibdev, ibdev.class_dev);
 	int i;
 	int len;
+	struct ipath_qp_table *qpt;
+	unsigned long flags;
 
 	len = sprintf(buf,
 		      "RC resends  %d\n"
-		      "RC no QACK  %d\n"
+		      "RC QACKs    %d\n"
 		      "RC ACKs     %d\n"
 		      "RC SEQ NAKs %d\n"
 		      "RC RDMA seq %d\n"
@@ -2285,6 +2339,7 @@ static ssize_t show_stats(struct class_device *cdev, char *buf)
 		      "RC OTH NAKs %d\n"
 		      "RC timeouts %d\n"
 		      "RC RDMA dup %d\n"
+		      "RC DComp    %d\n"
 		      "piobuf wait %d\n"
 		      "unaligned   %d\n"
 		      "PKT drops   %d\n"
@@ -2292,7 +2347,8 @@ static ssize_t show_stats(struct class_device *cdev, char *buf)
 		      dev->n_rc_resends, dev->n_rc_qacks, dev->n_rc_acks,
 		      dev->n_seq_naks, dev->n_rdma_seq, dev->n_rnr_naks,
 		      dev->n_other_naks, dev->n_timeouts,
-		      dev->n_rdma_dup_busy, dev->n_piowait, dev->n_unaligned,
+		      dev->n_rdma_dup_busy, dev->n_rc_delayed_comp,
+		      dev->n_piowait, dev->n_unaligned,
 		      dev->n_pkt_drops, dev->n_wqe_errs);
 	for (i = 0; i < ARRAY_SIZE(dev->opstats); i++) {
 		const struct ipath_opcode_stats *si = &dev->opstats[i];
@@ -2303,6 +2359,33 @@ static ssize_t show_stats(struct class_device *cdev, char *buf)
 			       (unsigned long long) si->n_packets,
 			       (unsigned long long) si->n_bytes);
 	}
+	qpt = &dev->qp_table;
+	spin_lock_irqsave(&qpt->lock, flags);
+	for (i = 0; i < qpt->max; i++) {
+		struct ipath_qp *qp;
+		for (qp = qpt->table[i]; qp != NULL; qp = qp->next) {
+			if (qp->s_last == qp->s_acked &&
+			    qp->s_acked == qp->s_cur &&
+			    qp->s_cur == qp->s_tail &&
+			    qp->s_tail == qp->s_head)
+				continue;
+			if (len + 128 >= PAGE_SIZE)
+				break;
+			len += sprintf(buf + len,
+			    "QP%u %x %u PSN %x %x %x %x %x (%u %u %u %u %u)\n",
+				qp->ibqp.qp_num,
+				qp->s_flags,
+				atomic_read(&qp->s_dma_busy),
+				qp->s_last_psn,
+				qp->s_psn,
+				qp->s_next_psn,
+				qp->s_sending_psn,
+				qp->s_sending_hpsn,
+				qp->s_last, qp->s_acked, qp->s_cur,
+				qp->s_tail, qp->s_head);
+		}
+	}
+	spin_unlock_irqrestore(&qpt->lock, flags);
 	return len;
 }
 
@@ -2325,7 +2408,7 @@ static int ipath_verbs_register_sysfs(struct ib_device *dev)
 
 	for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i)
 		if (class_device_create_file(&dev->class_dev,
-					     ipath_class_attributes[i])) {
+					       ipath_class_attributes[i])) {
 			ret = 1;
 			goto bail;
 		}
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h
index 9b21282..e60ccbc 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs.h
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.h
@@ -248,6 +248,7 @@ struct ipath_mregion {
 	int access_flags;
 	u32 max_segs;		/* number of ipath_segs in all the arrays */
 	u32 mapsz;		/* size of the map array */
+	atomic_t refcount;
 	struct ipath_segarray *map[0];	/* the segments */
 };
 
@@ -330,7 +331,6 @@ struct ipath_sge_state {
 	struct ipath_sge sge;   /* progress state for the current SGE */
 	u32 total_len;
 	u8 num_sge;
-	u8 static_rate;
 };
 
 /*
@@ -342,7 +342,7 @@ struct ipath_ack_entry {
 	u8 sent;
 	u32 psn;
 	union {
-		struct ipath_sge_state rdma_sge;
+		struct ipath_sge rdma_sge;
 		u64 atomic_data;
 	};
 };
@@ -371,6 +371,7 @@ struct ipath_qp {
 	struct ipath_mmap_info *ip;
 	struct ipath_sge_state *s_cur_sge;
 	struct ipath_verbs_txreq *s_tx;
+	struct ipath_mregion *s_rdma_mr;
 	struct ipath_sge_state s_sge;	/* current send request data */
 	struct ipath_ack_entry s_ack_queue[IPATH_MAX_RDMA_ATOMIC + 1];
 	struct ipath_sge_state s_ack_rdma_sge;
@@ -385,6 +386,8 @@ struct ipath_qp {
 	u32 s_rdma_read_len;	/* total length of s_rdma_read_sge */
 	u32 s_next_psn;		/* PSN for next request */
 	u32 s_last_psn;		/* last response PSN processed */
+	u32 s_sending_psn;	/* lowest PSN that is being sent */
+	u32 s_sending_hpsn;	/* highest PSN that is being sent */
 	u32 s_psn;		/* current packet sequence number */
 	u32 s_ack_rdma_psn;	/* PSN for sending RDMA read responses */
 	u32 s_ack_psn;		/* PSN for acking sends and RDMA writes */
@@ -420,6 +423,7 @@ struct ipath_qp {
 	u8 s_dmult;
 	u8 s_draining;
 	u8 timeout;		/* Timeout for this QP */
+	u16 s_rdma_ack_cnt;
 	enum ib_mtu path_mtu;
 	u32 remote_qpn;
 	u32 qkey;		/* QKEY for this QP (for UD or RD) */
@@ -427,11 +431,13 @@ struct ipath_qp {
 	u32 s_head;		/* new entries added here */
 	u32 s_tail;		/* next entry to process */
 	u32 s_cur;		/* current work queue entry */
-	u32 s_last;		/* last un-ACK'ed entry */
+	u32 s_acked;		/* last un-ACK'ed entry */
+	u32 s_last;		/* last completed entry */
 	u32 s_ssn;		/* SSN of tail entry */
 	u32 s_lsn;		/* limit sequence number (credit) */
 	struct ipath_swqe *s_wq;	/* send work queue */
 	struct ipath_swqe *s_wqe;
+	struct ipath_sge *r_ud_sg_list;
 	struct ipath_rq r_rq;		/* receive work queue */
 	struct ipath_sge r_sg_list[0];	/* verified SGEs */
 };
@@ -457,7 +463,7 @@ struct ipath_qp {
  * IPATH_S_WAITING - waiting for RNR timeout or send buffer available.
  * IPATH_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE
  * IPATH_S_WAIT_DMA - waiting for send DMA queue to drain before generating
- 		      next send completion entry not via send DMA.
+ *		      next send completion entry not via send DMA.
  */
 #define IPATH_S_SIGNAL_REQ_WR	0x01
 #define IPATH_S_FENCE_PENDING	0x02
@@ -538,6 +544,7 @@ struct ipath_ibdev {
 	struct list_head pending_mmaps;
 	spinlock_t mmap_offset_lock;
 	u32 mmap_offset;
+	struct ipath_mregion *dma_mr;
 	int ib_unit;		/* This is the device number */
 	u16 sm_lid;		/* in host order */
 	u8 sm_sl;
@@ -600,6 +607,7 @@ struct ipath_ibdev {
 	u32 n_rc_resends;
 	u32 n_rc_acks;
 	u32 n_rc_qacks;
+	u32 n_rc_delayed_comp;
 	u32 n_seq_naks;
 	u32 n_rdma_seq;
 	u32 n_rnr_naks;
@@ -647,6 +655,7 @@ struct ipath_verbs_txreq {
 	struct ipath_swqe       *wqe;
 	u32                      map_len;
 	u32                      len;
+	struct ipath_mregion	*mr;
 	struct ipath_sge_state  *ss;
 	struct ipath_pio_header  hdr;
 	struct ipath_sdma_txreq  txreq;
@@ -755,14 +764,13 @@ void ipath_get_credit(struct ipath_qp *qp, u32 aeth);
 
 unsigned ipath_ib_rate_to_mult(enum ib_rate rate);
 
-enum ib_rate ipath_mult_to_ib_rate(unsigned mult);
-
 int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr,
 		     u32 hdrwords, struct ipath_sge_state *ss, u32 len);
 
-void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length);
+void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length,
+		    int release);
 
-void ipath_skip_sge(struct ipath_sge_state *ss, u32 length);
+void ipath_skip_sge(struct ipath_sge_state *ss, u32 length, int release);
 
 void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
@@ -772,6 +780,8 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 
 void ipath_restart_rc(struct ipath_qp *qp, u32 psn);
 
+void ipath_rc_send_complete(struct ipath_qp *qp, struct ipath_ib_header *hdr);
+
 void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err);
 
 int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr);
@@ -782,12 +792,12 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 int ipath_alloc_lkey(struct ipath_lkey_table *rkt,
 		     struct ipath_mregion *mr);
 
-void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey);
+int ipath_free_lkey(struct ipath_ibdev *dev, struct ipath_mregion *mr);
 
 int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge,
 		  struct ib_sge *sge, int acc);
 
-int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss,
+int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge *sge,
 		  u32 len, u64 vaddr, u32 rkey, int acc);
 
 int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
index 9e5abf9..d73e322 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
+++ b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
@@ -31,8 +31,7 @@
  * SOFTWARE.
  */
 
-#include <linux/list.h>
-#include <linux/rcupdate.h>
+#include <linux/rculist.h>
 
 #include "ipath_verbs.h"
 
diff --git a/drivers/infiniband/hw/ipath/ipath_wc_pat.c b/drivers/infiniband/hw/ipath/ipath_wc_pat.c
new file mode 100644
index 0000000..31aa960
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_wc_pat.c
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2006-2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <asm/processor.h>
+#include <linux/io.h>
+#include <linux/pci.h>
+#include "ipath_kernel.h"
+#include "ipath_wc_pat.h"
+
+/* ipath_wc_pat parameter:
+ *   0 is WC via MTRR
+ *   1 is WC via PAT
+ *   2 is WC via PAT and over-ride chip-set wc errata and PAT checks
+ *   If PAT initialization fails, code reverts back to MTRR
+ */
+unsigned ipath_wc_pat = 1; /* default (1) is to use PAT, not MTRR */
+module_param_named(wc_pat, ipath_wc_pat, uint, S_IRUGO);
+MODULE_PARM_DESC(wc_pat, "enable write-combining via PAT mechanism");
+
+static u32 old_pat_lo[NR_CPUS] = {0};
+static u32 old_pat_hi[NR_CPUS] = {0};
+static u32 new_pat_lo[NR_CPUS] = {0};
+static unsigned int wc_enabled;
+
+#define IPATH_PAT_MASK	(0xFFFFF8FF)	/* PAT1 mask for the PAT MSR */
+#define IPATH_PAT_EXP	(0x00000400)	/* expected PAT1 value (WT) */
+#define IPATH_PAT_MOD	(0x00000100)	/* PAT1 value to select WC */
+#define IPATH_WC_MASK	(~_PAGE_PCD)	/* selects PAT1 for this page */
+#define IPATH_WC_FLAGS	(_PAGE_PWT)	/* selects PAT1 for this page */
+
+#if defined(__i386__) || defined(__x86_64__)
+
+#define X86_MSR_PAT_OFFSET  0x277
+
+/*  Returns non-zero if we have a chipset write-combining problem */
+static int have_wc_errata(void)
+{
+	struct pci_dev *dev;
+	u8 rev;
+
+	if (ipath_wc_pat == 2)
+		return 0;
+
+	dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL);
+	if (dev != NULL) {
+		/*
+		 * ServerWorks LE chipsets < rev 6 have problems with
+		 * write-combining.
+		 */
+		if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
+		    dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) {
+			pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
+			if (rev <= 5) {
+				ipath_dbg("Serverworks LE rev < 6 detected. "
+					  "Write-combining disabled\n");
+				pci_dev_put(dev);
+				return -ENOSYS;
+			}
+		}
+		/* Intel 450NX errata # 23. Non ascending cacheline evictions
+		   to write combining memory may resulting in data corruption
+		 */
+		if (dev->vendor == PCI_VENDOR_ID_INTEL &&
+		    dev->device == PCI_DEVICE_ID_INTEL_82451NX) {
+			ipath_dbg("Intel 450NX MMC detected. "
+				  "Write-combining disabled.\n");
+			pci_dev_put(dev);
+			return -ENOSYS;
+		}
+		pci_dev_put(dev);
+	}
+	return 0;
+}
+
+static void rd_old_pat(void *err)
+{
+	*(int *)err |= rdmsr_safe(X86_MSR_PAT_OFFSET,
+				  &old_pat_lo[smp_processor_id()],
+				  &old_pat_hi[smp_processor_id()]);
+}
+
+static void wr_new_pat(void *err)
+{
+	new_pat_lo[smp_processor_id()] =
+		(old_pat_lo[smp_processor_id()] & IPATH_PAT_MASK) |
+		IPATH_PAT_MOD;
+
+	*(int *)err |= wrmsr_safe(X86_MSR_PAT_OFFSET,
+				  new_pat_lo[smp_processor_id()],
+				  old_pat_hi[smp_processor_id()]);
+}
+
+static void wr_old_pat(void *err)
+{
+	u32 cur_pat_lo, cur_pat_hi;
+
+	*(int *)err |= rdmsr_safe(X86_MSR_PAT_OFFSET,
+				  &cur_pat_lo, &cur_pat_hi);
+
+	if (*(int *) err)
+		goto done;
+
+	/* only restore old PAT if it currently has the expected values */
+	if (cur_pat_lo != new_pat_lo[smp_processor_id()] ||
+	    cur_pat_hi != old_pat_hi[smp_processor_id()])
+		goto done;
+
+	*(int *)err |= wrmsr_safe(X86_MSR_PAT_OFFSET,
+				  old_pat_lo[smp_processor_id()],
+				  old_pat_hi[smp_processor_id()]);
+done:	;
+}
+
+static int validate_old_pat(void)
+{
+	int ret = 0;
+	int cpu = smp_processor_id();
+	int ncpus = num_online_cpus();
+	int i;
+	int onetime = 1;
+	u32 my_pat1 = old_pat_lo[cpu] & ~IPATH_PAT_MASK;
+
+	if (ipath_wc_pat == 2)
+		goto done;
+
+	for (i = 0; i < ncpus; i++) {
+		u32 this_pat1 = old_pat_lo[i] & ~IPATH_PAT_MASK;
+		if (this_pat1 != my_pat1) {
+			ipath_dbg("Inconsistent PAT1 settings across CPUs\n");
+			ret = -ENOSYS;
+			goto done;
+		} else if (this_pat1 == IPATH_PAT_MOD) {
+			if (onetime) {
+				ipath_dbg("PAT1 has already been "
+					  "modified for WC (warning)\n");
+				onetime = 0;
+			}
+		} else if (this_pat1 != IPATH_PAT_EXP) {
+			ipath_dbg("PAT1 not in expected WT state\n");
+			ret = -ENOSYS;
+			goto done;
+		}
+	}
+done:
+	return ret;
+}
+
+static int read_and_modify_pat(void)
+{
+	int ret = 0;
+
+	preempt_disable();
+	rd_old_pat(&ret);
+	if (!ret)
+		smp_call_function(rd_old_pat, &ret, 1, 1);
+	if (ret)
+		goto out;
+
+	if (validate_old_pat())
+		goto out;
+
+	wr_new_pat(&ret);
+	if (ret)
+		goto out;
+
+	smp_call_function(wr_new_pat, &ret, 1, 1);
+	BUG_ON(ret); /* have inconsistent PAT state */
+out:
+	preempt_enable();
+	return ret;
+}
+
+static int restore_pat(void)
+{
+	int ret = 0;
+
+	preempt_disable();
+	wr_old_pat(&ret);
+	if (!ret) {
+		smp_call_function(wr_old_pat, &ret, 1, 1);
+		BUG_ON(ret); /* have inconsistent PAT state */
+	}
+
+	preempt_enable();
+	return ret;
+}
+
+int ipath_enable_wc_pat(void)
+{
+	struct cpuinfo_x86 *c = &(cpu_data)[0];
+	int ret;
+
+	if (wc_enabled)
+		return 0;
+
+	if (!cpu_has(c, X86_FEATURE_MSR) ||
+	    !cpu_has(c, X86_FEATURE_PAT)) {
+		ipath_dbg("WC PAT not available on this processor\n");
+		return -ENOSYS;
+	}
+
+	if (have_wc_errata())
+		return -ENOSYS;
+
+	ret = read_and_modify_pat();
+	if (!ret)
+		wc_enabled = 1;
+	else
+		ipath_dbg("Failed to enable WC PAT\n");
+	return ret ? -EIO  : 0;
+}
+
+void ipath_disable_wc_pat(void)
+{
+	if (wc_enabled) {
+		if (!restore_pat())
+			wc_enabled = 0;
+		else
+			ipath_dbg("Failed to disable WC PAT\n");
+	}
+}
+
+pgprot_t pgprot_wc(pgprot_t _prot)
+{
+	return wc_enabled ?
+		__pgprot(pgprot_val(_prot) | IPATH_WC_FLAGS) :
+		pgprot_noncached(_prot);
+}
+
+void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
+{
+	return __ioremap(phys_addr, size, IPATH_WC_FLAGS);
+}
+
+int ipath_wc_pat_enabled(void)
+{
+	return wc_enabled;
+}
+
+#else	/* !(defined(__i386__) || defined(__x86_64__)) */
+
+int ipath_enable_wc_pat(void){ return 0; }
+void ipath_disable_wc_pat(void){}
+
+pgprot_t pgprot_wc(pgprot_t _prot)
+{
+	return pgprot_noncached(_prot);
+}
+
+void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
+{
+	return ioremap_nocache(phys_addr, size);
+}
+
+int ipath_wc_pat_enabled(void)
+{
+	return 0;
+}
+
+#endif
diff --git a/drivers/infiniband/hw/ipath/ipath_wc_pat.h b/drivers/infiniband/hw/ipath/ipath_wc_pat.h
new file mode 100644
index 0000000..1b17661
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_wc_pat.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2006-2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IPATH_WC_PAT_H
+#define IPATH_WC_PAT_H
+
+#include <asm/pgtable.h>
+
+extern unsigned ipath_wc_pat;
+
+int ipath_enable_wc_pat(void);
+void ipath_disable_wc_pat(void);
+int ipath_wc_pat_enabled(void);
+pgprot_t pgprot_wc(pgprot_t _prot);
+void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size);
+
+#endif