Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 89877e42827f16fa5f86b1df0c2860b1 > files > 1707

kernel-2.6.18-128.1.10.el5.src.rpm

From: AMEET M. PARANJAPE <aparanja@redhat.com>
Date: Wed, 5 Nov 2008 22:14:51 -0500
Subject: [openib] ib_core: use weak ordering for user memory
Message-id: 20081106031416.3297.68962.sendpatchset@squad5-lp1.lab.bos.redhat.com
O-Subject: [PATCH RHEL5.3 BZ469902 2/3] ib_core: use weak ordering for user memory
Bugzilla: 469902
RH-Acked-by: David Howells <dhowells@redhat.com>
RH-Acked-by: Doug Ledford <dledford@redhat.com>

RHBZ#:
======
https://bugzilla.redhat.com/show_bug.cgi?id=469902

Description:
===========
This patch uses the new interface for Infiniband umem allocations
to request relaxed ordering, as in the current OFED release.  This change only
affects the cell platform. On non-powerpc architectures, the ib_umem_get
and ib_umem_release functions that are changed as well as the new
ib_umem_get_dmasync and  ib_umem_release_dmasync should get
compiled into the same object code as the old b_umem_get and ib_umem_release
functions before the patch. The addition of the allow_weak_ordering module
parameter does change the module object code, but the setting of that parameter
does not have an effect on the code path.

The mlx4/qc.c and mthca_provider.c changes make the two infiniband device
drivers use strong ordering (previously the default for all the mappings) on
selected mappings, where this is required for data consistency.

RHEL Version Found:
================
RHEL 5.3 Beta

kABI Status:
============
No symbols were harmed.

Brew:
=====
Built on all platforms.
http://brewweb.devel.redhat.com/brew/taskinfo?taskID=1560071

Upstream Status:
================
This patch is a combination of other upstream patches with GIT ID:

cb9fbc5c37b69ac584e61d449cfd590f5ae1f90d

while others are already merged into the OFED-1.4 version of the kernel,
and are expected to move info the 2.6.29 kernel, but is not part of 2.6.28-rc.

Test Status:
============
A performance decrease is discovered on RHEL5.3 Beta expecially when executing
the IMB benchmark (formerly known as Pallas) on InfiniBand connections. The
performance decrease is up to 50%.

After applying these patches the performance degradation is no longer present.

===============================================================
Ameet Paranjape 978-392-3903 ext 23903
IBM on-site partner

Proposed Patch:
===============

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 1a64dfc..dd4b7ad 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -38,9 +38,14 @@
 #include <linux/dma-mapping.h>
 #include <linux/sched.h>
 #include <linux/hugetlb.h>
+#include <linux/dma-attrs.h>
 
 #include "uverbs.h"
 
+static int allow_weak_ordering;
+module_param(allow_weak_ordering, bool, 0444);
+MODULE_PARM_DESC(allow_weak_ordering,  "Allow weak ordering for data registered memory");
+
 #define IB_UMEM_MAX_PAGE_CHUNK						\
 	((PAGE_SIZE - offsetof(struct ib_umem_chunk, page_list)) /	\
 	 ((void *) &((struct ib_umem_chunk *) 0)->page_list[1] -	\
@@ -96,14 +101,21 @@ static void dma_unmap_sg_ia64(struct ib_device *ibdev,
 
 #endif
 
-static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
+static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty, int dmasync)
 {
 	struct ib_umem_chunk *chunk, *tmp;
 	int i;
 
+	DEFINE_DMA_ATTRS(attrs);
+
+	if (dmasync)
+		dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs);
+	else if (allow_weak_ordering)
+		dma_set_attr(DMA_ATTR_WEAK_ORDERING, &attrs);
+
 	list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) {
-		ib_dma_unmap_sg(dev, chunk->page_list,
-				chunk->nents, DMA_BIDIRECTIONAL);
+		ib_dma_unmap_sg_attrs(dev, chunk->page_list,
+				      chunk->nents, DMA_BIDIRECTIONAL, &attrs);
 		for (i = 0; i < chunk->nents; ++i) {
 			struct page *page = sg_page(&chunk->page_list[i]);
 
@@ -122,9 +134,10 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
  * @addr: userspace virtual address to start at
  * @size: length of region to pin
  * @access: IB_ACCESS_xxx flags for memory being pinned
+ * @dmasync: flush in-flight DMA when the memory region is written
  */
-struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
-			    size_t size, int access)
+static struct ib_umem *__ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+			    size_t size, int access, int dmasync)
 {
 	struct ib_umem *umem;
 	struct page **page_list;
@@ -137,6 +150,13 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 	int ret;
 	int off;
 	int i;
+	DEFINE_DMA_ATTRS(attrs);
+
+	if (dmasync)
+		dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs);
+	else if (allow_weak_ordering)
+		dma_set_attr(DMA_ATTR_WEAK_ORDERING, &attrs);
+
 
 	if (!can_do_mlock())
 		return ERR_PTR(-EPERM);
@@ -224,10 +244,11 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 				sg_set_page(&chunk->page_list[i], page_list[i + off], PAGE_SIZE, 0);
 			}
 
-			chunk->nmap = ib_dma_map_sg(context->device,
-						    &chunk->page_list[0],
-						    chunk->nents,
-						    DMA_BIDIRECTIONAL);
+			chunk->nmap = ib_dma_map_sg_attrs(context->device,
+							  &chunk->page_list[0],
+							  chunk->nents,
+							  DMA_BIDIRECTIONAL,
+							  &attrs);
 			if (chunk->nmap <= 0) {
 				for (i = 0; i < chunk->nents; ++i)
 					put_page(sg_page(&chunk->page_list[i]));
@@ -247,7 +268,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 
 out:
 	if (ret < 0) {
-		__ib_umem_release(context->device, umem, 0);
+		__ib_umem_release(context->device, umem, 0, dmasync);
 		kfree(umem);
 	} else
 		current->mm->locked_vm = locked;
@@ -259,8 +280,21 @@ out:
 
 	return ret < 0 ? ERR_PTR(ret) : umem;
 }
+
+struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+			    size_t size, int access)
+{
+	return __ib_umem_get(context, addr, size, access, 0);
+}
 EXPORT_SYMBOL(ib_umem_get);
 
+struct ib_umem *ib_umem_get_dmasync(struct ib_ucontext *context, unsigned long addr,
+			    size_t size, int access)
+{
+	return __ib_umem_get(context, addr, size, access, 1);
+}
+EXPORT_SYMBOL(ib_umem_get_dmasync);
+
 static void ib_umem_account(struct work_struct *work)
 {
 	struct ib_umem *umem = container_of(work, struct ib_umem, work);
@@ -276,13 +310,13 @@ static void ib_umem_account(struct work_struct *work)
  * ib_umem_release - release memory pinned with ib_umem_get
  * @umem: umem struct to release
  */
-void ib_umem_release(struct ib_umem *umem)
+static void ib_umem_release_attr(struct ib_umem *umem, int dmasync)
 {
 	struct ib_ucontext *context = umem->context;
 	struct mm_struct *mm;
 	unsigned long diff;
 
-	__ib_umem_release(umem->context->device, umem, 1);
+	__ib_umem_release(umem->context->device, umem, 1, dmasync);
 
 	mm = get_task_mm(current);
 	if (!mm) {
@@ -317,8 +351,19 @@ void ib_umem_release(struct ib_umem *umem)
 	mmput(mm);
 	kfree(umem);
 }
+
+void ib_umem_release(struct ib_umem *umem)
+{
+	return ib_umem_release_attr(umem, 0);
+}
 EXPORT_SYMBOL(ib_umem_release);
 
+void ib_umem_release_dmasync(struct ib_umem *umem)
+{
+	return ib_umem_release_attr(umem, 1);
+}
+EXPORT_SYMBOL(ib_umem_release_dmasync);
+
 int ib_umem_page_count(struct ib_umem *umem)
 {
 	struct ib_umem_chunk *chunk;
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index b730afa..b816fbc 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -142,7 +142,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector
 			goto err_cq;
 		}
 
-		cq->umem = ib_umem_get(context, ucmd.buf_addr, buf_size,
+		cq->umem = ib_umem_get_dmasync(context, ucmd.buf_addr, buf_size,
 				       IB_ACCESS_LOCAL_WRITE);
 		if (IS_ERR(cq->umem)) {
 			err = PTR_ERR(cq->umem);
@@ -216,7 +216,7 @@ err_mtt:
 
 err_buf:
 	if (context)
-		ib_umem_release(cq->umem);
+		ib_umem_release_dmasync(cq->umem);
 	else
 		mlx4_buf_free(dev->dev, entries * sizeof (struct mlx4_cqe),
 			      &cq->buf.buf);
@@ -241,7 +241,7 @@ int mlx4_ib_destroy_cq(struct ib_cq *cq)
 
 	if (cq->uobject) {
 		mlx4_ib_db_unmap_user(to_mucontext(cq->uobject->context), &mcq->db);
-		ib_umem_release(mcq->umem);
+		ib_umem_release_dmasync(mcq->umem);
 	} else {
 		mlx4_buf_free(dev->dev, (cq->cqe + 1) * sizeof (struct mlx4_cqe),
 			      &mcq->buf.buf);
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index 9e491df..f445918 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -1013,7 +1013,8 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	if (!mr)
 		return ERR_PTR(-ENOMEM);
 
-	mr->umem = ib_umem_get(pd->uobject->context, start, length, acc);
+	mr->umem = ib_umem_get_dmasync(pd->uobject->context, start,
+					 length, acc);
 	if (IS_ERR(mr->umem)) {
 		err = PTR_ERR(mr->umem);
 		goto err;
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 2229842..7f34a68 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -36,6 +36,7 @@
 #include <linux/list.h>
 #include <linux/scatterlist.h>
 #include <linux/workqueue.h>
+#include <linux/dma-attrs.h>
 
 struct ib_ucontext;
 
@@ -63,7 +64,10 @@ struct ib_umem_chunk {
 
 struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 			    size_t size, int access);
+struct ib_umem *ib_umem_get_dmasync(struct ib_ucontext *context, unsigned long addr,
+			    size_t size, int access);
 void ib_umem_release(struct ib_umem *umem);
+void ib_umem_release_dmasync(struct ib_umem *umem);
 int ib_umem_page_count(struct ib_umem *umem);
 
 #else /* CONFIG_INFINIBAND_USER_MEM */
@@ -75,7 +79,13 @@ static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context,
 					  int access) {
 	return ERR_PTR(-EINVAL);
 }
+static inline struct ib_umem *ib_umem_get_dmasync(struct ib_ucontext *context,
+					  unsigned long addr, size_t size,
+					  int access) {
+	return ERR_PTR(-EINVAL);
+}
 static inline void ib_umem_release(struct ib_umem *umem) { }
+static inline void ib_umem_release_dmasync(struct ib_umem *umem) { }
 static inline int ib_umem_page_count(struct ib_umem *umem) { return 0; }
 
 #endif /* CONFIG_INFINIBAND_USER_MEM */
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 81dacbb..c95e041 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1540,6 +1540,24 @@ static inline void ib_dma_unmap_single(struct ib_device *dev,
 		dma_unmap_single(dev->dma_device, addr, size, direction);
 }
 
+static inline u64 ib_dma_map_single_attrs(struct ib_device *dev,
+					  void *cpu_addr, size_t size,
+					  enum dma_data_direction direction,
+					  struct dma_attrs *attrs)
+{
+	return dma_map_single_attrs(dev->dma_device, cpu_addr, size,
+				    direction, attrs);
+}
+
+static inline void ib_dma_unmap_single_attrs(struct ib_device *dev,
+					     u64 addr, size_t size,
+					     enum dma_data_direction direction,
+					     struct dma_attrs *attrs)
+{
+	return dma_unmap_single_attrs(dev->dma_device, addr, size,
+				      direction, attrs);
+}
+
 /**
  * ib_dma_map_page - Map a physical page to DMA address
  * @dev: The device for which the dma_addr is to be created
@@ -1609,6 +1627,21 @@ static inline void ib_dma_unmap_sg(struct ib_device *dev,
 		dma_unmap_sg(dev->dma_device, sg, nents, direction);
 }
 
+static inline int ib_dma_map_sg_attrs(struct ib_device *dev,
+				      struct scatterlist *sg, int nents,
+				      enum dma_data_direction direction,
+				      struct dma_attrs *attrs)
+{
+	return dma_map_sg_attrs(dev->dma_device, sg, nents, direction, attrs);
+}
+
+static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev,
+					 struct scatterlist *sg, int nents,
+					 enum dma_data_direction direction,
+					 struct dma_attrs *attrs)
+{
+	dma_unmap_sg_attrs(dev->dma_device, sg, nents, direction, attrs);
+}
 /**
  * ib_sg_dma_address - Return the DMA address from a scatter/gather entry
  * @dev: The device for which the DMA addresses were created