Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > fc11cd6e1c513a17304da94a5390f3cd > files > 2697

kernel-2.6.18-194.11.1.el5.src.rpm

From: AMEET M. PARANJAPE <aparanja@redhat.com>
Date: Tue, 2 Dec 2008 13:39:40 -0600
Subject: [openib] ehca: fix generating flush work completions
Message-id: 49358EFC.2050203@REDHAT.COM
O-Subject: Re: [PATCH RHEL5.3 472812] fix problem with generate flush work completions
Bugzilla: 472812
RH-Acked-by: David Howells <dhowells@redhat.com>
RH-Acked-by: Doug Ledford <dledford@redhat.com>

RHBZ#:
======
https://bugzilla.redhat.com/show_bug.cgi?id=472812

Description:
===========
Note: The fix in RH BZ 462619 introduced this problem.

After shutting the ib interface down, ipoib is waiting to get all outstanding
workrequeusts completed. The flush cqe implementation of ehca has a problem
with the calculation of the outstanding workrequests. This problem occurs
because IPoIB is not requesting a completion for every work request.

The problem is seen with the ehea device in CM (Connected Mode) and
unreliable datagram (UD) modes.

RHEL Version Found:
================
RHEL 5.3 snapshot2

kABI Status:
============
No symbols were harmed.

Brew:
=====
Built on all platforms.
http://brewweb.devel.redhat.com/brew/taskinfo?taskID=1591681

Upstream Status:
================
Not available at this time.  Will update the thread when it is available.

Test Status:
============
The system must have ehca (i.e. a gal2, on an IBM p6 system).
Configure an ib interface so it comes up when the system is booted.
Run:
$ ifdown ib0
$ ifup ib1

Shortly afterwards start to receive messages from the ehca about duplicate
CQEs this will be followed by assertion errors and a crash.

With this patch applied the issue does not recreate.

--
Ameet M. Paranjape
aparanja@redhat.com
IBM/Red Hat POWER Liason
IRC name: aparanja

diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h
index 411c2c5..e1c25eb 100644
--- a/drivers/infiniband/hw/ehca/ehca_classes.h
+++ b/drivers/infiniband/hw/ehca/ehca_classes.h
@@ -163,7 +163,8 @@ struct ehca_mod_qp_parm {
 /* struct for tracking if cqes have been reported to the application */
 struct ehca_qmap_entry {
 	u16 app_wr_id;
-	u16 reported;
+	u8 reported;
+	u8 cqe_req;
 };
 
 struct ehca_queue_map {
@@ -171,8 +172,16 @@ struct ehca_queue_map {
 	unsigned int entries;        /* number of qmap entries */
 	unsigned int tail;           /* tail pointer */
 	unsigned int left_to_poll;   /* CQEs to poll before gen. flush CQEs */
+	unsigned int next_wqe_idx;   /* Idx to first wqe to be flushed */
 };
 
+/* function to calculate the next index for the qmap */
+static inline unsigned int next_index(unsigned int cur_index, unsigned int limit)
+{
+	unsigned int temp = cur_index + 1;
+	return (temp == limit) ? 0 : temp;
+}
+
 struct ehca_qp {
 	union {
 		struct ib_qp ib_qp;
diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c
index f3ca3f6..0b3caec 100644
--- a/drivers/infiniband/hw/ehca/ehca_qp.c
+++ b/drivers/infiniband/hw/ehca/ehca_qp.c
@@ -433,9 +433,13 @@ static void reset_queue_map(struct ehca_queue_map *qmap)
 {
 	int i;
 
-	qmap->tail = 0;
-	for (i = 0; i < qmap->entries; i++)
+	qmap->tail = qmap->entries - 1;
+	qmap->left_to_poll = 0;
+	qmap->next_wqe_idx = 0;
+	for (i = 0; i < qmap->entries; i++) {
 		qmap->map[i].reported = 1;
+		qmap->map[i].cqe_req = 0;
+	}
 }
 
 /*
@@ -1108,6 +1112,7 @@ static int calc_left_cqes(u64 wqe_p, struct ipz_queue *ipz_queue,
 	void *wqe_v;
 	u64 q_ofs;
 	u32 wqe_idx;
+	unsigned int tail_idx;
 
 	/* convert real to abs address */
 	wqe_p = wqe_p & (~(1UL << 63));
@@ -1120,12 +1125,17 @@ static int calc_left_cqes(u64 wqe_p, struct ipz_queue *ipz_queue,
 		return -EFAULT;
 	}
 
+	tail_idx = next_index(qmap->tail, qmap->entries);
 	wqe_idx = q_ofs / ipz_queue->qe_size;
-	if (wqe_idx < qmap->tail)
-		qmap->left_to_poll = (qmap->entries - qmap->tail) + wqe_idx;
-	else
-		qmap->left_to_poll = wqe_idx - qmap->tail;
 
+	/* check all processed wqes, whether a cqe is requested or not */
+	while (tail_idx != wqe_idx) {
+		if (qmap->map[tail_idx].cqe_req)
+			qmap->left_to_poll++;
+		tail_idx = next_index(tail_idx, qmap->entries);
+	}
+	/* save index in queue, where we have to start flushing */
+	qmap->next_wqe_idx = wqe_idx;
 	return 0;
 }
 
@@ -1174,10 +1184,14 @@ static int check_for_left_cqes(struct ehca_qp *my_qp, struct ehca_shca *shca)
 	} else {
 		spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
 		my_qp->sq_map.left_to_poll = 0;
+		my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail,
+							my_qp->sq_map.entries);
 		spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
 
 		spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
 		my_qp->rq_map.left_to_poll = 0;
+		my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail,
+							my_qp->rq_map.entries);
 		spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags);
 	}
 
diff --git a/drivers/infiniband/hw/ehca/ehca_reqs.c b/drivers/infiniband/hw/ehca/ehca_reqs.c
index bb43402..6be07ce 100644
--- a/drivers/infiniband/hw/ehca/ehca_reqs.c
+++ b/drivers/infiniband/hw/ehca/ehca_reqs.c
@@ -179,6 +179,7 @@ static inline int ehca_write_swqe(struct ehca_qp *qp,
 
 	qmap_entry->app_wr_id = get_app_wr_id(send_wr->wr_id);
 	qmap_entry->reported = 0;
+	qmap_entry->cqe_req = 0;
 
 	switch (send_wr->opcode) {
 	case IB_WR_SEND:
@@ -203,8 +204,10 @@ static inline int ehca_write_swqe(struct ehca_qp *qp,
 
 	if ((send_wr->send_flags & IB_SEND_SIGNALED ||
 	    qp->init_attr.sq_sig_type == IB_SIGNAL_ALL_WR)
-	    && !hidden)
+	    && !hidden) {
 		wqe_p->wr_flag |= WQE_WRFLAG_REQ_SIGNAL_COM;
+		qmap_entry->cqe_req = 1;
+	}
 
 	if (send_wr->opcode == IB_WR_SEND_WITH_IMM ||
 	    send_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
@@ -569,6 +572,7 @@ static int internal_post_recv(struct ehca_qp *my_qp,
 		qmap_entry = &my_qp->rq_map.map[rq_map_idx];
 		qmap_entry->app_wr_id = get_app_wr_id(cur_recv_wr->wr_id);
 		qmap_entry->reported = 0;
+		qmap_entry->cqe_req = 1;
 
 		wqe_cnt++;
 	} /* eof for cur_recv_wr */
@@ -700,27 +704,34 @@ poll_cq_one_read_cqe:
 
 	is_error = cqe->status & WC_STATUS_ERROR_BIT;
 
+	qmap_tail_idx = get_app_wr_id(cqe->work_request_id);
+	if (!(cqe->w_completion_flags & WC_SEND_RECEIVE_BIT))
+		/* We got a send completion. */
+		qmap = &my_qp->sq_map;
+	else
+		/* We got a receive completion. */
+		qmap = &my_qp->rq_map;
+
+	/* advance the tail pointer */
+	qmap->tail = qmap_tail_idx;
+
 	if (is_error) {
 		/*
 		 * set left_to_poll to 0 because in error state, we will not
 		 * get any additional CQEs
 		 */
-		ehca_add_to_err_list(my_qp, 1);
+		my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail,
+							my_qp->sq_map.entries);
 		my_qp->sq_map.left_to_poll = 0;
+		ehca_add_to_err_list(my_qp, 1);
 
+		my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail,
+							my_qp->rq_map.entries);
+		my_qp->rq_map.left_to_poll = 0;
 		if (HAS_RQ(my_qp))
 			ehca_add_to_err_list(my_qp, 0);
-		my_qp->rq_map.left_to_poll = 0;
 	}
 
-	qmap_tail_idx = get_app_wr_id(cqe->work_request_id);
-	if (!(cqe->w_completion_flags & WC_SEND_RECEIVE_BIT))
-		/* We got a send completion. */
-		qmap = &my_qp->sq_map;
-	else
-		/* We got a receive completion. */
-		qmap = &my_qp->rq_map;
-
 	qmap_entry = &qmap->map[qmap_tail_idx];
 	if (qmap_entry->reported) {
 		ehca_warn(cq->device, "Double cqe on qp_num=%#x",
@@ -732,10 +743,6 @@ poll_cq_one_read_cqe:
 	wc->wr_id = replace_wr_id(cqe->work_request_id, qmap_entry->app_wr_id);
 	qmap_entry->reported = 1;
 
-	/* this is a proper completion, we need to advance the tail pointer */
-	if (++qmap->tail == qmap->entries)
-		qmap->tail = 0;
-
 	/* if left_to_poll is decremented to 0, add the QP to the error list */
 	if (qmap->left_to_poll > 0) {
 		qmap->left_to_poll--;
@@ -799,13 +806,14 @@ static int generate_flush_cqes(struct ehca_qp *my_qp, struct ib_cq *cq,
 	else
 		qmap = &my_qp->rq_map;
 
-	qmap_entry = &qmap->map[qmap->tail];
+	qmap_entry = &qmap->map[qmap->next_wqe_idx];
 
 	while ((nr < num_entries) && (qmap_entry->reported == 0)) {
 		/* generate flush CQE */
+
 		memset(wc, 0, sizeof(*wc));
 
-		offset = qmap->tail * ipz_queue->qe_size;
+		offset = qmap->next_wqe_idx * ipz_queue->qe_size;
 		wqe = (struct ehca_wqe *)ipz_qeit_calc(ipz_queue, offset);
 		if (!wqe) {
 			ehca_err(cq->device, "Invalid wqe offset=%#lx on "
@@ -844,11 +852,11 @@ static int generate_flush_cqes(struct ehca_qp *my_qp, struct ib_cq *cq,
 
 		wc->qp = &my_qp->ib_qp;
 
-		/* mark as reported and advance tail pointer */
+		/* mark as reported and advance next_wqe pointer */
 		qmap_entry->reported = 1;
-		if (++qmap->tail == qmap->entries)
-			qmap->tail = 0;
-		qmap_entry = &qmap->map[qmap->tail];
+		qmap->next_wqe_idx = next_index(qmap->next_wqe_idx,
+						qmap->entries);
+		qmap_entry = &qmap->map[qmap->next_wqe_idx];
 
 		wc++; nr++;
 	}