Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > fc11cd6e1c513a17304da94a5390f3cd > files > 2687

kernel-2.6.18-194.11.1.el5.src.rpm

From: Doug Ledford <dledford@redhat.com>
Date: Mon, 24 Mar 2008 14:24:27 -0400
Subject: [openib] add improved error handling in srp driver
Message-id: 1206383072-7299-7-git-send-email-dledford@redhat.com
O-Subject: [Patch RHEL5 06/10] Infiniband: Add improved error handling in srp driver
Bugzilla: 253023

In the event that a connection goes stale, we now attempt to restart
the connection.  We also now use the kernel provided shost_printk when
printing kernel message.

Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index a36e978..680cff5 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -201,6 +201,22 @@ out:
 	return ret;
 }
 
+static int srp_new_cm_id(struct srp_target_port *target)
+{
+	struct ib_cm_id *new_cm_id;
+
+	new_cm_id = ib_create_cm_id(target->srp_host->dev->dev,
+				    srp_cm_handler, target);
+	if (IS_ERR(new_cm_id))
+		return PTR_ERR(new_cm_id);
+
+	if (target->cm_id)
+		ib_destroy_cm_id(target->cm_id);
+	target->cm_id = new_cm_id;
+
+	return 0;
+}
+
 static int srp_create_target_ib(struct srp_target_port *target)
 {
 	struct ib_qp_init_attr *init_attr;
@@ -269,7 +285,8 @@ static void srp_path_rec_completion(int status,
 
 	target->status = status;
 	if (status)
-		printk(KERN_ERR PFX "Got failed path rec status %d\n", status);
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "Got failed path rec status %d\n", status);
 	else
 		target->path = *pathrec;
 	complete(&target->done);
@@ -300,7 +317,8 @@ static int srp_lookup_path(struct srp_target_port *target)
 	wait_for_completion(&target->done);
 
 	if (target->status < 0)
-		printk(KERN_WARNING PFX "Path record query failed\n");
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Path record query failed\n");
 
 	return target->status;
 }
@@ -376,9 +394,10 @@ static int srp_send_req(struct srp_target_port *target)
 	 * the second 8 bytes to the local node GUID.
 	 */
 	if (srp_target_is_topspin(target)) {
-		printk(KERN_DEBUG PFX "Topspin/Cisco initiator port ID workaround "
-		       "activated for target GUID %016llx\n",
-		       (unsigned long long) be64_to_cpu(target->ioc_guid));
+		shost_printk(KERN_DEBUG, target->scsi_host,
+			     PFX "Topspin/Cisco initiator port ID workaround "
+			     "activated for target GUID %016llx\n",
+			     (unsigned long long) be64_to_cpu(target->ioc_guid));
 		memset(req->priv.initiator_port_id, 0, 8);
 		memcpy(req->priv.initiator_port_id + 8,
 		       &target->srp_host->dev->dev->node_guid, 8);
@@ -397,10 +416,10 @@ static void srp_disconnect_target(struct srp_target_port *target)
 
 	init_completion(&target->done);
 	if (ib_send_cm_dreq(target->cm_id, NULL, 0)) {
-		printk(KERN_DEBUG PFX "Sending CM DREQ failed\n");
+		shost_printk(KERN_DEBUG, target->scsi_host,
+			     PFX "Sending CM DREQ failed\n");
 		return;
 	}
-	wait_for_completion(&target->done);
 }
 
 static void srp_remove_work(struct work_struct *work)
@@ -428,6 +447,7 @@ static void srp_remove_work(struct work_struct *work)
 
 static int srp_connect_target(struct srp_target_port *target)
 {
+	int retries = 3;
 	int ret;
 
 	ret = srp_lookup_path(target);
@@ -460,6 +480,21 @@ static int srp_connect_target(struct srp_target_port *target)
 		case SRP_DLID_REDIRECT:
 			break;
 
+		case SRP_STALE_CONN:
+			/* Our current CM id was stale, and is now in timewait.
+			 * Try to reconnect with a new one.
+			 */
+			if (!retries-- || srp_new_cm_id(target)) {
+				shost_printk(KERN_ERR, target->scsi_host, PFX
+					     "giving up on stale connection\n");
+				target->status = -ECONNRESET;
+				return target->status;
+			}
+
+			shost_printk(KERN_ERR, target->scsi_host, PFX
+				     "retrying stale connection\n");
+			break;
+
 		default:
 			return target->status;
 		}
@@ -514,7 +549,6 @@ static void srp_reset_req(struct srp_target_port *target, struct srp_request *re
 
 static int srp_reconnect_target(struct srp_target_port *target)
 {
-	struct ib_cm_id *new_cm_id;
 	struct srp_request *req, *tmp;
 	int ret;
 	struct ib_cq *old_cq;
@@ -533,14 +567,9 @@ static int srp_reconnect_target(struct srp_target_port *target)
 	 * Now get a new local CM ID so that we avoid confusing the
 	 * target in case things are really fouled up.
 	 */
-	new_cm_id = ib_create_cm_id(target->srp_host->dev->dev,
-				    srp_cm_handler, target);
-	if (IS_ERR(new_cm_id)) {
-		ret = PTR_ERR(new_cm_id);
+	ret = srp_new_cm_id(target);
+	if (ret)
 		goto err;
-	}
-	ib_destroy_cm_id(target->cm_id);
-	target->cm_id = new_cm_id;
 
 	old_qp = target->qp;
 	old_cq = target->cq;
@@ -579,7 +608,8 @@ static int srp_reconnect_target(struct srp_target_port *target)
 	return ret;
 
 err:
-	printk(KERN_ERR PFX "reconnect failed (%d), removing target port.\n", ret);
+	shost_printk(KERN_ERR, target->scsi_host,
+		     PFX "reconnect failed (%d), removing target port.\n", ret);
 
 	/*
 	 * We couldn't reconnect, so kill our target port off.
@@ -694,8 +724,9 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target,
 
 	if (scmnd->sc_data_direction != DMA_FROM_DEVICE &&
 	    scmnd->sc_data_direction != DMA_TO_DEVICE) {
-		printk(KERN_WARNING PFX "Unhandled data direction %d\n",
-		       scmnd->sc_data_direction);
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Unhandled data direction %d\n",
+			     scmnd->sc_data_direction);
 		return -EINVAL;
 	}
 
@@ -807,8 +838,9 @@ static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp)
 	} else {
 		scmnd = req->scmnd;
 		if (!scmnd)
-			printk(KERN_ERR "Null scmnd for RSP w/tag %016llx\n",
-			       (unsigned long long) rsp->tag);
+			shost_printk(KERN_ERR, target->scsi_host,
+				     "Null scmnd for RSP w/tag %016llx\n",
+				     (unsigned long long) rsp->tag);
 		scmnd->result = rsp->status;
 
 		if (rsp->flags & SRP_RSP_FLAG_SNSVALID) {
@@ -852,7 +884,8 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc)
 	if (0) {
 		int i;
 
-		printk(KERN_ERR PFX "recv completion, opcode 0x%02x\n", opcode);
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "recv completion, opcode 0x%02x\n", opcode);
 
 		for (i = 0; i < wc->byte_len; ++i) {
 			if (i % 8 == 0)
@@ -873,11 +906,13 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc)
 
 	case SRP_T_LOGOUT:
 		/* XXX Handle target logout */
-		printk(KERN_WARNING PFX "Got target logout request\n");
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Got target logout request\n");
 		break;
 
 	default:
-		printk(KERN_WARNING PFX "Unhandled SRP opcode 0x%02x\n", opcode);
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Unhandled SRP opcode 0x%02x\n", opcode);
 		break;
 	}
 
@@ -885,6 +920,26 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc)
 				      DMA_FROM_DEVICE);
 }
 
+static void srp_reconnect_work(struct work_struct *work)
+{
+	struct srp_target_port *target =
+		container_of(work, struct srp_target_port, work);
+
+	srp_reconnect_target(target);
+}
+
+static void srp_qp_in_err_timer(unsigned long data)
+{
+	struct srp_target_port *target = (struct srp_target_port *)data;
+
+	spin_lock_irq(target->scsi_host->host_lock);
+	INIT_WORK(&target->work, srp_reconnect_work);
+	schedule_work(&target->work);
+	spin_unlock_irq(target->scsi_host->host_lock);
+
+	del_timer(&target->qp_err_timer);
+}
+
 static void srp_completion(struct ib_cq *cq, void *target_ptr)
 {
 	struct srp_target_port *target = target_ptr;
@@ -893,10 +948,20 @@ static void srp_completion(struct ib_cq *cq, void *target_ptr)
 	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
 	while (ib_poll_cq(cq, 1, &wc) > 0) {
 		if (wc.status) {
-			printk(KERN_ERR PFX "failed %s status %d\n",
-			       wc.wr_id & SRP_OP_RECV ? "receive" : "send",
-			       wc.status);
-			target->qp_in_error = 1;
+			shost_printk(KERN_ERR, target->scsi_host,
+				     PFX "failed %s status %d\n",
+				     wc.wr_id & SRP_OP_RECV ? "receive" : "send",
+				     wc.status);
+			if (!target->qp_in_error) {
+				target->qp_in_error = 1;
+				if (!timer_pending(&target->qp_err_timer)) {
+					setup_timer(&target->qp_err_timer,
+						    srp_qp_in_err_timer,
+						    (unsigned long)target);
+					target->qp_err_timer.expires = 10 * HZ + jiffies;
+					add_timer(&target->qp_err_timer);
+				}
+			}
 			break;
 		}
 
@@ -951,13 +1016,18 @@ static int srp_post_recv(struct srp_target_port *target)
  * req_lim and tx_head.  Lock cannot be dropped between call here and
  * call to __srp_post_send().
  */
-static struct srp_iu *__srp_get_tx_iu(struct srp_target_port *target)
+static struct srp_iu *__srp_get_tx_iu(struct srp_target_port *target,
+					enum srp_request_type req_type)
 {
+	s32 min = (req_type == SRP_REQ_TASK_MGMT) ? 1 : 2;
+
 	if (target->tx_head - target->tx_tail >= SRP_SQ_SIZE)
 		return NULL;
 
-	if (unlikely(target->req_lim < 1))
+	if (target->req_lim < min) {
 		++target->zero_req_lim;
+		return NULL;
+	}
 
 	return target->tx_ring[target->tx_head & SRP_SQ_SIZE];
 }
@@ -1004,17 +1074,18 @@ static int srp_queuecommand(struct scsi_cmnd *scmnd,
 	struct ib_device *dev;
 	int len;
 
-	if (target->state == SRP_TARGET_CONNECTING)
+	if (target->state == SRP_TARGET_CONNECTING ||
+	    target->qp_in_error)
 		goto err;
 
 	if (target->state == SRP_TARGET_DEAD ||
 	    target->state == SRP_TARGET_REMOVED) {
-		scmnd->result = DID_BAD_TARGET << 16;
+		scmnd->result = DID_NO_CONNECT << 16;
 		done(scmnd);
 		return 0;
 	}
 
-	iu = __srp_get_tx_iu(target);
+	iu = __srp_get_tx_iu(target, SRP_REQ_NORMAL);
 	if (!iu)
 		goto err;
 
@@ -1043,12 +1114,13 @@ static int srp_queuecommand(struct scsi_cmnd *scmnd,
 
 	len = srp_map_data(scmnd, target, req);
 	if (len < 0) {
-		printk(KERN_ERR PFX "Failed to map data\n");
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "Failed to map data\n");
 		goto err;
 	}
 
 	if (__srp_post_recv(target)) {
-		printk(KERN_ERR PFX "Recv failed\n");
+		shost_printk(KERN_ERR, target->scsi_host, PFX "Recv failed\n");
 		goto err_unmap;
 	}
 
@@ -1056,7 +1128,7 @@ static int srp_queuecommand(struct scsi_cmnd *scmnd,
 				      DMA_TO_DEVICE);
 
 	if (__srp_post_send(target, iu, len)) {
-		printk(KERN_ERR PFX "Send failed\n");
+		shost_printk(KERN_ERR, target->scsi_host, PFX "Send failed\n");
 		goto err_unmap;
 	}
 
@@ -1111,6 +1183,7 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id,
 			       struct ib_cm_event *event,
 			       struct srp_target_port *target)
 {
+	struct Scsi_Host *shost = target->scsi_host;
 	struct ib_class_port_info *cpi;
 	int opcode;
 
@@ -1136,19 +1209,22 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id,
 			memcpy(target->path.dgid.raw,
 			       event->param.rej_rcvd.ari, 16);
 
-			printk(KERN_DEBUG PFX "Topspin/Cisco redirect to target port GID %016llx%016llx\n",
-			       (unsigned long long) be64_to_cpu(target->path.dgid.global.subnet_prefix),
-			       (unsigned long long) be64_to_cpu(target->path.dgid.global.interface_id));
+			shost_printk(KERN_DEBUG, shost,
+				     PFX "Topspin/Cisco redirect to target port GID %016llx%016llx\n",
+				     (unsigned long long) be64_to_cpu(target->path.dgid.global.subnet_prefix),
+				     (unsigned long long) be64_to_cpu(target->path.dgid.global.interface_id));
 
 			target->status = SRP_PORT_REDIRECT;
 		} else {
-			printk(KERN_WARNING "  REJ reason: IB_CM_REJ_PORT_REDIRECT\n");
+			shost_printk(KERN_WARNING, shost,
+				     "  REJ reason: IB_CM_REJ_PORT_REDIRECT\n");
 			target->status = -ECONNRESET;
 		}
 		break;
 
 	case IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID:
-		printk(KERN_WARNING "  REJ reason: IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID\n");
+		shost_printk(KERN_WARNING, shost,
+			    "  REJ reason: IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID\n");
 		target->status = -ECONNRESET;
 		break;
 
@@ -1159,20 +1235,26 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id,
 			u32 reason = be32_to_cpu(rej->reason);
 
 			if (reason == SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE)
-				printk(KERN_WARNING PFX
-				       "SRP_LOGIN_REJ: requested max_it_iu_len too large\n");
+				shost_printk(KERN_WARNING, shost,
+					     PFX "SRP_LOGIN_REJ: requested max_it_iu_len too large\n");
 			else
-				printk(KERN_WARNING PFX
-				       "SRP LOGIN REJECTED, reason 0x%08x\n", reason);
+				shost_printk(KERN_WARNING, shost,
+					    PFX "SRP LOGIN REJECTED, reason 0x%08x\n", reason);
 		} else
-			printk(KERN_WARNING "  REJ reason: IB_CM_REJ_CONSUMER_DEFINED,"
-			       " opcode 0x%02x\n", opcode);
+			shost_printk(KERN_WARNING, shost,
+				     "  REJ reason: IB_CM_REJ_CONSUMER_DEFINED,"
+				     " opcode 0x%02x\n", opcode);
 		target->status = -ECONNRESET;
 		break;
 
+	case IB_CM_REJ_STALE_CONN:
+		shost_printk(KERN_WARNING, shost, "  REJ reason: stale connection\n");
+		target->status = SRP_STALE_CONN;
+		break;
+
 	default:
-		printk(KERN_WARNING "  REJ reason 0x%x\n",
-		       event->param.rej_rcvd.reason);
+		shost_printk(KERN_WARNING, shost, "  REJ reason 0x%x\n",
+			     event->param.rej_rcvd.reason);
 		target->status = -ECONNRESET;
 	}
 }
@@ -1187,7 +1269,8 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
 
 	switch (event->event) {
 	case IB_CM_REQ_ERROR:
-		printk(KERN_DEBUG PFX "Sending CM REQ failed\n");
+		shost_printk(KERN_DEBUG, target->scsi_host,
+			     PFX "Sending CM REQ failed\n");
 		comp = 1;
 		target->status = -ECONNRESET;
 		break;
@@ -1201,11 +1284,9 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
 
 			target->max_ti_iu_len = be32_to_cpu(rsp->max_ti_iu_len);
 			target->req_lim       = be32_to_cpu(rsp->req_lim_delta);
-
-			target->scsi_host->can_queue = min(target->req_lim,
-							   target->scsi_host->can_queue);
 		} else {
-			printk(KERN_WARNING PFX "Unhandled RSP opcode %#x\n", opcode);
+			shost_printk(KERN_WARNING, target->scsi_host,
+				    PFX "Unhandled RSP opcode %#x\n", opcode);
 			target->status = -ECONNRESET;
 			break;
 		}
@@ -1251,22 +1332,24 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
 		break;
 
 	case IB_CM_REJ_RECEIVED:
-		printk(KERN_DEBUG PFX "REJ received\n");
+		shost_printk(KERN_DEBUG, target->scsi_host, PFX "REJ received\n");
 		comp = 1;
 
 		srp_cm_rej_handler(cm_id, event, target);
 		break;
 
 	case IB_CM_DREQ_RECEIVED:
-		printk(KERN_WARNING PFX "DREQ received - connection closed\n");
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "DREQ received - connection closed\n");
 		if (ib_send_cm_drep(cm_id, NULL, 0))
-			printk(KERN_ERR PFX "Sending CM DREP failed\n");
+			shost_printk(KERN_ERR, target->scsi_host,
+				     PFX "Sending CM DREP failed\n");
 		break;
 
 	case IB_CM_TIMEWAIT_EXIT:
-		printk(KERN_ERR PFX "connection closed\n");
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "connection closed\n");
 
-		comp = 1;
 		target->status = 0;
 		break;
 
@@ -1276,7 +1359,8 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
 		break;
 
 	default:
-		printk(KERN_WARNING PFX "Unhandled CM event %d\n", event->event);
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Unhandled CM event %d\n", event->event);
 		break;
 	}
 
@@ -1304,7 +1388,7 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target,
 
 	init_completion(&req->done);
 
-	iu = __srp_get_tx_iu(target);
+	iu = __srp_get_tx_iu(target, SRP_REQ_TASK_MGMT);
 	if (!iu)
 		goto out;
 
@@ -1353,7 +1437,7 @@ static int srp_abort(struct scsi_cmnd *scmnd)
 	struct srp_request *req;
 	int ret = SUCCESS;
 
-	printk(KERN_ERR "SRP abort called\n");
+	shost_printk(KERN_ERR, target->scsi_host, "SRP abort called\n");
 
 	if (target->qp_in_error)
 		return FAILED;
@@ -1383,7 +1467,7 @@ static int srp_reset_device(struct scsi_cmnd *scmnd)
 	struct srp_target_port *target = host_to_target(scmnd->device->host);
 	struct srp_request *req, *tmp;
 
-	printk(KERN_ERR "SRP reset_device called\n");
+	shost_printk(KERN_ERR, target->scsi_host, "SRP reset_device called\n");
 
 	if (target->qp_in_error)
 		return FAILED;
@@ -1410,7 +1494,7 @@ static int srp_reset_host(struct scsi_cmnd *scmnd)
 	struct srp_target_port *target = host_to_target(scmnd->device->host);
 	int ret = FAILED;
 
-	printk(KERN_ERR PFX "SRP reset_host called\n");
+	shost_printk(KERN_ERR, target->scsi_host, PFX "SRP reset_host called\n");
 
 	if (!srp_reconnect_target(target))
 		ret = SUCCESS;
@@ -1822,8 +1906,9 @@ static ssize_t srp_create_target(struct class_device *class_dev,
 
 	ib_get_cached_gid(host->dev->dev, host->port, 0, &target->path.sgid);
 
-	printk(KERN_DEBUG PFX "new target: id_ext %016llx ioc_guid %016llx pkey %04x "
-	       "service_id %016llx dgid %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+	shost_printk(KERN_DEBUG, target->scsi_host, PFX
+		     "new target: id_ext %016llx ioc_guid %016llx pkey %04x "
+		     "service_id %016llx dgid %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
 	       (unsigned long long) be64_to_cpu(target->id_ext),
 	       (unsigned long long) be64_to_cpu(target->ioc_guid),
 	       be16_to_cpu(target->path.pkey),
@@ -1841,16 +1926,15 @@ static ssize_t srp_create_target(struct class_device *class_dev,
 	if (ret)
 		goto err;
 
-	target->cm_id = ib_create_cm_id(host->dev->dev, srp_cm_handler, target);
-	if (IS_ERR(target->cm_id)) {
-		ret = PTR_ERR(target->cm_id);
+	ret = srp_new_cm_id(target);
+	if (ret)
 		goto err_free;
-	}
 
 	target->qp_in_error = 0;
 	ret = srp_connect_target(target);
 	if (ret) {
-		printk(KERN_ERR PFX "Connection failed\n");
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "Connection failed\n");
 		goto err_cm_id;
 	}
 
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index 0a5f8df..631d8bd 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -54,6 +54,7 @@ enum {
 
 	SRP_PORT_REDIRECT	= 1,
 	SRP_DLID_REDIRECT	= 2,
+	SRP_STALE_CONN		= 3,
 
 	SRP_MAX_LUN		= 512,
 	SRP_DEF_SG_TABLESIZE	= 12,
@@ -79,6 +80,11 @@ enum srp_target_state {
 	SRP_TARGET_REMOVED
 };
 
+enum srp_request_type {
+	SRP_REQ_NORMAL,
+	SRP_REQ_TASK_MGMT,
+};
+
 struct srp_device {
 	struct list_head	dev_list;
 	struct ib_device       *dev;
@@ -160,6 +166,7 @@ struct srp_target_port {
 	int			status;
 	enum srp_target_state	state;
 	int			qp_in_error;
+	struct timer_list	qp_err_timer;
 };
 
 struct srp_iu {