From: Doug Ledford <dledford@redhat.com> Date: Mon, 24 Mar 2008 14:24:27 -0400 Subject: [openib] add improved error handling in srp driver Message-id: 1206383072-7299-7-git-send-email-dledford@redhat.com O-Subject: [Patch RHEL5 06/10] Infiniband: Add improved error handling in srp driver Bugzilla: 253023 In the event that a connection goes stale, we now attempt to restart the connection. We also now use the kernel provided shost_printk when printing kernel message. Signed-off-by: Doug Ledford <dledford@redhat.com> diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index a36e978..680cff5 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -201,6 +201,22 @@ out: return ret; } +static int srp_new_cm_id(struct srp_target_port *target) +{ + struct ib_cm_id *new_cm_id; + + new_cm_id = ib_create_cm_id(target->srp_host->dev->dev, + srp_cm_handler, target); + if (IS_ERR(new_cm_id)) + return PTR_ERR(new_cm_id); + + if (target->cm_id) + ib_destroy_cm_id(target->cm_id); + target->cm_id = new_cm_id; + + return 0; +} + static int srp_create_target_ib(struct srp_target_port *target) { struct ib_qp_init_attr *init_attr; @@ -269,7 +285,8 @@ static void srp_path_rec_completion(int status, target->status = status; if (status) - printk(KERN_ERR PFX "Got failed path rec status %d\n", status); + shost_printk(KERN_ERR, target->scsi_host, + PFX "Got failed path rec status %d\n", status); else target->path = *pathrec; complete(&target->done); @@ -300,7 +317,8 @@ static int srp_lookup_path(struct srp_target_port *target) wait_for_completion(&target->done); if (target->status < 0) - printk(KERN_WARNING PFX "Path record query failed\n"); + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Path record query failed\n"); return target->status; } @@ -376,9 +394,10 @@ static int srp_send_req(struct srp_target_port *target) * the second 8 bytes to the local node GUID. */ if (srp_target_is_topspin(target)) { - printk(KERN_DEBUG PFX "Topspin/Cisco initiator port ID workaround " - "activated for target GUID %016llx\n", - (unsigned long long) be64_to_cpu(target->ioc_guid)); + shost_printk(KERN_DEBUG, target->scsi_host, + PFX "Topspin/Cisco initiator port ID workaround " + "activated for target GUID %016llx\n", + (unsigned long long) be64_to_cpu(target->ioc_guid)); memset(req->priv.initiator_port_id, 0, 8); memcpy(req->priv.initiator_port_id + 8, &target->srp_host->dev->dev->node_guid, 8); @@ -397,10 +416,10 @@ static void srp_disconnect_target(struct srp_target_port *target) init_completion(&target->done); if (ib_send_cm_dreq(target->cm_id, NULL, 0)) { - printk(KERN_DEBUG PFX "Sending CM DREQ failed\n"); + shost_printk(KERN_DEBUG, target->scsi_host, + PFX "Sending CM DREQ failed\n"); return; } - wait_for_completion(&target->done); } static void srp_remove_work(struct work_struct *work) @@ -428,6 +447,7 @@ static void srp_remove_work(struct work_struct *work) static int srp_connect_target(struct srp_target_port *target) { + int retries = 3; int ret; ret = srp_lookup_path(target); @@ -460,6 +480,21 @@ static int srp_connect_target(struct srp_target_port *target) case SRP_DLID_REDIRECT: break; + case SRP_STALE_CONN: + /* Our current CM id was stale, and is now in timewait. + * Try to reconnect with a new one. + */ + if (!retries-- || srp_new_cm_id(target)) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "giving up on stale connection\n"); + target->status = -ECONNRESET; + return target->status; + } + + shost_printk(KERN_ERR, target->scsi_host, PFX + "retrying stale connection\n"); + break; + default: return target->status; } @@ -514,7 +549,6 @@ static void srp_reset_req(struct srp_target_port *target, struct srp_request *re static int srp_reconnect_target(struct srp_target_port *target) { - struct ib_cm_id *new_cm_id; struct srp_request *req, *tmp; int ret; struct ib_cq *old_cq; @@ -533,14 +567,9 @@ static int srp_reconnect_target(struct srp_target_port *target) * Now get a new local CM ID so that we avoid confusing the * target in case things are really fouled up. */ - new_cm_id = ib_create_cm_id(target->srp_host->dev->dev, - srp_cm_handler, target); - if (IS_ERR(new_cm_id)) { - ret = PTR_ERR(new_cm_id); + ret = srp_new_cm_id(target); + if (ret) goto err; - } - ib_destroy_cm_id(target->cm_id); - target->cm_id = new_cm_id; old_qp = target->qp; old_cq = target->cq; @@ -579,7 +608,8 @@ static int srp_reconnect_target(struct srp_target_port *target) return ret; err: - printk(KERN_ERR PFX "reconnect failed (%d), removing target port.\n", ret); + shost_printk(KERN_ERR, target->scsi_host, + PFX "reconnect failed (%d), removing target port.\n", ret); /* * We couldn't reconnect, so kill our target port off. @@ -694,8 +724,9 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, if (scmnd->sc_data_direction != DMA_FROM_DEVICE && scmnd->sc_data_direction != DMA_TO_DEVICE) { - printk(KERN_WARNING PFX "Unhandled data direction %d\n", - scmnd->sc_data_direction); + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Unhandled data direction %d\n", + scmnd->sc_data_direction); return -EINVAL; } @@ -807,8 +838,9 @@ static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp) } else { scmnd = req->scmnd; if (!scmnd) - printk(KERN_ERR "Null scmnd for RSP w/tag %016llx\n", - (unsigned long long) rsp->tag); + shost_printk(KERN_ERR, target->scsi_host, + "Null scmnd for RSP w/tag %016llx\n", + (unsigned long long) rsp->tag); scmnd->result = rsp->status; if (rsp->flags & SRP_RSP_FLAG_SNSVALID) { @@ -852,7 +884,8 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc) if (0) { int i; - printk(KERN_ERR PFX "recv completion, opcode 0x%02x\n", opcode); + shost_printk(KERN_ERR, target->scsi_host, + PFX "recv completion, opcode 0x%02x\n", opcode); for (i = 0; i < wc->byte_len; ++i) { if (i % 8 == 0) @@ -873,11 +906,13 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc) case SRP_T_LOGOUT: /* XXX Handle target logout */ - printk(KERN_WARNING PFX "Got target logout request\n"); + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Got target logout request\n"); break; default: - printk(KERN_WARNING PFX "Unhandled SRP opcode 0x%02x\n", opcode); + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Unhandled SRP opcode 0x%02x\n", opcode); break; } @@ -885,6 +920,26 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc) DMA_FROM_DEVICE); } +static void srp_reconnect_work(struct work_struct *work) +{ + struct srp_target_port *target = + container_of(work, struct srp_target_port, work); + + srp_reconnect_target(target); +} + +static void srp_qp_in_err_timer(unsigned long data) +{ + struct srp_target_port *target = (struct srp_target_port *)data; + + spin_lock_irq(target->scsi_host->host_lock); + INIT_WORK(&target->work, srp_reconnect_work); + schedule_work(&target->work); + spin_unlock_irq(target->scsi_host->host_lock); + + del_timer(&target->qp_err_timer); +} + static void srp_completion(struct ib_cq *cq, void *target_ptr) { struct srp_target_port *target = target_ptr; @@ -893,10 +948,20 @@ static void srp_completion(struct ib_cq *cq, void *target_ptr) ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); while (ib_poll_cq(cq, 1, &wc) > 0) { if (wc.status) { - printk(KERN_ERR PFX "failed %s status %d\n", - wc.wr_id & SRP_OP_RECV ? "receive" : "send", - wc.status); - target->qp_in_error = 1; + shost_printk(KERN_ERR, target->scsi_host, + PFX "failed %s status %d\n", + wc.wr_id & SRP_OP_RECV ? "receive" : "send", + wc.status); + if (!target->qp_in_error) { + target->qp_in_error = 1; + if (!timer_pending(&target->qp_err_timer)) { + setup_timer(&target->qp_err_timer, + srp_qp_in_err_timer, + (unsigned long)target); + target->qp_err_timer.expires = 10 * HZ + jiffies; + add_timer(&target->qp_err_timer); + } + } break; } @@ -951,13 +1016,18 @@ static int srp_post_recv(struct srp_target_port *target) * req_lim and tx_head. Lock cannot be dropped between call here and * call to __srp_post_send(). */ -static struct srp_iu *__srp_get_tx_iu(struct srp_target_port *target) +static struct srp_iu *__srp_get_tx_iu(struct srp_target_port *target, + enum srp_request_type req_type) { + s32 min = (req_type == SRP_REQ_TASK_MGMT) ? 1 : 2; + if (target->tx_head - target->tx_tail >= SRP_SQ_SIZE) return NULL; - if (unlikely(target->req_lim < 1)) + if (target->req_lim < min) { ++target->zero_req_lim; + return NULL; + } return target->tx_ring[target->tx_head & SRP_SQ_SIZE]; } @@ -1004,17 +1074,18 @@ static int srp_queuecommand(struct scsi_cmnd *scmnd, struct ib_device *dev; int len; - if (target->state == SRP_TARGET_CONNECTING) + if (target->state == SRP_TARGET_CONNECTING || + target->qp_in_error) goto err; if (target->state == SRP_TARGET_DEAD || target->state == SRP_TARGET_REMOVED) { - scmnd->result = DID_BAD_TARGET << 16; + scmnd->result = DID_NO_CONNECT << 16; done(scmnd); return 0; } - iu = __srp_get_tx_iu(target); + iu = __srp_get_tx_iu(target, SRP_REQ_NORMAL); if (!iu) goto err; @@ -1043,12 +1114,13 @@ static int srp_queuecommand(struct scsi_cmnd *scmnd, len = srp_map_data(scmnd, target, req); if (len < 0) { - printk(KERN_ERR PFX "Failed to map data\n"); + shost_printk(KERN_ERR, target->scsi_host, + PFX "Failed to map data\n"); goto err; } if (__srp_post_recv(target)) { - printk(KERN_ERR PFX "Recv failed\n"); + shost_printk(KERN_ERR, target->scsi_host, PFX "Recv failed\n"); goto err_unmap; } @@ -1056,7 +1128,7 @@ static int srp_queuecommand(struct scsi_cmnd *scmnd, DMA_TO_DEVICE); if (__srp_post_send(target, iu, len)) { - printk(KERN_ERR PFX "Send failed\n"); + shost_printk(KERN_ERR, target->scsi_host, PFX "Send failed\n"); goto err_unmap; } @@ -1111,6 +1183,7 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event, struct srp_target_port *target) { + struct Scsi_Host *shost = target->scsi_host; struct ib_class_port_info *cpi; int opcode; @@ -1136,19 +1209,22 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id, memcpy(target->path.dgid.raw, event->param.rej_rcvd.ari, 16); - printk(KERN_DEBUG PFX "Topspin/Cisco redirect to target port GID %016llx%016llx\n", - (unsigned long long) be64_to_cpu(target->path.dgid.global.subnet_prefix), - (unsigned long long) be64_to_cpu(target->path.dgid.global.interface_id)); + shost_printk(KERN_DEBUG, shost, + PFX "Topspin/Cisco redirect to target port GID %016llx%016llx\n", + (unsigned long long) be64_to_cpu(target->path.dgid.global.subnet_prefix), + (unsigned long long) be64_to_cpu(target->path.dgid.global.interface_id)); target->status = SRP_PORT_REDIRECT; } else { - printk(KERN_WARNING " REJ reason: IB_CM_REJ_PORT_REDIRECT\n"); + shost_printk(KERN_WARNING, shost, + " REJ reason: IB_CM_REJ_PORT_REDIRECT\n"); target->status = -ECONNRESET; } break; case IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID: - printk(KERN_WARNING " REJ reason: IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID\n"); + shost_printk(KERN_WARNING, shost, + " REJ reason: IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID\n"); target->status = -ECONNRESET; break; @@ -1159,20 +1235,26 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id, u32 reason = be32_to_cpu(rej->reason); if (reason == SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE) - printk(KERN_WARNING PFX - "SRP_LOGIN_REJ: requested max_it_iu_len too large\n"); + shost_printk(KERN_WARNING, shost, + PFX "SRP_LOGIN_REJ: requested max_it_iu_len too large\n"); else - printk(KERN_WARNING PFX - "SRP LOGIN REJECTED, reason 0x%08x\n", reason); + shost_printk(KERN_WARNING, shost, + PFX "SRP LOGIN REJECTED, reason 0x%08x\n", reason); } else - printk(KERN_WARNING " REJ reason: IB_CM_REJ_CONSUMER_DEFINED," - " opcode 0x%02x\n", opcode); + shost_printk(KERN_WARNING, shost, + " REJ reason: IB_CM_REJ_CONSUMER_DEFINED," + " opcode 0x%02x\n", opcode); target->status = -ECONNRESET; break; + case IB_CM_REJ_STALE_CONN: + shost_printk(KERN_WARNING, shost, " REJ reason: stale connection\n"); + target->status = SRP_STALE_CONN; + break; + default: - printk(KERN_WARNING " REJ reason 0x%x\n", - event->param.rej_rcvd.reason); + shost_printk(KERN_WARNING, shost, " REJ reason 0x%x\n", + event->param.rej_rcvd.reason); target->status = -ECONNRESET; } } @@ -1187,7 +1269,8 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) switch (event->event) { case IB_CM_REQ_ERROR: - printk(KERN_DEBUG PFX "Sending CM REQ failed\n"); + shost_printk(KERN_DEBUG, target->scsi_host, + PFX "Sending CM REQ failed\n"); comp = 1; target->status = -ECONNRESET; break; @@ -1201,11 +1284,9 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) target->max_ti_iu_len = be32_to_cpu(rsp->max_ti_iu_len); target->req_lim = be32_to_cpu(rsp->req_lim_delta); - - target->scsi_host->can_queue = min(target->req_lim, - target->scsi_host->can_queue); } else { - printk(KERN_WARNING PFX "Unhandled RSP opcode %#x\n", opcode); + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Unhandled RSP opcode %#x\n", opcode); target->status = -ECONNRESET; break; } @@ -1251,22 +1332,24 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) break; case IB_CM_REJ_RECEIVED: - printk(KERN_DEBUG PFX "REJ received\n"); + shost_printk(KERN_DEBUG, target->scsi_host, PFX "REJ received\n"); comp = 1; srp_cm_rej_handler(cm_id, event, target); break; case IB_CM_DREQ_RECEIVED: - printk(KERN_WARNING PFX "DREQ received - connection closed\n"); + shost_printk(KERN_WARNING, target->scsi_host, + PFX "DREQ received - connection closed\n"); if (ib_send_cm_drep(cm_id, NULL, 0)) - printk(KERN_ERR PFX "Sending CM DREP failed\n"); + shost_printk(KERN_ERR, target->scsi_host, + PFX "Sending CM DREP failed\n"); break; case IB_CM_TIMEWAIT_EXIT: - printk(KERN_ERR PFX "connection closed\n"); + shost_printk(KERN_ERR, target->scsi_host, + PFX "connection closed\n"); - comp = 1; target->status = 0; break; @@ -1276,7 +1359,8 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) break; default: - printk(KERN_WARNING PFX "Unhandled CM event %d\n", event->event); + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Unhandled CM event %d\n", event->event); break; } @@ -1304,7 +1388,7 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target, init_completion(&req->done); - iu = __srp_get_tx_iu(target); + iu = __srp_get_tx_iu(target, SRP_REQ_TASK_MGMT); if (!iu) goto out; @@ -1353,7 +1437,7 @@ static int srp_abort(struct scsi_cmnd *scmnd) struct srp_request *req; int ret = SUCCESS; - printk(KERN_ERR "SRP abort called\n"); + shost_printk(KERN_ERR, target->scsi_host, "SRP abort called\n"); if (target->qp_in_error) return FAILED; @@ -1383,7 +1467,7 @@ static int srp_reset_device(struct scsi_cmnd *scmnd) struct srp_target_port *target = host_to_target(scmnd->device->host); struct srp_request *req, *tmp; - printk(KERN_ERR "SRP reset_device called\n"); + shost_printk(KERN_ERR, target->scsi_host, "SRP reset_device called\n"); if (target->qp_in_error) return FAILED; @@ -1410,7 +1494,7 @@ static int srp_reset_host(struct scsi_cmnd *scmnd) struct srp_target_port *target = host_to_target(scmnd->device->host); int ret = FAILED; - printk(KERN_ERR PFX "SRP reset_host called\n"); + shost_printk(KERN_ERR, target->scsi_host, PFX "SRP reset_host called\n"); if (!srp_reconnect_target(target)) ret = SUCCESS; @@ -1822,8 +1906,9 @@ static ssize_t srp_create_target(struct class_device *class_dev, ib_get_cached_gid(host->dev->dev, host->port, 0, &target->path.sgid); - printk(KERN_DEBUG PFX "new target: id_ext %016llx ioc_guid %016llx pkey %04x " - "service_id %016llx dgid %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + shost_printk(KERN_DEBUG, target->scsi_host, PFX + "new target: id_ext %016llx ioc_guid %016llx pkey %04x " + "service_id %016llx dgid %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", (unsigned long long) be64_to_cpu(target->id_ext), (unsigned long long) be64_to_cpu(target->ioc_guid), be16_to_cpu(target->path.pkey), @@ -1841,16 +1926,15 @@ static ssize_t srp_create_target(struct class_device *class_dev, if (ret) goto err; - target->cm_id = ib_create_cm_id(host->dev->dev, srp_cm_handler, target); - if (IS_ERR(target->cm_id)) { - ret = PTR_ERR(target->cm_id); + ret = srp_new_cm_id(target); + if (ret) goto err_free; - } target->qp_in_error = 0; ret = srp_connect_target(target); if (ret) { - printk(KERN_ERR PFX "Connection failed\n"); + shost_printk(KERN_ERR, target->scsi_host, + PFX "Connection failed\n"); goto err_cm_id; } diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 0a5f8df..631d8bd 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -54,6 +54,7 @@ enum { SRP_PORT_REDIRECT = 1, SRP_DLID_REDIRECT = 2, + SRP_STALE_CONN = 3, SRP_MAX_LUN = 512, SRP_DEF_SG_TABLESIZE = 12, @@ -79,6 +80,11 @@ enum srp_target_state { SRP_TARGET_REMOVED }; +enum srp_request_type { + SRP_REQ_NORMAL, + SRP_REQ_TASK_MGMT, +}; + struct srp_device { struct list_head dev_list; struct ib_device *dev; @@ -160,6 +166,7 @@ struct srp_target_port { int status; enum srp_target_state state; int qp_in_error; + struct timer_list qp_err_timer; }; struct srp_iu {