Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > fc11cd6e1c513a17304da94a5390f3cd > files > 3585

kernel-2.6.18-194.11.1.el5.src.rpm

From: Marcus Barrow <mbarrow@redhat.com>
Date: Mon, 29 Sep 2008 19:05:37 -0400
Subject: [scsi] qla2xxx: support PCI Enhanced Error Recovery
Message-id: 20080929230532.29815.86467.sendpatchset@file.bos.redhat.com
O-Subject: [rhel 5.3 patch] [V2] qla2xxx: Support PCI Enhanced Error Recovery..
Bugzilla: 462416
RH-Acked-by: Pete Zaitcev <zaitcev@redhat.com>
RH-Acked-by: Tomas Henzl <thenzl@redhat.com>
RH-Acked-by: Pete Zaitcev <zaitcev@redhat.com>

BZ 462416 Update qla2xxx - PCI EE error handling support [V2]

Version 2 of this patch, to correct an un-intended change from upstream. When
IBM ported this code from upstream, a line which releases the firmware on module
exit was uncommented. Upstream supports loadable firmware, but RHEL 5 does not.

This patch provides support for PCI Enhanced Error Handling.

It was originally submitted for 5.2 but problems were found during testing.
IBM has provided addional fixes for the kernel, then updated and tested this
patch to the driver.

It applies cleanly and builds under kernel-2.6.18-110.

This patch is late because I received a data corruption patch late last week,
and there were problems with my tree and setting up lab equipment on site to
perform additional testing.

diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h
index defa234..1da3340 100644
--- a/drivers/scsi/qla2xxx/qla_def.h
+++ b/drivers/scsi/qla2xxx/qla_def.h
@@ -2272,6 +2272,7 @@ typedef struct scsi_qla_host {
 		uint32_t	gpsc_supported		:1;
 		uint32_t        vsan_enabled            :1;
 		uint32_t        npiv_supported          :1;
+		uint32_t        pci_channel_io_perm_failure          :1;
 	} flags;
 
 	atomic_t	loop_state;
diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c
index 9c2c280..ffc6bde 100644
--- a/drivers/scsi/qla2xxx/qla_init.c
+++ b/drivers/scsi/qla2xxx/qla_init.c
@@ -66,6 +66,7 @@ qla2x00_initialize_adapter(scsi_qla_host_t *ha)
 	/* Clear adapter flags. */
 	ha->flags.online = 0;
 	ha->flags.reset_active = 0;
+	ha->flags.pci_channel_io_perm_failure = 0;
 	atomic_set(&ha->loop_down_timer, LOOP_DOWN_TIME);
 	atomic_set(&ha->loop_state, LOOP_DOWN);
 	ha->device_flags = DFLG_NO_CABLE;
@@ -3181,6 +3182,13 @@ qla2x00_abort_isp(scsi_qla_host_t *ha)
 		}
 		spin_unlock_irqrestore(&ha->hardware_lock, flags);
 
+		if (pci_channel_offline(ha->pdev) &&
+		    ha->flags.pci_channel_io_perm_failure) {
+			clear_bit(ISP_ABORT_RETRY, &ha->dpc_flags);
+			status = 0;
+			goto isp_return;
+		}
+
 		ha->isp_ops->get_flash_version(ha, ha->request_ring);
 
 		rval = ha->isp_ops->nvram_config(ha);
@@ -3250,6 +3258,7 @@ isp_abort_retry:
 
 	}
 
+isp_return:
 	if (status) {
 		qla_printk(KERN_INFO, ha,
 			"qla2x00_abort_isp: **** FAILED ****\n");
@@ -3897,6 +3906,8 @@ qla2x00_try_to_stop_firmware(scsi_qla_host_t *ha)
 {
 	int ret, retries;
 
+	if (ha->flags.pci_channel_io_perm_failure)
+                return;
 	if (!IS_FWI2_CAPABLE(ha))
 		return;
 	if (!ha->fw_major_version)
diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
index 4106c90..f5bb36d 100644
--- a/drivers/scsi/qla2xxx/qla_isr.c
+++ b/drivers/scsi/qla2xxx/qla_isr.c
@@ -36,6 +36,7 @@ qla2100_intr_handler(int irq, void *dev_id, struct pt_regs *regs)
 	int		status;
 	unsigned long	flags;
 	unsigned long	iter;
+	uint16_t        hccr;
 	uint16_t	mb[4];
 
 	ha = (scsi_qla_host_t *) dev_id;
@@ -50,7 +51,23 @@ qla2100_intr_handler(int irq, void *dev_id, struct pt_regs *regs)
 
 	spin_lock_irqsave(&ha->hardware_lock, flags);
 	for (iter = 50; iter--; ) {
-		if ((RD_REG_WORD(&reg->istatus) & ISR_RISC_INT) == 0)
+		hccr = RD_REG_WORD(&reg->hccr);
+		if (hccr & HCCR_RISC_PAUSE) {
+			if (pci_channel_offline(ha->pdev))
+				break;
+
+			/*
+			 * Issue a "HARD" reset in order for the RISC interrupt
+			 * bit to be cleared.  Schedule a big hammmer to get
+			 * out of the RISC PAUSED state.
+			 */
+			WRT_REG_WORD(&reg->hccr, HCCR_RESET_RISC);
+			RD_REG_WORD(&reg->hccr);
+
+			ha->isp_ops->fw_dump(ha, 1);
+			set_bit(ISP_ABORT_NEEDED, &ha->dpc_flags);
+			break;
+		} else if ((RD_REG_WORD(&reg->istatus) & ISR_RISC_INT) == 0)
 			break;
 
 		if (RD_REG_WORD(&reg->semaphore) & BIT_0) {
@@ -130,6 +147,9 @@ qla2300_intr_handler(int irq, void *dev_id, struct pt_regs *regs)
 	for (iter = 50; iter--; ) {
 		stat = RD_REG_DWORD(&reg->u.isp2300.host_status);
 		if (stat & HSR_RISC_PAUSED) {
+			if (pci_channel_offline(ha->pdev))
+				break;
+
 			hccr = RD_REG_WORD(&reg->hccr);
 			if (hccr & (BIT_15 | BIT_13 | BIT_11 | BIT_8))
 				qla_printk(KERN_INFO, ha, "Parity error -- "
@@ -1605,6 +1625,9 @@ qla24xx_intr_handler(int irq, void *dev_id, struct pt_regs *regs)
 	for (iter = 50; iter--; ) {
 		stat = RD_REG_DWORD(&reg->host_status);
 		if (stat & HSRX_RISC_PAUSED) {
+			if (pci_channel_offline(ha->pdev))
+				break;
+
 			hccr = RD_REG_DWORD(&reg->hccr);
 
 			qla_printk(KERN_INFO, ha, "RISC paused -- HCCR=%x, "
@@ -1740,6 +1763,9 @@ qla24xx_msix_default(int irq, void *dev_id, struct pt_regs *regs)
 	do {
 		stat = RD_REG_DWORD(&reg->host_status);
 		if (stat & HSRX_RISC_PAUSED) {
+			if (pci_channel_offline(ha->pdev))
+				break;
+
 			hccr = RD_REG_DWORD(&reg->hccr);
 
 			qla_printk(KERN_INFO, ha, "RISC paused -- HCCR=%x, "
diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c
index c6a3418..29e5f19 100644
--- a/drivers/scsi/qla2xxx/qla_mbx.c
+++ b/drivers/scsi/qla2xxx/qla_mbx.c
@@ -57,7 +57,7 @@ qla2x00_mailbox_command(scsi_qla_host_t *pvha, mbx_cmd_t *mcp)
 	 * seconds. This is to serialize actual issuing of mailbox cmds during
 	 * non ISP abort time.
 	 */
-	if (!abort_active) {
+	if (!abort_active && !ha->flags.pci_channel_io_perm_failure) {
 		if (!wait_for_completion_timeout(&ha->mbx_cmd_comp,
 		    mcp->tov * HZ)) {
 			/* Timeout occurred. Return error. */
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index d2ec14b..ef4efb0 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -409,6 +409,11 @@ qla2x00_queuecommand(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *))
 	srb_t *sp;
 	int rval;
 
+	if (unlikely(pci_channel_offline(ha->pdev))) {
+		cmd->result = DID_REQUEUE << 16;
+		goto qc_fail_command;
+	}
+
 	rval = fc_remote_port_chkready(rport);
 	if (rval) {
 		cmd->result = rval;
@@ -471,6 +476,11 @@ qla24xx_queuecommand(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *))
 	srb_t *sp;
 	int rval;
 
+	if (unlikely(pci_channel_offline(ha->pdev))) {
+		cmd->result = DID_REQUEUE << 16;
+		goto qc24_fail_command;
+	}
+
 	rval = fc_remote_port_chkready(rport);
 	if (rval) {
 		cmd->result = rval;
@@ -1876,6 +1886,24 @@ qla2x00_remove_one(struct pci_dev *pdev)
 		}
 	}
 
+	/* Disable timer */
+	if (ha->timer_active)
+		qla2x00_stop_timer(ha);
+
+	/* Kill the kernel thread for this host */
+	if (ha->dpc_thread) {
+		struct task_struct *t = ha->dpc_thread;
+
+		/*
+		 * qla2xxx_wake_dpc checks for ->dpc_thread
+		 * so we need to zero it out.
+		 */
+		ha->dpc_thread = NULL;
+		kthread_stop(t);
+	}
+
+	ha->flags.online = 0;
+
 	qla84xx_put_chip(ha);
 
 	qla2x00_free_sysfs_attr(ha);
@@ -2891,6 +2919,109 @@ qla2x00_release_firmware(void)
 	up(&qla_fw_lock);
 }
 
+static pci_ers_result_t
+qla2xxx_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
+{
+	scsi_qla_host_t *ha = pci_get_drvdata(pdev);
+
+	switch (state) {
+	case pci_channel_io_normal:
+		return PCI_ERS_RESULT_CAN_RECOVER;
+	case pci_channel_io_frozen:
+		pci_disable_device(pdev);
+		return PCI_ERS_RESULT_NEED_RESET;
+	case pci_channel_io_perm_failure:
+		ha->flags.pci_channel_io_perm_failure = 1;
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+static pci_ers_result_t
+qla2xxx_pci_mmio_enabled(struct pci_dev *pdev)
+{
+	int risc_paused = 0;
+	uint32_t stat;
+	unsigned long flags;
+	scsi_qla_host_t *ha = pci_get_drvdata(pdev);
+	struct device_reg_2xxx __iomem *reg = &ha->iobase->isp;
+	struct device_reg_24xx __iomem *reg24 = &ha->iobase->isp24;
+
+	spin_lock_irqsave(&ha->hardware_lock, flags);
+	if (IS_QLA2100(ha) || IS_QLA2200(ha)){
+		stat = RD_REG_DWORD(&reg->hccr);
+		if (stat & HCCR_RISC_PAUSE)
+			risc_paused = 1;
+	} else if (IS_QLA23XX(ha)) {
+		stat = RD_REG_DWORD(&reg->u.isp2300.host_status);
+		if (stat & HSR_RISC_PAUSED)
+			risc_paused = 1;
+	} else if (IS_FWI2_CAPABLE(ha)) {
+		stat = RD_REG_DWORD(&reg24->host_status);
+		if (stat & HSRX_RISC_PAUSED)
+			risc_paused = 1;
+	}
+	spin_unlock_irqrestore(&ha->hardware_lock, flags);
+
+	if (risc_paused) {
+		qla_printk(KERN_INFO, ha, "RISC paused -- mmio_enabled, "
+		    "Dumping firmware!\n");
+		ha->isp_ops->fw_dump(ha, 0);
+
+		return PCI_ERS_RESULT_NEED_RESET;
+	} else
+		return PCI_ERS_RESULT_RECOVERED;
+}
+
+static pci_ers_result_t
+qla2xxx_pci_slot_reset(struct pci_dev *pdev)
+{
+	pci_ers_result_t ret = PCI_ERS_RESULT_DISCONNECT;
+	scsi_qla_host_t *ha = pci_get_drvdata(pdev);
+	int rc;
+
+	rc = pci_enable_device(pdev);
+
+	if (rc) {
+		qla_printk(KERN_WARNING, ha,
+		    "Can't re-enable PCI device after reset.\n");
+
+		return ret;
+	}
+	pci_set_master(pdev);
+
+	if (ha->isp_ops->pci_config(ha))
+		return ret;
+
+	set_bit(ABORT_ISP_ACTIVE, &ha->dpc_flags);
+	if (qla2x00_abort_isp(ha)== QLA_SUCCESS)
+		ret =  PCI_ERS_RESULT_RECOVERED;
+	clear_bit(ABORT_ISP_ACTIVE, &ha->dpc_flags);
+
+	return ret;
+}
+
+static void
+qla2xxx_pci_resume(struct pci_dev *pdev)
+{
+	scsi_qla_host_t *ha = pci_get_drvdata(pdev);
+	int ret;
+
+	ret = qla2x00_wait_for_hba_online(ha);
+	if (ret != QLA_SUCCESS) {
+		qla_printk(KERN_ERR, ha,
+		    "the device failed to resume I/O "
+		    "from slot/link_reset");
+	}
+}
+
+static struct pci_error_handlers qla2xxx_err_handler = {
+	.error_detected = qla2xxx_pci_error_detected,
+	.mmio_enabled = qla2xxx_pci_mmio_enabled,
+	.slot_reset = qla2xxx_pci_slot_reset,
+	.resume = qla2xxx_pci_resume,
+};
+
 static struct pci_device_id qla2xxx_pci_tbl[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_ISP2100) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_ISP2200) },
@@ -2917,6 +3048,7 @@ static struct pci_driver qla2xxx_pci_driver = {
 	.id_table	= qla2xxx_pci_tbl,
 	.probe		= qla2x00_probe_one,
 	.remove		= __devexit_p(qla2x00_remove_one),
+	.err_handler    = &qla2xxx_err_handler,
 };
 
 /**