Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 89877e42827f16fa5f86b1df0c2860b1 > files > 1934

kernel-2.6.18-128.1.10.el5.src.rpm

From: Hans-Joachim Picht <hpicht@redhat.com>
Date: Thu, 31 Jan 2008 11:23:12 +0100
Subject: [s390] cio: introduce timed recovery procedure
Message-id: 20080131102312.GD16660@redhat.com
O-Subject: [RHEL5 U2 PATCH 4/4] s390 - cio: introduce timed recovery procedure
Bugzilla: 430593
RH-Acked-by: Pete Zaitcev <zaitcev@redhat.com>

Description
============

In some cases, slow hardware response has been seen to interfer with
Linux channel path verification, leading to inaccessible devices.
To counter this problem, a timed path verification retry has been added.

Initially the retry is done after 3s. If this fails again, there will be
retries every 30s. If there is a verify event while the timer for the
scheduled verification is running, the timer will be stopped and the
verification is processed.

Bugzilla
=========

BZ 430593
https://bugzilla.redhat.com/show_bug.cgi?id=430593

Upstream status of the patch:
=============================
Patch is upstream in git commit 90ab133603d066e850fc9ed297b6eb52f888dd25

Test status:
============
Kernel with patch was built and successfully tested

Please ACK.

With best regards,

Hans

 drivers/s390/cio/device.c     |    2 +
 drivers/s390/cio/device.h     |    1 +
 drivers/s390/cio/device_fsm.c |   73 +++++++++++++++++++++++++++++++++++------
 3 files changed, 66 insertions(+), 10 deletions(-)

diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index dd45ac7..287aa30 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -672,6 +672,8 @@ ccw_device_release(struct device *dev)
 	struct ccw_device *cdev;
 
 	cdev = to_ccwdev(dev);
+	/* Just to be sure this is not still running. */
+	ccw_device_schedule_verify(cdev, 0);
 	kfree(cdev->private);
 	kfree(cdev);
 }
diff --git a/drivers/s390/cio/device.h b/drivers/s390/cio/device.h
index 00be9a5..4f936b2 100644
--- a/drivers/s390/cio/device.h
+++ b/drivers/s390/cio/device.h
@@ -82,6 +82,7 @@ int ccw_device_cancel_halt_clear(struct ccw_device *);
 int ccw_device_register(struct ccw_device *);
 void ccw_device_do_unreg_rereg(void *);
 void ccw_device_call_sch_unregister(void *);
+void ccw_device_schedule_verify(struct ccw_device *cdev, unsigned long delay);
 
 int ccw_device_recognition(struct ccw_device *);
 int ccw_device_online(struct ccw_device *);
diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c
index beb8f64..2cfe4e3 100644
--- a/drivers/s390/cio/device_fsm.c
+++ b/drivers/s390/cio/device_fsm.c
@@ -59,6 +59,12 @@ device_set_disconnected(struct subchannel *sch)
 	ccw_device_set_timeout(cdev, 0);
 	cdev->private->flags.fake_irb = 0;
 	cdev->private->state = DEV_STATE_DISCONNECTED;
+
+	/*
+	 * This is called when the device was not disconnected before. So
+	 * we use 3s as an initial re-validation delay.
+	 */
+	ccw_device_schedule_verify(cdev, 3*HZ);
 }
 
 void
@@ -98,6 +104,24 @@ device_trigger_verify(struct subchannel *sch)
 }
 
 /*
+ * Helper function to set the CCW device timer.
+ * (Assumes the ccw device lock is being held)
+ */
+static void device_set_timer(struct ccw_device *cdev, unsigned long delay,
+		void (*callback)(unsigned long))
+{
+	struct timer_list *t = &cdev->private->timer;
+
+	del_timer(t);
+	if (delay != 0) {
+		t->function = callback;
+		t->data = (unsigned long) cdev;
+		t->expires = jiffies + delay;
+		add_timer(t);
+	}
+}
+
+/*
  * Timeout function. It just triggers a DEV_EVENT_TIMEOUT.
  */
 static void
@@ -117,19 +141,31 @@ ccw_device_timeout(unsigned long data)
 void
 ccw_device_set_timeout(struct ccw_device *cdev, int expires)
 {
-	if (expires == 0) {
-		del_timer(&cdev->private->timer);
+	device_set_timer(cdev, expires, ccw_device_timeout);
+}
+
+void
+ccw_device_delayed_verify(unsigned long data)
+{
+	struct ccw_device *cdev = (struct ccw_device *) data;
+
+	spin_lock_irq(cdev->ccwlock);
+	if (cdev->private->state != DEV_STATE_DISCONNECTED) {
+		spin_unlock_irq(cdev->ccwlock);
 		return;
 	}
-	if (timer_pending(&cdev->private->timer)) {
-		if (mod_timer(&cdev->private->timer, jiffies + expires))
-			return;
-	}
-	cdev->private->timer.function = ccw_device_timeout;
-	cdev->private->timer.data = (unsigned long) cdev;
-	cdev->private->timer.expires = jiffies + expires;
-	add_timer(&cdev->private->timer);
+	CIO_MSG_EVENT(3, "Delayed revalidation called for device %s\n",
+		cdev->dev.bus_id);
+	device_trigger_reprobe(to_subchannel(cdev->dev.parent));
+	spin_unlock_irq(cdev->ccwlock);
+}
+
+void
+ccw_device_schedule_verify(struct ccw_device *cdev, unsigned long delay)
+{
+	device_set_timer(cdev, delay, ccw_device_delayed_verify);
 }
+EXPORT_SYMBOL_GPL(ccw_device_schedule_verify);
 
 /* Kill any pending timers after machine check. */
 void
@@ -277,6 +313,12 @@ ccw_device_recog_done(struct ccw_device *cdev, int state)
 		if (state == DEV_STATE_NOT_OPER) {
 			cdev->private->flags.recog_done = 1;
 			cdev->private->state = DEV_STATE_DISCONNECTED;
+			CIO_DEBUG(KERN_DEBUG, 4, "SenseID: "
+				"device %04x on subchannel 0.%x.%04x "
+				"disconnected after verify.\n",
+				cdev->private->devno, sch->schid.ssid,
+				sch->schid.sch_no);
+			ccw_device_schedule_verify(cdev, 30*HZ);
 			return;
 		}
 		/* Boxed devices don't need extra treatment. */
@@ -550,6 +592,7 @@ ccw_device_recog_timeout(struct ccw_device *cdev, enum dev_event dev_event)
 static void
 ccw_device_nopath_notify(void *data)
 {
+	unsigned long flags;
 	struct ccw_device *cdev;
 	struct subchannel *sch;
 	int ret;
@@ -575,10 +618,13 @@ ccw_device_nopath_notify(void *data)
 				put_device(&sch->dev);
 		}
 	} else {
+		spin_lock_irqsave(cdev->ccwlock, flags);
 		cio_disable_subchannel(sch);
 		ccw_device_set_timeout(cdev, 0);
 		cdev->private->flags.fake_irb = 0;
 		cdev->private->state = DEV_STATE_DISCONNECTED;
+		ccw_device_schedule_verify(cdev, 3*HZ);
+		spin_unlock_irqrestore(cdev->ccwlock, flags);
 		wake_up(&cdev->private->wait_q);
 	}
 }
@@ -778,6 +824,7 @@ ccw_device_online_notoper(struct ccw_device *cdev, enum dev_event dev_event)
 			ccw_device_set_timeout(cdev, 0);
 			cdev->private->flags.fake_irb = 0;
 			cdev->private->state = DEV_STATE_DISCONNECTED;
+			ccw_device_schedule_verify(cdev, 3*HZ);
 			wake_up(&cdev->private->wait_q);
 			return;
 	}
@@ -1170,6 +1217,12 @@ ccw_device_start_id(struct ccw_device *cdev, enum dev_event dev_event)
 		/* Couldn't enable the subchannel for i/o. Sick device. */
 		return;
 
+	/*
+	 * This should be the only way to get out of the disconnected state.
+	 * So stopping the delayed verification here hopefully is enough.
+	 */
+	ccw_device_schedule_verify(cdev, 0);
+
 	/* After 60s the device recognition is considered to have failed. */
 	ccw_device_set_timeout(cdev, 60*HZ);