Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > fc11cd6e1c513a17304da94a5390f3cd > files > 3085

kernel-2.6.18-194.11.1.el5.src.rpm

From: Hans-Joachim Picht <hpicht@redhat.com>
Date: Tue, 1 Jul 2008 15:13:07 +0200
Subject: [s390] cio: fix unusable zfcp device after vary off/on
Message-id: 20080701131307.GD20922@redhat.com
O-Subject: [RHEL5 U3 PATCH 4/6] s390 - cio: fix unusable zfcp device after vary off/on
Bugzilla: 451223
RH-Acked-by: Pete Zaitcev <zaitcev@redhat.com>

Description
============

After some cycles of vary off/on on a fcp chpid, the
zfcp adapter on that chpid shows an availability of
no path, even though the chpid is online again.
The problem is caused by a missing locking and
serialization in the chpid state change handling code.

Adding a  proper locking and serialization in chpid state
change handling code fixes this problem.

Bugzilla
=========

BZ 451223
https://bugzilla.redhat.com/show_bug.cgi?id=451223

Upstream status of the patch:
=============================

Patch is contained in linux-2.6 spread across the following
git commit ids:

2470b648e17e0216922bb78c7f05b4668402459a
ee04bbccdeb11bdbc54015be8dca30a0deeca5e4
7c8427c3fa1b2e77c5bd8cf219c1d55dccd0f167
3f4cf6e72f9f6a0b046b32881acc4f829f3aaa46

Some parts of the patch are not contained upstream, because the upstream
code base is different.

Test status:
============

The patch has been tested and fixes the problem.
The fix has been verified by the IBM test department.

Please ACK.

With best regards,

	-Hans

diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c
index 8cc32d1..2c87998 100644
--- a/drivers/s390/cio/chsc.c
+++ b/drivers/s390/cio/chsc.c
@@ -283,6 +283,8 @@ void chsc_chp_offline(struct chp_id chpid)
 
 	if (chp_get_status(chpid) <= 0)
 		return;
+	/* Wait until previous actions have settled. */
+	css_wait_for_slow_path();
 	bus_for_each_dev(&css_bus_type, NULL, &chpid,
 			 s390_subchannel_remove_chpid);
 
@@ -409,7 +411,8 @@ s390_process_res_acc (struct res_acc_data *res_data)
 		sprintf(dbf_txt, "fla%x", res_data->fla);
 		CIO_TRACE_EVENT( 2, dbf_txt);
 	}
-
+	/* Wait until previous actions have settled. */
+	css_wait_for_slow_path();
 	/*
 	 * I/O resources may have become accessible.
 	 * Scan through all subchannels that may be concerned and
@@ -719,6 +722,8 @@ int chsc_chp_online(struct chp_id chpid)
 
 	if (chp_get_status(chpid) == 0)
 		return 0;
+	/* Wait until previous actions have settled. */
+	css_wait_for_slow_path();
 	rc = for_each_subchannel(__chp_add, &chpid);
 	if (css_slow_subchannels_exist())
 		rc = -EAGAIN;
@@ -839,6 +844,8 @@ __s390_vary_chpid_on(struct subchannel_id schid, void *data)
  */
 int chsc_chp_vary(struct chp_id chpid, int on)
 {
+	/* Wait until previous actions have settled. */
+	css_wait_for_slow_path();
 	/*
 	 * Redo PathVerification on the devices the chpid connects to
 	 */
diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
index 651117e..8caf089 100644
--- a/drivers/s390/cio/css.c
+++ b/drivers/s390/cio/css.c
@@ -241,8 +241,10 @@ static int css_evaluate_known_subchannel(struct subchannel *sch, int slow)
 			spin_unlock_irqrestore(&sch->lock, flags);
 			ret = sch->driver->notify(&sch->dev, event);
 			spin_lock_irqsave(&sch->lock, flags);
-			if (ret)
+			if (ret) {
 				action = NONE;
+				device_wake_up_wait_q(sch);
+			}
 		}
 		break;
 	case CIO_REVALIDATE:
@@ -364,6 +366,33 @@ typedef void (*workfunc)(void *);
 DECLARE_WORK(slow_path_work, (workfunc)css_trigger_slow_path, NULL);
 struct workqueue_struct *slow_path_wq;
 
+static int css_end_grace_period(struct device *dev, void *data)
+{
+	struct subchannel *sch = to_subchannel(dev);
+	int wait = 0;
+
+	spin_lock_irq(&sch->lock);
+	if (device_in_grace_period(sch)) {
+		device_kill_io(sch);
+		wait = 1;
+	}
+	spin_unlock_irq(&sch->lock);
+	if (wait)
+		device_wait_for_final_state(sch);
+
+	return 0;
+}
+
+void css_wait_for_slow_path(void)
+{
+	/* First flush outstanding notifications. */
+	flush_workqueue(ccw_device_notify_work);
+	/* Then deal with devices in the grace period. */
+	bus_for_each_dev(&css_bus_type, NULL, NULL, css_end_grace_period);
+	/* Now let pending slow work complete. */
+	flush_workqueue(slow_path_wq);
+}
+
 /* Reprobe subchannel if unregistered. */
 static int reprobe_subchannel(struct subchannel_id schid, void *data)
 {
diff --git a/drivers/s390/cio/css.h b/drivers/s390/cio/css.h
index b999cb0..2f55aaf 100644
--- a/drivers/s390/cio/css.h
+++ b/drivers/s390/cio/css.h
@@ -177,6 +177,9 @@ void device_kill_io(struct subchannel *);
 void device_set_waiting(struct subchannel *);
 void device_set_intretry(struct subchannel *sch);
 int device_trigger_verify(struct subchannel *sch);
+int device_in_grace_period(struct subchannel *sch);
+void device_wait_for_final_state(struct subchannel *sch);
+void device_wake_up_wait_q(struct subchannel *sch);
 
 /* Machine check helper function. */
 void device_kill_pending_timer(struct subchannel *);
@@ -190,4 +193,5 @@ extern int need_rescan;
 
 extern struct workqueue_struct *slow_path_wq;
 extern struct work_struct slow_path_work;
+void css_wait_for_slow_path(void);
 #endif
diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index ce01b63..a46ba66 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -848,6 +848,11 @@ io_subchannel_probe (struct subchannel *sch)
 			get_device(&cdev->dev);
 		return 0;
 	}
+	/* Check if we have a disconnected device with this devno. */
+	cdev = get_disc_ccwdev_by_devno(sch->schib.pmcw.dev, sch->schid.ssid,
+					NULL);
+	if (cdev)
+		ccw_device_remove_disconnected(cdev);
 	cdev = kzalloc (sizeof(*cdev), GFP_KERNEL);
 	if (!cdev)
 		return -ENOMEM;
diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c
index ff267ea..842bf2c 100644
--- a/drivers/s390/cio/device_fsm.c
+++ b/drivers/s390/cio/device_fsm.c
@@ -121,6 +121,34 @@ static void device_set_timer(struct ccw_device *cdev, unsigned long delay,
 	}
 }
 
+int device_in_grace_period(struct subchannel *sch)
+{
+	struct ccw_device *cdev;
+
+	cdev = sch->dev.driver_data;
+	return (cdev && (cdev->private->state == DEV_STATE_WAIT4IO));
+}
+
+void device_wait_for_final_state(struct subchannel *sch)
+{
+	struct ccw_device *cdev;
+
+	cdev = sch->dev.driver_data;
+	if (cdev)
+		wait_event(cdev->private->wait_q,
+			   dev_fsm_final_state(cdev) ||
+			cdev->private->state == DEV_STATE_DISCONNECTED);
+}
+
+void device_wake_up_wait_q(struct subchannel *sch)
+{
+	struct ccw_device *cdev;
+
+	cdev = sch->dev.driver_data;
+	if (cdev)
+		wake_up(&cdev->private->wait_q);
+}
+
 /*
  * Timeout function. It just triggers a DEV_EVENT_TIMEOUT.
  */
@@ -197,8 +225,8 @@ ccw_device_cancel_halt_clear(struct ccw_device *cdev)
 	ret = stsch(sch->schid, &sch->schib);
 	if (ret || !sch->schib.pmcw.dnv)
 		return -ENODEV; 
-	if (!sch->schib.pmcw.ena || sch->schib.scsw.actl == 0)
-		/* Not operational or no activity -> done. */
+	if (!sch->schib.pmcw.ena)
+		/* Not operational -> done. */
 		return 0;
 	/* Stage 1: cancel io. */
 	if (!(sch->schib.scsw.actl & SCSW_ACTL_HALT_PEND) &&
@@ -397,19 +425,28 @@ ccw_device_oper_notify(void *data)
 	struct ccw_device *cdev;
 	struct subchannel *sch;
 	int ret;
+	unsigned long flags;
 
 	cdev = (struct ccw_device *)data;
+	spin_lock_irqsave(cdev->ccwlock, flags);
 	sch = to_subchannel(cdev->dev.parent);
-	ret = (sch->driver && sch->driver->notify) ?
-		sch->driver->notify(&sch->dev, CIO_OPER) : 0;
-	if (!ret)
-		/* Driver doesn't want device back. */
-		ccw_device_do_unreg_rereg((void *)cdev);
-	else {
+	if (sch->driver && sch->driver->notify) {
+		spin_unlock_irqrestore(cdev->ccwlock, flags);
+		ret = sch->driver->notify(&sch->dev, CIO_OPER);
+		spin_lock_irqsave(cdev->ccwlock, flags);
+	} else
+		ret = 0;
+	if (ret) {
 		/* Reenable channel measurements, if needed. */
+		spin_unlock_irqrestore(cdev->ccwlock, flags);
 		cmf_reenable(cdev);
+		spin_lock_irqsave(cdev->ccwlock, flags);
 		wake_up(&cdev->private->wait_q);
 	}
+	spin_unlock_irqrestore(cdev->ccwlock, flags);
+	if (!ret)
+		/* Driver doesn't want device back. */
+		ccw_device_do_unreg_rereg((void *)cdev);
 }
 
 /*
@@ -632,8 +669,10 @@ ccw_device_verify_done(struct ccw_device *cdev, int err)
 	default:
 		/* Reset oper notify indication after verify error. */
 		cdev->private->flags.donotify = 0;
-		dev_fsm_event(cdev, DEV_EVENT_NOTOPER);
-		ccw_device_done(cdev, DEV_STATE_NOT_OPER);
+		if (cdev->online)
+			dev_fsm_event(cdev, DEV_EVENT_NOTOPER);
+		else
+			ccw_device_done(cdev, DEV_STATE_NOT_OPER);
 		break;
 	}
 }
@@ -764,6 +803,7 @@ static void ccw_device_generic_notoper(struct ccw_device *cdev,
 		css_clear_subchannel_slow_list();
 		need_rescan = 1;
 	}
+	queue_work(slow_path_wq, &slow_path_work);
 }
 
 /*
@@ -945,16 +985,12 @@ ccw_device_killing_irq(struct ccw_device *cdev, enum dev_event dev_event)
 
 	sch = to_subchannel(cdev->dev.parent);
 	ccw_device_set_timeout(cdev, 0);
+	/* Start delayed path verification. */
+	ccw_device_online_verify(cdev, 0);
 	/* OK, i/o is dead now. Call interrupt handler. */
-	cdev->private->state = DEV_STATE_ONLINE;
 	if (cdev->handler)
 		cdev->handler(cdev, cdev->private->intparm,
 			      ERR_PTR(-ETIMEDOUT));
-	if (!sch->lpm)
-		dev_fsm_event(cdev, DEV_EVENT_NOTOPER);
-	else if (cdev->private->flags.doverify)
-		/* Start delayed path verification. */
-		ccw_device_online_verify(cdev, 0);
 }
 
 static void
@@ -967,12 +1003,8 @@ ccw_device_killing_timeout(struct ccw_device *cdev, enum dev_event dev_event)
 		ccw_device_set_timeout(cdev, 3*HZ);
 		return;
 	}
-	if (ret == -ENODEV) {
-		dev_fsm_event(cdev, DEV_EVENT_NOTOPER);
-		return;
-	}
-	//FIXME: Can we get here?
-	cdev->private->state = DEV_STATE_ONLINE;
+	/* Start delayed path verification. */
+	ccw_device_online_verify(cdev, 0);
 	if (cdev->handler)
 		cdev->handler(cdev, cdev->private->intparm,
 			      ERR_PTR(-ETIMEDOUT));
@@ -989,17 +1021,10 @@ void device_kill_io(struct subchannel *sch)
 		cdev->private->state = DEV_STATE_TIMEOUT_KILL;
 		return;
 	}
-	if (ret == -ENODEV) {
-		dev_fsm_event(cdev, DEV_EVENT_NOTOPER);
-		return;
-	}
+	/* Start delayed path verification. */
+	ccw_device_online_verify(cdev, 0);
 	if (cdev->handler)
 		cdev->handler(cdev, cdev->private->intparm, ERR_PTR(-EIO));
-	if (!sch->lpm)
-		dev_fsm_event(cdev, DEV_EVENT_NOTOPER);
-	else
-		/* Start delayed path verification. */
-		ccw_device_online_verify(cdev, 0);
 }
 
 static void
@@ -1023,15 +1048,15 @@ ccw_device_wait4io_irq(struct ccw_device *cdev, enum dev_event dev_event)
 
 	/* Iff device is idle, reset timeout. */
 	sch = to_subchannel(cdev->dev.parent);
-	if (!stsch(sch->schid, &sch->schib))
-		if (sch->schib.scsw.actl == 0)
+	if (!stsch(sch->schid, &sch->schib)) {
+		if (sch->schib.scsw.actl == 0) {
 			ccw_device_set_timeout(cdev, 0);
+			/* Start delayed path verification. */
+			ccw_device_online_verify(cdev, 0);
+		}
+	}
 	/* Call the handler. */
 	ccw_device_call_handler(cdev);
-	if (!sch->lpm)
-		dev_fsm_event(cdev, DEV_EVENT_NOTOPER);
-	else
-		ccw_device_online_verify(cdev, 0);
 }
 
 static void
@@ -1048,18 +1073,11 @@ ccw_device_wait4io_timeout(struct ccw_device *cdev, enum dev_event dev_event)
 		cdev->private->state = DEV_STATE_TIMEOUT_KILL;
 		return;
 	}
-	if (ret == -ENODEV) {
-		dev_fsm_event(cdev, DEV_EVENT_NOTOPER);
-		return;
-	}
+	/* Start delayed path verification. */
+	ccw_device_online_verify(cdev, 0);
 	if (cdev->handler)
 		cdev->handler(cdev, cdev->private->intparm,
 			      ERR_PTR(-ETIMEDOUT));
-	if (!sch->lpm)
-		dev_fsm_event(cdev, DEV_EVENT_NOTOPER);
-	else if (cdev->private->flags.doverify)
-		/* Start delayed path verification. */
-		ccw_device_online_verify(cdev, 0);
 }
 
 static void