Sophie: kernel-2.6.18-128.1.10.el5 src

kernel-2.6.18-128.1.10.el5.src.rpm

From: Mike Christie <mchristi@redhat.com>
Subject: [RHEL 5.1 PATCH] fix iscsi write handling regression (v2)
Date: Wed, 15 Aug 2007 01:52:16 -0500
Bugzilla: 247827
Message-Id: <1187160736.16777.1.camel@max>
Changelog: [scsi] fix iscsi write handling regression


This is for BZ 247827.

This is a resend of the patch from here
http://post-office.corp.redhat.com/archives/rhkernel-list/2007-July/msg00518.html

but the bug in 247827 had two issues and this patch now fixes them both.

The first bug:

In 5.0, we did not handle the iscsi command sequencing correctly and
some targets went crazy. For 5.1 beta, we implemented the command
sequencing handling, but we went too far and checked the sequencing when
we did not need to and would throttle IO when we did not need to as a
result. For heavy write workloads, this would end up causing a
performance regression if you are lucky, but the more common case would
be that we would throttle IO, it would time, scsi-ml's eh would run, we
would retry the IO, and we would throttle again and this would go on and
on until we ran out of command retries. We would then get IO errors and
file systems would be remounted.

This patch is in the scsi maintainer's tree here
http://git.kernel.org/?p=linux/kernel/git/jejb/scsi-rc-fixes-2.6.git;a=commitdiff;h=e07264071f7f2b02a2973cb28d9fdf5eb8866cc1

The second bug:

We do not want to send data if we are aborting a task. There is
a check in iscsi_xmit_ctask, but right before calling this we overwrite
the state so we always go right past the test. Sending data causes problems
because when we clean up from a successful abort the LLD assumes that
the task is not running. This was the cause of the oops seen in the bugzilla.
I was going to make two different bugzillas, but since the original patch
was not yet merged I am just combining them. Also while searching for this bug
I found a locking bug and some places where userspace could cause a problem
and fixed them.

This part of the patch was just sent to linux-scsi.

The patch was retested by doing heavy IO and making sure IO throughput
was what it should be at. Then doing heavy IO, setting the command timeout
to one second and letting the scsi eh fire over and over again. And at the
same time I would reboot the target or kill the network connection.


diff -aurp linux-2.6.18.noarch/drivers/scsi/libiscsi.c linux-2.6.18.noarch.work/drivers/scsi/libiscsi.c
--- linux-2.6.18.noarch/drivers/scsi/libiscsi.c	2007-08-14 21:37:13.000000000 -0500
+++ linux-2.6.18.noarch.work/drivers/scsi/libiscsi.c	2007-08-14 21:36:57.000000000 -0500
@@ -613,8 +613,10 @@ static void iscsi_prep_mtask(struct iscs
 	if (hdr->itt != RESERVED_ITT) {
 		hdr->itt = build_itt(mtask->itt, conn->id, session->age);
 		if (conn->c_stage == ISCSI_CONN_STARTED &&
-		    !(hdr->opcode & ISCSI_OP_IMMEDIATE))
+		    !(hdr->opcode & ISCSI_OP_IMMEDIATE)) {
+			session->queued_cmdsn++;
 			session->cmdsn++;
+		}
 	}
 
 	if (session->tt->init_mgmt_task)
@@ -658,9 +660,11 @@ static int iscsi_check_cmdsn_window_clos
 	/*
 	 * Check for iSCSI window and take care of CmdSN wrap-around
 	 */
-	if (!iscsi_sna_lte(session->cmdsn, session->max_cmdsn)) {
-		debug_scsi("iSCSI CmdSN closed. MaxCmdSN %u CmdSN %u\n",
-			   session->max_cmdsn, session->cmdsn);
+	if (!iscsi_sna_lte(session->queued_cmdsn, session->max_cmdsn)) {
+		debug_scsi("iSCSI CmdSN closed. ExpCmdSn %u MaxCmdSN %u "
+			   "CmdSN %u/%u\n", session->exp_cmdsn,
+			   session->max_cmdsn, session->cmdsn,
+			   session->queued_cmdsn);
 		return -ENOSPC;
 	}
 	return 0;
@@ -739,23 +743,25 @@ check_mgmt:
 
 	/* process command queue */
 	while (!list_empty(&conn->xmitqueue)) {
-		rc = iscsi_check_cmdsn_window_closed(conn);
-		if (rc) {
-			spin_unlock_bh(&conn->session->lock);
-			return rc;
-		}
 		/*
 		 * iscsi tcp may readd the task to the xmitqueue to send
 		 * write data
 		 */
 		conn->ctask = list_entry(conn->xmitqueue.next,
 					 struct iscsi_cmd_task, running);
-		if (conn->ctask->state == ISCSI_TASK_PENDING) {
+		switch (conn->ctask->state) {
+		case ISCSI_TASK_ABORTING:
+			break;
+		case ISCSI_TASK_PENDING:
 			iscsi_prep_scsi_cmd_pdu(conn->ctask);
 			conn->session->tt->init_cmd_task(conn->ctask);
+			/* fall through */
+		default:
+			conn->ctask->state = ISCSI_TASK_RUNNING;
+			break;
 		}
-		conn->ctask->state = ISCSI_TASK_RUNNING;
 		list_move_tail(conn->xmitqueue.next, &conn->run_list);
+
 		rc = iscsi_xmit_ctask(conn);
 		if (rc)
 			goto again;
@@ -850,12 +856,6 @@ int iscsi_queuecommand(struct scsi_cmnd 
 		goto fault;
 	}
 
-	/*
-	 * We check this here and in data xmit, because if we get to the point
-	 * that this check is hitting the window then we have enough IO in
-	 * flight and enough IO waiting to be transmitted it is better
-	 * to let the scsi/block layer queue up.
-	 */
 	if (iscsi_check_cmdsn_window_closed(conn)) {
 		reason = FAILURE_WINDOW_CLOSED;
 		goto reject;
@@ -866,6 +866,8 @@ int iscsi_queuecommand(struct scsi_cmnd 
 		reason = FAILURE_OOM;
 		goto reject;
 	}
+	session->queued_cmdsn++;
+
 	sc->SCp.phase = session->age;
 	sc->SCp.ptr = (char *)ctask;
 
@@ -1066,7 +1068,9 @@ static int iscsi_exec_abort_task(struct 
 	ctask->mtask = __iscsi_conn_send_pdu(conn, (struct iscsi_hdr *)hdr,
 					    NULL, 0);
 	if (!ctask->mtask) {
+		spin_unlock_bh(&session->lock);
 		iscsi_conn_failure(conn, ISCSI_ERR_CONN_FAILED);
+		spin_lock_bh(&session->lock)
 		debug_scsi("abort sent failure [itt 0x%x]\n", ctask->itt);
 		return -EPERM;
 	}
@@ -1083,6 +1087,7 @@ static int iscsi_exec_abort_task(struct 
 		debug_scsi("abort set timeout [itt 0x%x]\n", ctask->itt);
 	}
 	spin_unlock_bh(&session->lock);
+	mutex_unlock(&session->eh_mutex);
 	scsi_queue_work(session->host, &conn->xmitwork);
 
 	/*
@@ -1100,6 +1105,7 @@ static int iscsi_exec_abort_task(struct 
 	if (signal_pending(current))
 		flush_signals(current);
 	del_timer_sync(&conn->tmabort_timer);
+	mutex_lock(&session->eh_mutex);
 	spin_lock_bh(&session->lock);
 	return 0;
 }
@@ -1157,7 +1163,13 @@ static void fail_command(struct iscsi_co
 	if (!sc)
 		return;
 
-	if (ctask->state != ISCSI_TASK_PENDING)
+	if (ctask->state == ISCSI_TASK_PENDING)
+		/*
+		 * cmd never made it to the xmit thread, so we should not count
+		 * the cmd in the sequencing
+		 */
+		conn->session->queued_cmdsn--;
+	else
 		conn->session->tt->cleanup_cmd_task(conn, ctask);
 	iscsi_ctask_mtask_cleanup(ctask);
 
@@ -1169,31 +1181,45 @@ static void fail_command(struct iscsi_co
 	__iscsi_put_ctask(ctask);
 }
 
+static void iscsi_suspend_tx(struct iscsi_conn *conn)
+{
+	set_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx);
+	scsi_flush_work(conn->session->host);
+}
+
+static void iscsi_start_tx(struct iscsi_conn *conn)
+{
+	clear_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx);
+	scsi_queue_work(conn->session->host, &conn->xmitwork);
+}
+
 int iscsi_eh_abort(struct scsi_cmnd *sc)
 {
+	struct Scsi_Host *host = sc->device->host;
+	struct iscsi_session *session = iscsi_hostdata(host->hostdata);
 	struct iscsi_cmd_task *ctask;
 	struct iscsi_conn *conn;
-	struct iscsi_session *session;
 	int rc;
 
+	mutex_lock(&session->eh_mutex);
+	spin_lock_bh(&session->lock);
 	/*
 	 * if session was ISCSI_STATE_IN_RECOVERY then we may not have
 	 * got the command.
 	 */
 	if (!sc->SCp.ptr) {
 		debug_scsi("sc never reached iscsi layer or it completed.\n");
+		spin_unlock_bh(&session->lock);
+		mutex_unlock(&session->eh_mutex);
 		return SUCCESS;
 	}
 
 	ctask = (struct iscsi_cmd_task *)sc->SCp.ptr;
 	conn = ctask->conn;
-	session = conn->session;
 
 	conn->eh_abort_cnt++;
 	debug_scsi("aborting [sc %p itt 0x%x]\n", sc, ctask->itt);
 
-	spin_lock_bh(&session->lock);
-
 	/*
 	 * If we are not logged in or we have started a new session
 	 * then let the host reset code handle this
@@ -1230,6 +1256,7 @@ int iscsi_eh_abort(struct scsi_cmnd *sc)
 	switch (conn->tmabort_state) {
 	case TMABORT_SUCCESS:
 		spin_unlock_bh(&session->lock);
+		iscsi_suspend_tx(conn);
 		/*
 		 * clean up task if aborted. grab the recv lock as a writer
 		 */
@@ -1238,11 +1265,7 @@ int iscsi_eh_abort(struct scsi_cmnd *sc)
 		fail_command(conn, ctask, DID_ABORT << 16);
 		spin_unlock(&session->lock);
 		write_unlock_bh(conn->recv_lock);
-		/*
-		 * make sure xmit thread is not still touching the
-		 * ctask/scsi_cmnd
-		 */
-		scsi_flush_work(session->host);
+		iscsi_start_tx(conn);
 		goto success_unlocked;
 	case TMABORT_NOT_FOUND:
 		if (!ctask->sc) {
@@ -1262,12 +1285,14 @@ success:
 	spin_unlock_bh(&session->lock);
 success_unlocked:
 	debug_scsi("abort success [sc %lx itt 0x%x]\n", (long)sc, ctask->itt);
+	mutex_unlock(&session->eh_mutex);
 	return SUCCESS;
 
 failed:
 	spin_unlock_bh(&session->lock);
 failed_unlocked:
 	debug_scsi("abort failed [sc %lx itt 0x%x]\n", (long)sc, ctask->itt);
+	mutex_unlock(&session->eh_mutex);
 	return FAILED;
 }
 EXPORT_SYMBOL_GPL(iscsi_eh_abort);
@@ -1385,11 +1410,12 @@ iscsi_session_setup(struct iscsi_transpo
 	session->state = ISCSI_STATE_FREE;
 	session->mgmtpool_max = ISCSI_MGMT_CMDS_MAX;
 	session->cmds_max = ISCSI_XMIT_CMDS_MAX;
-	session->cmdsn = initial_cmdsn;
+	session->queued_cmdsn = session->cmdsn = initial_cmdsn;
 	session->exp_cmdsn = initial_cmdsn + 1;
 	session->max_cmdsn = initial_cmdsn + 1;
 	session->max_r2t = 1;
 	session->tt = iscsit;
+	mutex_init(&session->eh_mutex);
 
 	/* initialize SCSI PDU commands pool */
 	if (iscsi_pool_init(&session->cmdpool, session->cmds_max,
@@ -1620,11 +1646,8 @@ void iscsi_conn_teardown(struct iscsi_cl
 	kfree(conn->persistent_address);
 	__kfifo_put(session->mgmtpool.queue, (void*)&conn->login_mtask,
 		    sizeof(void*));
-	if (session->leadconn == conn) {
+	if (session->leadconn == conn)
 		session->leadconn = NULL;
-		/* no connections exits.. reset sequencing */
-		session->cmdsn = session->max_cmdsn = session->exp_cmdsn = 1;
-	}
 	spin_unlock_bh(&session->lock);
 
 	kfifo_free(conn->mgmtqueue);
@@ -1654,6 +1677,7 @@ int iscsi_conn_start(struct iscsi_cls_co
 	spin_lock_bh(&session->lock);
 	conn->c_stage = ISCSI_CONN_STARTED;
 	session->state = ISCSI_STATE_LOGGED_IN;
+	session->queued_cmdsn = session->cmdsn;
 
 	switch(conn->stop_stage) {
 	case STOP_CONN_RECOVER:
@@ -1736,9 +1760,22 @@ static void iscsi_start_session_recovery
 {
 	int old_stop_stage;
 
+	mutex_lock(&session->eh_mutex);
 	spin_lock_bh(&session->lock);
 	if (conn->stop_stage == STOP_CONN_TERM) {
 		spin_unlock_bh(&session->lock);
+		mutex_unlock(&session->eh_mutex);
+		return;
+	}
+
+	/*
+	 * The LLD either freed/unset the lock on us, or userspace called
+	 * stop but did not create a proper connection (connection was never
+	 * bound or it was unbound then stop was called).
+	 */
+	if (!conn->recv_lock) {
+		spin_unlock_bh(&session->lock);
+		mutex_unlock(&session->eh_mutex);
 		return;
 	}
 
@@ -1755,9 +1792,9 @@ static void iscsi_start_session_recovery
 	old_stop_stage = conn->stop_stage;
 	conn->stop_stage = flag;
 	conn->c_stage = ISCSI_CONN_STOPPED;
-	set_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx);
 	spin_unlock_bh(&session->lock);
-	scsi_flush_work(session->host);
+
+	iscsi_suspend_tx(conn);
 
 	write_lock_bh(conn->recv_lock);
 	set_bit(ISCSI_SUSPEND_BIT, &conn->suspend_rx);
@@ -1786,6 +1823,7 @@ static void iscsi_start_session_recovery
 	fail_all_commands(conn);
 	flush_control_queues(session, conn);
 	spin_unlock_bh(&session->lock);
+	mutex_unlock(&session->eh_mutex);
 }
 
 void iscsi_conn_stop(struct iscsi_cls_conn *cls_conn, int flag)
diff -aurp linux-2.6.18.noarch/include/scsi/libiscsi.h linux-2.6.18.noarch.work/include/scsi/libiscsi.h
--- linux-2.6.18.noarch/include/scsi/libiscsi.h	2007-08-14 21:37:13.000000000 -0500
+++ linux-2.6.18.noarch.work/include/scsi/libiscsi.h	2007-08-14 13:54:37.000000000 -0500
@@ -259,6 +259,15 @@ struct iscsi_session {
 	struct iscsi_mgmt_task	**mgmt_cmds;	/* Original mgmt arr */
 	struct iscsi_queue	mgmtpool;	/* Mgmt PDU's pool */
 #ifndef __GENKSYMS__
+	/* This tracks the reqs queued into the initiator */
+	uint32_t		queued_cmdsn;
+	/*
+	 * Syncs up the scsi eh thread with the iscsi eh thread when sending
+	 * task management functions. This must be taken before the session
+	 * and recv lock.
+	 */
+	struct mutex		eh_mutex;
+
 	char			*username;
 	char			*username_in;
 	char			*password;