Sophie: kernel-2.6.18-128.1.10.el5 src

kernel-2.6.18-128.1.10.el5.src.rpm

From: Jonathan Brassow <jbrassow@redhat.com>
Subject: [PATCH RHEL 5.1] bz236271 1/2  dm-raid1.c: fix EIO on writes after 	log failure
Date: Tue, 08 May 2007 13:14:30 -0500
Bugzilla: 236271
Message-Id: <1178648070.10582.18.camel@hydrogen.msp.redhat.com>
Changelog: [md] fix EIO on writes after log failure


Bug #236271: dm-mirror: Writes to mirror after log failure return EIO

Without the ability to requeue bio's in core device-mapper, mirroring
was forced to return EIO during writes when the log device died.

Since we can now requeue, we don't need to return EIO.  We can requeue
the writes and reconfigure the mirror (in userspace).

If a write to the log produces and error, the pending writes are
put on the "failures" list.  Since the log is marked as failed,
they will stay on the failures list until a suspend happens.

Suspends come in two phases, presuspend and postsuspend.  We must
make sure that all the writes on the failures list are requeued
in the presuspend phase (a requirement of dm core).  This means
that recovery must be complete
(because writes may be delayed behind it) and the failures list
must be requeued before we return from presuspend.

The mechanisms to ensure recovery is complete (or stopped) was
already in place, but needed to be moved from postsuspend to
presuspend.  We rely on 'flush_workqueue' to ensure that the
mirror thread is complete and therefore, has requeued all writes
in the failures list.

Because we are using flush_workqueue, we must ensure that no
additional 'queue_work' calls will produce additional I/O
that we need to requeue (because once we return from
presuspend, we are unable to do anything about it).  'queue_work'
is called in response to the following functions:
- complete_resync_work = NA, recovery is stopped
- rh_dec (mirror_end_io) = NA, only calls 'queue_work' if it
                           is ready to recover the region
                           (recovery is stopped) or it needs
                           to clear the region in the log*
                           **this doesn't get called while
                           suspending**
- rh_recovery_end = NA, recovery is stopped
- rh_recovery_start = NA, recovery is stopped
- write_callback = 1) Writes w/o failures simply call
                   bio_endio -> mirror_end_io -> rh_dec
                   (see rh_dec above)
                   2) Writes with failures are put on
                   the failures list and queue_work is
                   called**
                   ** write_callbacks don't happen
                   during suspend **
- do_failures = NA, 'queue_work' not called if suspending
- add_mirror (initialization) = NA, only done on mirror creation
- queue_bio = NA, 1) delayed I/O scheduled before flush_workqueue
              is called.  2) No more I/Os are being issued.
              3) Re-attempted READs can still be handled.
              (Write completions are handled through rh_dec/
              write_callback - mention above - and do not
              use queue_bio.)

Index: linux-rhel5/drivers/md/dm-raid1.c
===================================================================
--- linux-rhel5.orig/drivers/md/dm-raid1.c
+++ linux-rhel5/drivers/md/dm-raid1.c
@@ -135,6 +135,8 @@ struct mirror_set {
 	/* recovery */
 	region_t nr_regions;
 	int in_sync;
+	int log_failure;
+	atomic_t suspend;
 
 	struct mirror *default_mirror;	/* Default mirror */
 
@@ -363,9 +365,19 @@ static void complete_resync_work(struct 
 	struct region_hash *rh = reg->rh;
 
 	rh->log->type->set_region_sync(rh->log, reg->key, success);
+
+	/*
+	 * Dispatch the bios before we call 'wake_up_all'.
+	 * This is important because if we are suspending,
+	 * we want to know that recovery is complete and
+	 * the work queue is flushed.  If we wake_up_all
+	 * before we dispatch_bios (queue bios and call wake()),
+	 * then we risk suspending before the work queue
+	 * has been properly flushed.
+	 */
+	dispatch_bios(rh->ms, &reg->delayed_bios);
 	if (atomic_dec_and_test(&rh->recovery_in_flight))
 		wake_up_all(&recovery_stopped_event);
-	dispatch_bios(rh->ms, &reg->delayed_bios);
 	up(&rh->recovery_count);
 }
 
@@ -1062,8 +1074,6 @@ static void write_callback(unsigned long
 	 * regions with the same code.
 	 */
 	if (unlikely(error)) {
-		DMERR("Error during write occurred.");
-
 		/*
 		 * If the log is intact, we can play around with trying
 		 * to handle the failure.  Otherwise, we have to report
@@ -1147,7 +1157,7 @@ static void do_write(struct mirror_set *
 
 static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 {
-	int state, r;
+	int state;
 	struct bio *bio;
 	struct bio_list sync, nosync, recover, *this_list = NULL;
 	struct bio_list requeue;
@@ -1209,13 +1219,19 @@ static void do_writes(struct mirror_set 
 	rh_inc_pending(&ms->rh, &sync);
 	rh_inc_pending(&ms->rh, &nosync);
 
-	r = rh_flush(&ms->rh);
+	ms->log_failure = rh_flush(&ms->rh);
 
 	/*
 	 * Dispatch io.
 	 */
-	while ((bio = bio_list_pop(&sync)))
-		do_write(ms, bio, r ? 1 : 0);
+	if (ms->log_failure) {
+		spin_lock_irq(&ms->lock);
+		bio_list_merge(&ms->failures, &sync);
+		spin_unlock_irq(&ms->lock);
+	} else {
+		while ((bio = bio_list_pop(&sync)))
+			do_write(ms, bio, 0);
+	}
 
 	while ((bio = bio_list_pop(&recover)))
 		rh_delay(&ms->rh, bio);
@@ -1234,6 +1250,39 @@ static void do_failures(struct mirror_se
 	if (!failures->head)
 		return;
 
+	if (ms->log_failure) {
+		/*
+		 * If the log has failed, unattempted writes are being
+		 * put on the failures list.  We can't issue those writes
+		 * until a log has been marked, so we must store them.
+		 *
+		 * If a 'noflush' suspend is in progress, we can requeue
+		 * the I/O's to the core.  This give userspace a chance
+		 * to reconfigure the mirror, at which point the core
+		 * will reissue the writes.  If the 'noflush' flag is
+		 * not set, we have no choice but to return errors.
+		 *
+		 * Some writes on the failures list may have been
+		 * submitted before the log failure and represent a
+		 * failure to write to one of the devices.  It is ok
+		 * for us to treat them the same and requeue them
+		 * as well.
+		 */
+		if (dm_noflush_suspending(ms->ti)) {
+			while ((bio = bio_list_pop(failures)))
+				bio_endio(bio, bio->bi_size, DM_ENDIO_REQUEUE);
+		} else if (atomic_read(&ms->suspend)) {
+			while ((bio = bio_list_pop(failures)))
+				bio_endio(bio, bio->bi_size, -EIO);
+		} else {
+			spin_lock_irq(&ms->lock);
+			bio_list_merge(&ms->failures, failures);
+			spin_unlock_irq(&ms->lock);
+			wake(ms);
+		}
+		return;
+	}
+
 	if (log->type->get_failure_response(log) == DMLOG_IOERR_BLOCK)
 		dm_table_event(ms->ti->table);
 
@@ -1303,6 +1352,8 @@ static struct mirror_set *alloc_context(
 	ms->nr_mirrors = nr_mirrors;
 	ms->nr_regions = dm_sector_div_up(ti->len, region_size);
 	ms->in_sync = 0;
+	ms->log_failure = 0;
+	atomic_set(&ms->suspend, 0);
 	ms->read_mirror = &ms->mirror[DEFAULT_MIRROR];
 	ms->default_mirror = &ms->mirror[DEFAULT_MIRROR];
 
@@ -1648,6 +1699,25 @@ static void mirror_presuspend(struct dm_
 	struct mirror_set *ms = (struct mirror_set *) ti->private;
 	struct dirty_log *log = ms->rh.log;
 
+	atomic_set(&ms->suspend, 1);
+
+	/*
+	 * We must finish up all the work that we've
+	 * generated (i.e. recovery work).
+	 */
+	rh_stop_recovery(&ms->rh);
+
+	wait_event(recovery_stopped_event,
+		   !atomic_read(&ms->rh.recovery_in_flight));
+
+	/*
+	 * Now that recovery is complete/stopped and the
+	 * delayed bios are queued, we need to wait for
+	 * the worker thread to complete.  This way,
+	 * we know that all of our I/O has been pushed.
+	 */
+	flush_workqueue(ms->kmirrord_wq);
+
 	if (log->type->presuspend && log->type->presuspend(log))
 		/* FIXME: need better error handling */
 		DMWARN("log presuspend failed");
@@ -1658,12 +1728,6 @@ static void mirror_postsuspend(struct dm
 	struct mirror_set *ms = (struct mirror_set *) ti->private;
 	struct dirty_log *log = ms->rh.log;
 
-	rh_stop_recovery(&ms->rh);
-
-	/* Wait for all I/O we generated to complete */
-	wait_event(recovery_stopped_event,
-		   !atomic_read(&ms->rh.recovery_in_flight));
-
 	if (log->type->postsuspend && log->type->postsuspend(log))
 		/* FIXME: need better error handling */
 		DMWARN("log postsuspend failed");
@@ -1673,6 +1737,8 @@ static void mirror_resume(struct dm_targ
 {
 	struct mirror_set *ms = (struct mirror_set *) ti->private;
 	struct dirty_log *log = ms->rh.log;
+
+	atomic_set(&ms->suspend, 0);
 	if (log->type->resume && log->type->resume(log))
 		/* FIXME: need better error handling */
 		DMWARN("log resume failed");

Index: linux-rhel5/drivers/md/dm-raid1.c
===================================================================
--- linux-rhel5.orig/drivers/md/dm-raid1.c
+++ linux-rhel5/drivers/md/dm-raid1.c
@@ -26,8 +26,6 @@
 
 DECLARE_WAIT_QUEUE_HEAD(recovery_stopped_event);
 
-static int dm_mirror_error_on_log_failure = 1;
-
 /*-----------------------------------------------------------------
  * Region hash
  *
@@ -1056,7 +1054,7 @@ static void __bio_mark_nosync(struct mir
 		complete_resync_work(reg, 0);
 }
 
-static void write_callback(unsigned long error, void *context, int log_failure)
+static void write_callback(unsigned long error, void *context)
 {
 	unsigned int i, ret = 0;
 	struct bio *bio = (struct bio *) context;
@@ -1074,18 +1072,11 @@ static void write_callback(unsigned long
 	 * regions with the same code.
 	 */
 	if (unlikely(error)) {
-		/*
-		 * If the log is intact, we can play around with trying
-		 * to handle the failure.  Otherwise, we have to report
-		 * the I/O as failed.
-		 */
-		if (!log_failure) {
-			for (i = 0; i < ms->nr_mirrors; i++) {
-				if (test_bit(i, &error))
-					fail_mirror(ms->mirror + i);
-				else
-					uptodate = 1;
-			}
+		for (i = 0; i < ms->nr_mirrors; i++) {
+			if (test_bit(i, &error))
+				fail_mirror(ms->mirror + i);
+			else
+				uptodate = 1;
 		}
 
 		if (likely(uptodate)) {
@@ -1112,17 +1103,7 @@ static void write_callback(unsigned long
 	bio_endio(bio, bio->bi_size, ret);
 }
 
-static void write_callback_good_log(unsigned long error, void *context)
-{
-	write_callback(error, context, 0);
-}
-
-static void write_callback_bad_log(unsigned long error, void *context)
-{
-	write_callback(error, context, 1);
-}
-
-static void do_write(struct mirror_set *ms, struct bio *bio, int log_failure)
+static void do_write(struct mirror_set *ms, struct bio *bio)
 {
 	unsigned int i;
 	struct io_region io[ms->nr_mirrors], *dest = io;
@@ -1131,16 +1112,11 @@ static void do_write(struct mirror_set *
 		.bi_rw = WRITE,
 		.mem.type = DM_IO_BVEC,
 		.mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
-		.notify.fn = NULL,
+		.notify.fn = write_callback,
 		.notify.context = bio,
 		.client = ms->io_client,
 	};
 
-	if (log_failure && dm_mirror_error_on_log_failure) {
-		bio_endio(bio, bio->bi_size, -EIO);
-		return;
-	}
-
 	for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
 		map_region(dest++, m, bio);
 
@@ -1150,8 +1126,6 @@ static void do_write(struct mirror_set *
 	 * to the mirror set in write_callback().
 	 */
 	bio_set_m(bio, ms->default_mirror);
-	io_req.notify.fn = log_failure ? write_callback_bad_log :
-					 write_callback_good_log;
 	(void) dm_io(&io_req, ms->nr_mirrors, io, NULL);
 }
 
@@ -1230,7 +1204,7 @@ static void do_writes(struct mirror_set 
 		spin_unlock_irq(&ms->lock);
 	} else {
 		while ((bio = bio_list_pop(&sync)))
-			do_write(ms, bio, 0);
+			do_write(ms, bio);
 	}
 
 	while ((bio = bio_list_pop(&recover)))
@@ -1811,15 +1785,6 @@ static int __init dm_mirror_init(void)
 		DMERR("%s: Failed to register mirror target",
 		      mirror_target.name);
 		dm_dirty_log_exit();
-	} else if (!dm_mirror_error_on_log_failure) {
-		DMWARN("Warning: dm_mirror_error_on_log_failure = 0");
-		DMWARN("In this mode, the following fault sequence could cause corruption:");
-		DMWARN("  1) Log device failure");
-		DMWARN("  2) Write I/O issued");
-		DMWARN("  3) Machine failure");
-		DMWARN("  4) Log device restored");
-		DMWARN("  5) Machine reboots");
-		DMWARN("If this happens, you must resync your mirror.");
 	}
 
 	return r;
@@ -1840,8 +1805,6 @@ static void __exit dm_mirror_exit(void)
 module_init(dm_mirror_init);
 module_exit(dm_mirror_exit);
 
-module_param(dm_mirror_error_on_log_failure, int, 1);
-MODULE_PARM_DESC(dm_mirror_error_on_log_failure, "Set to '0' if you want writes to succeed on log device failure");
 MODULE_DESCRIPTION(DM_NAME " mirror target");
 MODULE_AUTHOR("Joe Thornber");
 MODULE_LICENSE("GPL");