Sophie: kernel-2.6.18-238.el5 src

kernel-2.6.18-238.el5.src.rpm

From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 06:53:50 -0500
Subject: [md] fix a race in dm-raid1
Message-id: <Pine.LNX.4.64.0912100147320.2811@hs20-bc2-1.build.redhat.com>
Patchwork-id: 21834
O-Subject: Re: [RHEL5.5 PATCH] bz502927: fix a race in dm-raid1
Bugzilla: 502927
RH-Acked-by: Jonathan E Brassow <jbrassow@redhat.com>

This patch fixes a race condition in dm-raid1 that results in reverting
already finished writes.

Suppose that the primary mirror leg fail. The data is written to the
secondary leg and signaled as completed to the application, dmeventd
attempts to remove the primary leg, bug the computer crashes before
dmeventd does so. On next reboot, the primary leg comes online back again
and dm-raid1 synchronizes the mirror, copying the stale data from the
primary leg to the secondary leg. --- the result: the write request,
signaled as finished is reverted. It may, for example, cause a revert of
transaction that was already committed.

The patch changes the behavior so that all writes are held until dmeventd
does its job.

For description and discussion of the problem, see:
https://bugzilla.redhat.com/show_bug.cgi?id=502927
https://www.redhat.com/archives/dm-devel/2009-April/msg00178.html
https://www.redhat.com/archives/dm-devel/2009-November/msg00176.html

Upstream status: to be submitted in the next cycle, in
ftp://ftp.kernel.org/pub/linux/kernel/people/agk/patches/2.6/editing

This patch is a combination of these upstream patches:
dm-raid1-add-framework-to-hold-bios-during-suspend.patch # 60997
dm-raid1-use-hold-framework-in-do_failures.patch # 60998
dm-raid1-abstract-get_valid_mirror-function.patch # 61000
dm-raid1-remove-bio_endio-from-dm_rh_mark_nosync.patch # 61001
dm-raid1-hold-write-bios-when-errors-are-handled.patch # 61004
dm-raid1-hold-all-write-bios-when-leg-fails.patch # 63219

Testing: I reproduced the race condition described in the bug report (by
manually stopping dmeventd with -STOP signal) and got my writes reverted.
When applying the patch, the writes were blocked until dmeventd did its
job.

Signed-off-by: Don Zickus <dzickus@redhat.com>

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index f76c2e0..3f7ba40 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -134,6 +134,7 @@ struct mirror_set {
 	struct bio_list reads;
 	struct bio_list writes;
 	struct bio_list failures;
+	struct bio_list holds;	/* bios are waiting until suspend */
 
 	struct dm_io_client *io_client;
 
@@ -141,6 +142,7 @@ struct mirror_set {
 	region_t nr_regions;
 	int in_sync;
 	int log_failure;
+	int leg_failure;
 	atomic_t suspend;
 
 	struct mirror *default_mirror;	/* Default mirror */
@@ -874,6 +876,17 @@ use_mirror:
 	return ret;
 }
 
+static struct mirror *get_valid_mirror(struct mirror_set *ms)
+{
+	struct mirror *m;
+
+	for (m = ms->mirror; m < ms->mirror + ms->nr_mirrors; m++)
+		if (!atomic_read(&m->error_count))
+			return m;
+
+	return NULL;
+}
+
 /* fail_mirror
  * @m: mirror device to fail
  * @error_type: one of the enum's, DM_RAID1_*_ERROR
@@ -894,6 +907,8 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
 	struct mirror *new;
 	struct dm_dirty_log *log = ms->rh.log;
 
+	ms->leg_failure = 1;
+
 	atomic_inc(&m->error_count);
 
 	if (test_and_set_bit(error_type, &m->error_type))
@@ -922,15 +937,12 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
 		goto out;
 	}
 
-	for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++)
-		if (!atomic_read(&new->error_count)) {
-			ms->default_mirror = new;
-			break;
-		}
-
-	if (unlikely(new == ms->mirror + ms->nr_mirrors))
-		DMWARN("All sides of mirror have failed.");
-
+	new = get_valid_mirror(ms);
+	if (new)
+		ms->default_mirror = new;
+	else
+  		DMWARN("All sides of mirror have failed.");
+  
 out:
 	schedule_work(&ms->trigger_event);
 }
@@ -972,6 +984,27 @@ static void map_region(struct io_region *io, struct mirror *m,
 	io->count = bio->bi_size >> 9;
 }
 
+static void hold_bio(struct mirror_set *ms, struct bio *bio)
+{
+	/*
+	 * If device is suspended, complete the bio.
+	 */
+	if (atomic_read(&ms->suspend)) {
+		if (dm_noflush_suspending(ms->ti))
+			bio_endio(bio, bio->bi_size, DM_ENDIO_REQUEUE);
+		else
+			bio_endio(bio, bio->bi_size, -EIO);
+		return;
+	}
+
+	/*
+	 * Hold bio until the suspend is complete.
+	 */
+	spin_lock_irq(&ms->lock);
+	bio_list_add(&ms->holds, bio);
+	spin_unlock_irq(&ms->lock);
+}
+
 /*-----------------------------------------------------------------
  * Reads
  *---------------------------------------------------------------*/
@@ -1065,8 +1098,7 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
  *
  * This function is _not_ interrupt safe!
  */
-static void __bio_mark_nosync(struct mirror_set *ms,
-			      struct bio *bio, unsigned int done, int error)
+static void __bio_mark_nosync(struct mirror_set *ms, struct bio *bio)
 {
 	unsigned long flags;
 	struct region_hash *rh = &ms->rh;
@@ -1100,7 +1132,6 @@ static void __bio_mark_nosync(struct mirror_set *ms,
 	BUG_ON(!list_empty(&reg->list));
 	spin_unlock_irqrestore(&rh->region_lock, flags);
 
-	bio_endio(bio, done, error);
 	if (recovering)
 		complete_resync_work(reg, 0);
 }
@@ -1110,7 +1141,6 @@ static void write_callback(unsigned long error, void *context)
 	unsigned int i, ret = 0;
 	struct bio *bio = (struct bio *) context;
 	struct mirror_set *ms;
-	int uptodate = 0;
 	int should_wake = 0;
 	unsigned long flags;
 
@@ -1123,36 +1153,27 @@ static void write_callback(unsigned long error, void *context)
 	 * This way we handle both writes to SYNC and NOSYNC
 	 * regions with the same code.
 	 */
-	if (unlikely(error)) {
-		for (i = 0; i < ms->nr_mirrors; i++) {
-			if (test_bit(i, &error))
-				fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR);
-			else
-				uptodate = 1;
-		}
-
-		if (likely(uptodate)) {
-			/*
-			 * Need to raise event.  Since raising
-			 * events can block, we need to do it in
-			 * the main thread.
-			 */
-			spin_lock_irqsave(&ms->lock, flags);
-			if (!ms->failures.head)
-				should_wake = 1;
-			bio_list_add(&ms->failures, bio);
-			spin_unlock_irqrestore(&ms->lock, flags);
-			if (should_wake)
-				wake(ms);
-			return;
-		} else {
-			DMERR("All replicated volumes dead, failing I/O");
-			/* None of the writes succeeded, fail the I/O. */
-			ret = -EIO;
-		}
+	if (likely(!error)) {
+		bio_endio(bio, bio->bi_size, ret);
+		return;
 	}
 
-	bio_endio(bio, bio->bi_size, ret);
+	for (i = 0; i < ms->nr_mirrors; i++)
+		if (test_bit(i, &error))
+			fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR);
+
+	/*
+	 * Need to raise event.  Since raising
+	 * events can block, we need to do it in
+	 * the main thread.
+	 */
+	spin_lock_irqsave(&ms->lock, flags);
+	if (!ms->failures.head)
+		should_wake = 1;
+	bio_list_add(&ms->failures, bio);
+	spin_unlock_irqrestore(&ms->lock, flags);
+	if (should_wake)
+		wake(ms);
 }
 
 static void do_write(struct mirror_set *ms, struct bio *bio)
@@ -1271,8 +1292,13 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 		rh_delay(&ms->rh, bio);
 
 	while ((bio = bio_list_pop(&nosync))) {
-		map_bio(ms->default_mirror, bio);
-		generic_make_request(bio);
+		if (unlikely(ms->leg_failure) &&
+		    log->type->get_failure_response(log) == DMLOG_IOERR_BLOCK)
+			hold_bio(ms, bio);
+		else {
+			map_bio(ms->default_mirror, bio);
+			generic_make_request(bio);
+		}
 	}
 }
 
@@ -1280,44 +1306,48 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
 {
 	struct bio *bio;
 
-	if (!failures->head)
+	if (likely(!failures->head))
 		return;
 
-	if (ms->log_failure) {
+	/*
+	 * If the log has failed, unattempted writes are being
+	 * put on the holds list.  We can't issue those writes
+	 * until a log has been marked, so we must store them.
+	 *
+	 * If a 'noflush' suspend is in progress, we can requeue
+	 * the I/O's to the core.  This give userspace a chance
+	 * to reconfigure the mirror, at which point the core
+	 * will reissue the writes.  If the 'noflush' flag is
+	 * not set, we have no choice but to return errors.
+	 *
+	 * Some writes on the failures list may have been
+	 * submitted before the log failure and represent a
+	 * failure to write to one of the devices.  It is ok
+	 * for us to treat them the same and requeue them
+	 * as well.
+	 */
+
+	while ((bio = bio_list_pop(failures))) {
+		struct dm_dirty_log *log = ms->rh.log;
+
+		if (!ms->log_failure)
+			__bio_mark_nosync(ms, bio);
+
 		/*
-		 * If the log has failed, unattempted writes are being
-		 * put on the failures list.  We can't issue those writes
-		 * until a log has been marked, so we must store them.
-		 *
-		 * If a 'noflush' suspend is in progress, we can requeue
-		 * the I/O's to the core.  This give userspace a chance
-		 * to reconfigure the mirror, at which point the core
-		 * will reissue the writes.  If the 'noflush' flag is
-		 * not set, we have no choice but to return errors.
-		 *
-		 * Some writes on the failures list may have been
-		 * submitted before the log failure and represent a
-		 * failure to write to one of the devices.  It is ok
-		 * for us to treat them the same and requeue them
-		 * as well.
+		 * If all the legs are dead, fail the I/O.
+		 * If we have been told to handle errors, hold the bio
+		 * and wait for userspace to deal with the problem.
+		 * Otherwise pretend that the I/O succeeded. (This would
+		 * be wrong if the failed leg returned after reboot and
+		 * got replicated back to the good legs.)
 		 */
-		if (dm_noflush_suspending(ms->ti)) {
-			while ((bio = bio_list_pop(failures)))
-				bio_endio(bio, bio->bi_size, DM_ENDIO_REQUEUE);
-		} else if (atomic_read(&ms->suspend)) {
-			while ((bio = bio_list_pop(failures)))
-				bio_endio(bio, bio->bi_size, -EIO);
-		} else {
-			spin_lock_irq(&ms->lock);
-			bio_list_merge(&ms->failures, failures);
-			spin_unlock_irq(&ms->lock);
-			delayed_wake(ms);
-		}
-		return;
-	}
-
-	while ((bio = bio_list_pop(failures)))
-		__bio_mark_nosync(ms, bio, bio->bi_size, 0);
+		if (!get_valid_mirror(ms))
+			bio_endio(bio, bio->bi_size, -EIO);
+		else if (log->type->get_failure_response(log) == DMLOG_IOERR_BLOCK)
+			hold_bio(ms, bio);
+		else
+			bio_endio(bio, bio->bi_size, 0);
+  	}
 }
 
 static void trigger_event(void *data)
@@ -1384,6 +1414,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
 	ms->nr_regions = dm_sector_div_up(ti->len, region_size);
 	ms->in_sync = 0;
 	ms->log_failure = 0;
+	ms->leg_failure = 0;
 	atomic_set(&ms->suspend, 0);
 
 	spin_lock_init(&ms->choose_lock);
@@ -1738,6 +1769,9 @@ static void mirror_presuspend(struct dm_target *ti)
 	struct mirror_set *ms = (struct mirror_set *) ti->private;
 	struct dm_dirty_log *log = ms->rh.log;
 
+	struct bio_list holds;
+	struct bio *bio;
+
 	atomic_set(&ms->suspend, 1);
 
 	/*
@@ -1760,6 +1794,22 @@ static void mirror_presuspend(struct dm_target *ti)
 	 * we know that all of our I/O has been pushed.
 	 */
 	flush_workqueue(ms->kmirrord_wq);
+
+	/*
+	 * Now set ms->suspend is set and the workqueue flushed, no more
+	 * entries can be added to ms->hold list, so process it.
+	 *
+	 * Bios can still arrive concurrently with or after this
+	 * presuspend function, but they cannot join the hold list
+	 * because ms->suspend is set.
+	 */
+	spin_lock_irq(&ms->lock);
+	holds = ms->holds;
+	bio_list_init(&ms->holds);
+	spin_unlock_irq(&ms->lock);
+
+	while ((bio = bio_list_pop(&holds)))
+		hold_bio(ms, bio);
 }
 
 static void mirror_postsuspend(struct dm_target *ti)