Sophie: kernel-2.6.18-238.el5 src

kernel-2.6.18-238.el5.src.rpm

From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Tue, 12 May 2009 22:54:34 +0200
Subject: [md] dm: raid45 target doesn't create parity as expected
Message-id: 1242161674.28404.57.camel@o
O-Subject: [RHEL 5.4 PATCH] dm: raid45 target doesn't create parity as expected by dmraid (isw)
Bugzilla: 499406
RH-Acked-by: Jonathan Brassow <jbrassow@redhat.com>

RHEL5.4 device mapper: raid45 target regression

When a RAID5 array with a small number of disks (3-4) is being used,
dmraid doesn't enforce a complete resynchronization of the RAID array
but expects the dm-raid45 target to always create parity, wheras the new
raid45 target (introduced with patches for bz#479383 and bz#480733 on
April 23) expects synchronization to be enforced by userpace for those
arrays.
It fixes a potential error message flood with dead arrays as well.

Resolves: BZ 499406

Don,
this patch adresses the missing ()'s you found with my initial patch
submission on April 23 as well.

diff --git a/drivers/md/dm-raid45.c b/drivers/md/dm-raid45.c
index bd32c7a..89374f9 100644
--- a/drivers/md/dm-raid45.c
+++ b/drivers/md/dm-raid45.c
@@ -37,7 +37,7 @@
  * FIXME: recovery bandwidth
  */ 
 
-static const char *version = "v0.2594b";
+static const char *version = "v0.2594l";
 
 #include "dm.h"
 #include "dm-bio-list.h"
@@ -365,8 +365,10 @@ BITOPS(Dev, IoQueued, raid_dev, DEV_IO_QUEUED)
 enum raid_set_flags {
 	RS_CHECK_OVERWRITE,	/* Check for chunk overwrites. */
 	RS_DEAD,		/* RAID set inoperational. */
+	RS_DEAD_ENDIO_MESSAGE,	/* RAID set dead endio one-off message. */
 	RS_DEGRADED,		/* Io errors on RAID device. */
 	RS_DEVEL_STATS,		/* REMOVEME: display status information. */
+	RS_ENFORCE_PARITY_CREATION,/* Enforce parity creation. */
 	RS_RECOVER,		/* Do recovery. */
 	RS_RECOVERY_BANDWIDTH,	/* Allow recovery bandwidth (delayed bios). */
 	RS_SC_BUSY,		/* Stripe cache busy -> send an event. */
@@ -571,8 +573,10 @@ struct raid_set {
 BITOPS(RS, Bandwidth, raid_set, RS_RECOVERY_BANDWIDTH)
 BITOPS(RS, CheckOverwrite, raid_set, RS_CHECK_OVERWRITE)
 BITOPS(RS, Dead, raid_set, RS_DEAD)
+BITOPS(RS, DeadEndioMessage, raid_set, RS_DEAD_ENDIO_MESSAGE)
 BITOPS(RS, Degraded, raid_set, RS_DEGRADED)
 BITOPS(RS, DevelStats, raid_set, RS_DEVEL_STATS)
+BITOPS(RS, EnforceParityCreation, raid_set, RS_ENFORCE_PARITY_CREATION)
 BITOPS(RS, Recover, raid_set, RS_RECOVER)
 BITOPS(RS, ScBusy, raid_set, RS_SC_BUSY)
 BITOPS(RS, Suspend, raid_set, RS_SUSPEND)
@@ -2310,7 +2314,9 @@ static void stripe_chunk_rw(struct stripe *stripe, unsigned p)
 
 	BUG_ON(!io.count);
 	io.sector += dev->start;	/* Add <offset>. */
-	recover_io_count(stripe);	/* Recovery io accounting. */
+
+	if (RSRecover(rs))
+		recover_io_count(stripe);	/* Recovery io accounting. */
 
 	/* REMOVEME: statistics. */
 	atomic_inc(rs->stats + (ChunkDirty(chunk) ? S_DM_IO_WRITE :
@@ -2526,9 +2532,11 @@ static int stripe_check_reconstruct(struct stripe *stripe)
 		 * Reconstruction mode (ie. a particular (replaced) device or
 		 * some (rotating) parity chunk is being resynchronized) ->
 		 *   o make sure all needed chunks are read in
-		 *   o writes are allowed to go through
-		 */
-		if (!sync) {
+		 *   o cope with 3/4 disk array special case where it
+		 *     doesn't make a difference to read in parity
+		 *     to xor data in/out
+                 */
+		if (RSEnforceParityCreation(rs) || !sync) {
 			/* REMOVEME: statistics. */
 			atomic_inc(rs->stats + S_NOSYNC);
 			/* Allow IO on all devs but the one to reconstruct. */
@@ -2965,7 +2973,7 @@ static int _do_recovery(struct stripe *stripe)
 		goto err;
 
 	/* Recovery end required. */
-	if (!RSRecover(rs))
+	if (unlikely(RSDegraded(rs)))
 		goto err;
 
 	/* Get a region to recover. */
@@ -3018,11 +3026,14 @@ err:
 	/* Check state of partially recovered array. */
 	if (RSDegraded(rs) && !RSDead(rs) &&
 	    rs->set.dev_to_init != -1 &&
-	    rs->set.ei != rs->set.dev_to_init)
+	    rs->set.ei != rs->set.dev_to_init) {
 		/* Broken drive != drive to recover -> FATAL. */
 		SetRSDead(rs);
+		DMERR("FATAL: failed device != device to initialize -> "
+		      "RAID set broken");
+	}
 
-	if (StripeError(stripe)) {
+	if (StripeError(stripe) || RSDegraded(rs)) {
 		char buf[BDEVNAME_SIZE];
 
 		DMERR("stopping recovery due to "
@@ -3121,7 +3132,9 @@ static void _do_endios(struct raid_set *rs, struct stripe *stripe,
 
 	/* If RAID set is dead -> fail any ios to dead drives. */
 	if (RSDead(rs)) {
-		DMERR_LIMIT("RAID set dead: failing ios to dead devices");
+		if (!TestSetRSDeadEndioMessage(rs))
+			DMERR("RAID set dead: failing ios to dead devices");
+
 		stripe_fail_io(stripe);
 	}
 
@@ -4079,6 +4092,15 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	}
 
 	/*
+	 * Enable parity chunk creation enformcement for
+	 * little numbers of array members where it doesn'ti
+	 * gain us performance to xor parity out and back in as
+	 * with larger array member numbers.
+	 */
+	if (rs->set.raid_devs <= rs->set.raid_type->minimal_devs + 1)
+		SetRSEnforceParityCreation(rs);
+
+	/*
 	 * Make sure that dm core only hands maximum io size
 	 * length down and pays attention to io boundaries.
 	 */
@@ -4224,7 +4246,7 @@ static void raid_devel_stats(struct dm_target *ti, char *result,
 	struct recover *rec = &rs->recover;
 	struct timespec ts;
 
-	DMEMIT("%s %s %u\n", version, rs->xor.f->name, rs->xor.chunks);
+	DMEMIT("%s %s=%u\n", version, rs->xor.f->name, rs->xor.chunks);
 	DMEMIT("act_ios=%d ", io_ref(rs));
 	DMEMIT("act_ios_max=%d\n", atomic_read(&rs->io.in_process_max));
 	DMEMIT("act_stripes=%d ", sc_active(&rs->sc));
@@ -4261,6 +4283,7 @@ static int raid_status(struct dm_target *ti, status_type_t type,
 	unsigned p, sz = 0;
 	char buf[BDEVNAME_SIZE];
 	struct raid_set *rs = ti->private;
+	struct dm_dirty_log *dl = rs->recover.dl;
 	int raid_parms[] = {
 		rs->set.chunk_size_parm,
 		rs->sc.stripes_parm,
@@ -4283,7 +4306,7 @@ static int raid_status(struct dm_target *ti, status_type_t type,
 			DMEMIT("%s ",
 			       format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev));
 
-		DMEMIT("1 ");
+		DMEMIT("2 ");
 		for (p = 0; p < rs->set.raid_devs; p++) {
 			DMEMIT("%c", !DevFailed(rs->dev + p) ? 'A' : 'D');
 
@@ -4294,12 +4317,15 @@ static int raid_status(struct dm_target *ti, status_type_t type,
 				DMEMIT("i");
 		}
 
+		DMEMIT(" %llu/%llu ",
+		       (unsigned long long) dl->type->get_sync_count(dl),
+		       (unsigned long long) rs->recover.nr_regions);
+		sz += dl->type->status(dl, type, result+sz, maxlen-sz);
 		break;
 	case STATUSTYPE_TABLE:
 		sz = rs->recover.dl->type->status(rs->recover.dl, type,
 						  result, maxlen);
-		DMEMIT("%s %u ", rs->set.raid_type->name,
-		       rs->set.raid_parms);
+		DMEMIT("%s %u ", rs->set.raid_type->name, rs->set.raid_parms);
 
 		for (p = 0; p < rs->set.raid_parms; p++) {
 			if (raid_parms[p] > -2)